diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:49:45 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:49:45 +0000 |
commit | 2c3c1048746a4622d8c89a29670120dc8fab93c4 (patch) | |
tree | 848558de17fb3008cdf4d861b01ac7781903ce39 /net/core | |
parent | Initial commit. (diff) | |
download | linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.tar.xz linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.zip |
Adding upstream version 6.1.76.upstream/6.1.76
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'net/core')
57 files changed, 76005 insertions, 0 deletions
diff --git a/net/core/Makefile b/net/core/Makefile new file mode 100644 index 000000000..10edd66a8 --- /dev/null +++ b/net/core/Makefile @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the Linux networking core. +# + +obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \ + gen_stats.o gen_estimator.o net_namespace.o secure_seq.o \ + flow_dissector.o + +obj-$(CONFIG_SYSCTL) += sysctl_net_core.o + +obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ + neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ + sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ + fib_notifier.o xdp.o flow_offload.o gro.o + +obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o + +obj-y += net-sysfs.o +obj-$(CONFIG_PAGE_POOL) += page_pool.o +obj-$(CONFIG_PROC_FS) += net-procfs.o +obj-$(CONFIG_NET_PKTGEN) += pktgen.o +obj-$(CONFIG_NETPOLL) += netpoll.o +obj-$(CONFIG_FIB_RULES) += fib_rules.o +obj-$(CONFIG_TRACEPOINTS) += net-traces.o +obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o +obj-$(CONFIG_NET_SELFTESTS) += selftests.o +obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o +obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o +obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o +obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o +obj-$(CONFIG_LWTUNNEL) += lwtunnel.o +obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o +obj-$(CONFIG_DST_CACHE) += dst_cache.o +obj-$(CONFIG_HWBM) += hwbm.o +obj-$(CONFIG_GRO_CELLS) += gro_cells.o +obj-$(CONFIG_FAILOVER) += failover.o +obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o +obj-$(CONFIG_BPF_SYSCALL) += sock_map.o +obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o +obj-$(CONFIG_OF) += of_net.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c new file mode 100644 index 000000000..ad01b1bea --- /dev/null +++ b/net/core/bpf_sk_storage.c @@ -0,0 +1,965 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ +#include <linux/rculist.h> +#include <linux/list.h> +#include <linux/hash.h> +#include <linux/types.h> +#include <linux/spinlock.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/btf_ids.h> +#include <linux/bpf_local_storage.h> +#include <net/bpf_sk_storage.h> +#include <net/sock.h> +#include <uapi/linux/sock_diag.h> +#include <uapi/linux/btf.h> +#include <linux/rcupdate_trace.h> + +DEFINE_BPF_STORAGE_CACHE(sk_cache); + +static struct bpf_local_storage_data * +bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) +{ + struct bpf_local_storage *sk_storage; + struct bpf_local_storage_map *smap; + + sk_storage = + rcu_dereference_check(sk->sk_bpf_storage, bpf_rcu_lock_held()); + if (!sk_storage) + return NULL; + + smap = (struct bpf_local_storage_map *)map; + return bpf_local_storage_lookup(sk_storage, smap, cacheit_lockit); +} + +static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) +{ + struct bpf_local_storage_data *sdata; + + sdata = bpf_sk_storage_lookup(sk, map, false); + if (!sdata) + return -ENOENT; + + bpf_selem_unlink(SELEM(sdata), true); + + return 0; +} + +/* Called by __sk_destruct() & bpf_sk_storage_clone() */ +void bpf_sk_storage_free(struct sock *sk) +{ + struct bpf_local_storage_elem *selem; + struct bpf_local_storage *sk_storage; + bool free_sk_storage = false; + struct hlist_node *n; + + rcu_read_lock(); + sk_storage = rcu_dereference(sk->sk_bpf_storage); + if (!sk_storage) { + rcu_read_unlock(); + return; + } + + /* Netiher the bpf_prog nor the bpf-map's syscall + * could be modifying the sk_storage->list now. + * Thus, no elem can be added-to or deleted-from the + * sk_storage->list by the bpf_prog or by the bpf-map's syscall. + * + * It is racing with bpf_local_storage_map_free() alone + * when unlinking elem from the sk_storage->list and + * the map's bucket->list. + */ + raw_spin_lock_bh(&sk_storage->lock); + hlist_for_each_entry_safe(selem, n, &sk_storage->list, snode) { + /* Always unlink from map before unlinking from + * sk_storage. + */ + bpf_selem_unlink_map(selem); + free_sk_storage = bpf_selem_unlink_storage_nolock( + sk_storage, selem, true, false); + } + raw_spin_unlock_bh(&sk_storage->lock); + rcu_read_unlock(); + + if (free_sk_storage) + kfree_rcu(sk_storage, rcu); +} + +static void bpf_sk_storage_map_free(struct bpf_map *map) +{ + struct bpf_local_storage_map *smap; + + smap = (struct bpf_local_storage_map *)map; + bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx); + bpf_local_storage_map_free(smap, NULL); +} + +static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) +{ + struct bpf_local_storage_map *smap; + + smap = bpf_local_storage_map_alloc(attr); + if (IS_ERR(smap)) + return ERR_CAST(smap); + + smap->cache_idx = bpf_local_storage_cache_idx_get(&sk_cache); + return &smap->map; +} + +static int notsupp_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -ENOTSUPP; +} + +static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_local_storage_data *sdata; + struct socket *sock; + int fd, err; + + fd = *(int *)key; + sock = sockfd_lookup(fd, &err); + if (sock) { + sdata = bpf_sk_storage_lookup(sock->sk, map, true); + sockfd_put(sock); + return sdata ? sdata->data : NULL; + } + + return ERR_PTR(err); +} + +static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct bpf_local_storage_data *sdata; + struct socket *sock; + int fd, err; + + fd = *(int *)key; + sock = sockfd_lookup(fd, &err); + if (sock) { + sdata = bpf_local_storage_update( + sock->sk, (struct bpf_local_storage_map *)map, value, + map_flags, GFP_ATOMIC); + sockfd_put(sock); + return PTR_ERR_OR_ZERO(sdata); + } + + return err; +} + +static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) +{ + struct socket *sock; + int fd, err; + + fd = *(int *)key; + sock = sockfd_lookup(fd, &err); + if (sock) { + err = bpf_sk_storage_del(sock->sk, map); + sockfd_put(sock); + return err; + } + + return err; +} + +static struct bpf_local_storage_elem * +bpf_sk_storage_clone_elem(struct sock *newsk, + struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem) +{ + struct bpf_local_storage_elem *copy_selem; + + copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, GFP_ATOMIC); + if (!copy_selem) + return NULL; + + if (map_value_has_spin_lock(&smap->map)) + copy_map_value_locked(&smap->map, SDATA(copy_selem)->data, + SDATA(selem)->data, true); + else + copy_map_value(&smap->map, SDATA(copy_selem)->data, + SDATA(selem)->data); + + return copy_selem; +} + +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) +{ + struct bpf_local_storage *new_sk_storage = NULL; + struct bpf_local_storage *sk_storage; + struct bpf_local_storage_elem *selem; + int ret = 0; + + RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); + + rcu_read_lock(); + sk_storage = rcu_dereference(sk->sk_bpf_storage); + + if (!sk_storage || hlist_empty(&sk_storage->list)) + goto out; + + hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { + struct bpf_local_storage_elem *copy_selem; + struct bpf_local_storage_map *smap; + struct bpf_map *map; + + smap = rcu_dereference(SDATA(selem)->smap); + if (!(smap->map.map_flags & BPF_F_CLONE)) + continue; + + /* Note that for lockless listeners adding new element + * here can race with cleanup in bpf_local_storage_map_free. + * Try to grab map refcnt to make sure that it's still + * alive and prevent concurrent removal. + */ + map = bpf_map_inc_not_zero(&smap->map); + if (IS_ERR(map)) + continue; + + copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem); + if (!copy_selem) { + ret = -ENOMEM; + bpf_map_put(map); + goto out; + } + + if (new_sk_storage) { + bpf_selem_link_map(smap, copy_selem); + bpf_selem_link_storage_nolock(new_sk_storage, copy_selem); + } else { + ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC); + if (ret) { + kfree(copy_selem); + atomic_sub(smap->elem_size, + &newsk->sk_omem_alloc); + bpf_map_put(map); + goto out; + } + + new_sk_storage = + rcu_dereference(copy_selem->local_storage); + } + bpf_map_put(map); + } + +out: + rcu_read_unlock(); + + /* In case of an error, don't free anything explicitly here, the + * caller is responsible to call bpf_sk_storage_free. + */ + + return ret; +} + +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, + void *, value, u64, flags, gfp_t, gfp_flags) +{ + struct bpf_local_storage_data *sdata; + + WARN_ON_ONCE(!bpf_rcu_lock_held()); + if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE) + return (unsigned long)NULL; + + sdata = bpf_sk_storage_lookup(sk, map, true); + if (sdata) + return (unsigned long)sdata->data; + + if (flags == BPF_SK_STORAGE_GET_F_CREATE && + /* Cannot add new elem to a going away sk. + * Otherwise, the new elem may become a leak + * (and also other memory issues during map + * destruction). + */ + refcount_inc_not_zero(&sk->sk_refcnt)) { + sdata = bpf_local_storage_update( + sk, (struct bpf_local_storage_map *)map, value, + BPF_NOEXIST, gfp_flags); + /* sk must be a fullsock (guaranteed by verifier), + * so sock_gen_put() is unnecessary. + */ + sock_put(sk); + return IS_ERR(sdata) ? + (unsigned long)NULL : (unsigned long)sdata->data; + } + + return (unsigned long)NULL; +} + +BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) +{ + WARN_ON_ONCE(!bpf_rcu_lock_held()); + if (!sk || !sk_fullsock(sk)) + return -EINVAL; + + if (refcount_inc_not_zero(&sk->sk_refcnt)) { + int err; + + err = bpf_sk_storage_del(sk, map); + sock_put(sk); + return err; + } + + return -ENOENT; +} + +static int bpf_sk_storage_charge(struct bpf_local_storage_map *smap, + void *owner, u32 size) +{ + int optmem_max = READ_ONCE(sysctl_optmem_max); + struct sock *sk = (struct sock *)owner; + + /* same check as in sock_kmalloc() */ + if (size <= optmem_max && + atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { + atomic_add(size, &sk->sk_omem_alloc); + return 0; + } + + return -ENOMEM; +} + +static void bpf_sk_storage_uncharge(struct bpf_local_storage_map *smap, + void *owner, u32 size) +{ + struct sock *sk = owner; + + atomic_sub(size, &sk->sk_omem_alloc); +} + +static struct bpf_local_storage __rcu ** +bpf_sk_storage_ptr(void *owner) +{ + struct sock *sk = owner; + + return &sk->sk_bpf_storage; +} + +BTF_ID_LIST_SINGLE(sk_storage_map_btf_ids, struct, bpf_local_storage_map) +const struct bpf_map_ops sk_storage_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc_check = bpf_local_storage_map_alloc_check, + .map_alloc = bpf_sk_storage_map_alloc, + .map_free = bpf_sk_storage_map_free, + .map_get_next_key = notsupp_get_next_key, + .map_lookup_elem = bpf_fd_sk_storage_lookup_elem, + .map_update_elem = bpf_fd_sk_storage_update_elem, + .map_delete_elem = bpf_fd_sk_storage_delete_elem, + .map_check_btf = bpf_local_storage_map_check_btf, + .map_btf_id = &sk_storage_map_btf_ids[0], + .map_local_storage_charge = bpf_sk_storage_charge, + .map_local_storage_uncharge = bpf_sk_storage_uncharge, + .map_owner_storage_ptr = bpf_sk_storage_ptr, +}; + +const struct bpf_func_proto bpf_sk_storage_get_proto = { + .func = bpf_sk_storage_get, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; + +const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto = { + .func = bpf_sk_storage_get, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_CTX, /* context is 'struct sock' */ + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; + +const struct bpf_func_proto bpf_sk_storage_delete_proto = { + .func = bpf_sk_storage_delete, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, +}; + +static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) +{ + const struct btf *btf_vmlinux; + const struct btf_type *t; + const char *tname; + u32 btf_id; + + if (prog->aux->dst_prog) + return false; + + /* Ensure the tracing program is not tracing + * any bpf_sk_storage*() function and also + * use the bpf_sk_storage_(get|delete) helper. + */ + switch (prog->expected_attach_type) { + case BPF_TRACE_ITER: + case BPF_TRACE_RAW_TP: + /* bpf_sk_storage has no trace point */ + return true; + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + btf_vmlinux = bpf_get_btf_vmlinux(); + if (IS_ERR_OR_NULL(btf_vmlinux)) + return false; + btf_id = prog->aux->attach_btf_id; + t = btf_type_by_id(btf_vmlinux, btf_id); + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + return !!strncmp(tname, "bpf_sk_storage", + strlen("bpf_sk_storage")); + default: + return false; + } + + return false; +} + +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, + void *, value, u64, flags, gfp_t, gfp_flags) +{ + WARN_ON_ONCE(!bpf_rcu_lock_held()); + if (in_hardirq() || in_nmi()) + return (unsigned long)NULL; + + return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags, + gfp_flags); +} + +BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map, + struct sock *, sk) +{ + WARN_ON_ONCE(!bpf_rcu_lock_held()); + if (in_hardirq() || in_nmi()) + return -EPERM; + + return ____bpf_sk_storage_delete(map, sk); +} + +const struct bpf_func_proto bpf_sk_storage_get_tracing_proto = { + .func = bpf_sk_storage_get_tracing, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, + .allowed = bpf_sk_storage_tracing_allowed, +}; + +const struct bpf_func_proto bpf_sk_storage_delete_tracing_proto = { + .func = bpf_sk_storage_delete_tracing, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + .allowed = bpf_sk_storage_tracing_allowed, +}; + +struct bpf_sk_storage_diag { + u32 nr_maps; + struct bpf_map *maps[]; +}; + +/* The reply will be like: + * INET_DIAG_BPF_SK_STORAGES (nla_nest) + * SK_DIAG_BPF_STORAGE (nla_nest) + * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) + * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) + * SK_DIAG_BPF_STORAGE (nla_nest) + * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) + * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) + * .... + */ +static int nla_value_size(u32 value_size) +{ + /* SK_DIAG_BPF_STORAGE (nla_nest) + * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32) + * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit) + */ + return nla_total_size(0) + nla_total_size(sizeof(u32)) + + nla_total_size_64bit(value_size); +} + +void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag) +{ + u32 i; + + if (!diag) + return; + + for (i = 0; i < diag->nr_maps; i++) + bpf_map_put(diag->maps[i]); + + kfree(diag); +} +EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_free); + +static bool diag_check_dup(const struct bpf_sk_storage_diag *diag, + const struct bpf_map *map) +{ + u32 i; + + for (i = 0; i < diag->nr_maps; i++) { + if (diag->maps[i] == map) + return true; + } + + return false; +} + +struct bpf_sk_storage_diag * +bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) +{ + struct bpf_sk_storage_diag *diag; + struct nlattr *nla; + u32 nr_maps = 0; + int rem, err; + + /* bpf_local_storage_map is currently limited to CAP_SYS_ADMIN as + * the map_alloc_check() side also does. + */ + if (!bpf_capable()) + return ERR_PTR(-EPERM); + + nla_for_each_nested(nla, nla_stgs, rem) { + if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD) { + if (nla_len(nla) != sizeof(u32)) + return ERR_PTR(-EINVAL); + nr_maps++; + } + } + + diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL); + if (!diag) + return ERR_PTR(-ENOMEM); + + nla_for_each_nested(nla, nla_stgs, rem) { + struct bpf_map *map; + int map_fd; + + if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD) + continue; + + map_fd = nla_get_u32(nla); + map = bpf_map_get(map_fd); + if (IS_ERR(map)) { + err = PTR_ERR(map); + goto err_free; + } + if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) { + bpf_map_put(map); + err = -EINVAL; + goto err_free; + } + if (diag_check_dup(diag, map)) { + bpf_map_put(map); + err = -EEXIST; + goto err_free; + } + diag->maps[diag->nr_maps++] = map; + } + + return diag; + +err_free: + bpf_sk_storage_diag_free(diag); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc); + +static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb) +{ + struct nlattr *nla_stg, *nla_value; + struct bpf_local_storage_map *smap; + + /* It cannot exceed max nlattr's payload */ + BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < BPF_LOCAL_STORAGE_MAX_VALUE_SIZE); + + nla_stg = nla_nest_start(skb, SK_DIAG_BPF_STORAGE); + if (!nla_stg) + return -EMSGSIZE; + + smap = rcu_dereference(sdata->smap); + if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id)) + goto errout; + + nla_value = nla_reserve_64bit(skb, SK_DIAG_BPF_STORAGE_MAP_VALUE, + smap->map.value_size, + SK_DIAG_BPF_STORAGE_PAD); + if (!nla_value) + goto errout; + + if (map_value_has_spin_lock(&smap->map)) + copy_map_value_locked(&smap->map, nla_data(nla_value), + sdata->data, true); + else + copy_map_value(&smap->map, nla_data(nla_value), sdata->data); + + nla_nest_end(skb, nla_stg); + return 0; + +errout: + nla_nest_cancel(skb, nla_stg); + return -EMSGSIZE; +} + +static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb, + int stg_array_type, + unsigned int *res_diag_size) +{ + /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */ + unsigned int diag_size = nla_total_size(0); + struct bpf_local_storage *sk_storage; + struct bpf_local_storage_elem *selem; + struct bpf_local_storage_map *smap; + struct nlattr *nla_stgs; + unsigned int saved_len; + int err = 0; + + rcu_read_lock(); + + sk_storage = rcu_dereference(sk->sk_bpf_storage); + if (!sk_storage || hlist_empty(&sk_storage->list)) { + rcu_read_unlock(); + return 0; + } + + nla_stgs = nla_nest_start(skb, stg_array_type); + if (!nla_stgs) + /* Continue to learn diag_size */ + err = -EMSGSIZE; + + saved_len = skb->len; + hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { + smap = rcu_dereference(SDATA(selem)->smap); + diag_size += nla_value_size(smap->map.value_size); + + if (nla_stgs && diag_get(SDATA(selem), skb)) + /* Continue to learn diag_size */ + err = -EMSGSIZE; + } + + rcu_read_unlock(); + + if (nla_stgs) { + if (saved_len == skb->len) + nla_nest_cancel(skb, nla_stgs); + else + nla_nest_end(skb, nla_stgs); + } + + if (diag_size == nla_total_size(0)) { + *res_diag_size = 0; + return 0; + } + + *res_diag_size = diag_size; + return err; +} + +int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag, + struct sock *sk, struct sk_buff *skb, + int stg_array_type, + unsigned int *res_diag_size) +{ + /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */ + unsigned int diag_size = nla_total_size(0); + struct bpf_local_storage *sk_storage; + struct bpf_local_storage_data *sdata; + struct nlattr *nla_stgs; + unsigned int saved_len; + int err = 0; + u32 i; + + *res_diag_size = 0; + + /* No map has been specified. Dump all. */ + if (!diag->nr_maps) + return bpf_sk_storage_diag_put_all(sk, skb, stg_array_type, + res_diag_size); + + rcu_read_lock(); + sk_storage = rcu_dereference(sk->sk_bpf_storage); + if (!sk_storage || hlist_empty(&sk_storage->list)) { + rcu_read_unlock(); + return 0; + } + + nla_stgs = nla_nest_start(skb, stg_array_type); + if (!nla_stgs) + /* Continue to learn diag_size */ + err = -EMSGSIZE; + + saved_len = skb->len; + for (i = 0; i < diag->nr_maps; i++) { + sdata = bpf_local_storage_lookup(sk_storage, + (struct bpf_local_storage_map *)diag->maps[i], + false); + + if (!sdata) + continue; + + diag_size += nla_value_size(diag->maps[i]->value_size); + + if (nla_stgs && diag_get(sdata, skb)) + /* Continue to learn diag_size */ + err = -EMSGSIZE; + } + rcu_read_unlock(); + + if (nla_stgs) { + if (saved_len == skb->len) + nla_nest_cancel(skb, nla_stgs); + else + nla_nest_end(skb, nla_stgs); + } + + if (diag_size == nla_total_size(0)) { + *res_diag_size = 0; + return 0; + } + + *res_diag_size = diag_size; + return err; +} +EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_put); + +struct bpf_iter_seq_sk_storage_map_info { + struct bpf_map *map; + unsigned int bucket_id; + unsigned skip_elems; +}; + +static struct bpf_local_storage_elem * +bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, + struct bpf_local_storage_elem *prev_selem) + __acquires(RCU) __releases(RCU) +{ + struct bpf_local_storage *sk_storage; + struct bpf_local_storage_elem *selem; + u32 skip_elems = info->skip_elems; + struct bpf_local_storage_map *smap; + u32 bucket_id = info->bucket_id; + u32 i, count, n_buckets; + struct bpf_local_storage_map_bucket *b; + + smap = (struct bpf_local_storage_map *)info->map; + n_buckets = 1U << smap->bucket_log; + if (bucket_id >= n_buckets) + return NULL; + + /* try to find next selem in the same bucket */ + selem = prev_selem; + count = 0; + while (selem) { + selem = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&selem->map_node)), + struct bpf_local_storage_elem, map_node); + if (!selem) { + /* not found, unlock and go to the next bucket */ + b = &smap->buckets[bucket_id++]; + rcu_read_unlock(); + skip_elems = 0; + break; + } + sk_storage = rcu_dereference(selem->local_storage); + if (sk_storage) { + info->skip_elems = skip_elems + count; + return selem; + } + count++; + } + + for (i = bucket_id; i < (1U << smap->bucket_log); i++) { + b = &smap->buckets[i]; + rcu_read_lock(); + count = 0; + hlist_for_each_entry_rcu(selem, &b->list, map_node) { + sk_storage = rcu_dereference(selem->local_storage); + if (sk_storage && count >= skip_elems) { + info->bucket_id = i; + info->skip_elems = count; + return selem; + } + count++; + } + rcu_read_unlock(); + skip_elems = 0; + } + + info->bucket_id = i; + info->skip_elems = 0; + return NULL; +} + +static void *bpf_sk_storage_map_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_local_storage_elem *selem; + + selem = bpf_sk_storage_map_seq_find_next(seq->private, NULL); + if (!selem) + return NULL; + + if (*pos == 0) + ++*pos; + return selem; +} + +static void *bpf_sk_storage_map_seq_next(struct seq_file *seq, void *v, + loff_t *pos) +{ + struct bpf_iter_seq_sk_storage_map_info *info = seq->private; + + ++*pos; + ++info->skip_elems; + return bpf_sk_storage_map_seq_find_next(seq->private, v); +} + +struct bpf_iter__bpf_sk_storage_map { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_map *, map); + __bpf_md_ptr(struct sock *, sk); + __bpf_md_ptr(void *, value); +}; + +DEFINE_BPF_ITER_FUNC(bpf_sk_storage_map, struct bpf_iter_meta *meta, + struct bpf_map *map, struct sock *sk, + void *value) + +static int __bpf_sk_storage_map_seq_show(struct seq_file *seq, + struct bpf_local_storage_elem *selem) +{ + struct bpf_iter_seq_sk_storage_map_info *info = seq->private; + struct bpf_iter__bpf_sk_storage_map ctx = {}; + struct bpf_local_storage *sk_storage; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, selem == NULL); + if (prog) { + ctx.meta = &meta; + ctx.map = info->map; + if (selem) { + sk_storage = rcu_dereference(selem->local_storage); + ctx.sk = sk_storage->owner; + ctx.value = SDATA(selem)->data; + } + ret = bpf_iter_run_prog(prog, &ctx); + } + + return ret; +} + +static int bpf_sk_storage_map_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_sk_storage_map_seq_show(seq, v); +} + +static void bpf_sk_storage_map_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + if (!v) + (void)__bpf_sk_storage_map_seq_show(seq, v); + else + rcu_read_unlock(); +} + +static int bpf_iter_init_sk_storage_map(void *priv_data, + struct bpf_iter_aux_info *aux) +{ + struct bpf_iter_seq_sk_storage_map_info *seq_info = priv_data; + + bpf_map_inc_with_uref(aux->map); + seq_info->map = aux->map; + return 0; +} + +static void bpf_iter_fini_sk_storage_map(void *priv_data) +{ + struct bpf_iter_seq_sk_storage_map_info *seq_info = priv_data; + + bpf_map_put_with_uref(seq_info->map); +} + +static int bpf_iter_attach_map(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) +{ + struct bpf_map *map; + int err = -EINVAL; + + if (!linfo->map.map_fd) + return -EBADF; + + map = bpf_map_get_with_uref(linfo->map.map_fd); + if (IS_ERR(map)) + return PTR_ERR(map); + + if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) + goto put_map; + + if (prog->aux->max_rdwr_access > map->value_size) { + err = -EACCES; + goto put_map; + } + + aux->map = map; + return 0; + +put_map: + bpf_map_put_with_uref(map); + return err; +} + +static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux) +{ + bpf_map_put_with_uref(aux->map); +} + +static const struct seq_operations bpf_sk_storage_map_seq_ops = { + .start = bpf_sk_storage_map_seq_start, + .next = bpf_sk_storage_map_seq_next, + .stop = bpf_sk_storage_map_seq_stop, + .show = bpf_sk_storage_map_seq_show, +}; + +static const struct bpf_iter_seq_info iter_seq_info = { + .seq_ops = &bpf_sk_storage_map_seq_ops, + .init_seq_private = bpf_iter_init_sk_storage_map, + .fini_seq_private = bpf_iter_fini_sk_storage_map, + .seq_priv_size = sizeof(struct bpf_iter_seq_sk_storage_map_info), +}; + +static struct bpf_iter_reg bpf_sk_storage_map_reg_info = { + .target = "bpf_sk_storage_map", + .attach_target = bpf_iter_attach_map, + .detach_target = bpf_iter_detach_map, + .show_fdinfo = bpf_iter_map_show_fdinfo, + .fill_link_info = bpf_iter_map_fill_link_info, + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_sk_storage_map, sk), + PTR_TO_BTF_ID_OR_NULL }, + { offsetof(struct bpf_iter__bpf_sk_storage_map, value), + PTR_TO_BUF | PTR_MAYBE_NULL }, + }, + .seq_info = &iter_seq_info, +}; + +static int __init bpf_sk_storage_map_iter_init(void) +{ + bpf_sk_storage_map_reg_info.ctx_arg_info[0].btf_id = + btf_sock_ids[BTF_SOCK_TYPE_SOCK]; + return bpf_iter_reg_target(&bpf_sk_storage_map_reg_info); +} +late_initcall(bpf_sk_storage_map_iter_init); diff --git a/net/core/datagram.c b/net/core/datagram.c new file mode 100644 index 000000000..8dabb9a74 --- /dev/null +++ b/net/core/datagram.c @@ -0,0 +1,842 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * SUCS NET3: + * + * Generic datagram handling routines. These are generic for all + * protocols. Possibly a generic IP version on top of these would + * make sense. Not tonight however 8-). + * This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and + * NetROM layer all have identical poll code and mostly + * identical recvmsg() code. So we share it here. The poll was + * shared before but buried in udp.c so I moved it. + * + * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old + * udp.c code) + * + * Fixes: + * Alan Cox : NULL return from skb_peek_copy() + * understood + * Alan Cox : Rewrote skb_read_datagram to avoid the + * skb_peek_copy stuff. + * Alan Cox : Added support for SOCK_SEQPACKET. + * IPX can no longer use the SO_TYPE hack + * but AX.25 now works right, and SPX is + * feasible. + * Alan Cox : Fixed write poll of non IP protocol + * crash. + * Florian La Roche: Changed for my new skbuff handling. + * Darryl Miles : Fixed non-blocking SOCK_SEQPACKET. + * Linus Torvalds : BSD semantic fixes. + * Alan Cox : Datagram iovec handling + * Darryl Miles : Fixed non-blocking SOCK_STREAM. + * Alan Cox : POSIXisms + * Pete Wyckoff : Unconnected accept() fix. + * + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/uaccess.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/poll.h> +#include <linux/highmem.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/pagemap.h> +#include <linux/uio.h> +#include <linux/indirect_call_wrapper.h> + +#include <net/protocol.h> +#include <linux/skbuff.h> + +#include <net/checksum.h> +#include <net/sock.h> +#include <net/tcp_states.h> +#include <trace/events/skb.h> +#include <net/busy_poll.h> + +/* + * Is a socket 'connection oriented' ? + */ +static inline int connection_based(struct sock *sk) +{ + return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM; +} + +static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync, + void *key) +{ + /* + * Avoid a wakeup if event not interesting for us + */ + if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR))) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} +/* + * Wait for the last received packet to be different from skb + */ +int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue, + int *err, long *timeo_p, + const struct sk_buff *skb) +{ + int error; + DEFINE_WAIT_FUNC(wait, receiver_wake_function); + + prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + + /* Socket errors? */ + error = sock_error(sk); + if (error) + goto out_err; + + if (READ_ONCE(queue->prev) != skb) + goto out; + + /* Socket shut down? */ + if (sk->sk_shutdown & RCV_SHUTDOWN) + goto out_noerr; + + /* Sequenced packets can come disconnected. + * If so we report the problem + */ + error = -ENOTCONN; + if (connection_based(sk) && + !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN)) + goto out_err; + + /* handle signals */ + if (signal_pending(current)) + goto interrupted; + + error = 0; + *timeo_p = schedule_timeout(*timeo_p); +out: + finish_wait(sk_sleep(sk), &wait); + return error; +interrupted: + error = sock_intr_errno(*timeo_p); +out_err: + *err = error; + goto out; +out_noerr: + *err = 0; + error = 1; + goto out; +} +EXPORT_SYMBOL(__skb_wait_for_more_packets); + +static struct sk_buff *skb_set_peeked(struct sk_buff *skb) +{ + struct sk_buff *nskb; + + if (skb->peeked) + return skb; + + /* We have to unshare an skb before modifying it. */ + if (!skb_shared(skb)) + goto done; + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return ERR_PTR(-ENOMEM); + + skb->prev->next = nskb; + skb->next->prev = nskb; + nskb->prev = skb->prev; + nskb->next = skb->next; + + consume_skb(skb); + skb = nskb; + +done: + skb->peeked = 1; + + return skb; +} + +struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, + struct sk_buff_head *queue, + unsigned int flags, + int *off, int *err, + struct sk_buff **last) +{ + bool peek_at_off = false; + struct sk_buff *skb; + int _off = 0; + + if (unlikely(flags & MSG_PEEK && *off >= 0)) { + peek_at_off = true; + _off = *off; + } + + *last = queue->prev; + skb_queue_walk(queue, skb) { + if (flags & MSG_PEEK) { + if (peek_at_off && _off >= skb->len && + (_off || skb->peeked)) { + _off -= skb->len; + continue; + } + if (!skb->len) { + skb = skb_set_peeked(skb); + if (IS_ERR(skb)) { + *err = PTR_ERR(skb); + return NULL; + } + } + refcount_inc(&skb->users); + } else { + __skb_unlink(skb, queue); + } + *off = _off; + return skb; + } + return NULL; +} + +/** + * __skb_try_recv_datagram - Receive a datagram skbuff + * @sk: socket + * @queue: socket queue from which to receive + * @flags: MSG\_ flags + * @off: an offset in bytes to peek skb from. Returns an offset + * within an skb where data actually starts + * @err: error code returned + * @last: set to last peeked message to inform the wait function + * what to look for when peeking + * + * Get a datagram skbuff, understands the peeking, nonblocking wakeups + * and possible races. This replaces identical code in packet, raw and + * udp, as well as the IPX AX.25 and Appletalk. It also finally fixes + * the long standing peek and read race for datagram sockets. If you + * alter this routine remember it must be re-entrant. + * + * This function will lock the socket if a skb is returned, so + * the caller needs to unlock the socket in that case (usually by + * calling skb_free_datagram). Returns NULL with @err set to + * -EAGAIN if no data was available or to some other value if an + * error was detected. + * + * * It does not lock socket since today. This function is + * * free of race conditions. This measure should/can improve + * * significantly datagram socket latencies at high loads, + * * when data copying to user space takes lots of time. + * * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet + * * 8) Great win.) + * * --ANK (980729) + * + * The order of the tests when we find no data waiting are specified + * quite explicitly by POSIX 1003.1g, don't change them without having + * the standard around please. + */ +struct sk_buff *__skb_try_recv_datagram(struct sock *sk, + struct sk_buff_head *queue, + unsigned int flags, int *off, int *err, + struct sk_buff **last) +{ + struct sk_buff *skb; + unsigned long cpu_flags; + /* + * Caller is allowed not to check sk->sk_err before skb_recv_datagram() + */ + int error = sock_error(sk); + + if (error) + goto no_packet; + + do { + /* Again only user level code calls this function, so nothing + * interrupt level will suddenly eat the receive_queue. + * + * Look at current nfs client by the way... + * However, this function was correct in any case. 8) + */ + spin_lock_irqsave(&queue->lock, cpu_flags); + skb = __skb_try_recv_from_queue(sk, queue, flags, off, &error, + last); + spin_unlock_irqrestore(&queue->lock, cpu_flags); + if (error) + goto no_packet; + if (skb) + return skb; + + if (!sk_can_busy_loop(sk)) + break; + + sk_busy_loop(sk, flags & MSG_DONTWAIT); + } while (READ_ONCE(queue->prev) != *last); + + error = -EAGAIN; + +no_packet: + *err = error; + return NULL; +} +EXPORT_SYMBOL(__skb_try_recv_datagram); + +struct sk_buff *__skb_recv_datagram(struct sock *sk, + struct sk_buff_head *sk_queue, + unsigned int flags, int *off, int *err) +{ + struct sk_buff *skb, *last; + long timeo; + + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + + do { + skb = __skb_try_recv_datagram(sk, sk_queue, flags, off, err, + &last); + if (skb) + return skb; + + if (*err != -EAGAIN) + break; + } while (timeo && + !__skb_wait_for_more_packets(sk, sk_queue, err, + &timeo, last)); + + return NULL; +} +EXPORT_SYMBOL(__skb_recv_datagram); + +struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, + int *err) +{ + int off = 0; + + return __skb_recv_datagram(sk, &sk->sk_receive_queue, flags, + &off, err); +} +EXPORT_SYMBOL(skb_recv_datagram); + +void skb_free_datagram(struct sock *sk, struct sk_buff *skb) +{ + consume_skb(skb); +} +EXPORT_SYMBOL(skb_free_datagram); + +void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len) +{ + bool slow; + + if (!skb_unref(skb)) { + sk_peek_offset_bwd(sk, len); + return; + } + + slow = lock_sock_fast(sk); + sk_peek_offset_bwd(sk, len); + skb_orphan(skb); + unlock_sock_fast(sk, slow); + + /* skb is now orphaned, can be freed outside of locked section */ + __kfree_skb(skb); +} +EXPORT_SYMBOL(__skb_free_datagram_locked); + +int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, + struct sk_buff *skb, unsigned int flags, + void (*destructor)(struct sock *sk, + struct sk_buff *skb)) +{ + int err = 0; + + if (flags & MSG_PEEK) { + err = -ENOENT; + spin_lock_bh(&sk_queue->lock); + if (skb->next) { + __skb_unlink(skb, sk_queue); + refcount_dec(&skb->users); + if (destructor) + destructor(sk, skb); + err = 0; + } + spin_unlock_bh(&sk_queue->lock); + } + + atomic_inc(&sk->sk_drops); + return err; +} +EXPORT_SYMBOL(__sk_queue_drop_skb); + +/** + * skb_kill_datagram - Free a datagram skbuff forcibly + * @sk: socket + * @skb: datagram skbuff + * @flags: MSG\_ flags + * + * This function frees a datagram skbuff that was received by + * skb_recv_datagram. The flags argument must match the one + * used for skb_recv_datagram. + * + * If the MSG_PEEK flag is set, and the packet is still on the + * receive queue of the socket, it will be taken off the queue + * before it is freed. + * + * This function currently only disables BH when acquiring the + * sk_receive_queue lock. Therefore it must not be used in a + * context where that lock is acquired in an IRQ context. + * + * It returns 0 if the packet was removed by us. + */ + +int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) +{ + int err = __sk_queue_drop_skb(sk, &sk->sk_receive_queue, skb, flags, + NULL); + + kfree_skb(skb); + return err; +} +EXPORT_SYMBOL(skb_kill_datagram); + +INDIRECT_CALLABLE_DECLARE(static size_t simple_copy_to_iter(const void *addr, + size_t bytes, + void *data __always_unused, + struct iov_iter *i)); + +static int __skb_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, bool fault_short, + size_t (*cb)(const void *, size_t, void *, + struct iov_iter *), void *data) +{ + int start = skb_headlen(skb); + int i, copy = start - offset, start_off = offset, n; + struct sk_buff *frag_iter; + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + n = INDIRECT_CALL_1(cb, simple_copy_to_iter, + skb->data + offset, copy, data, to); + offset += n; + if (n != copy) + goto short_copy; + if ((len -= copy) == 0) + return 0; + } + + /* Copy paged appendix. Hmm... why does this look so complicated? */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(frag); + if ((copy = end - offset) > 0) { + struct page *page = skb_frag_page(frag); + u8 *vaddr = kmap(page); + + if (copy > len) + copy = len; + n = INDIRECT_CALL_1(cb, simple_copy_to_iter, + vaddr + skb_frag_off(frag) + offset - start, + copy, data, to); + kunmap(page); + offset += n; + if (n != copy) + goto short_copy; + if (!(len -= copy)) + return 0; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (__skb_datagram_iter(frag_iter, offset - start, + to, copy, fault_short, cb, data)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + } + start = end; + } + if (!len) + return 0; + + /* This is not really a user copy fault, but rather someone + * gave us a bogus length on the skb. We should probably + * print a warning here as it may indicate a kernel bug. + */ + +fault: + iov_iter_revert(to, offset - start_off); + return -EFAULT; + +short_copy: + if (fault_short || iov_iter_count(to)) + goto fault; + + return 0; +} + +/** + * skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator + * and update a hash. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: iovec iterator to copy to + * @len: amount of data to copy from buffer to iovec + * @hash: hash request to update + */ +int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, + struct ahash_request *hash) +{ + return __skb_datagram_iter(skb, offset, to, len, true, + hash_and_copy_to_iter, hash); +} +EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter); + +static size_t simple_copy_to_iter(const void *addr, size_t bytes, + void *data __always_unused, struct iov_iter *i) +{ + return copy_to_iter(addr, bytes, i); +} + +/** + * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: iovec iterator to copy to + * @len: amount of data to copy from buffer to iovec + */ +int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len) +{ + trace_skb_copy_datagram_iovec(skb, len); + return __skb_datagram_iter(skb, offset, to, len, false, + simple_copy_to_iter, NULL); +} +EXPORT_SYMBOL(skb_copy_datagram_iter); + +/** + * skb_copy_datagram_from_iter - Copy a datagram from an iov_iter. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying to + * @from: the copy source + * @len: amount of data to copy to buffer from iovec + * + * Returns 0 or -EFAULT. + */ +int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, + struct iov_iter *from, + int len) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + if (copy_from_iter(skb->data + offset, copy, from) != copy) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + } + + /* Copy paged appendix. Hmm... why does this look so complicated? */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(frag); + if ((copy = end - offset) > 0) { + size_t copied; + + if (copy > len) + copy = len; + copied = copy_page_from_iter(skb_frag_page(frag), + skb_frag_off(frag) + offset - start, + copy, from); + if (copied != copy) + goto fault; + + if (!(len -= copy)) + return 0; + offset += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_datagram_from_iter(frag_iter, + offset - start, + from, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + } + start = end; + } + if (!len) + return 0; + +fault: + return -EFAULT; +} +EXPORT_SYMBOL(skb_copy_datagram_from_iter); + +int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb, struct iov_iter *from, + size_t length) +{ + int frag; + + if (msg && msg->msg_ubuf && msg->sg_from_iter) + return msg->sg_from_iter(sk, skb, from, length); + + frag = skb_shinfo(skb)->nr_frags; + + while (length && iov_iter_count(from)) { + struct page *pages[MAX_SKB_FRAGS]; + struct page *last_head = NULL; + size_t start; + ssize_t copied; + unsigned long truesize; + int refs, n = 0; + + if (frag == MAX_SKB_FRAGS) + return -EMSGSIZE; + + copied = iov_iter_get_pages2(from, pages, length, + MAX_SKB_FRAGS - frag, &start); + if (copied < 0) + return -EFAULT; + + length -= copied; + + truesize = PAGE_ALIGN(copied + start); + skb->data_len += copied; + skb->len += copied; + skb->truesize += truesize; + if (sk && sk->sk_type == SOCK_STREAM) { + sk_wmem_queued_add(sk, truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_charge(sk, truesize); + } else { + refcount_add(truesize, &skb->sk->sk_wmem_alloc); + } + for (refs = 0; copied != 0; start = 0) { + int size = min_t(int, copied, PAGE_SIZE - start); + struct page *head = compound_head(pages[n]); + + start += (pages[n] - head) << PAGE_SHIFT; + copied -= size; + n++; + if (frag) { + skb_frag_t *last = &skb_shinfo(skb)->frags[frag - 1]; + + if (head == skb_frag_page(last) && + start == skb_frag_off(last) + skb_frag_size(last)) { + skb_frag_size_add(last, size); + /* We combined this page, we need to release + * a reference. Since compound pages refcount + * is shared among many pages, batch the refcount + * adjustments to limit false sharing. + */ + last_head = head; + refs++; + continue; + } + } + if (refs) { + page_ref_sub(last_head, refs); + refs = 0; + } + skb_fill_page_desc_noacc(skb, frag++, head, start, size); + } + if (refs) + page_ref_sub(last_head, refs); + } + return 0; +} +EXPORT_SYMBOL(__zerocopy_sg_from_iter); + +/** + * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter + * @skb: buffer to copy + * @from: the source to copy from + * + * The function will first copy up to headlen, and then pin the userspace + * pages and build frags through them. + * + * Returns 0, -EFAULT or -EMSGSIZE. + */ +int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) +{ + int copy = min_t(int, skb_headlen(skb), iov_iter_count(from)); + + /* copy up to skb headlen */ + if (skb_copy_datagram_from_iter(skb, 0, from, copy)) + return -EFAULT; + + return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U); +} +EXPORT_SYMBOL(zerocopy_sg_from_iter); + +/** + * skb_copy_and_csum_datagram - Copy datagram to an iovec iterator + * and update a checksum. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: iovec iterator to copy to + * @len: amount of data to copy from buffer to iovec + * @csump: checksum pointer + */ +static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, + __wsum *csump) +{ + struct csum_state csdata = { .csum = *csump }; + int ret; + + ret = __skb_datagram_iter(skb, offset, to, len, true, + csum_and_copy_to_iter, &csdata); + if (ret) + return ret; + + *csump = csdata.csum; + return 0; +} + +/** + * skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec. + * @skb: skbuff + * @hlen: hardware length + * @msg: destination + * + * Caller _must_ check that skb will fit to this iovec. + * + * Returns: 0 - success. + * -EINVAL - checksum failure. + * -EFAULT - fault during copy. + */ +int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, + int hlen, struct msghdr *msg) +{ + __wsum csum; + int chunk = skb->len - hlen; + + if (!chunk) + return 0; + + if (msg_data_left(msg) < chunk) { + if (__skb_checksum_complete(skb)) + return -EINVAL; + if (skb_copy_datagram_msg(skb, hlen, msg, chunk)) + goto fault; + } else { + csum = csum_partial(skb->data, hlen, skb->csum); + if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter, + chunk, &csum)) + goto fault; + + if (csum_fold(csum)) { + iov_iter_revert(&msg->msg_iter, chunk); + return -EINVAL; + } + + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(NULL, skb); + } + return 0; +fault: + return -EFAULT; +} +EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); + +/** + * datagram_poll - generic datagram poll + * @file: file struct + * @sock: socket + * @wait: poll table + * + * Datagram poll: Again totally generic. This also handles + * sequenced packet sockets providing the socket receive queue + * is only ever holding data ready to receive. + * + * Note: when you *don't* use this routine for this protocol, + * and you use a different write policy from sock_writeable() + * then please supply your own write_space callback. + */ +__poll_t datagram_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + __poll_t mask; + u8 shutdown; + + sock_poll_wait(file, sock, wait); + mask = 0; + + /* exceptional events? */ + if (READ_ONCE(sk->sk_err) || + !skb_queue_empty_lockless(&sk->sk_error_queue)) + mask |= EPOLLERR | + (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); + + shutdown = READ_ONCE(sk->sk_shutdown); + if (shutdown & RCV_SHUTDOWN) + mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; + if (shutdown == SHUTDOWN_MASK) + mask |= EPOLLHUP; + + /* readable? */ + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) + mask |= EPOLLIN | EPOLLRDNORM; + + /* Connection-based need to check for termination and startup */ + if (connection_based(sk)) { + int state = READ_ONCE(sk->sk_state); + + if (state == TCP_CLOSE) + mask |= EPOLLHUP; + /* connection hasn't started yet? */ + if (state == TCP_SYN_SENT) + return mask; + } + + /* writable? */ + if (sock_writeable(sk)) + mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; + else + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); + + return mask; +} +EXPORT_SYMBOL(datagram_poll); diff --git a/net/core/dev.c b/net/core/dev.c new file mode 100644 index 000000000..1ba3662fa --- /dev/null +++ b/net/core/dev.c @@ -0,0 +1,11486 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * NET3 Protocol independent device support routines. + * + * Derived from the non IP parts of dev.c 1.0.19 + * Authors: Ross Biro + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Mark Evans, <evansmp@uhura.aston.ac.uk> + * + * Additional Authors: + * Florian la Roche <rzsfl@rz.uni-sb.de> + * Alan Cox <gw4pts@gw4pts.ampr.org> + * David Hinds <dahinds@users.sourceforge.net> + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * Adam Sulmicki <adam@cfar.umd.edu> + * Pekka Riikonen <priikone@poesidon.pspt.fi> + * + * Changes: + * D.J. Barrow : Fixed bug where dev->refcnt gets set + * to 2 if register_netdev gets called + * before net_dev_init & also removed a + * few lines of code in the process. + * Alan Cox : device private ioctl copies fields back. + * Alan Cox : Transmit queue code does relevant + * stunts to keep the queue safe. + * Alan Cox : Fixed double lock. + * Alan Cox : Fixed promisc NULL pointer trap + * ???????? : Support the full private ioctl range + * Alan Cox : Moved ioctl permission check into + * drivers + * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI + * Alan Cox : 100 backlog just doesn't cut it when + * you start doing multicast video 8) + * Alan Cox : Rewrote net_bh and list manager. + * Alan Cox : Fix ETH_P_ALL echoback lengths. + * Alan Cox : Took out transmit every packet pass + * Saved a few bytes in the ioctl handler + * Alan Cox : Network driver sets packet type before + * calling netif_rx. Saves a function + * call a packet. + * Alan Cox : Hashed net_bh() + * Richard Kooijman: Timestamp fixes. + * Alan Cox : Wrong field in SIOCGIFDSTADDR + * Alan Cox : Device lock protection. + * Alan Cox : Fixed nasty side effect of device close + * changes. + * Rudi Cilibrasi : Pass the right thing to + * set_mac_address() + * Dave Miller : 32bit quantity for the device lock to + * make it work out on a Sparc. + * Bjorn Ekwall : Added KERNELD hack. + * Alan Cox : Cleaned up the backlog initialise. + * Craig Metz : SIOCGIFCONF fix if space for under + * 1 device. + * Thomas Bogendoerfer : Return ENODEV for dev_open, if there + * is no device open function. + * Andi Kleen : Fix error reporting for SIOCGIFCONF + * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF + * Cyrus Durgin : Cleaned for KMOD + * Adam Sulmicki : Bug Fix : Network Device Unload + * A network device unload needs to purge + * the backlog queue. + * Paul Rusty Russell : SIOCSIFNAME + * Pekka Riikonen : Netdev boot-time settings code + * Andrew Morton : Make unregister_netdevice wait + * indefinitely on dev->refcnt + * J Hadi Salim : - Backlog queue sampling + * - netif_rx() feedback + */ + +#include <linux/uaccess.h> +#include <linux/bitops.h> +#include <linux/capability.h> +#include <linux/cpu.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/hash.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/mutex.h> +#include <linux/rwsem.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/ethtool.h> +#include <linux/skbuff.h> +#include <linux/kthread.h> +#include <linux/bpf.h> +#include <linux/bpf_trace.h> +#include <net/net_namespace.h> +#include <net/sock.h> +#include <net/busy_poll.h> +#include <linux/rtnetlink.h> +#include <linux/stat.h> +#include <net/dsa.h> +#include <net/dst.h> +#include <net/dst_metadata.h> +#include <net/gro.h> +#include <net/pkt_sched.h> +#include <net/pkt_cls.h> +#include <net/checksum.h> +#include <net/xfrm.h> +#include <linux/highmem.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netpoll.h> +#include <linux/rcupdate.h> +#include <linux/delay.h> +#include <net/iw_handler.h> +#include <asm/current.h> +#include <linux/audit.h> +#include <linux/dmaengine.h> +#include <linux/err.h> +#include <linux/ctype.h> +#include <linux/if_arp.h> +#include <linux/if_vlan.h> +#include <linux/ip.h> +#include <net/ip.h> +#include <net/mpls.h> +#include <linux/ipv6.h> +#include <linux/in.h> +#include <linux/jhash.h> +#include <linux/random.h> +#include <trace/events/napi.h> +#include <trace/events/net.h> +#include <trace/events/skb.h> +#include <trace/events/qdisc.h> +#include <linux/inetdevice.h> +#include <linux/cpu_rmap.h> +#include <linux/static_key.h> +#include <linux/hashtable.h> +#include <linux/vmalloc.h> +#include <linux/if_macvlan.h> +#include <linux/errqueue.h> +#include <linux/hrtimer.h> +#include <linux/netfilter_netdev.h> +#include <linux/crash_dump.h> +#include <linux/sctp.h> +#include <net/udp_tunnel.h> +#include <linux/net_namespace.h> +#include <linux/indirect_call_wrapper.h> +#include <net/devlink.h> +#include <linux/pm_runtime.h> +#include <linux/prandom.h> +#include <linux/once_lite.h> + +#include "dev.h" +#include "net-sysfs.h" + + +static DEFINE_SPINLOCK(ptype_lock); +struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; +struct list_head ptype_all __read_mostly; /* Taps */ + +static int netif_rx_internal(struct sk_buff *skb); +static int call_netdevice_notifiers_info(unsigned long val, + struct netdev_notifier_info *info); +static int call_netdevice_notifiers_extack(unsigned long val, + struct net_device *dev, + struct netlink_ext_ack *extack); +static struct napi_struct *napi_by_id(unsigned int napi_id); + +/* + * The @dev_base_head list is protected by @dev_base_lock and the rtnl + * semaphore. + * + * Pure readers hold dev_base_lock for reading, or rcu_read_lock() + * + * Writers must hold the rtnl semaphore while they loop through the + * dev_base_head list, and hold dev_base_lock for writing when they do the + * actual updates. This allows pure readers to access the list even + * while a writer is preparing to update it. + * + * To put it another way, dev_base_lock is held for writing only to + * protect against pure readers; the rtnl semaphore provides the + * protection against other writers. + * + * See, for example usages, register_netdevice() and + * unregister_netdevice(), which must be called with the rtnl + * semaphore held. + */ +DEFINE_RWLOCK(dev_base_lock); +EXPORT_SYMBOL(dev_base_lock); + +static DEFINE_MUTEX(ifalias_mutex); + +/* protects napi_hash addition/deletion and napi_gen_id */ +static DEFINE_SPINLOCK(napi_hash_lock); + +static unsigned int napi_gen_id = NR_CPUS; +static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); + +static DECLARE_RWSEM(devnet_rename_sem); + +static inline void dev_base_seq_inc(struct net *net) +{ + while (++net->dev_base_seq == 0) + ; +} + +static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) +{ + unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ)); + + return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; +} + +static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) +{ + return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; +} + +static inline void rps_lock_irqsave(struct softnet_data *sd, + unsigned long *flags) +{ + if (IS_ENABLED(CONFIG_RPS)) + spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_save(*flags); +} + +static inline void rps_lock_irq_disable(struct softnet_data *sd) +{ + if (IS_ENABLED(CONFIG_RPS)) + spin_lock_irq(&sd->input_pkt_queue.lock); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); +} + +static inline void rps_unlock_irq_restore(struct softnet_data *sd, + unsigned long *flags) +{ + if (IS_ENABLED(CONFIG_RPS)) + spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_restore(*flags); +} + +static inline void rps_unlock_irq_enable(struct softnet_data *sd) +{ + if (IS_ENABLED(CONFIG_RPS)) + spin_unlock_irq(&sd->input_pkt_queue.lock); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); +} + +static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, + const char *name) +{ + struct netdev_name_node *name_node; + + name_node = kmalloc(sizeof(*name_node), GFP_KERNEL); + if (!name_node) + return NULL; + INIT_HLIST_NODE(&name_node->hlist); + name_node->dev = dev; + name_node->name = name; + return name_node; +} + +static struct netdev_name_node * +netdev_name_node_head_alloc(struct net_device *dev) +{ + struct netdev_name_node *name_node; + + name_node = netdev_name_node_alloc(dev, dev->name); + if (!name_node) + return NULL; + INIT_LIST_HEAD(&name_node->list); + return name_node; +} + +static void netdev_name_node_free(struct netdev_name_node *name_node) +{ + kfree(name_node); +} + +static void netdev_name_node_add(struct net *net, + struct netdev_name_node *name_node) +{ + hlist_add_head_rcu(&name_node->hlist, + dev_name_hash(net, name_node->name)); +} + +static void netdev_name_node_del(struct netdev_name_node *name_node) +{ + hlist_del_rcu(&name_node->hlist); +} + +static struct netdev_name_node *netdev_name_node_lookup(struct net *net, + const char *name) +{ + struct hlist_head *head = dev_name_hash(net, name); + struct netdev_name_node *name_node; + + hlist_for_each_entry(name_node, head, hlist) + if (!strcmp(name_node->name, name)) + return name_node; + return NULL; +} + +static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net, + const char *name) +{ + struct hlist_head *head = dev_name_hash(net, name); + struct netdev_name_node *name_node; + + hlist_for_each_entry_rcu(name_node, head, hlist) + if (!strcmp(name_node->name, name)) + return name_node; + return NULL; +} + +bool netdev_name_in_use(struct net *net, const char *name) +{ + return netdev_name_node_lookup(net, name); +} +EXPORT_SYMBOL(netdev_name_in_use); + +int netdev_name_node_alt_create(struct net_device *dev, const char *name) +{ + struct netdev_name_node *name_node; + struct net *net = dev_net(dev); + + name_node = netdev_name_node_lookup(net, name); + if (name_node) + return -EEXIST; + name_node = netdev_name_node_alloc(dev, name); + if (!name_node) + return -ENOMEM; + netdev_name_node_add(net, name_node); + /* The node that holds dev->name acts as a head of per-device list. */ + list_add_tail(&name_node->list, &dev->name_node->list); + + return 0; +} + +static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) +{ + list_del(&name_node->list); + kfree(name_node->name); + netdev_name_node_free(name_node); +} + +int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) +{ + struct netdev_name_node *name_node; + struct net *net = dev_net(dev); + + name_node = netdev_name_node_lookup(net, name); + if (!name_node) + return -ENOENT; + /* lookup might have found our primary name or a name belonging + * to another device. + */ + if (name_node == dev->name_node || name_node->dev != dev) + return -EINVAL; + + netdev_name_node_del(name_node); + synchronize_rcu(); + __netdev_name_node_alt_destroy(name_node); + + return 0; +} + +static void netdev_name_node_alt_flush(struct net_device *dev) +{ + struct netdev_name_node *name_node, *tmp; + + list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) + __netdev_name_node_alt_destroy(name_node); +} + +/* Device list insertion */ +static void list_netdevice(struct net_device *dev) +{ + struct netdev_name_node *name_node; + struct net *net = dev_net(dev); + + ASSERT_RTNL(); + + write_lock(&dev_base_lock); + list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); + netdev_name_node_add(net, dev->name_node); + hlist_add_head_rcu(&dev->index_hlist, + dev_index_hash(net, dev->ifindex)); + write_unlock(&dev_base_lock); + + netdev_for_each_altname(dev, name_node) + netdev_name_node_add(net, name_node); + + dev_base_seq_inc(net); +} + +/* Device list removal + * caller must respect a RCU grace period before freeing/reusing dev + */ +static void unlist_netdevice(struct net_device *dev, bool lock) +{ + struct netdev_name_node *name_node; + + ASSERT_RTNL(); + + netdev_for_each_altname(dev, name_node) + netdev_name_node_del(name_node); + + /* Unlink dev from the device chain */ + if (lock) + write_lock(&dev_base_lock); + list_del_rcu(&dev->dev_list); + netdev_name_node_del(dev->name_node); + hlist_del_rcu(&dev->index_hlist); + if (lock) + write_unlock(&dev_base_lock); + + dev_base_seq_inc(dev_net(dev)); +} + +/* + * Our notifier list + */ + +static RAW_NOTIFIER_HEAD(netdev_chain); + +/* + * Device drivers call our routines to queue packets here. We empty the + * queue in the local softnet handler. + */ + +DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); +EXPORT_PER_CPU_SYMBOL(softnet_data); + +#ifdef CONFIG_LOCKDEP +/* + * register_netdevice() inits txq->_xmit_lock and sets lockdep class + * according to dev->type + */ +static const unsigned short netdev_lock_type[] = { + ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, + ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, + ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, + ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, + ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, + ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, + ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, + ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, + ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, + ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, + ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, + ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, + ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, + ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, + ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; + +static const char *const netdev_lock_name[] = { + "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", + "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", + "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", + "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", + "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", + "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", + "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", + "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", + "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", + "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", + "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", + "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", + "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", + "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", + "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; + +static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; +static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; + +static inline unsigned short netdev_lock_pos(unsigned short dev_type) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) + if (netdev_lock_type[i] == dev_type) + return i; + /* the last key is used by default */ + return ARRAY_SIZE(netdev_lock_type) - 1; +} + +static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, + unsigned short dev_type) +{ + int i; + + i = netdev_lock_pos(dev_type); + lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], + netdev_lock_name[i]); +} + +static inline void netdev_set_addr_lockdep_class(struct net_device *dev) +{ + int i; + + i = netdev_lock_pos(dev->type); + lockdep_set_class_and_name(&dev->addr_list_lock, + &netdev_addr_lock_key[i], + netdev_lock_name[i]); +} +#else +static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, + unsigned short dev_type) +{ +} + +static inline void netdev_set_addr_lockdep_class(struct net_device *dev) +{ +} +#endif + +/******************************************************************************* + * + * Protocol management and registration routines + * + *******************************************************************************/ + + +/* + * Add a protocol ID to the list. Now that the input handler is + * smarter we can dispense with all the messy stuff that used to be + * here. + * + * BEWARE!!! Protocol handlers, mangling input packets, + * MUST BE last in hash buckets and checking protocol handlers + * MUST start from promiscuous ptype_all chain in net_bh. + * It is true now, do not change it. + * Explanation follows: if protocol handler, mangling packet, will + * be the first on list, it is not able to sense, that packet + * is cloned and should be copied-on-write, so that it will + * change it and subsequent readers will get broken packet. + * --ANK (980803) + */ + +static inline struct list_head *ptype_head(const struct packet_type *pt) +{ + if (pt->type == htons(ETH_P_ALL)) + return pt->dev ? &pt->dev->ptype_all : &ptype_all; + else + return pt->dev ? &pt->dev->ptype_specific : + &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; +} + +/** + * dev_add_pack - add packet handler + * @pt: packet type declaration + * + * Add a protocol handler to the networking stack. The passed &packet_type + * is linked into kernel lists and may not be freed until it has been + * removed from the kernel lists. + * + * This call does not sleep therefore it can not + * guarantee all CPU's that are in middle of receiving packets + * will see the new packet type (until the next received packet). + */ + +void dev_add_pack(struct packet_type *pt) +{ + struct list_head *head = ptype_head(pt); + + spin_lock(&ptype_lock); + list_add_rcu(&pt->list, head); + spin_unlock(&ptype_lock); +} +EXPORT_SYMBOL(dev_add_pack); + +/** + * __dev_remove_pack - remove packet handler + * @pt: packet type declaration + * + * Remove a protocol handler that was previously added to the kernel + * protocol handlers by dev_add_pack(). The passed &packet_type is removed + * from the kernel lists and can be freed or reused once this function + * returns. + * + * The packet type might still be in use by receivers + * and must not be freed until after all the CPU's have gone + * through a quiescent state. + */ +void __dev_remove_pack(struct packet_type *pt) +{ + struct list_head *head = ptype_head(pt); + struct packet_type *pt1; + + spin_lock(&ptype_lock); + + list_for_each_entry(pt1, head, list) { + if (pt == pt1) { + list_del_rcu(&pt->list); + goto out; + } + } + + pr_warn("dev_remove_pack: %p not found\n", pt); +out: + spin_unlock(&ptype_lock); +} +EXPORT_SYMBOL(__dev_remove_pack); + +/** + * dev_remove_pack - remove packet handler + * @pt: packet type declaration + * + * Remove a protocol handler that was previously added to the kernel + * protocol handlers by dev_add_pack(). The passed &packet_type is removed + * from the kernel lists and can be freed or reused once this function + * returns. + * + * This call sleeps to guarantee that no CPU is looking at the packet + * type after return. + */ +void dev_remove_pack(struct packet_type *pt) +{ + __dev_remove_pack(pt); + + synchronize_net(); +} +EXPORT_SYMBOL(dev_remove_pack); + + +/******************************************************************************* + * + * Device Interface Subroutines + * + *******************************************************************************/ + +/** + * dev_get_iflink - get 'iflink' value of a interface + * @dev: targeted interface + * + * Indicates the ifindex the interface is linked to. + * Physical interfaces have the same 'ifindex' and 'iflink' values. + */ + +int dev_get_iflink(const struct net_device *dev) +{ + if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) + return dev->netdev_ops->ndo_get_iflink(dev); + + return dev->ifindex; +} +EXPORT_SYMBOL(dev_get_iflink); + +/** + * dev_fill_metadata_dst - Retrieve tunnel egress information. + * @dev: targeted interface + * @skb: The packet. + * + * For better visibility of tunnel traffic OVS needs to retrieve + * egress tunnel information for a packet. Following API allows + * user to get this info. + */ +int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) +{ + struct ip_tunnel_info *info; + + if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst) + return -EINVAL; + + info = skb_tunnel_info_unclone(skb); + if (!info) + return -ENOMEM; + if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX))) + return -EINVAL; + + return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb); +} +EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); + +static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack) +{ + int k = stack->num_paths++; + + if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX)) + return NULL; + + return &stack->path[k]; +} + +int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, + struct net_device_path_stack *stack) +{ + const struct net_device *last_dev; + struct net_device_path_ctx ctx = { + .dev = dev, + }; + struct net_device_path *path; + int ret = 0; + + memcpy(ctx.daddr, daddr, sizeof(ctx.daddr)); + stack->num_paths = 0; + while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) { + last_dev = ctx.dev; + path = dev_fwd_path(stack); + if (!path) + return -1; + + memset(path, 0, sizeof(struct net_device_path)); + ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path); + if (ret < 0) + return -1; + + if (WARN_ON_ONCE(last_dev == ctx.dev)) + return -1; + } + + if (!ctx.dev) + return ret; + + path = dev_fwd_path(stack); + if (!path) + return -1; + path->type = DEV_PATH_ETHERNET; + path->dev = ctx.dev; + + return ret; +} +EXPORT_SYMBOL_GPL(dev_fill_forward_path); + +/** + * __dev_get_by_name - find a device by its name + * @net: the applicable net namespace + * @name: name to find + * + * Find an interface by name. Must be called under RTNL semaphore + * or @dev_base_lock. If the name is found a pointer to the device + * is returned. If the name is not found then %NULL is returned. The + * reference counters are not incremented so the caller must be + * careful with locks. + */ + +struct net_device *__dev_get_by_name(struct net *net, const char *name) +{ + struct netdev_name_node *node_name; + + node_name = netdev_name_node_lookup(net, name); + return node_name ? node_name->dev : NULL; +} +EXPORT_SYMBOL(__dev_get_by_name); + +/** + * dev_get_by_name_rcu - find a device by its name + * @net: the applicable net namespace + * @name: name to find + * + * Find an interface by name. + * If the name is found a pointer to the device is returned. + * If the name is not found then %NULL is returned. + * The reference counters are not incremented so the caller must be + * careful with locks. The caller must hold RCU lock. + */ + +struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) +{ + struct netdev_name_node *node_name; + + node_name = netdev_name_node_lookup_rcu(net, name); + return node_name ? node_name->dev : NULL; +} +EXPORT_SYMBOL(dev_get_by_name_rcu); + +/** + * dev_get_by_name - find a device by its name + * @net: the applicable net namespace + * @name: name to find + * + * Find an interface by name. This can be called from any + * context and does its own locking. The returned handle has + * the usage count incremented and the caller must use dev_put() to + * release it when it is no longer needed. %NULL is returned if no + * matching device is found. + */ + +struct net_device *dev_get_by_name(struct net *net, const char *name) +{ + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, name); + dev_hold(dev); + rcu_read_unlock(); + return dev; +} +EXPORT_SYMBOL(dev_get_by_name); + +/** + * __dev_get_by_index - find a device by its ifindex + * @net: the applicable net namespace + * @ifindex: index of device + * + * Search for an interface by index. Returns %NULL if the device + * is not found or a pointer to the device. The device has not + * had its reference counter increased so the caller must be careful + * about locking. The caller must hold either the RTNL semaphore + * or @dev_base_lock. + */ + +struct net_device *__dev_get_by_index(struct net *net, int ifindex) +{ + struct net_device *dev; + struct hlist_head *head = dev_index_hash(net, ifindex); + + hlist_for_each_entry(dev, head, index_hlist) + if (dev->ifindex == ifindex) + return dev; + + return NULL; +} +EXPORT_SYMBOL(__dev_get_by_index); + +/** + * dev_get_by_index_rcu - find a device by its ifindex + * @net: the applicable net namespace + * @ifindex: index of device + * + * Search for an interface by index. Returns %NULL if the device + * is not found or a pointer to the device. The device has not + * had its reference counter increased so the caller must be careful + * about locking. The caller must hold RCU lock. + */ + +struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) +{ + struct net_device *dev; + struct hlist_head *head = dev_index_hash(net, ifindex); + + hlist_for_each_entry_rcu(dev, head, index_hlist) + if (dev->ifindex == ifindex) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_get_by_index_rcu); + + +/** + * dev_get_by_index - find a device by its ifindex + * @net: the applicable net namespace + * @ifindex: index of device + * + * Search for an interface by index. Returns NULL if the device + * is not found or a pointer to the device. The device returned has + * had a reference added and the pointer is safe until the user calls + * dev_put to indicate they have finished with it. + */ + +struct net_device *dev_get_by_index(struct net *net, int ifindex) +{ + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); + dev_hold(dev); + rcu_read_unlock(); + return dev; +} +EXPORT_SYMBOL(dev_get_by_index); + +/** + * dev_get_by_napi_id - find a device by napi_id + * @napi_id: ID of the NAPI struct + * + * Search for an interface by NAPI ID. Returns %NULL if the device + * is not found or a pointer to the device. The device has not had + * its reference counter increased so the caller must be careful + * about locking. The caller must hold RCU lock. + */ + +struct net_device *dev_get_by_napi_id(unsigned int napi_id) +{ + struct napi_struct *napi; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (napi_id < MIN_NAPI_ID) + return NULL; + + napi = napi_by_id(napi_id); + + return napi ? napi->dev : NULL; +} +EXPORT_SYMBOL(dev_get_by_napi_id); + +/** + * netdev_get_name - get a netdevice name, knowing its ifindex. + * @net: network namespace + * @name: a pointer to the buffer where the name will be stored. + * @ifindex: the ifindex of the interface to get the name from. + */ +int netdev_get_name(struct net *net, char *name, int ifindex) +{ + struct net_device *dev; + int ret; + + down_read(&devnet_rename_sem); + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, ifindex); + if (!dev) { + ret = -ENODEV; + goto out; + } + + strcpy(name, dev->name); + + ret = 0; +out: + rcu_read_unlock(); + up_read(&devnet_rename_sem); + return ret; +} + +/** + * dev_getbyhwaddr_rcu - find a device by its hardware address + * @net: the applicable net namespace + * @type: media type of device + * @ha: hardware address + * + * Search for an interface by MAC address. Returns NULL if the device + * is not found or a pointer to the device. + * The caller must hold RCU or RTNL. + * The returned device has not had its ref count increased + * and the caller must therefore be careful about locking + * + */ + +struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, + const char *ha) +{ + struct net_device *dev; + + for_each_netdev_rcu(net, dev) + if (dev->type == type && + !memcmp(dev->dev_addr, ha, dev->addr_len)) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_getbyhwaddr_rcu); + +struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) +{ + struct net_device *dev, *ret = NULL; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) + if (dev->type == type) { + dev_hold(dev); + ret = dev; + break; + } + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(dev_getfirstbyhwtype); + +/** + * __dev_get_by_flags - find any device with given flags + * @net: the applicable net namespace + * @if_flags: IFF_* values + * @mask: bitmask of bits in if_flags to check + * + * Search for any interface with the given flags. Returns NULL if a device + * is not found or a pointer to the device. Must be called inside + * rtnl_lock(), and result refcount is unchanged. + */ + +struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, + unsigned short mask) +{ + struct net_device *dev, *ret; + + ASSERT_RTNL(); + + ret = NULL; + for_each_netdev(net, dev) { + if (((dev->flags ^ if_flags) & mask) == 0) { + ret = dev; + break; + } + } + return ret; +} +EXPORT_SYMBOL(__dev_get_by_flags); + +/** + * dev_valid_name - check if name is okay for network device + * @name: name string + * + * Network device names need to be valid file names to + * allow sysfs to work. We also disallow any kind of + * whitespace. + */ +bool dev_valid_name(const char *name) +{ + if (*name == '\0') + return false; + if (strnlen(name, IFNAMSIZ) == IFNAMSIZ) + return false; + if (!strcmp(name, ".") || !strcmp(name, "..")) + return false; + + while (*name) { + if (*name == '/' || *name == ':' || isspace(*name)) + return false; + name++; + } + return true; +} +EXPORT_SYMBOL(dev_valid_name); + +/** + * __dev_alloc_name - allocate a name for a device + * @net: network namespace to allocate the device name in + * @name: name format string + * @buf: scratch buffer and result name string + * + * Passed a format string - eg "lt%d" it will try and find a suitable + * id. It scans list of devices to build up a free map, then chooses + * the first empty slot. The caller must hold the dev_base or rtnl lock + * while allocating the name and adding the device in order to avoid + * duplicates. + * Limited to bits_per_byte * page size devices (ie 32K on most platforms). + * Returns the number of the unit assigned or a negative errno code. + */ + +static int __dev_alloc_name(struct net *net, const char *name, char *buf) +{ + int i = 0; + const char *p; + const int max_netdevices = 8*PAGE_SIZE; + unsigned long *inuse; + struct net_device *d; + + if (!dev_valid_name(name)) + return -EINVAL; + + p = strchr(name, '%'); + if (p) { + /* + * Verify the string as this thing may have come from + * the user. There must be either one "%d" and no other "%" + * characters. + */ + if (p[1] != 'd' || strchr(p + 2, '%')) + return -EINVAL; + + /* Use one page as a bit array of possible slots */ + inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); + if (!inuse) + return -ENOMEM; + + for_each_netdev(net, d) { + struct netdev_name_node *name_node; + + netdev_for_each_altname(d, name_node) { + if (!sscanf(name_node->name, name, &i)) + continue; + if (i < 0 || i >= max_netdevices) + continue; + + /* avoid cases where sscanf is not exact inverse of printf */ + snprintf(buf, IFNAMSIZ, name, i); + if (!strncmp(buf, name_node->name, IFNAMSIZ)) + __set_bit(i, inuse); + } + if (!sscanf(d->name, name, &i)) + continue; + if (i < 0 || i >= max_netdevices) + continue; + + /* avoid cases where sscanf is not exact inverse of printf */ + snprintf(buf, IFNAMSIZ, name, i); + if (!strncmp(buf, d->name, IFNAMSIZ)) + __set_bit(i, inuse); + } + + i = find_first_zero_bit(inuse, max_netdevices); + free_page((unsigned long) inuse); + } + + snprintf(buf, IFNAMSIZ, name, i); + if (!netdev_name_in_use(net, buf)) + return i; + + /* It is possible to run out of possible slots + * when the name is long and there isn't enough space left + * for the digits, or if all bits are used. + */ + return -ENFILE; +} + +static int dev_prep_valid_name(struct net *net, struct net_device *dev, + const char *want_name, char *out_name) +{ + int ret; + + if (!dev_valid_name(want_name)) + return -EINVAL; + + if (strchr(want_name, '%')) { + ret = __dev_alloc_name(net, want_name, out_name); + return ret < 0 ? ret : 0; + } else if (netdev_name_in_use(net, want_name)) { + return -EEXIST; + } else if (out_name != want_name) { + strscpy(out_name, want_name, IFNAMSIZ); + } + + return 0; +} + +static int dev_alloc_name_ns(struct net *net, + struct net_device *dev, + const char *name) +{ + char buf[IFNAMSIZ]; + int ret; + + BUG_ON(!net); + ret = __dev_alloc_name(net, name, buf); + if (ret >= 0) + strscpy(dev->name, buf, IFNAMSIZ); + return ret; +} + +/** + * dev_alloc_name - allocate a name for a device + * @dev: device + * @name: name format string + * + * Passed a format string - eg "lt%d" it will try and find a suitable + * id. It scans list of devices to build up a free map, then chooses + * the first empty slot. The caller must hold the dev_base or rtnl lock + * while allocating the name and adding the device in order to avoid + * duplicates. + * Limited to bits_per_byte * page size devices (ie 32K on most platforms). + * Returns the number of the unit assigned or a negative errno code. + */ + +int dev_alloc_name(struct net_device *dev, const char *name) +{ + return dev_alloc_name_ns(dev_net(dev), dev, name); +} +EXPORT_SYMBOL(dev_alloc_name); + +static int dev_get_valid_name(struct net *net, struct net_device *dev, + const char *name) +{ + char buf[IFNAMSIZ]; + int ret; + + ret = dev_prep_valid_name(net, dev, name, buf); + if (ret >= 0) + strscpy(dev->name, buf, IFNAMSIZ); + return ret; +} + +/** + * dev_change_name - change name of a device + * @dev: device + * @newname: name (or format string) must be at least IFNAMSIZ + * + * Change name of a device, can pass format strings "eth%d". + * for wildcarding. + */ +int dev_change_name(struct net_device *dev, const char *newname) +{ + unsigned char old_assign_type; + char oldname[IFNAMSIZ]; + int err = 0; + int ret; + struct net *net; + + ASSERT_RTNL(); + BUG_ON(!dev_net(dev)); + + net = dev_net(dev); + + /* Some auto-enslaved devices e.g. failover slaves are + * special, as userspace might rename the device after + * the interface had been brought up and running since + * the point kernel initiated auto-enslavement. Allow + * live name change even when these slave devices are + * up and running. + * + * Typically, users of these auto-enslaving devices + * don't actually care about slave name change, as + * they are supposed to operate on master interface + * directly. + */ + if (dev->flags & IFF_UP && + likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK))) + return -EBUSY; + + down_write(&devnet_rename_sem); + + if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { + up_write(&devnet_rename_sem); + return 0; + } + + memcpy(oldname, dev->name, IFNAMSIZ); + + err = dev_get_valid_name(net, dev, newname); + if (err < 0) { + up_write(&devnet_rename_sem); + return err; + } + + if (oldname[0] && !strchr(oldname, '%')) + netdev_info(dev, "renamed from %s\n", oldname); + + old_assign_type = dev->name_assign_type; + dev->name_assign_type = NET_NAME_RENAMED; + +rollback: + ret = device_rename(&dev->dev, dev->name); + if (ret) { + memcpy(dev->name, oldname, IFNAMSIZ); + dev->name_assign_type = old_assign_type; + up_write(&devnet_rename_sem); + return ret; + } + + up_write(&devnet_rename_sem); + + netdev_adjacent_rename_links(dev, oldname); + + write_lock(&dev_base_lock); + netdev_name_node_del(dev->name_node); + write_unlock(&dev_base_lock); + + synchronize_rcu(); + + write_lock(&dev_base_lock); + netdev_name_node_add(net, dev->name_node); + write_unlock(&dev_base_lock); + + ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); + ret = notifier_to_errno(ret); + + if (ret) { + /* err >= 0 after dev_alloc_name() or stores the first errno */ + if (err >= 0) { + err = ret; + down_write(&devnet_rename_sem); + memcpy(dev->name, oldname, IFNAMSIZ); + memcpy(oldname, newname, IFNAMSIZ); + dev->name_assign_type = old_assign_type; + old_assign_type = NET_NAME_RENAMED; + goto rollback; + } else { + netdev_err(dev, "name change rollback failed: %d\n", + ret); + } + } + + return err; +} + +/** + * dev_set_alias - change ifalias of a device + * @dev: device + * @alias: name up to IFALIASZ + * @len: limit of bytes to copy from info + * + * Set ifalias for a device, + */ +int dev_set_alias(struct net_device *dev, const char *alias, size_t len) +{ + struct dev_ifalias *new_alias = NULL; + + if (len >= IFALIASZ) + return -EINVAL; + + if (len) { + new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL); + if (!new_alias) + return -ENOMEM; + + memcpy(new_alias->ifalias, alias, len); + new_alias->ifalias[len] = 0; + } + + mutex_lock(&ifalias_mutex); + new_alias = rcu_replace_pointer(dev->ifalias, new_alias, + mutex_is_locked(&ifalias_mutex)); + mutex_unlock(&ifalias_mutex); + + if (new_alias) + kfree_rcu(new_alias, rcuhead); + + return len; +} +EXPORT_SYMBOL(dev_set_alias); + +/** + * dev_get_alias - get ifalias of a device + * @dev: device + * @name: buffer to store name of ifalias + * @len: size of buffer + * + * get ifalias for a device. Caller must make sure dev cannot go + * away, e.g. rcu read lock or own a reference count to device. + */ +int dev_get_alias(const struct net_device *dev, char *name, size_t len) +{ + const struct dev_ifalias *alias; + int ret = 0; + + rcu_read_lock(); + alias = rcu_dereference(dev->ifalias); + if (alias) + ret = snprintf(name, len, "%s", alias->ifalias); + rcu_read_unlock(); + + return ret; +} + +/** + * netdev_features_change - device changes features + * @dev: device to cause notification + * + * Called to indicate a device has changed features. + */ +void netdev_features_change(struct net_device *dev) +{ + call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); +} +EXPORT_SYMBOL(netdev_features_change); + +/** + * netdev_state_change - device changes state + * @dev: device to cause notification + * + * Called to indicate a device has changed state. This function calls + * the notifier chains for netdev_chain and sends a NEWLINK message + * to the routing socket. + */ +void netdev_state_change(struct net_device *dev) +{ + if (dev->flags & IFF_UP) { + struct netdev_notifier_change_info change_info = { + .info.dev = dev, + }; + + call_netdevice_notifiers_info(NETDEV_CHANGE, + &change_info.info); + rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); + } +} +EXPORT_SYMBOL(netdev_state_change); + +/** + * __netdev_notify_peers - notify network peers about existence of @dev, + * to be called when rtnl lock is already held. + * @dev: network device + * + * Generate traffic such that interested network peers are aware of + * @dev, such as by generating a gratuitous ARP. This may be used when + * a device wants to inform the rest of the network about some sort of + * reconfiguration such as a failover event or virtual machine + * migration. + */ +void __netdev_notify_peers(struct net_device *dev) +{ + ASSERT_RTNL(); + call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); + call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev); +} +EXPORT_SYMBOL(__netdev_notify_peers); + +/** + * netdev_notify_peers - notify network peers about existence of @dev + * @dev: network device + * + * Generate traffic such that interested network peers are aware of + * @dev, such as by generating a gratuitous ARP. This may be used when + * a device wants to inform the rest of the network about some sort of + * reconfiguration such as a failover event or virtual machine + * migration. + */ +void netdev_notify_peers(struct net_device *dev) +{ + rtnl_lock(); + __netdev_notify_peers(dev); + rtnl_unlock(); +} +EXPORT_SYMBOL(netdev_notify_peers); + +static int napi_threaded_poll(void *data); + +static int napi_kthread_create(struct napi_struct *n) +{ + int err = 0; + + /* Create and wake up the kthread once to put it in + * TASK_INTERRUPTIBLE mode to avoid the blocked task + * warning and work with loadavg. + */ + n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d", + n->dev->name, n->napi_id); + if (IS_ERR(n->thread)) { + err = PTR_ERR(n->thread); + pr_err("kthread_run failed with err %d\n", err); + n->thread = NULL; + } + + return err; +} + +static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int ret; + + ASSERT_RTNL(); + dev_addr_check(dev); + + if (!netif_device_present(dev)) { + /* may be detached because parent is runtime-suspended */ + if (dev->dev.parent) + pm_runtime_resume(dev->dev.parent); + if (!netif_device_present(dev)) + return -ENODEV; + } + + /* Block netpoll from trying to do any rx path servicing. + * If we don't do this there is a chance ndo_poll_controller + * or ndo_poll may be running while we open the device + */ + netpoll_poll_disable(dev); + + ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack); + ret = notifier_to_errno(ret); + if (ret) + return ret; + + set_bit(__LINK_STATE_START, &dev->state); + + if (ops->ndo_validate_addr) + ret = ops->ndo_validate_addr(dev); + + if (!ret && ops->ndo_open) + ret = ops->ndo_open(dev); + + netpoll_poll_enable(dev); + + if (ret) + clear_bit(__LINK_STATE_START, &dev->state); + else { + dev->flags |= IFF_UP; + dev_set_rx_mode(dev); + dev_activate(dev); + add_device_randomness(dev->dev_addr, dev->addr_len); + } + + return ret; +} + +/** + * dev_open - prepare an interface for use. + * @dev: device to open + * @extack: netlink extended ack + * + * Takes a device from down to up state. The device's private open + * function is invoked and then the multicast lists are loaded. Finally + * the device is moved into the up state and a %NETDEV_UP message is + * sent to the netdev notifier chain. + * + * Calling this function on an active interface is a nop. On a failure + * a negative errno code is returned. + */ +int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) +{ + int ret; + + if (dev->flags & IFF_UP) + return 0; + + ret = __dev_open(dev, extack); + if (ret < 0) + return ret; + + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); + call_netdevice_notifiers(NETDEV_UP, dev); + + return ret; +} +EXPORT_SYMBOL(dev_open); + +static void __dev_close_many(struct list_head *head) +{ + struct net_device *dev; + + ASSERT_RTNL(); + might_sleep(); + + list_for_each_entry(dev, head, close_list) { + /* Temporarily disable netpoll until the interface is down */ + netpoll_poll_disable(dev); + + call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); + + clear_bit(__LINK_STATE_START, &dev->state); + + /* Synchronize to scheduled poll. We cannot touch poll list, it + * can be even on different cpu. So just clear netif_running(). + * + * dev->stop() will invoke napi_disable() on all of it's + * napi_struct instances on this device. + */ + smp_mb__after_atomic(); /* Commit netif_running(). */ + } + + dev_deactivate_many(head); + + list_for_each_entry(dev, head, close_list) { + const struct net_device_ops *ops = dev->netdev_ops; + + /* + * Call the device specific close. This cannot fail. + * Only if device is UP + * + * We allow it to be called even after a DETACH hot-plug + * event. + */ + if (ops->ndo_stop) + ops->ndo_stop(dev); + + dev->flags &= ~IFF_UP; + netpoll_poll_enable(dev); + } +} + +static void __dev_close(struct net_device *dev) +{ + LIST_HEAD(single); + + list_add(&dev->close_list, &single); + __dev_close_many(&single); + list_del(&single); +} + +void dev_close_many(struct list_head *head, bool unlink) +{ + struct net_device *dev, *tmp; + + /* Remove the devices that don't need to be closed */ + list_for_each_entry_safe(dev, tmp, head, close_list) + if (!(dev->flags & IFF_UP)) + list_del_init(&dev->close_list); + + __dev_close_many(head); + + list_for_each_entry_safe(dev, tmp, head, close_list) { + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); + call_netdevice_notifiers(NETDEV_DOWN, dev); + if (unlink) + list_del_init(&dev->close_list); + } +} +EXPORT_SYMBOL(dev_close_many); + +/** + * dev_close - shutdown an interface. + * @dev: device to shutdown + * + * This function moves an active device into down state. A + * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device + * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier + * chain. + */ +void dev_close(struct net_device *dev) +{ + if (dev->flags & IFF_UP) { + LIST_HEAD(single); + + list_add(&dev->close_list, &single); + dev_close_many(&single, true); + list_del(&single); + } +} +EXPORT_SYMBOL(dev_close); + + +/** + * dev_disable_lro - disable Large Receive Offload on a device + * @dev: device + * + * Disable Large Receive Offload (LRO) on a net device. Must be + * called under RTNL. This is needed if received packets may be + * forwarded to another interface. + */ +void dev_disable_lro(struct net_device *dev) +{ + struct net_device *lower_dev; + struct list_head *iter; + + dev->wanted_features &= ~NETIF_F_LRO; + netdev_update_features(dev); + + if (unlikely(dev->features & NETIF_F_LRO)) + netdev_WARN(dev, "failed to disable LRO!\n"); + + netdev_for_each_lower_dev(dev, lower_dev, iter) + dev_disable_lro(lower_dev); +} +EXPORT_SYMBOL(dev_disable_lro); + +/** + * dev_disable_gro_hw - disable HW Generic Receive Offload on a device + * @dev: device + * + * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be + * called under RTNL. This is needed if Generic XDP is installed on + * the device. + */ +static void dev_disable_gro_hw(struct net_device *dev) +{ + dev->wanted_features &= ~NETIF_F_GRO_HW; + netdev_update_features(dev); + + if (unlikely(dev->features & NETIF_F_GRO_HW)) + netdev_WARN(dev, "failed to disable GRO_HW!\n"); +} + +const char *netdev_cmd_to_name(enum netdev_cmd cmd) +{ +#define N(val) \ + case NETDEV_##val: \ + return "NETDEV_" __stringify(val); + switch (cmd) { + N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER) + N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE) + N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE) + N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER) + N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO) + N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO) + N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) + N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) + N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) + N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE) + N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA) + } +#undef N + return "UNKNOWN_NETDEV_EVENT"; +} +EXPORT_SYMBOL_GPL(netdev_cmd_to_name); + +static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, + struct net_device *dev) +{ + struct netdev_notifier_info info = { + .dev = dev, + }; + + return nb->notifier_call(nb, val, &info); +} + +static int call_netdevice_register_notifiers(struct notifier_block *nb, + struct net_device *dev) +{ + int err; + + err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); + err = notifier_to_errno(err); + if (err) + return err; + + if (!(dev->flags & IFF_UP)) + return 0; + + call_netdevice_notifier(nb, NETDEV_UP, dev); + return 0; +} + +static void call_netdevice_unregister_notifiers(struct notifier_block *nb, + struct net_device *dev) +{ + if (dev->flags & IFF_UP) { + call_netdevice_notifier(nb, NETDEV_GOING_DOWN, + dev); + call_netdevice_notifier(nb, NETDEV_DOWN, dev); + } + call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); +} + +static int call_netdevice_register_net_notifiers(struct notifier_block *nb, + struct net *net) +{ + struct net_device *dev; + int err; + + for_each_netdev(net, dev) { + err = call_netdevice_register_notifiers(nb, dev); + if (err) + goto rollback; + } + return 0; + +rollback: + for_each_netdev_continue_reverse(net, dev) + call_netdevice_unregister_notifiers(nb, dev); + return err; +} + +static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb, + struct net *net) +{ + struct net_device *dev; + + for_each_netdev(net, dev) + call_netdevice_unregister_notifiers(nb, dev); +} + +static int dev_boot_phase = 1; + +/** + * register_netdevice_notifier - register a network notifier block + * @nb: notifier + * + * Register a notifier to be called when network device events occur. + * The notifier passed is linked into the kernel structures and must + * not be reused until it has been unregistered. A negative errno code + * is returned on a failure. + * + * When registered all registration and up events are replayed + * to the new notifier to allow device to have a race free + * view of the network device list. + */ + +int register_netdevice_notifier(struct notifier_block *nb) +{ + struct net *net; + int err; + + /* Close race with setup_net() and cleanup_net() */ + down_write(&pernet_ops_rwsem); + rtnl_lock(); + err = raw_notifier_chain_register(&netdev_chain, nb); + if (err) + goto unlock; + if (dev_boot_phase) + goto unlock; + for_each_net(net) { + err = call_netdevice_register_net_notifiers(nb, net); + if (err) + goto rollback; + } + +unlock: + rtnl_unlock(); + up_write(&pernet_ops_rwsem); + return err; + +rollback: + for_each_net_continue_reverse(net) + call_netdevice_unregister_net_notifiers(nb, net); + + raw_notifier_chain_unregister(&netdev_chain, nb); + goto unlock; +} +EXPORT_SYMBOL(register_netdevice_notifier); + +/** + * unregister_netdevice_notifier - unregister a network notifier block + * @nb: notifier + * + * Unregister a notifier previously registered by + * register_netdevice_notifier(). The notifier is unlinked into the + * kernel structures and may then be reused. A negative errno code + * is returned on a failure. + * + * After unregistering unregister and down device events are synthesized + * for all devices on the device list to the removed notifier to remove + * the need for special case cleanup code. + */ + +int unregister_netdevice_notifier(struct notifier_block *nb) +{ + struct net *net; + int err; + + /* Close race with setup_net() and cleanup_net() */ + down_write(&pernet_ops_rwsem); + rtnl_lock(); + err = raw_notifier_chain_unregister(&netdev_chain, nb); + if (err) + goto unlock; + + for_each_net(net) + call_netdevice_unregister_net_notifiers(nb, net); + +unlock: + rtnl_unlock(); + up_write(&pernet_ops_rwsem); + return err; +} +EXPORT_SYMBOL(unregister_netdevice_notifier); + +static int __register_netdevice_notifier_net(struct net *net, + struct notifier_block *nb, + bool ignore_call_fail) +{ + int err; + + err = raw_notifier_chain_register(&net->netdev_chain, nb); + if (err) + return err; + if (dev_boot_phase) + return 0; + + err = call_netdevice_register_net_notifiers(nb, net); + if (err && !ignore_call_fail) + goto chain_unregister; + + return 0; + +chain_unregister: + raw_notifier_chain_unregister(&net->netdev_chain, nb); + return err; +} + +static int __unregister_netdevice_notifier_net(struct net *net, + struct notifier_block *nb) +{ + int err; + + err = raw_notifier_chain_unregister(&net->netdev_chain, nb); + if (err) + return err; + + call_netdevice_unregister_net_notifiers(nb, net); + return 0; +} + +/** + * register_netdevice_notifier_net - register a per-netns network notifier block + * @net: network namespace + * @nb: notifier + * + * Register a notifier to be called when network device events occur. + * The notifier passed is linked into the kernel structures and must + * not be reused until it has been unregistered. A negative errno code + * is returned on a failure. + * + * When registered all registration and up events are replayed + * to the new notifier to allow device to have a race free + * view of the network device list. + */ + +int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb) +{ + int err; + + rtnl_lock(); + err = __register_netdevice_notifier_net(net, nb, false); + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(register_netdevice_notifier_net); + +/** + * unregister_netdevice_notifier_net - unregister a per-netns + * network notifier block + * @net: network namespace + * @nb: notifier + * + * Unregister a notifier previously registered by + * register_netdevice_notifier(). The notifier is unlinked into the + * kernel structures and may then be reused. A negative errno code + * is returned on a failure. + * + * After unregistering unregister and down device events are synthesized + * for all devices on the device list to the removed notifier to remove + * the need for special case cleanup code. + */ + +int unregister_netdevice_notifier_net(struct net *net, + struct notifier_block *nb) +{ + int err; + + rtnl_lock(); + err = __unregister_netdevice_notifier_net(net, nb); + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(unregister_netdevice_notifier_net); + +int register_netdevice_notifier_dev_net(struct net_device *dev, + struct notifier_block *nb, + struct netdev_net_notifier *nn) +{ + int err; + + rtnl_lock(); + err = __register_netdevice_notifier_net(dev_net(dev), nb, false); + if (!err) { + nn->nb = nb; + list_add(&nn->list, &dev->net_notifier_list); + } + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(register_netdevice_notifier_dev_net); + +int unregister_netdevice_notifier_dev_net(struct net_device *dev, + struct notifier_block *nb, + struct netdev_net_notifier *nn) +{ + int err; + + rtnl_lock(); + list_del(&nn->list); + err = __unregister_netdevice_notifier_net(dev_net(dev), nb); + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net); + +static void move_netdevice_notifiers_dev_net(struct net_device *dev, + struct net *net) +{ + struct netdev_net_notifier *nn; + + list_for_each_entry(nn, &dev->net_notifier_list, list) { + __unregister_netdevice_notifier_net(dev_net(dev), nn->nb); + __register_netdevice_notifier_net(net, nn->nb, true); + } +} + +/** + * call_netdevice_notifiers_info - call all network notifier blocks + * @val: value passed unmodified to notifier function + * @info: notifier information data + * + * Call all network notifier blocks. Parameters and return value + * are as for raw_notifier_call_chain(). + */ + +static int call_netdevice_notifiers_info(unsigned long val, + struct netdev_notifier_info *info) +{ + struct net *net = dev_net(info->dev); + int ret; + + ASSERT_RTNL(); + + /* Run per-netns notifier block chain first, then run the global one. + * Hopefully, one day, the global one is going to be removed after + * all notifier block registrators get converted to be per-netns. + */ + ret = raw_notifier_call_chain(&net->netdev_chain, val, info); + if (ret & NOTIFY_STOP_MASK) + return ret; + return raw_notifier_call_chain(&netdev_chain, val, info); +} + +/** + * call_netdevice_notifiers_info_robust - call per-netns notifier blocks + * for and rollback on error + * @val_up: value passed unmodified to notifier function + * @val_down: value passed unmodified to the notifier function when + * recovering from an error on @val_up + * @info: notifier information data + * + * Call all per-netns network notifier blocks, but not notifier blocks on + * the global notifier chain. Parameters and return value are as for + * raw_notifier_call_chain_robust(). + */ + +static int +call_netdevice_notifiers_info_robust(unsigned long val_up, + unsigned long val_down, + struct netdev_notifier_info *info) +{ + struct net *net = dev_net(info->dev); + + ASSERT_RTNL(); + + return raw_notifier_call_chain_robust(&net->netdev_chain, + val_up, val_down, info); +} + +static int call_netdevice_notifiers_extack(unsigned long val, + struct net_device *dev, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_info info = { + .dev = dev, + .extack = extack, + }; + + return call_netdevice_notifiers_info(val, &info); +} + +/** + * call_netdevice_notifiers - call all network notifier blocks + * @val: value passed unmodified to notifier function + * @dev: net_device pointer passed unmodified to notifier function + * + * Call all network notifier blocks. Parameters and return value + * are as for raw_notifier_call_chain(). + */ + +int call_netdevice_notifiers(unsigned long val, struct net_device *dev) +{ + return call_netdevice_notifiers_extack(val, dev, NULL); +} +EXPORT_SYMBOL(call_netdevice_notifiers); + +/** + * call_netdevice_notifiers_mtu - call all network notifier blocks + * @val: value passed unmodified to notifier function + * @dev: net_device pointer passed unmodified to notifier function + * @arg: additional u32 argument passed to the notifier function + * + * Call all network notifier blocks. Parameters and return value + * are as for raw_notifier_call_chain(). + */ +static int call_netdevice_notifiers_mtu(unsigned long val, + struct net_device *dev, u32 arg) +{ + struct netdev_notifier_info_ext info = { + .info.dev = dev, + .ext.mtu = arg, + }; + + BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0); + + return call_netdevice_notifiers_info(val, &info.info); +} + +#ifdef CONFIG_NET_INGRESS +static DEFINE_STATIC_KEY_FALSE(ingress_needed_key); + +void net_inc_ingress_queue(void) +{ + static_branch_inc(&ingress_needed_key); +} +EXPORT_SYMBOL_GPL(net_inc_ingress_queue); + +void net_dec_ingress_queue(void) +{ + static_branch_dec(&ingress_needed_key); +} +EXPORT_SYMBOL_GPL(net_dec_ingress_queue); +#endif + +#ifdef CONFIG_NET_EGRESS +static DEFINE_STATIC_KEY_FALSE(egress_needed_key); + +void net_inc_egress_queue(void) +{ + static_branch_inc(&egress_needed_key); +} +EXPORT_SYMBOL_GPL(net_inc_egress_queue); + +void net_dec_egress_queue(void) +{ + static_branch_dec(&egress_needed_key); +} +EXPORT_SYMBOL_GPL(net_dec_egress_queue); +#endif + +DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); +EXPORT_SYMBOL(netstamp_needed_key); +#ifdef CONFIG_JUMP_LABEL +static atomic_t netstamp_needed_deferred; +static atomic_t netstamp_wanted; +static void netstamp_clear(struct work_struct *work) +{ + int deferred = atomic_xchg(&netstamp_needed_deferred, 0); + int wanted; + + wanted = atomic_add_return(deferred, &netstamp_wanted); + if (wanted > 0) + static_branch_enable(&netstamp_needed_key); + else + static_branch_disable(&netstamp_needed_key); +} +static DECLARE_WORK(netstamp_work, netstamp_clear); +#endif + +void net_enable_timestamp(void) +{ +#ifdef CONFIG_JUMP_LABEL + int wanted; + + while (1) { + wanted = atomic_read(&netstamp_wanted); + if (wanted <= 0) + break; + if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted) + return; + } + atomic_inc(&netstamp_needed_deferred); + schedule_work(&netstamp_work); +#else + static_branch_inc(&netstamp_needed_key); +#endif +} +EXPORT_SYMBOL(net_enable_timestamp); + +void net_disable_timestamp(void) +{ +#ifdef CONFIG_JUMP_LABEL + int wanted; + + while (1) { + wanted = atomic_read(&netstamp_wanted); + if (wanted <= 1) + break; + if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted) + return; + } + atomic_dec(&netstamp_needed_deferred); + schedule_work(&netstamp_work); +#else + static_branch_dec(&netstamp_needed_key); +#endif +} +EXPORT_SYMBOL(net_disable_timestamp); + +static inline void net_timestamp_set(struct sk_buff *skb) +{ + skb->tstamp = 0; + skb->mono_delivery_time = 0; + if (static_branch_unlikely(&netstamp_needed_key)) + skb->tstamp = ktime_get_real(); +} + +#define net_timestamp_check(COND, SKB) \ + if (static_branch_unlikely(&netstamp_needed_key)) { \ + if ((COND) && !(SKB)->tstamp) \ + (SKB)->tstamp = ktime_get_real(); \ + } \ + +bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) +{ + return __is_skb_forwardable(dev, skb, true); +} +EXPORT_SYMBOL_GPL(is_skb_forwardable); + +static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb, + bool check_mtu) +{ + int ret = ____dev_forward_skb(dev, skb, check_mtu); + + if (likely(!ret)) { + skb->protocol = eth_type_trans(skb, dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + } + + return ret; +} + +int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ + return __dev_forward_skb2(dev, skb, true); +} +EXPORT_SYMBOL_GPL(__dev_forward_skb); + +/** + * dev_forward_skb - loopback an skb to another netif + * + * @dev: destination network device + * @skb: buffer to forward + * + * return values: + * NET_RX_SUCCESS (no congestion) + * NET_RX_DROP (packet was dropped, but freed) + * + * dev_forward_skb can be used for injecting an skb from the + * start_xmit function of one device into the receive queue + * of another device. + * + * The receiving device may be in another namespace, so + * we have to clear all information in the skb that could + * impact namespace isolation. + */ +int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ + return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); +} +EXPORT_SYMBOL_GPL(dev_forward_skb); + +int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb) +{ + return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb); +} + +static inline int deliver_skb(struct sk_buff *skb, + struct packet_type *pt_prev, + struct net_device *orig_dev) +{ + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) + return -ENOMEM; + refcount_inc(&skb->users); + return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); +} + +static inline void deliver_ptype_list_skb(struct sk_buff *skb, + struct packet_type **pt, + struct net_device *orig_dev, + __be16 type, + struct list_head *ptype_list) +{ + struct packet_type *ptype, *pt_prev = *pt; + + list_for_each_entry_rcu(ptype, ptype_list, list) { + if (ptype->type != type) + continue; + if (pt_prev) + deliver_skb(skb, pt_prev, orig_dev); + pt_prev = ptype; + } + *pt = pt_prev; +} + +static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) +{ + if (!ptype->af_packet_priv || !skb->sk) + return false; + + if (ptype->id_match) + return ptype->id_match(ptype, skb->sk); + else if ((struct sock *)ptype->af_packet_priv == skb->sk) + return true; + + return false; +} + +/** + * dev_nit_active - return true if any network interface taps are in use + * + * @dev: network device to check for the presence of taps + */ +bool dev_nit_active(struct net_device *dev) +{ + return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all); +} +EXPORT_SYMBOL_GPL(dev_nit_active); + +/* + * Support routine. Sends outgoing frames to any network + * taps currently in use. + */ + +void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) +{ + struct packet_type *ptype; + struct sk_buff *skb2 = NULL; + struct packet_type *pt_prev = NULL; + struct list_head *ptype_list = &ptype_all; + + rcu_read_lock(); +again: + list_for_each_entry_rcu(ptype, ptype_list, list) { + if (ptype->ignore_outgoing) + continue; + + /* Never send packets back to the socket + * they originated from - MvS (miquels@drinkel.ow.org) + */ + if (skb_loop_sk(ptype, skb)) + continue; + + if (pt_prev) { + deliver_skb(skb2, pt_prev, skb->dev); + pt_prev = ptype; + continue; + } + + /* need to clone skb, done only once */ + skb2 = skb_clone(skb, GFP_ATOMIC); + if (!skb2) + goto out_unlock; + + net_timestamp_set(skb2); + + /* skb->nh should be correctly + * set by sender, so that the second statement is + * just protection against buggy protocols. + */ + skb_reset_mac_header(skb2); + + if (skb_network_header(skb2) < skb2->data || + skb_network_header(skb2) > skb_tail_pointer(skb2)) { + net_crit_ratelimited("protocol %04x is buggy, dev %s\n", + ntohs(skb2->protocol), + dev->name); + skb_reset_network_header(skb2); + } + + skb2->transport_header = skb2->network_header; + skb2->pkt_type = PACKET_OUTGOING; + pt_prev = ptype; + } + + if (ptype_list == &ptype_all) { + ptype_list = &dev->ptype_all; + goto again; + } +out_unlock: + if (pt_prev) { + if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC)) + pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); + else + kfree_skb(skb2); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(dev_queue_xmit_nit); + +/** + * netif_setup_tc - Handle tc mappings on real_num_tx_queues change + * @dev: Network device + * @txq: number of queues available + * + * If real_num_tx_queues is changed the tc mappings may no longer be + * valid. To resolve this verify the tc mapping remains valid and if + * not NULL the mapping. With no priorities mapping to this + * offset/count pair it will no longer be used. In the worst case TC0 + * is invalid nothing can be done so disable priority mappings. If is + * expected that drivers will fix this mapping if they can before + * calling netif_set_real_num_tx_queues. + */ +static void netif_setup_tc(struct net_device *dev, unsigned int txq) +{ + int i; + struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; + + /* If TC0 is invalidated disable TC mapping */ + if (tc->offset + tc->count > txq) { + netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); + dev->num_tc = 0; + return; + } + + /* Invalidated prio to tc mappings set to TC0 */ + for (i = 1; i < TC_BITMASK + 1; i++) { + int q = netdev_get_prio_tc_map(dev, i); + + tc = &dev->tc_to_txq[q]; + if (tc->offset + tc->count > txq) { + netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", + i, q); + netdev_set_prio_tc_map(dev, i, 0); + } + } +} + +int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) +{ + if (dev->num_tc) { + struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; + int i; + + /* walk through the TCs and see if it falls into any of them */ + for (i = 0; i < TC_MAX_QUEUE; i++, tc++) { + if ((txq - tc->offset) < tc->count) + return i; + } + + /* didn't find it, just return -1 to indicate no match */ + return -1; + } + + return 0; +} +EXPORT_SYMBOL(netdev_txq_to_tc); + +#ifdef CONFIG_XPS +static struct static_key xps_needed __read_mostly; +static struct static_key xps_rxqs_needed __read_mostly; +static DEFINE_MUTEX(xps_map_mutex); +#define xmap_dereference(P) \ + rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) + +static bool remove_xps_queue(struct xps_dev_maps *dev_maps, + struct xps_dev_maps *old_maps, int tci, u16 index) +{ + struct xps_map *map = NULL; + int pos; + + if (dev_maps) + map = xmap_dereference(dev_maps->attr_map[tci]); + if (!map) + return false; + + for (pos = map->len; pos--;) { + if (map->queues[pos] != index) + continue; + + if (map->len > 1) { + map->queues[pos] = map->queues[--map->len]; + break; + } + + if (old_maps) + RCU_INIT_POINTER(old_maps->attr_map[tci], NULL); + RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); + kfree_rcu(map, rcu); + return false; + } + + return true; +} + +static bool remove_xps_queue_cpu(struct net_device *dev, + struct xps_dev_maps *dev_maps, + int cpu, u16 offset, u16 count) +{ + int num_tc = dev_maps->num_tc; + bool active = false; + int tci; + + for (tci = cpu * num_tc; num_tc--; tci++) { + int i, j; + + for (i = count, j = offset; i--; j++) { + if (!remove_xps_queue(dev_maps, NULL, tci, j)) + break; + } + + active |= i < 0; + } + + return active; +} + +static void reset_xps_maps(struct net_device *dev, + struct xps_dev_maps *dev_maps, + enum xps_map_type type) +{ + static_key_slow_dec_cpuslocked(&xps_needed); + if (type == XPS_RXQS) + static_key_slow_dec_cpuslocked(&xps_rxqs_needed); + + RCU_INIT_POINTER(dev->xps_maps[type], NULL); + + kfree_rcu(dev_maps, rcu); +} + +static void clean_xps_maps(struct net_device *dev, enum xps_map_type type, + u16 offset, u16 count) +{ + struct xps_dev_maps *dev_maps; + bool active = false; + int i, j; + + dev_maps = xmap_dereference(dev->xps_maps[type]); + if (!dev_maps) + return; + + for (j = 0; j < dev_maps->nr_ids; j++) + active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); + if (!active) + reset_xps_maps(dev, dev_maps, type); + + if (type == XPS_CPUS) { + for (i = offset + (count - 1); count--; i--) + netdev_queue_numa_node_write( + netdev_get_tx_queue(dev, i), NUMA_NO_NODE); + } +} + +static void netif_reset_xps_queues(struct net_device *dev, u16 offset, + u16 count) +{ + if (!static_key_false(&xps_needed)) + return; + + cpus_read_lock(); + mutex_lock(&xps_map_mutex); + + if (static_key_false(&xps_rxqs_needed)) + clean_xps_maps(dev, XPS_RXQS, offset, count); + + clean_xps_maps(dev, XPS_CPUS, offset, count); + + mutex_unlock(&xps_map_mutex); + cpus_read_unlock(); +} + +static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) +{ + netif_reset_xps_queues(dev, index, dev->num_tx_queues - index); +} + +static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, + u16 index, bool is_rxqs_map) +{ + struct xps_map *new_map; + int alloc_len = XPS_MIN_MAP_ALLOC; + int i, pos; + + for (pos = 0; map && pos < map->len; pos++) { + if (map->queues[pos] != index) + continue; + return map; + } + + /* Need to add tx-queue to this CPU's/rx-queue's existing map */ + if (map) { + if (pos < map->alloc_len) + return map; + + alloc_len = map->alloc_len * 2; + } + + /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's + * map + */ + if (is_rxqs_map) + new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL); + else + new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, + cpu_to_node(attr_index)); + if (!new_map) + return NULL; + + for (i = 0; i < pos; i++) + new_map->queues[i] = map->queues[i]; + new_map->alloc_len = alloc_len; + new_map->len = pos; + + return new_map; +} + +/* Copy xps maps at a given index */ +static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps, + struct xps_dev_maps *new_dev_maps, int index, + int tc, bool skip_tc) +{ + int i, tci = index * dev_maps->num_tc; + struct xps_map *map; + + /* copy maps belonging to foreign traffic classes */ + for (i = 0; i < dev_maps->num_tc; i++, tci++) { + if (i == tc && skip_tc) + continue; + + /* fill in the new device map from the old device map */ + map = xmap_dereference(dev_maps->attr_map[tci]); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); + } +} + +/* Must be called under cpus_read_lock */ +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, + u16 index, enum xps_map_type type) +{ + struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL; + const unsigned long *online_mask = NULL; + bool active = false, copy = false; + int i, j, tci, numa_node_id = -2; + int maps_sz, num_tc = 1, tc = 0; + struct xps_map *map, *new_map; + unsigned int nr_ids; + + WARN_ON_ONCE(index >= dev->num_tx_queues); + + if (dev->num_tc) { + /* Do not allow XPS on subordinate device directly */ + num_tc = dev->num_tc; + if (num_tc < 0) + return -EINVAL; + + /* If queue belongs to subordinate dev use its map */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + + tc = netdev_txq_to_tc(dev, index); + if (tc < 0) + return -EINVAL; + } + + mutex_lock(&xps_map_mutex); + + dev_maps = xmap_dereference(dev->xps_maps[type]); + if (type == XPS_RXQS) { + maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues); + nr_ids = dev->num_rx_queues; + } else { + maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc); + if (num_possible_cpus() > 1) + online_mask = cpumask_bits(cpu_online_mask); + nr_ids = nr_cpu_ids; + } + + if (maps_sz < L1_CACHE_BYTES) + maps_sz = L1_CACHE_BYTES; + + /* The old dev_maps could be larger or smaller than the one we're + * setting up now, as dev->num_tc or nr_ids could have been updated in + * between. We could try to be smart, but let's be safe instead and only + * copy foreign traffic classes if the two map sizes match. + */ + if (dev_maps && + dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids) + copy = true; + + /* allocate memory for queue storage */ + for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids), + j < nr_ids;) { + if (!new_dev_maps) { + new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); + if (!new_dev_maps) { + mutex_unlock(&xps_map_mutex); + return -ENOMEM; + } + + new_dev_maps->nr_ids = nr_ids; + new_dev_maps->num_tc = num_tc; + } + + tci = j * num_tc + tc; + map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; + + map = expand_xps_map(map, j, index, type == XPS_RXQS); + if (!map) + goto error; + + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); + } + + if (!new_dev_maps) + goto out_no_new_maps; + + if (!dev_maps) { + /* Increment static keys at most once per type */ + static_key_slow_inc_cpuslocked(&xps_needed); + if (type == XPS_RXQS) + static_key_slow_inc_cpuslocked(&xps_rxqs_needed); + } + + for (j = 0; j < nr_ids; j++) { + bool skip_tc = false; + + tci = j * num_tc + tc; + if (netif_attr_test_mask(j, mask, nr_ids) && + netif_attr_test_online(j, online_mask, nr_ids)) { + /* add tx-queue to CPU/rx-queue maps */ + int pos = 0; + + skip_tc = true; + + map = xmap_dereference(new_dev_maps->attr_map[tci]); + while ((pos < map->len) && (map->queues[pos] != index)) + pos++; + + if (pos == map->len) + map->queues[map->len++] = index; +#ifdef CONFIG_NUMA + if (type == XPS_CPUS) { + if (numa_node_id == -2) + numa_node_id = cpu_to_node(j); + else if (numa_node_id != cpu_to_node(j)) + numa_node_id = -1; + } +#endif + } + + if (copy) + xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc, + skip_tc); + } + + rcu_assign_pointer(dev->xps_maps[type], new_dev_maps); + + /* Cleanup old maps */ + if (!dev_maps) + goto out_no_old_maps; + + for (j = 0; j < dev_maps->nr_ids; j++) { + for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) { + map = xmap_dereference(dev_maps->attr_map[tci]); + if (!map) + continue; + + if (copy) { + new_map = xmap_dereference(new_dev_maps->attr_map[tci]); + if (map == new_map) + continue; + } + + RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); + kfree_rcu(map, rcu); + } + } + + old_dev_maps = dev_maps; + +out_no_old_maps: + dev_maps = new_dev_maps; + active = true; + +out_no_new_maps: + if (type == XPS_CPUS) + /* update Tx queue numa node */ + netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), + (numa_node_id >= 0) ? + numa_node_id : NUMA_NO_NODE); + + if (!dev_maps) + goto out_no_maps; + + /* removes tx-queue from unused CPUs/rx-queues */ + for (j = 0; j < dev_maps->nr_ids; j++) { + tci = j * dev_maps->num_tc; + + for (i = 0; i < dev_maps->num_tc; i++, tci++) { + if (i == tc && + netif_attr_test_mask(j, mask, dev_maps->nr_ids) && + netif_attr_test_online(j, online_mask, dev_maps->nr_ids)) + continue; + + active |= remove_xps_queue(dev_maps, + copy ? old_dev_maps : NULL, + tci, index); + } + } + + if (old_dev_maps) + kfree_rcu(old_dev_maps, rcu); + + /* free map if not active */ + if (!active) + reset_xps_maps(dev, dev_maps, type); + +out_no_maps: + mutex_unlock(&xps_map_mutex); + + return 0; +error: + /* remove any maps that we added */ + for (j = 0; j < nr_ids; j++) { + for (i = num_tc, tci = j * num_tc; i--; tci++) { + new_map = xmap_dereference(new_dev_maps->attr_map[tci]); + map = copy ? + xmap_dereference(dev_maps->attr_map[tci]) : + NULL; + if (new_map && new_map != map) + kfree(new_map); + } + } + + mutex_unlock(&xps_map_mutex); + + kfree(new_dev_maps); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(__netif_set_xps_queue); + +int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, + u16 index) +{ + int ret; + + cpus_read_lock(); + ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS); + cpus_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(netif_set_xps_queue); + +#endif +static void netdev_unbind_all_sb_channels(struct net_device *dev) +{ + struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; + + /* Unbind any subordinate channels */ + while (txq-- != &dev->_tx[0]) { + if (txq->sb_dev) + netdev_unbind_sb_channel(dev, txq->sb_dev); + } +} + +void netdev_reset_tc(struct net_device *dev) +{ +#ifdef CONFIG_XPS + netif_reset_xps_queues_gt(dev, 0); +#endif + netdev_unbind_all_sb_channels(dev); + + /* Reset TC configuration of device */ + dev->num_tc = 0; + memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); + memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); +} +EXPORT_SYMBOL(netdev_reset_tc); + +int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset) +{ + if (tc >= dev->num_tc) + return -EINVAL; + +#ifdef CONFIG_XPS + netif_reset_xps_queues(dev, offset, count); +#endif + dev->tc_to_txq[tc].count = count; + dev->tc_to_txq[tc].offset = offset; + return 0; +} +EXPORT_SYMBOL(netdev_set_tc_queue); + +int netdev_set_num_tc(struct net_device *dev, u8 num_tc) +{ + if (num_tc > TC_MAX_QUEUE) + return -EINVAL; + +#ifdef CONFIG_XPS + netif_reset_xps_queues_gt(dev, 0); +#endif + netdev_unbind_all_sb_channels(dev); + + dev->num_tc = num_tc; + return 0; +} +EXPORT_SYMBOL(netdev_set_num_tc); + +void netdev_unbind_sb_channel(struct net_device *dev, + struct net_device *sb_dev) +{ + struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; + +#ifdef CONFIG_XPS + netif_reset_xps_queues_gt(sb_dev, 0); +#endif + memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq)); + memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map)); + + while (txq-- != &dev->_tx[0]) { + if (txq->sb_dev == sb_dev) + txq->sb_dev = NULL; + } +} +EXPORT_SYMBOL(netdev_unbind_sb_channel); + +int netdev_bind_sb_channel_queue(struct net_device *dev, + struct net_device *sb_dev, + u8 tc, u16 count, u16 offset) +{ + /* Make certain the sb_dev and dev are already configured */ + if (sb_dev->num_tc >= 0 || tc >= dev->num_tc) + return -EINVAL; + + /* We cannot hand out queues we don't have */ + if ((offset + count) > dev->real_num_tx_queues) + return -EINVAL; + + /* Record the mapping */ + sb_dev->tc_to_txq[tc].count = count; + sb_dev->tc_to_txq[tc].offset = offset; + + /* Provide a way for Tx queue to find the tc_to_txq map or + * XPS map for itself. + */ + while (count--) + netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev; + + return 0; +} +EXPORT_SYMBOL(netdev_bind_sb_channel_queue); + +int netdev_set_sb_channel(struct net_device *dev, u16 channel) +{ + /* Do not use a multiqueue device to represent a subordinate channel */ + if (netif_is_multiqueue(dev)) + return -ENODEV; + + /* We allow channels 1 - 32767 to be used for subordinate channels. + * Channel 0 is meant to be "native" mode and used only to represent + * the main root device. We allow writing 0 to reset the device back + * to normal mode after being used as a subordinate channel. + */ + if (channel > S16_MAX) + return -EINVAL; + + dev->num_tc = -channel; + + return 0; +} +EXPORT_SYMBOL(netdev_set_sb_channel); + +/* + * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues + * greater than real_num_tx_queues stale skbs on the qdisc must be flushed. + */ +int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) +{ + bool disabling; + int rc; + + disabling = txq < dev->real_num_tx_queues; + + if (txq < 1 || txq > dev->num_tx_queues) + return -EINVAL; + + if (dev->reg_state == NETREG_REGISTERED || + dev->reg_state == NETREG_UNREGISTERING) { + ASSERT_RTNL(); + + rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, + txq); + if (rc) + return rc; + + if (dev->num_tc) + netif_setup_tc(dev, txq); + + dev_qdisc_change_real_num_tx(dev, txq); + + dev->real_num_tx_queues = txq; + + if (disabling) { + synchronize_net(); + qdisc_reset_all_tx_gt(dev, txq); +#ifdef CONFIG_XPS + netif_reset_xps_queues_gt(dev, txq); +#endif + } + } else { + dev->real_num_tx_queues = txq; + } + + return 0; +} +EXPORT_SYMBOL(netif_set_real_num_tx_queues); + +#ifdef CONFIG_SYSFS +/** + * netif_set_real_num_rx_queues - set actual number of RX queues used + * @dev: Network device + * @rxq: Actual number of RX queues + * + * This must be called either with the rtnl_lock held or before + * registration of the net device. Returns 0 on success, or a + * negative error code. If called before registration, it always + * succeeds. + */ +int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) +{ + int rc; + + if (rxq < 1 || rxq > dev->num_rx_queues) + return -EINVAL; + + if (dev->reg_state == NETREG_REGISTERED) { + ASSERT_RTNL(); + + rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, + rxq); + if (rc) + return rc; + } + + dev->real_num_rx_queues = rxq; + return 0; +} +EXPORT_SYMBOL(netif_set_real_num_rx_queues); +#endif + +/** + * netif_set_real_num_queues - set actual number of RX and TX queues used + * @dev: Network device + * @txq: Actual number of TX queues + * @rxq: Actual number of RX queues + * + * Set the real number of both TX and RX queues. + * Does nothing if the number of queues is already correct. + */ +int netif_set_real_num_queues(struct net_device *dev, + unsigned int txq, unsigned int rxq) +{ + unsigned int old_rxq = dev->real_num_rx_queues; + int err; + + if (txq < 1 || txq > dev->num_tx_queues || + rxq < 1 || rxq > dev->num_rx_queues) + return -EINVAL; + + /* Start from increases, so the error path only does decreases - + * decreases can't fail. + */ + if (rxq > dev->real_num_rx_queues) { + err = netif_set_real_num_rx_queues(dev, rxq); + if (err) + return err; + } + if (txq > dev->real_num_tx_queues) { + err = netif_set_real_num_tx_queues(dev, txq); + if (err) + goto undo_rx; + } + if (rxq < dev->real_num_rx_queues) + WARN_ON(netif_set_real_num_rx_queues(dev, rxq)); + if (txq < dev->real_num_tx_queues) + WARN_ON(netif_set_real_num_tx_queues(dev, txq)); + + return 0; +undo_rx: + WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq)); + return err; +} +EXPORT_SYMBOL(netif_set_real_num_queues); + +/** + * netif_set_tso_max_size() - set the max size of TSO frames supported + * @dev: netdev to update + * @size: max skb->len of a TSO frame + * + * Set the limit on the size of TSO super-frames the device can handle. + * Unless explicitly set the stack will assume the value of + * %GSO_LEGACY_MAX_SIZE. + */ +void netif_set_tso_max_size(struct net_device *dev, unsigned int size) +{ + dev->tso_max_size = min(GSO_MAX_SIZE, size); + if (size < READ_ONCE(dev->gso_max_size)) + netif_set_gso_max_size(dev, size); +} +EXPORT_SYMBOL(netif_set_tso_max_size); + +/** + * netif_set_tso_max_segs() - set the max number of segs supported for TSO + * @dev: netdev to update + * @segs: max number of TCP segments + * + * Set the limit on the number of TCP segments the device can generate from + * a single TSO super-frame. + * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS. + */ +void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs) +{ + dev->tso_max_segs = segs; + if (segs < READ_ONCE(dev->gso_max_segs)) + netif_set_gso_max_segs(dev, segs); +} +EXPORT_SYMBOL(netif_set_tso_max_segs); + +/** + * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper + * @to: netdev to update + * @from: netdev from which to copy the limits + */ +void netif_inherit_tso_max(struct net_device *to, const struct net_device *from) +{ + netif_set_tso_max_size(to, from->tso_max_size); + netif_set_tso_max_segs(to, from->tso_max_segs); +} +EXPORT_SYMBOL(netif_inherit_tso_max); + +/** + * netif_get_num_default_rss_queues - default number of RSS queues + * + * Default value is the number of physical cores if there are only 1 or 2, or + * divided by 2 if there are more. + */ +int netif_get_num_default_rss_queues(void) +{ + cpumask_var_t cpus; + int cpu, count = 0; + + if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL))) + return 1; + + cpumask_copy(cpus, cpu_online_mask); + for_each_cpu(cpu, cpus) { + ++count; + cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu)); + } + free_cpumask_var(cpus); + + return count > 2 ? DIV_ROUND_UP(count, 2) : count; +} +EXPORT_SYMBOL(netif_get_num_default_rss_queues); + +static void __netif_reschedule(struct Qdisc *q) +{ + struct softnet_data *sd; + unsigned long flags; + + local_irq_save(flags); + sd = this_cpu_ptr(&softnet_data); + q->next_sched = NULL; + *sd->output_queue_tailp = q; + sd->output_queue_tailp = &q->next_sched; + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_restore(flags); +} + +void __netif_schedule(struct Qdisc *q) +{ + if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) + __netif_reschedule(q); +} +EXPORT_SYMBOL(__netif_schedule); + +struct dev_kfree_skb_cb { + enum skb_free_reason reason; +}; + +static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) +{ + return (struct dev_kfree_skb_cb *)skb->cb; +} + +void netif_schedule_queue(struct netdev_queue *txq) +{ + rcu_read_lock(); + if (!netif_xmit_stopped(txq)) { + struct Qdisc *q = rcu_dereference(txq->qdisc); + + __netif_schedule(q); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(netif_schedule_queue); + +void netif_tx_wake_queue(struct netdev_queue *dev_queue) +{ + if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { + struct Qdisc *q; + + rcu_read_lock(); + q = rcu_dereference(dev_queue->qdisc); + __netif_schedule(q); + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(netif_tx_wake_queue); + +void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) +{ + unsigned long flags; + + if (unlikely(!skb)) + return; + + if (likely(refcount_read(&skb->users) == 1)) { + smp_rmb(); + refcount_set(&skb->users, 0); + } else if (likely(!refcount_dec_and_test(&skb->users))) { + return; + } + get_kfree_skb_cb(skb)->reason = reason; + local_irq_save(flags); + skb->next = __this_cpu_read(softnet_data.completion_queue); + __this_cpu_write(softnet_data.completion_queue, skb); + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_restore(flags); +} +EXPORT_SYMBOL(__dev_kfree_skb_irq); + +void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason) +{ + if (in_hardirq() || irqs_disabled()) + __dev_kfree_skb_irq(skb, reason); + else if (unlikely(reason == SKB_REASON_DROPPED)) + kfree_skb(skb); + else + consume_skb(skb); +} +EXPORT_SYMBOL(__dev_kfree_skb_any); + + +/** + * netif_device_detach - mark device as removed + * @dev: network device + * + * Mark device as removed from system and therefore no longer available. + */ +void netif_device_detach(struct net_device *dev) +{ + if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && + netif_running(dev)) { + netif_tx_stop_all_queues(dev); + } +} +EXPORT_SYMBOL(netif_device_detach); + +/** + * netif_device_attach - mark device as attached + * @dev: network device + * + * Mark device as attached from system and restart if needed. + */ +void netif_device_attach(struct net_device *dev) +{ + if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && + netif_running(dev)) { + netif_tx_wake_all_queues(dev); + __netdev_watchdog_up(dev); + } +} +EXPORT_SYMBOL(netif_device_attach); + +/* + * Returns a Tx hash based on the given packet descriptor a Tx queues' number + * to be used as a distribution range. + */ +static u16 skb_tx_hash(const struct net_device *dev, + const struct net_device *sb_dev, + struct sk_buff *skb) +{ + u32 hash; + u16 qoffset = 0; + u16 qcount = dev->real_num_tx_queues; + + if (dev->num_tc) { + u8 tc = netdev_get_prio_tc_map(dev, skb->priority); + + qoffset = sb_dev->tc_to_txq[tc].offset; + qcount = sb_dev->tc_to_txq[tc].count; + if (unlikely(!qcount)) { + net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n", + sb_dev->name, qoffset, tc); + qoffset = 0; + qcount = dev->real_num_tx_queues; + } + } + + if (skb_rx_queue_recorded(skb)) { + DEBUG_NET_WARN_ON_ONCE(qcount == 0); + hash = skb_get_rx_queue(skb); + if (hash >= qoffset) + hash -= qoffset; + while (unlikely(hash >= qcount)) + hash -= qcount; + return hash + qoffset; + } + + return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; +} + +static void skb_warn_bad_offload(const struct sk_buff *skb) +{ + static const netdev_features_t null_features; + struct net_device *dev = skb->dev; + const char *name = ""; + + if (!net_ratelimit()) + return; + + if (dev) { + if (dev->dev.parent) + name = dev_driver_string(dev->dev.parent); + else + name = netdev_name(dev); + } + skb_dump(KERN_WARNING, skb, false); + WARN(1, "%s: caps=(%pNF, %pNF)\n", + name, dev ? &dev->features : &null_features, + skb->sk ? &skb->sk->sk_route_caps : &null_features); +} + +/* + * Invalidate hardware checksum when packet is to be mangled, and + * complete checksum manually on outgoing path. + */ +int skb_checksum_help(struct sk_buff *skb) +{ + __wsum csum; + int ret = 0, offset; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + goto out_set_summed; + + if (unlikely(skb_is_gso(skb))) { + skb_warn_bad_offload(skb); + return -EINVAL; + } + + /* Before computing a checksum, we should make sure no frag could + * be modified by an external entity : checksum could be wrong. + */ + if (skb_has_shared_frag(skb)) { + ret = __skb_linearize(skb); + if (ret) + goto out; + } + + offset = skb_checksum_start_offset(skb); + ret = -EINVAL; + if (unlikely(offset >= skb_headlen(skb))) { + DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); + WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n", + offset, skb_headlen(skb)); + goto out; + } + csum = skb_checksum(skb, offset, skb->len - offset, 0); + + offset += skb->csum_offset; + if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) { + DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); + WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n", + offset + sizeof(__sum16), skb_headlen(skb)); + goto out; + } + ret = skb_ensure_writable(skb, offset + sizeof(__sum16)); + if (ret) + goto out; + + *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0; +out_set_summed: + skb->ip_summed = CHECKSUM_NONE; +out: + return ret; +} +EXPORT_SYMBOL(skb_checksum_help); + +int skb_crc32c_csum_help(struct sk_buff *skb) +{ + __le32 crc32c_csum; + int ret = 0, offset, start; + + if (skb->ip_summed != CHECKSUM_PARTIAL) + goto out; + + if (unlikely(skb_is_gso(skb))) + goto out; + + /* Before computing a checksum, we should make sure no frag could + * be modified by an external entity : checksum could be wrong. + */ + if (unlikely(skb_has_shared_frag(skb))) { + ret = __skb_linearize(skb); + if (ret) + goto out; + } + start = skb_checksum_start_offset(skb); + offset = start + offsetof(struct sctphdr, checksum); + if (WARN_ON_ONCE(offset >= skb_headlen(skb))) { + ret = -EINVAL; + goto out; + } + + ret = skb_ensure_writable(skb, offset + sizeof(__le32)); + if (ret) + goto out; + + crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start, + skb->len - start, ~(__u32)0, + crc32c_csum_stub)); + *(__le32 *)(skb->data + offset) = crc32c_csum; + skb->ip_summed = CHECKSUM_NONE; + skb->csum_not_inet = 0; +out: + return ret; +} + +__be16 skb_network_protocol(struct sk_buff *skb, int *depth) +{ + __be16 type = skb->protocol; + + /* Tunnel gso handlers can set protocol to ethernet. */ + if (type == htons(ETH_P_TEB)) { + struct ethhdr *eth; + + if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) + return 0; + + eth = (struct ethhdr *)skb->data; + type = eth->h_proto; + } + + return vlan_get_protocol_and_depth(skb, type, depth); +} + +/* openvswitch calls this on rx path, so we need a different check. + */ +static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) +{ + if (tx_path) + return skb->ip_summed != CHECKSUM_PARTIAL && + skb->ip_summed != CHECKSUM_UNNECESSARY; + + return skb->ip_summed == CHECKSUM_NONE; +} + +/** + * __skb_gso_segment - Perform segmentation on skb. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * @tx_path: whether it is called in TX path + * + * This function segments the given skb and returns a list of segments. + * + * It may return NULL if the skb requires no segmentation. This is + * only possible when GSO is used for verifying header integrity. + * + * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb. + */ +struct sk_buff *__skb_gso_segment(struct sk_buff *skb, + netdev_features_t features, bool tx_path) +{ + struct sk_buff *segs; + + if (unlikely(skb_needs_check(skb, tx_path))) { + int err; + + /* We're going to init ->check field in TCP or UDP header */ + err = skb_cow_head(skb, 0); + if (err < 0) + return ERR_PTR(err); + } + + /* Only report GSO partial support if it will enable us to + * support segmentation on this frame without needing additional + * work. + */ + if (features & NETIF_F_GSO_PARTIAL) { + netdev_features_t partial_features = NETIF_F_GSO_ROBUST; + struct net_device *dev = skb->dev; + + partial_features |= dev->features & dev->gso_partial_features; + if (!skb_gso_ok(skb, features | partial_features)) + features &= ~NETIF_F_GSO_PARTIAL; + } + + BUILD_BUG_ON(SKB_GSO_CB_OFFSET + + sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); + + SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); + SKB_GSO_CB(skb)->encap_level = 0; + + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + segs = skb_mac_gso_segment(skb, features); + + if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) + skb_warn_bad_offload(skb); + + return segs; +} +EXPORT_SYMBOL(__skb_gso_segment); + +/* Take action when hardware reception checksum errors are detected. */ +#ifdef CONFIG_BUG +static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) +{ + netdev_err(dev, "hw csum failure\n"); + skb_dump(KERN_ERR, skb, true); + dump_stack(); +} + +void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) +{ + DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb); +} +EXPORT_SYMBOL(netdev_rx_csum_fault); +#endif + +/* XXX: check that highmem exists at all on the given machine. */ +static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) +{ +#ifdef CONFIG_HIGHMEM + int i; + + if (!(dev->features & NETIF_F_HIGHDMA)) { + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (PageHighMem(skb_frag_page(frag))) + return 1; + } + } +#endif + return 0; +} + +/* If MPLS offload request, verify we are testing hardware MPLS features + * instead of standard features for the netdev. + */ +#if IS_ENABLED(CONFIG_NET_MPLS_GSO) +static netdev_features_t net_mpls_features(struct sk_buff *skb, + netdev_features_t features, + __be16 type) +{ + if (eth_p_mpls(type)) + features &= skb->dev->mpls_features; + + return features; +} +#else +static netdev_features_t net_mpls_features(struct sk_buff *skb, + netdev_features_t features, + __be16 type) +{ + return features; +} +#endif + +static netdev_features_t harmonize_features(struct sk_buff *skb, + netdev_features_t features) +{ + __be16 type; + + type = skb_network_protocol(skb, NULL); + features = net_mpls_features(skb, features, type); + + if (skb->ip_summed != CHECKSUM_NONE && + !can_checksum_protocol(features, type)) { + features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); + } + if (illegal_highdma(skb->dev, skb)) + features &= ~NETIF_F_SG; + + return features; +} + +netdev_features_t passthru_features_check(struct sk_buff *skb, + struct net_device *dev, + netdev_features_t features) +{ + return features; +} +EXPORT_SYMBOL(passthru_features_check); + +static netdev_features_t dflt_features_check(struct sk_buff *skb, + struct net_device *dev, + netdev_features_t features) +{ + return vlan_features_check(skb, features); +} + +static netdev_features_t gso_features_check(const struct sk_buff *skb, + struct net_device *dev, + netdev_features_t features) +{ + u16 gso_segs = skb_shinfo(skb)->gso_segs; + + if (gso_segs > READ_ONCE(dev->gso_max_segs)) + return features & ~NETIF_F_GSO_MASK; + + if (unlikely(skb->len >= READ_ONCE(dev->gso_max_size))) + return features & ~NETIF_F_GSO_MASK; + + if (!skb_shinfo(skb)->gso_type) { + skb_warn_bad_offload(skb); + return features & ~NETIF_F_GSO_MASK; + } + + /* Support for GSO partial features requires software + * intervention before we can actually process the packets + * so we need to strip support for any partial features now + * and we can pull them back in after we have partially + * segmented the frame. + */ + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) + features &= ~dev->gso_partial_features; + + /* Make sure to clear the IPv4 ID mangling feature if the + * IPv4 header has the potential to be fragmented. + */ + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { + struct iphdr *iph = skb->encapsulation ? + inner_ip_hdr(skb) : ip_hdr(skb); + + if (!(iph->frag_off & htons(IP_DF))) + features &= ~NETIF_F_TSO_MANGLEID; + } + + return features; +} + +netdev_features_t netif_skb_features(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + netdev_features_t features = dev->features; + + if (skb_is_gso(skb)) + features = gso_features_check(skb, dev, features); + + /* If encapsulation offload request, verify we are testing + * hardware encapsulation features instead of standard + * features for the netdev + */ + if (skb->encapsulation) + features &= dev->hw_enc_features; + + if (skb_vlan_tagged(skb)) + features = netdev_intersect_features(features, + dev->vlan_features | + NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX); + + if (dev->netdev_ops->ndo_features_check) + features &= dev->netdev_ops->ndo_features_check(skb, dev, + features); + else + features &= dflt_features_check(skb, dev, features); + + return harmonize_features(skb, features); +} +EXPORT_SYMBOL(netif_skb_features); + +static int xmit_one(struct sk_buff *skb, struct net_device *dev, + struct netdev_queue *txq, bool more) +{ + unsigned int len; + int rc; + + if (dev_nit_active(dev)) + dev_queue_xmit_nit(skb, dev); + + len = skb->len; + trace_net_dev_start_xmit(skb, dev); + rc = netdev_start_xmit(skb, dev, txq, more); + trace_net_dev_xmit(skb, rc, dev, len); + + return rc; +} + +struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, + struct netdev_queue *txq, int *ret) +{ + struct sk_buff *skb = first; + int rc = NETDEV_TX_OK; + + while (skb) { + struct sk_buff *next = skb->next; + + skb_mark_not_on_list(skb); + rc = xmit_one(skb, dev, txq, next != NULL); + if (unlikely(!dev_xmit_complete(rc))) { + skb->next = next; + goto out; + } + + skb = next; + if (netif_tx_queue_stopped(txq) && skb) { + rc = NETDEV_TX_BUSY; + break; + } + } + +out: + *ret = rc; + return skb; +} + +static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, + netdev_features_t features) +{ + if (skb_vlan_tag_present(skb) && + !vlan_hw_offload_capable(features, skb->vlan_proto)) + skb = __vlan_hwaccel_push_inside(skb); + return skb; +} + +int skb_csum_hwoffload_help(struct sk_buff *skb, + const netdev_features_t features) +{ + if (unlikely(skb_csum_is_sctp(skb))) + return !!(features & NETIF_F_SCTP_CRC) ? 0 : + skb_crc32c_csum_help(skb); + + if (features & NETIF_F_HW_CSUM) + return 0; + + if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) { + switch (skb->csum_offset) { + case offsetof(struct tcphdr, check): + case offsetof(struct udphdr, check): + return 0; + } + } + + return skb_checksum_help(skb); +} +EXPORT_SYMBOL(skb_csum_hwoffload_help); + +static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again) +{ + netdev_features_t features; + + features = netif_skb_features(skb); + skb = validate_xmit_vlan(skb, features); + if (unlikely(!skb)) + goto out_null; + + skb = sk_validate_xmit_skb(skb, dev); + if (unlikely(!skb)) + goto out_null; + + if (netif_needs_gso(skb, features)) { + struct sk_buff *segs; + + segs = skb_gso_segment(skb, features); + if (IS_ERR(segs)) { + goto out_kfree_skb; + } else if (segs) { + consume_skb(skb); + skb = segs; + } + } else { + if (skb_needs_linearize(skb, features) && + __skb_linearize(skb)) + goto out_kfree_skb; + + /* If packet is not checksummed and device does not + * support checksumming for this protocol, complete + * checksumming here. + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + if (skb->encapsulation) + skb_set_inner_transport_header(skb, + skb_checksum_start_offset(skb)); + else + skb_set_transport_header(skb, + skb_checksum_start_offset(skb)); + if (skb_csum_hwoffload_help(skb, features)) + goto out_kfree_skb; + } + } + + skb = validate_xmit_xfrm(skb, features, again); + + return skb; + +out_kfree_skb: + kfree_skb(skb); +out_null: + dev_core_stats_tx_dropped_inc(dev); + return NULL; +} + +struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again) +{ + struct sk_buff *next, *head = NULL, *tail; + + for (; skb != NULL; skb = next) { + next = skb->next; + skb_mark_not_on_list(skb); + + /* in case skb wont be segmented, point to itself */ + skb->prev = skb; + + skb = validate_xmit_skb(skb, dev, again); + if (!skb) + continue; + + if (!head) + head = skb; + else + tail->next = skb; + /* If skb was segmented, skb->prev points to + * the last segment. If not, it still contains skb. + */ + tail = skb->prev; + } + return head; +} +EXPORT_SYMBOL_GPL(validate_xmit_skb_list); + +static void qdisc_pkt_len_init(struct sk_buff *skb) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + + qdisc_skb_cb(skb)->pkt_len = skb->len; + + /* To get more precise estimation of bytes sent on wire, + * we add to pkt_len the headers size of all segments + */ + if (shinfo->gso_size && skb_transport_header_was_set(skb)) { + unsigned int hdr_len; + u16 gso_segs = shinfo->gso_segs; + + /* mac layer + network layer */ + hdr_len = skb_transport_header(skb) - skb_mac_header(skb); + + /* + transport layer */ + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { + const struct tcphdr *th; + struct tcphdr _tcphdr; + + th = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_tcphdr), &_tcphdr); + if (likely(th)) + hdr_len += __tcp_hdrlen(th); + } else { + struct udphdr _udphdr; + + if (skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_udphdr), &_udphdr)) + hdr_len += sizeof(struct udphdr); + } + + if (shinfo->gso_type & SKB_GSO_DODGY) + gso_segs = DIV_ROUND_UP(skb->len - hdr_len, + shinfo->gso_size); + + qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; + } +} + +static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q, + struct sk_buff **to_free, + struct netdev_queue *txq) +{ + int rc; + + rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK; + if (rc == NET_XMIT_SUCCESS) + trace_qdisc_enqueue(q, txq, skb); + return rc; +} + +static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, + struct net_device *dev, + struct netdev_queue *txq) +{ + spinlock_t *root_lock = qdisc_lock(q); + struct sk_buff *to_free = NULL; + bool contended; + int rc; + + qdisc_calculate_pkt_len(skb, q); + + if (q->flags & TCQ_F_NOLOCK) { + if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) && + qdisc_run_begin(q)) { + /* Retest nolock_qdisc_is_empty() within the protection + * of q->seqlock to protect from racing with requeuing. + */ + if (unlikely(!nolock_qdisc_is_empty(q))) { + rc = dev_qdisc_enqueue(skb, q, &to_free, txq); + __qdisc_run(q); + qdisc_run_end(q); + + goto no_lock_out; + } + + qdisc_bstats_cpu_update(q, skb); + if (sch_direct_xmit(skb, q, dev, txq, NULL, true) && + !nolock_qdisc_is_empty(q)) + __qdisc_run(q); + + qdisc_run_end(q); + return NET_XMIT_SUCCESS; + } + + rc = dev_qdisc_enqueue(skb, q, &to_free, txq); + qdisc_run(q); + +no_lock_out: + if (unlikely(to_free)) + kfree_skb_list_reason(to_free, + SKB_DROP_REASON_QDISC_DROP); + return rc; + } + + /* + * Heuristic to force contended enqueues to serialize on a + * separate lock before trying to get qdisc main lock. + * This permits qdisc->running owner to get the lock more + * often and dequeue packets faster. + * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit + * and then other tasks will only enqueue packets. The packets will be + * sent after the qdisc owner is scheduled again. To prevent this + * scenario the task always serialize on the lock. + */ + contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT); + if (unlikely(contended)) + spin_lock(&q->busylock); + + spin_lock(root_lock); + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { + __qdisc_drop(skb, &to_free); + rc = NET_XMIT_DROP; + } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && + qdisc_run_begin(q)) { + /* + * This is a work-conserving queue; there are no old skbs + * waiting to be sent out; and the qdisc is not running - + * xmit the skb directly. + */ + + qdisc_bstats_update(q, skb); + + if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { + if (unlikely(contended)) { + spin_unlock(&q->busylock); + contended = false; + } + __qdisc_run(q); + } + + qdisc_run_end(q); + rc = NET_XMIT_SUCCESS; + } else { + rc = dev_qdisc_enqueue(skb, q, &to_free, txq); + if (qdisc_run_begin(q)) { + if (unlikely(contended)) { + spin_unlock(&q->busylock); + contended = false; + } + __qdisc_run(q); + qdisc_run_end(q); + } + } + spin_unlock(root_lock); + if (unlikely(to_free)) + kfree_skb_list_reason(to_free, SKB_DROP_REASON_QDISC_DROP); + if (unlikely(contended)) + spin_unlock(&q->busylock); + return rc; +} + +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) +static void skb_update_prio(struct sk_buff *skb) +{ + const struct netprio_map *map; + const struct sock *sk; + unsigned int prioidx; + + if (skb->priority) + return; + map = rcu_dereference_bh(skb->dev->priomap); + if (!map) + return; + sk = skb_to_full_sk(skb); + if (!sk) + return; + + prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data); + + if (prioidx < map->priomap_len) + skb->priority = map->priomap[prioidx]; +} +#else +#define skb_update_prio(skb) +#endif + +/** + * dev_loopback_xmit - loop back @skb + * @net: network namespace this loopback is happening in + * @sk: sk needed to be a netfilter okfn + * @skb: buffer to transmit + */ +int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + skb_reset_mac_header(skb); + __skb_pull(skb, skb_network_offset(skb)); + skb->pkt_type = PACKET_LOOPBACK; + if (skb->ip_summed == CHECKSUM_NONE) + skb->ip_summed = CHECKSUM_UNNECESSARY; + DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb)); + skb_dst_force(skb); + netif_rx(skb); + return 0; +} +EXPORT_SYMBOL(dev_loopback_xmit); + +#ifdef CONFIG_NET_EGRESS +static struct sk_buff * +sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) +{ +#ifdef CONFIG_NET_CLS_ACT + struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress); + struct tcf_result cl_res; + + if (!miniq) + return skb; + + /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ + tc_skb_cb(skb)->mru = 0; + tc_skb_cb(skb)->post_ct = false; + mini_qdisc_bstats_cpu_update(miniq, skb); + + switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { + case TC_ACT_OK: + case TC_ACT_RECLASSIFY: + skb->tc_index = TC_H_MIN(cl_res.classid); + break; + case TC_ACT_SHOT: + mini_qdisc_qstats_cpu_drop(miniq); + *ret = NET_XMIT_DROP; + kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS); + return NULL; + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + *ret = NET_XMIT_SUCCESS; + consume_skb(skb); + return NULL; + case TC_ACT_REDIRECT: + /* No need to push/pop skb's mac_header here on egress! */ + skb_do_redirect(skb); + *ret = NET_XMIT_SUCCESS; + return NULL; + default: + break; + } +#endif /* CONFIG_NET_CLS_ACT */ + + return skb; +} + +static struct netdev_queue * +netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) +{ + int qm = skb_get_queue_mapping(skb); + + return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); +} + +static bool netdev_xmit_txqueue_skipped(void) +{ + return __this_cpu_read(softnet_data.xmit.skip_txqueue); +} + +void netdev_xmit_skip_txqueue(bool skip) +{ + __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); +} +EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); +#endif /* CONFIG_NET_EGRESS */ + +#ifdef CONFIG_XPS +static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, + struct xps_dev_maps *dev_maps, unsigned int tci) +{ + int tc = netdev_get_prio_tc_map(dev, skb->priority); + struct xps_map *map; + int queue_index = -1; + + if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids) + return queue_index; + + tci *= dev_maps->num_tc; + tci += tc; + + map = rcu_dereference(dev_maps->attr_map[tci]); + if (map) { + if (map->len == 1) + queue_index = map->queues[0]; + else + queue_index = map->queues[reciprocal_scale( + skb_get_hash(skb), map->len)]; + if (unlikely(queue_index >= dev->real_num_tx_queues)) + queue_index = -1; + } + return queue_index; +} +#endif + +static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, + struct sk_buff *skb) +{ +#ifdef CONFIG_XPS + struct xps_dev_maps *dev_maps; + struct sock *sk = skb->sk; + int queue_index = -1; + + if (!static_key_false(&xps_needed)) + return -1; + + rcu_read_lock(); + if (!static_key_false(&xps_rxqs_needed)) + goto get_cpus_map; + + dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]); + if (dev_maps) { + int tci = sk_rx_queue_get(sk); + + if (tci >= 0) + queue_index = __get_xps_queue_idx(dev, skb, dev_maps, + tci); + } + +get_cpus_map: + if (queue_index < 0) { + dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]); + if (dev_maps) { + unsigned int tci = skb->sender_cpu - 1; + + queue_index = __get_xps_queue_idx(dev, skb, dev_maps, + tci); + } + } + rcu_read_unlock(); + + return queue_index; +#else + return -1; +#endif +} + +u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev) +{ + return 0; +} +EXPORT_SYMBOL(dev_pick_tx_zero); + +u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev) +{ + return (u16)raw_smp_processor_id() % dev->real_num_tx_queues; +} +EXPORT_SYMBOL(dev_pick_tx_cpu_id); + +u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev) +{ + struct sock *sk = skb->sk; + int queue_index = sk_tx_queue_get(sk); + + sb_dev = sb_dev ? : dev; + + if (queue_index < 0 || skb->ooo_okay || + queue_index >= dev->real_num_tx_queues) { + int new_index = get_xps_queue(dev, sb_dev, skb); + + if (new_index < 0) + new_index = skb_tx_hash(dev, sb_dev, skb); + + if (queue_index != new_index && sk && + sk_fullsock(sk) && + rcu_access_pointer(sk->sk_dst_cache)) + sk_tx_queue_set(sk, new_index); + + queue_index = new_index; + } + + return queue_index; +} +EXPORT_SYMBOL(netdev_pick_tx); + +struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, + struct sk_buff *skb, + struct net_device *sb_dev) +{ + int queue_index = 0; + +#ifdef CONFIG_XPS + u32 sender_cpu = skb->sender_cpu - 1; + + if (sender_cpu >= (u32)NR_CPUS) + skb->sender_cpu = raw_smp_processor_id() + 1; +#endif + + if (dev->real_num_tx_queues != 1) { + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_select_queue) + queue_index = ops->ndo_select_queue(dev, skb, sb_dev); + else + queue_index = netdev_pick_tx(dev, skb, sb_dev); + + queue_index = netdev_cap_txqueue(dev, queue_index); + } + + skb_set_queue_mapping(skb, queue_index); + return netdev_get_tx_queue(dev, queue_index); +} + +/** + * __dev_queue_xmit() - transmit a buffer + * @skb: buffer to transmit + * @sb_dev: suboordinate device used for L2 forwarding offload + * + * Queue a buffer for transmission to a network device. The caller must + * have set the device and priority and built the buffer before calling + * this function. The function can be called from an interrupt. + * + * When calling this method, interrupts MUST be enabled. This is because + * the BH enable code must have IRQs enabled so that it will not deadlock. + * + * Regardless of the return value, the skb is consumed, so it is currently + * difficult to retry a send to this method. (You can bump the ref count + * before sending to hold a reference for retry if you are careful.) + * + * Return: + * * 0 - buffer successfully transmitted + * * positive qdisc return code - NET_XMIT_DROP etc. + * * negative errno - other errors + */ +int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) +{ + struct net_device *dev = skb->dev; + struct netdev_queue *txq = NULL; + struct Qdisc *q; + int rc = -ENOMEM; + bool again = false; + + skb_reset_mac_header(skb); + skb_assert_len(skb); + + if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) + __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED); + + /* Disable soft irqs for various locks below. Also + * stops preemption for RCU. + */ + rcu_read_lock_bh(); + + skb_update_prio(skb); + + qdisc_pkt_len_init(skb); +#ifdef CONFIG_NET_CLS_ACT + skb->tc_at_ingress = 0; +#endif +#ifdef CONFIG_NET_EGRESS + if (static_branch_unlikely(&egress_needed_key)) { + if (nf_hook_egress_active()) { + skb = nf_hook_egress(skb, &rc, dev); + if (!skb) + goto out; + } + + netdev_xmit_skip_txqueue(false); + + nf_skip_egress(skb, true); + skb = sch_handle_egress(skb, &rc, dev); + if (!skb) + goto out; + nf_skip_egress(skb, false); + + if (netdev_xmit_txqueue_skipped()) + txq = netdev_tx_queue_mapping(dev, skb); + } +#endif + /* If device/qdisc don't need skb->dst, release it right now while + * its hot in this cpu cache. + */ + if (dev->priv_flags & IFF_XMIT_DST_RELEASE) + skb_dst_drop(skb); + else + skb_dst_force(skb); + + if (!txq) + txq = netdev_core_pick_tx(dev, skb, sb_dev); + + q = rcu_dereference_bh(txq->qdisc); + + trace_net_dev_queue(skb); + if (q->enqueue) { + rc = __dev_xmit_skb(skb, q, dev, txq); + goto out; + } + + /* The device has no queue. Common case for software devices: + * loopback, all the sorts of tunnels... + + * Really, it is unlikely that netif_tx_lock protection is necessary + * here. (f.e. loopback and IP tunnels are clean ignoring statistics + * counters.) + * However, it is possible, that they rely on protection + * made by us here. + + * Check this and shot the lock. It is not prone from deadlocks. + *Either shot noqueue qdisc, it is even simpler 8) + */ + if (dev->flags & IFF_UP) { + int cpu = smp_processor_id(); /* ok because BHs are off */ + + /* Other cpus might concurrently change txq->xmit_lock_owner + * to -1 or to their cpu id, but not to our id. + */ + if (READ_ONCE(txq->xmit_lock_owner) != cpu) { + if (dev_xmit_recursion()) + goto recursion_alert; + + skb = validate_xmit_skb(skb, dev, &again); + if (!skb) + goto out; + + HARD_TX_LOCK(dev, txq, cpu); + + if (!netif_xmit_stopped(txq)) { + dev_xmit_recursion_inc(); + skb = dev_hard_start_xmit(skb, dev, txq, &rc); + dev_xmit_recursion_dec(); + if (dev_xmit_complete(rc)) { + HARD_TX_UNLOCK(dev, txq); + goto out; + } + } + HARD_TX_UNLOCK(dev, txq); + net_crit_ratelimited("Virtual device %s asks to queue packet!\n", + dev->name); + } else { + /* Recursion is detected! It is possible, + * unfortunately + */ +recursion_alert: + net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", + dev->name); + } + } + + rc = -ENETDOWN; + rcu_read_unlock_bh(); + + dev_core_stats_tx_dropped_inc(dev); + kfree_skb_list(skb); + return rc; +out: + rcu_read_unlock_bh(); + return rc; +} +EXPORT_SYMBOL(__dev_queue_xmit); + +int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id) +{ + struct net_device *dev = skb->dev; + struct sk_buff *orig_skb = skb; + struct netdev_queue *txq; + int ret = NETDEV_TX_BUSY; + bool again = false; + + if (unlikely(!netif_running(dev) || + !netif_carrier_ok(dev))) + goto drop; + + skb = validate_xmit_skb_list(skb, dev, &again); + if (skb != orig_skb) + goto drop; + + skb_set_queue_mapping(skb, queue_id); + txq = skb_get_tx_queue(dev, skb); + + local_bh_disable(); + + dev_xmit_recursion_inc(); + HARD_TX_LOCK(dev, txq, smp_processor_id()); + if (!netif_xmit_frozen_or_drv_stopped(txq)) + ret = netdev_start_xmit(skb, dev, txq, false); + HARD_TX_UNLOCK(dev, txq); + dev_xmit_recursion_dec(); + + local_bh_enable(); + return ret; +drop: + dev_core_stats_tx_dropped_inc(dev); + kfree_skb_list(skb); + return NET_XMIT_DROP; +} +EXPORT_SYMBOL(__dev_direct_xmit); + +/************************************************************************* + * Receiver routines + *************************************************************************/ + +int netdev_max_backlog __read_mostly = 1000; +EXPORT_SYMBOL(netdev_max_backlog); + +int netdev_tstamp_prequeue __read_mostly = 1; +unsigned int sysctl_skb_defer_max __read_mostly = 64; +int netdev_budget __read_mostly = 300; +/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ +unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ; +int weight_p __read_mostly = 64; /* old backlog weight */ +int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ +int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ +int dev_rx_weight __read_mostly = 64; +int dev_tx_weight __read_mostly = 64; + +/* Called with irq disabled */ +static inline void ____napi_schedule(struct softnet_data *sd, + struct napi_struct *napi) +{ + struct task_struct *thread; + + lockdep_assert_irqs_disabled(); + + if (test_bit(NAPI_STATE_THREADED, &napi->state)) { + /* Paired with smp_mb__before_atomic() in + * napi_enable()/dev_set_threaded(). + * Use READ_ONCE() to guarantee a complete + * read on napi->thread. Only call + * wake_up_process() when it's not NULL. + */ + thread = READ_ONCE(napi->thread); + if (thread) { + /* Avoid doing set_bit() if the thread is in + * INTERRUPTIBLE state, cause napi_thread_wait() + * makes sure to proceed with napi polling + * if the thread is explicitly woken from here. + */ + if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE) + set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); + wake_up_process(thread); + return; + } + } + + list_add_tail(&napi->poll_list, &sd->poll_list); + __raise_softirq_irqoff(NET_RX_SOFTIRQ); +} + +#ifdef CONFIG_RPS + +/* One global table that all flow-based protocols share. */ +struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; +EXPORT_SYMBOL(rps_sock_flow_table); +u32 rps_cpu_mask __read_mostly; +EXPORT_SYMBOL(rps_cpu_mask); + +struct static_key_false rps_needed __read_mostly; +EXPORT_SYMBOL(rps_needed); +struct static_key_false rfs_needed __read_mostly; +EXPORT_SYMBOL(rfs_needed); + +static struct rps_dev_flow * +set_rps_cpu(struct net_device *dev, struct sk_buff *skb, + struct rps_dev_flow *rflow, u16 next_cpu) +{ + if (next_cpu < nr_cpu_ids) { +#ifdef CONFIG_RFS_ACCEL + struct netdev_rx_queue *rxqueue; + struct rps_dev_flow_table *flow_table; + struct rps_dev_flow *old_rflow; + u32 flow_id; + u16 rxq_index; + int rc; + + /* Should we steer this flow to a different hardware queue? */ + if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || + !(dev->features & NETIF_F_NTUPLE)) + goto out; + rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); + if (rxq_index == skb_get_rx_queue(skb)) + goto out; + + rxqueue = dev->_rx + rxq_index; + flow_table = rcu_dereference(rxqueue->rps_flow_table); + if (!flow_table) + goto out; + flow_id = skb_get_hash(skb) & flow_table->mask; + rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, + rxq_index, flow_id); + if (rc < 0) + goto out; + old_rflow = rflow; + rflow = &flow_table->flows[flow_id]; + rflow->filter = rc; + if (old_rflow->filter == rflow->filter) + old_rflow->filter = RPS_NO_FILTER; + out: +#endif + rflow->last_qtail = + per_cpu(softnet_data, next_cpu).input_queue_head; + } + + rflow->cpu = next_cpu; + return rflow; +} + +/* + * get_rps_cpu is called from netif_receive_skb and returns the target + * CPU from the RPS map of the receiving queue for a given skb. + * rcu_read_lock must be held on entry. + */ +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, + struct rps_dev_flow **rflowp) +{ + const struct rps_sock_flow_table *sock_flow_table; + struct netdev_rx_queue *rxqueue = dev->_rx; + struct rps_dev_flow_table *flow_table; + struct rps_map *map; + int cpu = -1; + u32 tcpu; + u32 hash; + + if (skb_rx_queue_recorded(skb)) { + u16 index = skb_get_rx_queue(skb); + + if (unlikely(index >= dev->real_num_rx_queues)) { + WARN_ONCE(dev->real_num_rx_queues > 1, + "%s received packet on queue %u, but number " + "of RX queues is %u\n", + dev->name, index, dev->real_num_rx_queues); + goto done; + } + rxqueue += index; + } + + /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ + + flow_table = rcu_dereference(rxqueue->rps_flow_table); + map = rcu_dereference(rxqueue->rps_map); + if (!flow_table && !map) + goto done; + + skb_reset_network_header(skb); + hash = skb_get_hash(skb); + if (!hash) + goto done; + + sock_flow_table = rcu_dereference(rps_sock_flow_table); + if (flow_table && sock_flow_table) { + struct rps_dev_flow *rflow; + u32 next_cpu; + u32 ident; + + /* First check into global flow table if there is a match. + * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow(). + */ + ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]); + if ((ident ^ hash) & ~rps_cpu_mask) + goto try_rps; + + next_cpu = ident & rps_cpu_mask; + + /* OK, now we know there is a match, + * we can look at the local (per receive queue) flow table + */ + rflow = &flow_table->flows[hash & flow_table->mask]; + tcpu = rflow->cpu; + + /* + * If the desired CPU (where last recvmsg was done) is + * different from current CPU (one in the rx-queue flow + * table entry), switch if one of the following holds: + * - Current CPU is unset (>= nr_cpu_ids). + * - Current CPU is offline. + * - The current CPU's queue tail has advanced beyond the + * last packet that was enqueued using this table entry. + * This guarantees that all previous packets for the flow + * have been dequeued, thus preserving in order delivery. + */ + if (unlikely(tcpu != next_cpu) && + (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || + ((int)(per_cpu(softnet_data, tcpu).input_queue_head - + rflow->last_qtail)) >= 0)) { + tcpu = next_cpu; + rflow = set_rps_cpu(dev, skb, rflow, next_cpu); + } + + if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { + *rflowp = rflow; + cpu = tcpu; + goto done; + } + } + +try_rps: + + if (map) { + tcpu = map->cpus[reciprocal_scale(hash, map->len)]; + if (cpu_online(tcpu)) { + cpu = tcpu; + goto done; + } + } + +done: + return cpu; +} + +#ifdef CONFIG_RFS_ACCEL + +/** + * rps_may_expire_flow - check whether an RFS hardware filter may be removed + * @dev: Device on which the filter was set + * @rxq_index: RX queue index + * @flow_id: Flow ID passed to ndo_rx_flow_steer() + * @filter_id: Filter ID returned by ndo_rx_flow_steer() + * + * Drivers that implement ndo_rx_flow_steer() should periodically call + * this function for each installed filter and remove the filters for + * which it returns %true. + */ +bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, + u32 flow_id, u16 filter_id) +{ + struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; + struct rps_dev_flow_table *flow_table; + struct rps_dev_flow *rflow; + bool expire = true; + unsigned int cpu; + + rcu_read_lock(); + flow_table = rcu_dereference(rxqueue->rps_flow_table); + if (flow_table && flow_id <= flow_table->mask) { + rflow = &flow_table->flows[flow_id]; + cpu = READ_ONCE(rflow->cpu); + if (rflow->filter == filter_id && cpu < nr_cpu_ids && + ((int)(per_cpu(softnet_data, cpu).input_queue_head - + rflow->last_qtail) < + (int)(10 * flow_table->mask))) + expire = false; + } + rcu_read_unlock(); + return expire; +} +EXPORT_SYMBOL(rps_may_expire_flow); + +#endif /* CONFIG_RFS_ACCEL */ + +/* Called from hardirq (IPI) context */ +static void rps_trigger_softirq(void *data) +{ + struct softnet_data *sd = data; + + ____napi_schedule(sd, &sd->backlog); + sd->received_rps++; +} + +#endif /* CONFIG_RPS */ + +/* Called from hardirq (IPI) context */ +static void trigger_rx_softirq(void *data) +{ + struct softnet_data *sd = data; + + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + smp_store_release(&sd->defer_ipi_scheduled, 0); +} + +/* + * Check if this softnet_data structure is another cpu one + * If yes, queue it to our IPI list and return 1 + * If no, return 0 + */ +static int napi_schedule_rps(struct softnet_data *sd) +{ + struct softnet_data *mysd = this_cpu_ptr(&softnet_data); + +#ifdef CONFIG_RPS + if (sd != mysd) { + sd->rps_ipi_next = mysd->rps_ipi_list; + mysd->rps_ipi_list = sd; + + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + return 1; + } +#endif /* CONFIG_RPS */ + __napi_schedule_irqoff(&mysd->backlog); + return 0; +} + +#ifdef CONFIG_NET_FLOW_LIMIT +int netdev_flow_limit_table_len __read_mostly = (1 << 12); +#endif + +static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) +{ +#ifdef CONFIG_NET_FLOW_LIMIT + struct sd_flow_limit *fl; + struct softnet_data *sd; + unsigned int old_flow, new_flow; + + if (qlen < (READ_ONCE(netdev_max_backlog) >> 1)) + return false; + + sd = this_cpu_ptr(&softnet_data); + + rcu_read_lock(); + fl = rcu_dereference(sd->flow_limit); + if (fl) { + new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); + old_flow = fl->history[fl->history_head]; + fl->history[fl->history_head] = new_flow; + + fl->history_head++; + fl->history_head &= FLOW_LIMIT_HISTORY - 1; + + if (likely(fl->buckets[old_flow])) + fl->buckets[old_flow]--; + + if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { + fl->count++; + rcu_read_unlock(); + return true; + } + } + rcu_read_unlock(); +#endif + return false; +} + +/* + * enqueue_to_backlog is called to queue an skb to a per CPU backlog + * queue (may be a remote CPU queue). + */ +static int enqueue_to_backlog(struct sk_buff *skb, int cpu, + unsigned int *qtail) +{ + enum skb_drop_reason reason; + struct softnet_data *sd; + unsigned long flags; + unsigned int qlen; + + reason = SKB_DROP_REASON_NOT_SPECIFIED; + sd = &per_cpu(softnet_data, cpu); + + rps_lock_irqsave(sd, &flags); + if (!netif_running(skb->dev)) + goto drop; + qlen = skb_queue_len(&sd->input_pkt_queue); + if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) { + if (qlen) { +enqueue: + __skb_queue_tail(&sd->input_pkt_queue, skb); + input_queue_tail_incr_save(sd, qtail); + rps_unlock_irq_restore(sd, &flags); + return NET_RX_SUCCESS; + } + + /* Schedule NAPI for backlog device + * We can use non atomic operation since we own the queue lock + */ + if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) + napi_schedule_rps(sd); + goto enqueue; + } + reason = SKB_DROP_REASON_CPU_BACKLOG; + +drop: + sd->dropped++; + rps_unlock_irq_restore(sd, &flags); + + dev_core_stats_rx_dropped_inc(skb->dev); + kfree_skb_reason(skb, reason); + return NET_RX_DROP; +} + +static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct netdev_rx_queue *rxqueue; + + rxqueue = dev->_rx; + + if (skb_rx_queue_recorded(skb)) { + u16 index = skb_get_rx_queue(skb); + + if (unlikely(index >= dev->real_num_rx_queues)) { + WARN_ONCE(dev->real_num_rx_queues > 1, + "%s received packet on queue %u, but number " + "of RX queues is %u\n", + dev->name, index, dev->real_num_rx_queues); + + return rxqueue; /* Return first rxqueue */ + } + rxqueue += index; + } + return rxqueue; +} + +u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + void *orig_data, *orig_data_end, *hard_start; + struct netdev_rx_queue *rxqueue; + bool orig_bcast, orig_host; + u32 mac_len, frame_sz; + __be16 orig_eth_type; + struct ethhdr *eth; + u32 metalen, act; + int off; + + /* The XDP program wants to see the packet starting at the MAC + * header. + */ + mac_len = skb->data - skb_mac_header(skb); + hard_start = skb->data - skb_headroom(skb); + + /* SKB "head" area always have tailroom for skb_shared_info */ + frame_sz = (void *)skb_end_pointer(skb) - hard_start; + frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + rxqueue = netif_get_rxqueue(skb); + xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); + xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, + skb_headlen(skb) + mac_len, true); + + orig_data_end = xdp->data_end; + orig_data = xdp->data; + eth = (struct ethhdr *)xdp->data; + orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr); + orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); + orig_eth_type = eth->h_proto; + + act = bpf_prog_run_xdp(xdp_prog, xdp); + + /* check if bpf_xdp_adjust_head was used */ + off = xdp->data - orig_data; + if (off) { + if (off > 0) + __skb_pull(skb, off); + else if (off < 0) + __skb_push(skb, -off); + + skb->mac_header += off; + skb_reset_network_header(skb); + } + + /* check if bpf_xdp_adjust_tail was used */ + off = xdp->data_end - orig_data_end; + if (off != 0) { + skb_set_tail_pointer(skb, xdp->data_end - xdp->data); + skb->len += off; /* positive on grow, negative on shrink */ + } + + /* check if XDP changed eth hdr such SKB needs update */ + eth = (struct ethhdr *)xdp->data; + if ((orig_eth_type != eth->h_proto) || + (orig_host != ether_addr_equal_64bits(eth->h_dest, + skb->dev->dev_addr)) || + (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) { + __skb_push(skb, ETH_HLEN); + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, skb->dev); + } + + /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull + * before calling us again on redirect path. We do not call do_redirect + * as we leave that up to the caller. + * + * Caller is responsible for managing lifetime of skb (i.e. calling + * kfree_skb in response to actions it cannot handle/XDP_DROP). + */ + switch (act) { + case XDP_REDIRECT: + case XDP_TX: + __skb_push(skb, mac_len); + break; + case XDP_PASS: + metalen = xdp->data - xdp->data_meta; + if (metalen) + skb_metadata_set(skb, metalen); + break; + } + + return act; +} + +static u32 netif_receive_generic_xdp(struct sk_buff *skb, + struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + u32 act = XDP_DROP; + + /* Reinjected packets coming from act_mirred or similar should + * not get XDP generic processing. + */ + if (skb_is_redirected(skb)) + return XDP_PASS; + + /* XDP packets must be linear and must have sufficient headroom + * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also + * native XDP provides, thus we need to do it here as well. + */ + if (skb_cloned(skb) || skb_is_nonlinear(skb) || + skb_headroom(skb) < XDP_PACKET_HEADROOM) { + int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); + int troom = skb->tail + skb->data_len - skb->end; + + /* In case we have to go down the path and also linearize, + * then lets do the pskb_expand_head() work just once here. + */ + if (pskb_expand_head(skb, + hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, + troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) + goto do_drop; + if (skb_linearize(skb)) + goto do_drop; + } + + act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog); + switch (act) { + case XDP_REDIRECT: + case XDP_TX: + case XDP_PASS: + break; + default: + bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception(skb->dev, xdp_prog, act); + fallthrough; + case XDP_DROP: + do_drop: + kfree_skb(skb); + break; + } + + return act; +} + +/* When doing generic XDP we have to bypass the qdisc layer and the + * network taps in order to match in-driver-XDP behavior. This also means + * that XDP packets are able to starve other packets going through a qdisc, + * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX + * queues, so they do not have this starvation issue. + */ +void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) +{ + struct net_device *dev = skb->dev; + struct netdev_queue *txq; + bool free_skb = true; + int cpu, rc; + + txq = netdev_core_pick_tx(dev, skb, NULL); + cpu = smp_processor_id(); + HARD_TX_LOCK(dev, txq, cpu); + if (!netif_xmit_frozen_or_drv_stopped(txq)) { + rc = netdev_start_xmit(skb, dev, txq, 0); + if (dev_xmit_complete(rc)) + free_skb = false; + } + HARD_TX_UNLOCK(dev, txq); + if (free_skb) { + trace_xdp_exception(dev, xdp_prog, XDP_TX); + dev_core_stats_tx_dropped_inc(dev); + kfree_skb(skb); + } +} + +static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); + +int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) +{ + if (xdp_prog) { + struct xdp_buff xdp; + u32 act; + int err; + + act = netif_receive_generic_xdp(skb, &xdp, xdp_prog); + if (act != XDP_PASS) { + switch (act) { + case XDP_REDIRECT: + err = xdp_do_generic_redirect(skb->dev, skb, + &xdp, xdp_prog); + if (err) + goto out_redir; + break; + case XDP_TX: + generic_xdp_tx(skb, xdp_prog); + break; + } + return XDP_DROP; + } + } + return XDP_PASS; +out_redir: + kfree_skb_reason(skb, SKB_DROP_REASON_XDP); + return XDP_DROP; +} +EXPORT_SYMBOL_GPL(do_xdp_generic); + +static int netif_rx_internal(struct sk_buff *skb) +{ + int ret; + + net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); + + trace_netif_rx(skb); + +#ifdef CONFIG_RPS + if (static_branch_unlikely(&rps_needed)) { + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu; + + rcu_read_lock(); + + cpu = get_rps_cpu(skb->dev, skb, &rflow); + if (cpu < 0) + cpu = smp_processor_id(); + + ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + + rcu_read_unlock(); + } else +#endif + { + unsigned int qtail; + + ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail); + } + return ret; +} + +/** + * __netif_rx - Slightly optimized version of netif_rx + * @skb: buffer to post + * + * This behaves as netif_rx except that it does not disable bottom halves. + * As a result this function may only be invoked from the interrupt context + * (either hard or soft interrupt). + */ +int __netif_rx(struct sk_buff *skb) +{ + int ret; + + lockdep_assert_once(hardirq_count() | softirq_count()); + + trace_netif_rx_entry(skb); + ret = netif_rx_internal(skb); + trace_netif_rx_exit(ret); + return ret; +} +EXPORT_SYMBOL(__netif_rx); + +/** + * netif_rx - post buffer to the network code + * @skb: buffer to post + * + * This function receives a packet from a device driver and queues it for + * the upper (protocol) levels to process via the backlog NAPI device. It + * always succeeds. The buffer may be dropped during processing for + * congestion control or by the protocol layers. + * The network buffer is passed via the backlog NAPI device. Modern NIC + * driver should use NAPI and GRO. + * This function can used from interrupt and from process context. The + * caller from process context must not disable interrupts before invoking + * this function. + * + * return values: + * NET_RX_SUCCESS (no congestion) + * NET_RX_DROP (packet was dropped) + * + */ +int netif_rx(struct sk_buff *skb) +{ + bool need_bh_off = !(hardirq_count() | softirq_count()); + int ret; + + if (need_bh_off) + local_bh_disable(); + trace_netif_rx_entry(skb); + ret = netif_rx_internal(skb); + trace_netif_rx_exit(ret); + if (need_bh_off) + local_bh_enable(); + return ret; +} +EXPORT_SYMBOL(netif_rx); + +static __latent_entropy void net_tx_action(struct softirq_action *h) +{ + struct softnet_data *sd = this_cpu_ptr(&softnet_data); + + if (sd->completion_queue) { + struct sk_buff *clist; + + local_irq_disable(); + clist = sd->completion_queue; + sd->completion_queue = NULL; + local_irq_enable(); + + while (clist) { + struct sk_buff *skb = clist; + + clist = clist->next; + + WARN_ON(refcount_read(&skb->users)); + if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) + trace_consume_skb(skb); + else + trace_kfree_skb(skb, net_tx_action, + SKB_DROP_REASON_NOT_SPECIFIED); + + if (skb->fclone != SKB_FCLONE_UNAVAILABLE) + __kfree_skb(skb); + else + __kfree_skb_defer(skb); + } + } + + if (sd->output_queue) { + struct Qdisc *head; + + local_irq_disable(); + head = sd->output_queue; + sd->output_queue = NULL; + sd->output_queue_tailp = &sd->output_queue; + local_irq_enable(); + + rcu_read_lock(); + + while (head) { + struct Qdisc *q = head; + spinlock_t *root_lock = NULL; + + head = head->next_sched; + + /* We need to make sure head->next_sched is read + * before clearing __QDISC_STATE_SCHED + */ + smp_mb__before_atomic(); + + if (!(q->flags & TCQ_F_NOLOCK)) { + root_lock = qdisc_lock(q); + spin_lock(root_lock); + } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, + &q->state))) { + /* There is a synchronize_net() between + * STATE_DEACTIVATED flag being set and + * qdisc_reset()/some_qdisc_is_busy() in + * dev_deactivate(), so we can safely bail out + * early here to avoid data race between + * qdisc_deactivate() and some_qdisc_is_busy() + * for lockless qdisc. + */ + clear_bit(__QDISC_STATE_SCHED, &q->state); + continue; + } + + clear_bit(__QDISC_STATE_SCHED, &q->state); + qdisc_run(q); + if (root_lock) + spin_unlock(root_lock); + } + + rcu_read_unlock(); + } + + xfrm_dev_backlog(sd); +} + +#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE) +/* This hook is defined here for ATM LANE */ +int (*br_fdb_test_addr_hook)(struct net_device *dev, + unsigned char *addr) __read_mostly; +EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); +#endif + +static inline struct sk_buff * +sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev, bool *another) +{ +#ifdef CONFIG_NET_CLS_ACT + struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress); + struct tcf_result cl_res; + + /* If there's at least one ingress present somewhere (so + * we get here via enabled static key), remaining devices + * that are not configured with an ingress qdisc will bail + * out here. + */ + if (!miniq) + return skb; + + if (*pt_prev) { + *ret = deliver_skb(skb, *pt_prev, orig_dev); + *pt_prev = NULL; + } + + qdisc_skb_cb(skb)->pkt_len = skb->len; + tc_skb_cb(skb)->mru = 0; + tc_skb_cb(skb)->post_ct = false; + skb->tc_at_ingress = 1; + mini_qdisc_bstats_cpu_update(miniq, skb); + + switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { + case TC_ACT_OK: + case TC_ACT_RECLASSIFY: + skb->tc_index = TC_H_MIN(cl_res.classid); + break; + case TC_ACT_SHOT: + mini_qdisc_qstats_cpu_drop(miniq); + kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS); + *ret = NET_RX_DROP; + return NULL; + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + consume_skb(skb); + *ret = NET_RX_SUCCESS; + return NULL; + case TC_ACT_REDIRECT: + /* skb_mac_header check was done by cls/act_bpf, so + * we can safely push the L2 header back before + * redirecting to another netdev + */ + __skb_push(skb, skb->mac_len); + if (skb_do_redirect(skb) == -EAGAIN) { + __skb_pull(skb, skb->mac_len); + *another = true; + break; + } + *ret = NET_RX_SUCCESS; + return NULL; + case TC_ACT_CONSUMED: + *ret = NET_RX_SUCCESS; + return NULL; + default: + break; + } +#endif /* CONFIG_NET_CLS_ACT */ + return skb; +} + +/** + * netdev_is_rx_handler_busy - check if receive handler is registered + * @dev: device to check + * + * Check if a receive handler is already registered for a given device. + * Return true if there one. + * + * The caller must hold the rtnl_mutex. + */ +bool netdev_is_rx_handler_busy(struct net_device *dev) +{ + ASSERT_RTNL(); + return dev && rtnl_dereference(dev->rx_handler); +} +EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy); + +/** + * netdev_rx_handler_register - register receive handler + * @dev: device to register a handler for + * @rx_handler: receive handler to register + * @rx_handler_data: data pointer that is used by rx handler + * + * Register a receive handler for a device. This handler will then be + * called from __netif_receive_skb. A negative errno code is returned + * on a failure. + * + * The caller must hold the rtnl_mutex. + * + * For a general description of rx_handler, see enum rx_handler_result. + */ +int netdev_rx_handler_register(struct net_device *dev, + rx_handler_func_t *rx_handler, + void *rx_handler_data) +{ + if (netdev_is_rx_handler_busy(dev)) + return -EBUSY; + + if (dev->priv_flags & IFF_NO_RX_HANDLER) + return -EINVAL; + + /* Note: rx_handler_data must be set before rx_handler */ + rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); + rcu_assign_pointer(dev->rx_handler, rx_handler); + + return 0; +} +EXPORT_SYMBOL_GPL(netdev_rx_handler_register); + +/** + * netdev_rx_handler_unregister - unregister receive handler + * @dev: device to unregister a handler from + * + * Unregister a receive handler from a device. + * + * The caller must hold the rtnl_mutex. + */ +void netdev_rx_handler_unregister(struct net_device *dev) +{ + + ASSERT_RTNL(); + RCU_INIT_POINTER(dev->rx_handler, NULL); + /* a reader seeing a non NULL rx_handler in a rcu_read_lock() + * section has a guarantee to see a non NULL rx_handler_data + * as well. + */ + synchronize_net(); + RCU_INIT_POINTER(dev->rx_handler_data, NULL); +} +EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); + +/* + * Limit the use of PFMEMALLOC reserves to those protocols that implement + * the special handling of PFMEMALLOC skbs. + */ +static bool skb_pfmemalloc_protocol(struct sk_buff *skb) +{ + switch (skb->protocol) { + case htons(ETH_P_ARP): + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + case htons(ETH_P_8021Q): + case htons(ETH_P_8021AD): + return true; + default: + return false; + } +} + +static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, + int *ret, struct net_device *orig_dev) +{ + if (nf_hook_ingress_active(skb)) { + int ingress_retval; + + if (*pt_prev) { + *ret = deliver_skb(skb, *pt_prev, orig_dev); + *pt_prev = NULL; + } + + rcu_read_lock(); + ingress_retval = nf_hook_ingress(skb); + rcu_read_unlock(); + return ingress_retval; + } + return 0; +} + +static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, + struct packet_type **ppt_prev) +{ + struct packet_type *ptype, *pt_prev; + rx_handler_func_t *rx_handler; + struct sk_buff *skb = *pskb; + struct net_device *orig_dev; + bool deliver_exact = false; + int ret = NET_RX_DROP; + __be16 type; + + net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb); + + trace_netif_receive_skb(skb); + + orig_dev = skb->dev; + + skb_reset_network_header(skb); + if (!skb_transport_header_was_set(skb)) + skb_reset_transport_header(skb); + skb_reset_mac_len(skb); + + pt_prev = NULL; + +another_round: + skb->skb_iif = skb->dev->ifindex; + + __this_cpu_inc(softnet_data.processed); + + if (static_branch_unlikely(&generic_xdp_needed_key)) { + int ret2; + + migrate_disable(); + ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); + migrate_enable(); + + if (ret2 != XDP_PASS) { + ret = NET_RX_DROP; + goto out; + } + } + + if (eth_type_vlan(skb->protocol)) { + skb = skb_vlan_untag(skb); + if (unlikely(!skb)) + goto out; + } + + if (skb_skip_tc_classify(skb)) + goto skip_classify; + + if (pfmemalloc) + goto skip_taps; + + list_for_each_entry_rcu(ptype, &ptype_all, list) { + if (pt_prev) + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = ptype; + } + + list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { + if (pt_prev) + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = ptype; + } + +skip_taps: +#ifdef CONFIG_NET_INGRESS + if (static_branch_unlikely(&ingress_needed_key)) { + bool another = false; + + nf_skip_egress(skb, true); + skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, + &another); + if (another) + goto another_round; + if (!skb) + goto out; + + nf_skip_egress(skb, false); + if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) + goto out; + } +#endif + skb_reset_redirect(skb); +skip_classify: + if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) + goto drop; + + if (skb_vlan_tag_present(skb)) { + if (pt_prev) { + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = NULL; + } + if (vlan_do_receive(&skb)) + goto another_round; + else if (unlikely(!skb)) + goto out; + } + + rx_handler = rcu_dereference(skb->dev->rx_handler); + if (rx_handler) { + if (pt_prev) { + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = NULL; + } + switch (rx_handler(&skb)) { + case RX_HANDLER_CONSUMED: + ret = NET_RX_SUCCESS; + goto out; + case RX_HANDLER_ANOTHER: + goto another_round; + case RX_HANDLER_EXACT: + deliver_exact = true; + break; + case RX_HANDLER_PASS: + break; + default: + BUG(); + } + } + + if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) { +check_vlan_id: + if (skb_vlan_tag_get_id(skb)) { + /* Vlan id is non 0 and vlan_do_receive() above couldn't + * find vlan device. + */ + skb->pkt_type = PACKET_OTHERHOST; + } else if (eth_type_vlan(skb->protocol)) { + /* Outer header is 802.1P with vlan 0, inner header is + * 802.1Q or 802.1AD and vlan_do_receive() above could + * not find vlan dev for vlan id 0. + */ + __vlan_hwaccel_clear_tag(skb); + skb = skb_vlan_untag(skb); + if (unlikely(!skb)) + goto out; + if (vlan_do_receive(&skb)) + /* After stripping off 802.1P header with vlan 0 + * vlan dev is found for inner header. + */ + goto another_round; + else if (unlikely(!skb)) + goto out; + else + /* We have stripped outer 802.1P vlan 0 header. + * But could not find vlan dev. + * check again for vlan id to set OTHERHOST. + */ + goto check_vlan_id; + } + /* Note: we might in the future use prio bits + * and set skb->priority like in vlan_do_receive() + * For the time being, just ignore Priority Code Point + */ + __vlan_hwaccel_clear_tag(skb); + } + + type = skb->protocol; + + /* deliver only exact match when indicated */ + if (likely(!deliver_exact)) { + deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, + &ptype_base[ntohs(type) & + PTYPE_HASH_MASK]); + } + + deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, + &orig_dev->ptype_specific); + + if (unlikely(skb->dev != orig_dev)) { + deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, + &skb->dev->ptype_specific); + } + + if (pt_prev) { + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) + goto drop; + *ppt_prev = pt_prev; + } else { +drop: + if (!deliver_exact) + dev_core_stats_rx_dropped_inc(skb->dev); + else + dev_core_stats_rx_nohandler_inc(skb->dev); + kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO); + /* Jamal, now you will not able to escape explaining + * me how you were going to use this. :-) + */ + ret = NET_RX_DROP; + } + +out: + /* The invariant here is that if *ppt_prev is not NULL + * then skb should also be non-NULL. + * + * Apparently *ppt_prev assignment above holds this invariant due to + * skb dereferencing near it. + */ + *pskb = skb; + return ret; +} + +static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc) +{ + struct net_device *orig_dev = skb->dev; + struct packet_type *pt_prev = NULL; + int ret; + + ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); + if (pt_prev) + ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb, + skb->dev, pt_prev, orig_dev); + return ret; +} + +/** + * netif_receive_skb_core - special purpose version of netif_receive_skb + * @skb: buffer to process + * + * More direct receive version of netif_receive_skb(). It should + * only be used by callers that have a need to skip RPS and Generic XDP. + * Caller must also take care of handling if ``(page_is_)pfmemalloc``. + * + * This function may only be called from softirq context and interrupts + * should be enabled. + * + * Return values (usually ignored): + * NET_RX_SUCCESS: no congestion + * NET_RX_DROP: packet was dropped + */ +int netif_receive_skb_core(struct sk_buff *skb) +{ + int ret; + + rcu_read_lock(); + ret = __netif_receive_skb_one_core(skb, false); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(netif_receive_skb_core); + +static inline void __netif_receive_skb_list_ptype(struct list_head *head, + struct packet_type *pt_prev, + struct net_device *orig_dev) +{ + struct sk_buff *skb, *next; + + if (!pt_prev) + return; + if (list_empty(head)) + return; + if (pt_prev->list_func != NULL) + INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv, + ip_list_rcv, head, pt_prev, orig_dev); + else + list_for_each_entry_safe(skb, next, head, list) { + skb_list_del_init(skb); + pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + } +} + +static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc) +{ + /* Fast-path assumptions: + * - There is no RX handler. + * - Only one packet_type matches. + * If either of these fails, we will end up doing some per-packet + * processing in-line, then handling the 'last ptype' for the whole + * sublist. This can't cause out-of-order delivery to any single ptype, + * because the 'last ptype' must be constant across the sublist, and all + * other ptypes are handled per-packet. + */ + /* Current (common) ptype of sublist */ + struct packet_type *pt_curr = NULL; + /* Current (common) orig_dev of sublist */ + struct net_device *od_curr = NULL; + struct list_head sublist; + struct sk_buff *skb, *next; + + INIT_LIST_HEAD(&sublist); + list_for_each_entry_safe(skb, next, head, list) { + struct net_device *orig_dev = skb->dev; + struct packet_type *pt_prev = NULL; + + skb_list_del_init(skb); + __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); + if (!pt_prev) + continue; + if (pt_curr != pt_prev || od_curr != orig_dev) { + /* dispatch old sublist */ + __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); + /* start new sublist */ + INIT_LIST_HEAD(&sublist); + pt_curr = pt_prev; + od_curr = orig_dev; + } + list_add_tail(&skb->list, &sublist); + } + + /* dispatch final sublist */ + __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); +} + +static int __netif_receive_skb(struct sk_buff *skb) +{ + int ret; + + if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { + unsigned int noreclaim_flag; + + /* + * PFMEMALLOC skbs are special, they should + * - be delivered to SOCK_MEMALLOC sockets only + * - stay away from userspace + * - have bounded memory usage + * + * Use PF_MEMALLOC as this saves us from propagating the allocation + * context down to all allocation sites. + */ + noreclaim_flag = memalloc_noreclaim_save(); + ret = __netif_receive_skb_one_core(skb, true); + memalloc_noreclaim_restore(noreclaim_flag); + } else + ret = __netif_receive_skb_one_core(skb, false); + + return ret; +} + +static void __netif_receive_skb_list(struct list_head *head) +{ + unsigned long noreclaim_flag = 0; + struct sk_buff *skb, *next; + bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */ + + list_for_each_entry_safe(skb, next, head, list) { + if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) { + struct list_head sublist; + + /* Handle the previous sublist */ + list_cut_before(&sublist, head, &skb->list); + if (!list_empty(&sublist)) + __netif_receive_skb_list_core(&sublist, pfmemalloc); + pfmemalloc = !pfmemalloc; + /* See comments in __netif_receive_skb */ + if (pfmemalloc) + noreclaim_flag = memalloc_noreclaim_save(); + else + memalloc_noreclaim_restore(noreclaim_flag); + } + } + /* Handle the remaining sublist */ + if (!list_empty(head)) + __netif_receive_skb_list_core(head, pfmemalloc); + /* Restore pflags */ + if (pfmemalloc) + memalloc_noreclaim_restore(noreclaim_flag); +} + +static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) +{ + struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); + struct bpf_prog *new = xdp->prog; + int ret = 0; + + switch (xdp->command) { + case XDP_SETUP_PROG: + rcu_assign_pointer(dev->xdp_prog, new); + if (old) + bpf_prog_put(old); + + if (old && !new) { + static_branch_dec(&generic_xdp_needed_key); + } else if (new && !old) { + static_branch_inc(&generic_xdp_needed_key); + dev_disable_lro(dev); + dev_disable_gro_hw(dev); + } + break; + + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int netif_receive_skb_internal(struct sk_buff *skb) +{ + int ret; + + net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); + + if (skb_defer_rx_timestamp(skb)) + return NET_RX_SUCCESS; + + rcu_read_lock(); +#ifdef CONFIG_RPS + if (static_branch_unlikely(&rps_needed)) { + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu = get_rps_cpu(skb->dev, skb, &rflow); + + if (cpu >= 0) { + ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + rcu_read_unlock(); + return ret; + } + } +#endif + ret = __netif_receive_skb(skb); + rcu_read_unlock(); + return ret; +} + +void netif_receive_skb_list_internal(struct list_head *head) +{ + struct sk_buff *skb, *next; + struct list_head sublist; + + INIT_LIST_HEAD(&sublist); + list_for_each_entry_safe(skb, next, head, list) { + net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); + skb_list_del_init(skb); + if (!skb_defer_rx_timestamp(skb)) + list_add_tail(&skb->list, &sublist); + } + list_splice_init(&sublist, head); + + rcu_read_lock(); +#ifdef CONFIG_RPS + if (static_branch_unlikely(&rps_needed)) { + list_for_each_entry_safe(skb, next, head, list) { + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu = get_rps_cpu(skb->dev, skb, &rflow); + + if (cpu >= 0) { + /* Will be handled, remove from list */ + skb_list_del_init(skb); + enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + } + } + } +#endif + __netif_receive_skb_list(head); + rcu_read_unlock(); +} + +/** + * netif_receive_skb - process receive buffer from network + * @skb: buffer to process + * + * netif_receive_skb() is the main receive data processing function. + * It always succeeds. The buffer may be dropped during processing + * for congestion control or by the protocol layers. + * + * This function may only be called from softirq context and interrupts + * should be enabled. + * + * Return values (usually ignored): + * NET_RX_SUCCESS: no congestion + * NET_RX_DROP: packet was dropped + */ +int netif_receive_skb(struct sk_buff *skb) +{ + int ret; + + trace_netif_receive_skb_entry(skb); + + ret = netif_receive_skb_internal(skb); + trace_netif_receive_skb_exit(ret); + + return ret; +} +EXPORT_SYMBOL(netif_receive_skb); + +/** + * netif_receive_skb_list - process many receive buffers from network + * @head: list of skbs to process. + * + * Since return value of netif_receive_skb() is normally ignored, and + * wouldn't be meaningful for a list, this function returns void. + * + * This function may only be called from softirq context and interrupts + * should be enabled. + */ +void netif_receive_skb_list(struct list_head *head) +{ + struct sk_buff *skb; + + if (list_empty(head)) + return; + if (trace_netif_receive_skb_list_entry_enabled()) { + list_for_each_entry(skb, head, list) + trace_netif_receive_skb_list_entry(skb); + } + netif_receive_skb_list_internal(head); + trace_netif_receive_skb_list_exit(0); +} +EXPORT_SYMBOL(netif_receive_skb_list); + +static DEFINE_PER_CPU(struct work_struct, flush_works); + +/* Network device is going away, flush any packets still pending */ +static void flush_backlog(struct work_struct *work) +{ + struct sk_buff *skb, *tmp; + struct softnet_data *sd; + + local_bh_disable(); + sd = this_cpu_ptr(&softnet_data); + + rps_lock_irq_disable(sd); + skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { + if (skb->dev->reg_state == NETREG_UNREGISTERING) { + __skb_unlink(skb, &sd->input_pkt_queue); + dev_kfree_skb_irq(skb); + input_queue_head_incr(sd); + } + } + rps_unlock_irq_enable(sd); + + skb_queue_walk_safe(&sd->process_queue, skb, tmp) { + if (skb->dev->reg_state == NETREG_UNREGISTERING) { + __skb_unlink(skb, &sd->process_queue); + kfree_skb(skb); + input_queue_head_incr(sd); + } + } + local_bh_enable(); +} + +static bool flush_required(int cpu) +{ +#if IS_ENABLED(CONFIG_RPS) + struct softnet_data *sd = &per_cpu(softnet_data, cpu); + bool do_flush; + + rps_lock_irq_disable(sd); + + /* as insertion into process_queue happens with the rps lock held, + * process_queue access may race only with dequeue + */ + do_flush = !skb_queue_empty(&sd->input_pkt_queue) || + !skb_queue_empty_lockless(&sd->process_queue); + rps_unlock_irq_enable(sd); + + return do_flush; +#endif + /* without RPS we can't safely check input_pkt_queue: during a + * concurrent remote skb_queue_splice() we can detect as empty both + * input_pkt_queue and process_queue even if the latter could end-up + * containing a lot of packets. + */ + return true; +} + +static void flush_all_backlogs(void) +{ + static cpumask_t flush_cpus; + unsigned int cpu; + + /* since we are under rtnl lock protection we can use static data + * for the cpumask and avoid allocating on stack the possibly + * large mask + */ + ASSERT_RTNL(); + + cpus_read_lock(); + + cpumask_clear(&flush_cpus); + for_each_online_cpu(cpu) { + if (flush_required(cpu)) { + queue_work_on(cpu, system_highpri_wq, + per_cpu_ptr(&flush_works, cpu)); + cpumask_set_cpu(cpu, &flush_cpus); + } + } + + /* we can have in flight packet[s] on the cpus we are not flushing, + * synchronize_net() in unregister_netdevice_many() will take care of + * them + */ + for_each_cpu(cpu, &flush_cpus) + flush_work(per_cpu_ptr(&flush_works, cpu)); + + cpus_read_unlock(); +} + +static void net_rps_send_ipi(struct softnet_data *remsd) +{ +#ifdef CONFIG_RPS + while (remsd) { + struct softnet_data *next = remsd->rps_ipi_next; + + if (cpu_online(remsd->cpu)) + smp_call_function_single_async(remsd->cpu, &remsd->csd); + remsd = next; + } +#endif +} + +/* + * net_rps_action_and_irq_enable sends any pending IPI's for rps. + * Note: called with local irq disabled, but exits with local irq enabled. + */ +static void net_rps_action_and_irq_enable(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + struct softnet_data *remsd = sd->rps_ipi_list; + + if (remsd) { + sd->rps_ipi_list = NULL; + + local_irq_enable(); + + /* Send pending IPI's to kick RPS processing on remote cpus. */ + net_rps_send_ipi(remsd); + } else +#endif + local_irq_enable(); +} + +static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + return sd->rps_ipi_list != NULL; +#else + return false; +#endif +} + +static int process_backlog(struct napi_struct *napi, int quota) +{ + struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); + bool again = true; + int work = 0; + + /* Check if we have pending ipi, its better to send them now, + * not waiting net_rx_action() end. + */ + if (sd_has_rps_ipi_waiting(sd)) { + local_irq_disable(); + net_rps_action_and_irq_enable(sd); + } + + napi->weight = READ_ONCE(dev_rx_weight); + while (again) { + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&sd->process_queue))) { + rcu_read_lock(); + __netif_receive_skb(skb); + rcu_read_unlock(); + input_queue_head_incr(sd); + if (++work >= quota) + return work; + + } + + rps_lock_irq_disable(sd); + if (skb_queue_empty(&sd->input_pkt_queue)) { + /* + * Inline a custom version of __napi_complete(). + * only current cpu owns and manipulates this napi, + * and NAPI_STATE_SCHED is the only possible flag set + * on backlog. + * We can use a plain write instead of clear_bit(), + * and we dont need an smp_mb() memory barrier. + */ + napi->state = 0; + again = false; + } else { + skb_queue_splice_tail_init(&sd->input_pkt_queue, + &sd->process_queue); + } + rps_unlock_irq_enable(sd); + } + + return work; +} + +/** + * __napi_schedule - schedule for receive + * @n: entry to schedule + * + * The entry's receive function will be scheduled to run. + * Consider using __napi_schedule_irqoff() if hard irqs are masked. + */ +void __napi_schedule(struct napi_struct *n) +{ + unsigned long flags; + + local_irq_save(flags); + ____napi_schedule(this_cpu_ptr(&softnet_data), n); + local_irq_restore(flags); +} +EXPORT_SYMBOL(__napi_schedule); + +/** + * napi_schedule_prep - check if napi can be scheduled + * @n: napi context + * + * Test if NAPI routine is already running, and if not mark + * it as running. This is used as a condition variable to + * insure only one NAPI poll instance runs. We also make + * sure there is no pending NAPI disable. + */ +bool napi_schedule_prep(struct napi_struct *n) +{ + unsigned long val, new; + + do { + val = READ_ONCE(n->state); + if (unlikely(val & NAPIF_STATE_DISABLE)) + return false; + new = val | NAPIF_STATE_SCHED; + + /* Sets STATE_MISSED bit if STATE_SCHED was already set + * This was suggested by Alexander Duyck, as compiler + * emits better code than : + * if (val & NAPIF_STATE_SCHED) + * new |= NAPIF_STATE_MISSED; + */ + new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * + NAPIF_STATE_MISSED; + } while (cmpxchg(&n->state, val, new) != val); + + return !(val & NAPIF_STATE_SCHED); +} +EXPORT_SYMBOL(napi_schedule_prep); + +/** + * __napi_schedule_irqoff - schedule for receive + * @n: entry to schedule + * + * Variant of __napi_schedule() assuming hard irqs are masked. + * + * On PREEMPT_RT enabled kernels this maps to __napi_schedule() + * because the interrupt disabled assumption might not be true + * due to force-threaded interrupts and spinlock substitution. + */ +void __napi_schedule_irqoff(struct napi_struct *n) +{ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + ____napi_schedule(this_cpu_ptr(&softnet_data), n); + else + __napi_schedule(n); +} +EXPORT_SYMBOL(__napi_schedule_irqoff); + +bool napi_complete_done(struct napi_struct *n, int work_done) +{ + unsigned long flags, val, new, timeout = 0; + bool ret = true; + + /* + * 1) Don't let napi dequeue from the cpu poll list + * just in case its running on a different cpu. + * 2) If we are busy polling, do nothing here, we have + * the guarantee we will be called later. + */ + if (unlikely(n->state & (NAPIF_STATE_NPSVC | + NAPIF_STATE_IN_BUSY_POLL))) + return false; + + if (work_done) { + if (n->gro_bitmask) + timeout = READ_ONCE(n->dev->gro_flush_timeout); + n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs); + } + if (n->defer_hard_irqs_count > 0) { + n->defer_hard_irqs_count--; + timeout = READ_ONCE(n->dev->gro_flush_timeout); + if (timeout) + ret = false; + } + if (n->gro_bitmask) { + /* When the NAPI instance uses a timeout and keeps postponing + * it, we need to bound somehow the time packets are kept in + * the GRO layer + */ + napi_gro_flush(n, !!timeout); + } + + gro_normal_list(n); + + if (unlikely(!list_empty(&n->poll_list))) { + /* If n->poll_list is not empty, we need to mask irqs */ + local_irq_save(flags); + list_del_init(&n->poll_list); + local_irq_restore(flags); + } + + do { + val = READ_ONCE(n->state); + + WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); + + new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED | + NAPIF_STATE_SCHED_THREADED | + NAPIF_STATE_PREFER_BUSY_POLL); + + /* If STATE_MISSED was set, leave STATE_SCHED set, + * because we will call napi->poll() one more time. + * This C code was suggested by Alexander Duyck to help gcc. + */ + new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED * + NAPIF_STATE_SCHED; + } while (cmpxchg(&n->state, val, new) != val); + + if (unlikely(val & NAPIF_STATE_MISSED)) { + __napi_schedule(n); + return false; + } + + if (timeout) + hrtimer_start(&n->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); + return ret; +} +EXPORT_SYMBOL(napi_complete_done); + +/* must be called under rcu_read_lock(), as we dont take a reference */ +static struct napi_struct *napi_by_id(unsigned int napi_id) +{ + unsigned int hash = napi_id % HASH_SIZE(napi_hash); + struct napi_struct *napi; + + hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) + if (napi->napi_id == napi_id) + return napi; + + return NULL; +} + +#if defined(CONFIG_NET_RX_BUSY_POLL) + +static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) +{ + if (!skip_schedule) { + gro_normal_list(napi); + __napi_schedule(napi); + return; + } + + if (napi->gro_bitmask) { + /* flush too old packets + * If HZ < 1000, flush all packets. + */ + napi_gro_flush(napi, HZ >= 1000); + } + + gro_normal_list(napi); + clear_bit(NAPI_STATE_SCHED, &napi->state); +} + +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll, + u16 budget) +{ + bool skip_schedule = false; + unsigned long timeout; + int rc; + + /* Busy polling means there is a high chance device driver hard irq + * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was + * set in napi_schedule_prep(). + * Since we are about to call napi->poll() once more, we can safely + * clear NAPI_STATE_MISSED. + * + * Note: x86 could use a single "lock and ..." instruction + * to perform these two clear_bit() + */ + clear_bit(NAPI_STATE_MISSED, &napi->state); + clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); + + local_bh_disable(); + + if (prefer_busy_poll) { + napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); + timeout = READ_ONCE(napi->dev->gro_flush_timeout); + if (napi->defer_hard_irqs_count && timeout) { + hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); + skip_schedule = true; + } + } + + /* All we really want here is to re-enable device interrupts. + * Ideally, a new ndo_busy_poll_stop() could avoid another round. + */ + rc = napi->poll(napi, budget); + /* We can't gro_normal_list() here, because napi->poll() might have + * rearmed the napi (napi_complete_done()) in which case it could + * already be running on another CPU. + */ + trace_napi_poll(napi, rc, budget); + netpoll_poll_unlock(have_poll_lock); + if (rc == budget) + __busy_poll_stop(napi, skip_schedule); + local_bh_enable(); +} + +void napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, bool prefer_busy_poll, u16 budget) +{ + unsigned long start_time = loop_end ? busy_loop_current_time() : 0; + int (*napi_poll)(struct napi_struct *napi, int budget); + void *have_poll_lock = NULL; + struct napi_struct *napi; + +restart: + napi_poll = NULL; + + rcu_read_lock(); + + napi = napi_by_id(napi_id); + if (!napi) + goto out; + + preempt_disable(); + for (;;) { + int work = 0; + + local_bh_disable(); + if (!napi_poll) { + unsigned long val = READ_ONCE(napi->state); + + /* If multiple threads are competing for this napi, + * we avoid dirtying napi->state as much as we can. + */ + if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | + NAPIF_STATE_IN_BUSY_POLL)) { + if (prefer_busy_poll) + set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); + goto count; + } + if (cmpxchg(&napi->state, val, + val | NAPIF_STATE_IN_BUSY_POLL | + NAPIF_STATE_SCHED) != val) { + if (prefer_busy_poll) + set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); + goto count; + } + have_poll_lock = netpoll_poll_lock(napi); + napi_poll = napi->poll; + } + work = napi_poll(napi, budget); + trace_napi_poll(napi, work, budget); + gro_normal_list(napi); +count: + if (work > 0) + __NET_ADD_STATS(dev_net(napi->dev), + LINUX_MIB_BUSYPOLLRXPACKETS, work); + local_bh_enable(); + + if (!loop_end || loop_end(loop_end_arg, start_time)) + break; + + if (unlikely(need_resched())) { + if (napi_poll) + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); + preempt_enable(); + rcu_read_unlock(); + cond_resched(); + if (loop_end(loop_end_arg, start_time)) + return; + goto restart; + } + cpu_relax(); + } + if (napi_poll) + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); + preempt_enable(); +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL(napi_busy_loop); + +#endif /* CONFIG_NET_RX_BUSY_POLL */ + +static void napi_hash_add(struct napi_struct *napi) +{ + if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state)) + return; + + spin_lock(&napi_hash_lock); + + /* 0..NR_CPUS range is reserved for sender_cpu use */ + do { + if (unlikely(++napi_gen_id < MIN_NAPI_ID)) + napi_gen_id = MIN_NAPI_ID; + } while (napi_by_id(napi_gen_id)); + napi->napi_id = napi_gen_id; + + hlist_add_head_rcu(&napi->napi_hash_node, + &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + + spin_unlock(&napi_hash_lock); +} + +/* Warning : caller is responsible to make sure rcu grace period + * is respected before freeing memory containing @napi + */ +static void napi_hash_del(struct napi_struct *napi) +{ + spin_lock(&napi_hash_lock); + + hlist_del_init_rcu(&napi->napi_hash_node); + + spin_unlock(&napi_hash_lock); +} + +static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) +{ + struct napi_struct *napi; + + napi = container_of(timer, struct napi_struct, timer); + + /* Note : we use a relaxed variant of napi_schedule_prep() not setting + * NAPI_STATE_MISSED, since we do not react to a device IRQ. + */ + if (!napi_disable_pending(napi) && + !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) { + clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); + __napi_schedule_irqoff(napi); + } + + return HRTIMER_NORESTART; +} + +static void init_gro_hash(struct napi_struct *napi) +{ + int i; + + for (i = 0; i < GRO_HASH_BUCKETS; i++) { + INIT_LIST_HEAD(&napi->gro_hash[i].list); + napi->gro_hash[i].count = 0; + } + napi->gro_bitmask = 0; +} + +int dev_set_threaded(struct net_device *dev, bool threaded) +{ + struct napi_struct *napi; + int err = 0; + + if (dev->threaded == threaded) + return 0; + + if (threaded) { + list_for_each_entry(napi, &dev->napi_list, dev_list) { + if (!napi->thread) { + err = napi_kthread_create(napi); + if (err) { + threaded = false; + break; + } + } + } + } + + dev->threaded = threaded; + + /* Make sure kthread is created before THREADED bit + * is set. + */ + smp_mb__before_atomic(); + + /* Setting/unsetting threaded mode on a napi might not immediately + * take effect, if the current napi instance is actively being + * polled. In this case, the switch between threaded mode and + * softirq mode will happen in the next round of napi_schedule(). + * This should not cause hiccups/stalls to the live traffic. + */ + list_for_each_entry(napi, &dev->napi_list, dev_list) { + if (threaded) + set_bit(NAPI_STATE_THREADED, &napi->state); + else + clear_bit(NAPI_STATE_THREADED, &napi->state); + } + + return err; +} +EXPORT_SYMBOL(dev_set_threaded); + +void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight) +{ + if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state))) + return; + + INIT_LIST_HEAD(&napi->poll_list); + INIT_HLIST_NODE(&napi->napi_hash_node); + hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + napi->timer.function = napi_watchdog; + init_gro_hash(napi); + napi->skb = NULL; + INIT_LIST_HEAD(&napi->rx_list); + napi->rx_count = 0; + napi->poll = poll; + if (weight > NAPI_POLL_WEIGHT) + netdev_err_once(dev, "%s() called with weight %d\n", __func__, + weight); + napi->weight = weight; + napi->dev = dev; +#ifdef CONFIG_NETPOLL + napi->poll_owner = -1; +#endif + set_bit(NAPI_STATE_SCHED, &napi->state); + set_bit(NAPI_STATE_NPSVC, &napi->state); + list_add_rcu(&napi->dev_list, &dev->napi_list); + napi_hash_add(napi); + napi_get_frags_check(napi); + /* Create kthread for this napi if dev->threaded is set. + * Clear dev->threaded if kthread creation failed so that + * threaded mode will not be enabled in napi_enable(). + */ + if (dev->threaded && napi_kthread_create(napi)) + dev->threaded = 0; +} +EXPORT_SYMBOL(netif_napi_add_weight); + +void napi_disable(struct napi_struct *n) +{ + unsigned long val, new; + + might_sleep(); + set_bit(NAPI_STATE_DISABLE, &n->state); + + for ( ; ; ) { + val = READ_ONCE(n->state); + if (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) { + usleep_range(20, 200); + continue; + } + + new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; + new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); + + if (cmpxchg(&n->state, val, new) == val) + break; + } + + hrtimer_cancel(&n->timer); + + clear_bit(NAPI_STATE_DISABLE, &n->state); +} +EXPORT_SYMBOL(napi_disable); + +/** + * napi_enable - enable NAPI scheduling + * @n: NAPI context + * + * Resume NAPI from being scheduled on this context. + * Must be paired with napi_disable. + */ +void napi_enable(struct napi_struct *n) +{ + unsigned long val, new; + + do { + val = READ_ONCE(n->state); + BUG_ON(!test_bit(NAPI_STATE_SCHED, &val)); + + new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC); + if (n->dev->threaded && n->thread) + new |= NAPIF_STATE_THREADED; + } while (cmpxchg(&n->state, val, new) != val); +} +EXPORT_SYMBOL(napi_enable); + +static void flush_gro_hash(struct napi_struct *napi) +{ + int i; + + for (i = 0; i < GRO_HASH_BUCKETS; i++) { + struct sk_buff *skb, *n; + + list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list) + kfree_skb(skb); + napi->gro_hash[i].count = 0; + } +} + +/* Must be called in process context */ +void __netif_napi_del(struct napi_struct *napi) +{ + if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) + return; + + napi_hash_del(napi); + list_del_rcu(&napi->dev_list); + napi_free_frags(napi); + + flush_gro_hash(napi); + napi->gro_bitmask = 0; + + if (napi->thread) { + kthread_stop(napi->thread); + napi->thread = NULL; + } +} +EXPORT_SYMBOL(__netif_napi_del); + +static int __napi_poll(struct napi_struct *n, bool *repoll) +{ + int work, weight; + + weight = n->weight; + + /* This NAPI_STATE_SCHED test is for avoiding a race + * with netpoll's poll_napi(). Only the entity which + * obtains the lock and sees NAPI_STATE_SCHED set will + * actually make the ->poll() call. Therefore we avoid + * accidentally calling ->poll() when NAPI is not scheduled. + */ + work = 0; + if (test_bit(NAPI_STATE_SCHED, &n->state)) { + work = n->poll(n, weight); + trace_napi_poll(n, work, weight); + } + + if (unlikely(work > weight)) + netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n", + n->poll, work, weight); + + if (likely(work < weight)) + return work; + + /* Drivers must not modify the NAPI state if they + * consume the entire weight. In such cases this code + * still "owns" the NAPI instance and therefore can + * move the instance around on the list at-will. + */ + if (unlikely(napi_disable_pending(n))) { + napi_complete(n); + return work; + } + + /* The NAPI context has more processing work, but busy-polling + * is preferred. Exit early. + */ + if (napi_prefer_busy_poll(n)) { + if (napi_complete_done(n, work)) { + /* If timeout is not set, we need to make sure + * that the NAPI is re-scheduled. + */ + napi_schedule(n); + } + return work; + } + + if (n->gro_bitmask) { + /* flush too old packets + * If HZ < 1000, flush all packets. + */ + napi_gro_flush(n, HZ >= 1000); + } + + gro_normal_list(n); + + /* Some drivers may have called napi_schedule + * prior to exhausting their budget. + */ + if (unlikely(!list_empty(&n->poll_list))) { + pr_warn_once("%s: Budget exhausted after napi rescheduled\n", + n->dev ? n->dev->name : "backlog"); + return work; + } + + *repoll = true; + + return work; +} + +static int napi_poll(struct napi_struct *n, struct list_head *repoll) +{ + bool do_repoll = false; + void *have; + int work; + + list_del_init(&n->poll_list); + + have = netpoll_poll_lock(n); + + work = __napi_poll(n, &do_repoll); + + if (do_repoll) + list_add_tail(&n->poll_list, repoll); + + netpoll_poll_unlock(have); + + return work; +} + +static int napi_thread_wait(struct napi_struct *napi) +{ + bool woken = false; + + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop()) { + /* Testing SCHED_THREADED bit here to make sure the current + * kthread owns this napi and could poll on this napi. + * Testing SCHED bit is not enough because SCHED bit might be + * set by some other busy poll thread or by napi_disable(). + */ + if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) { + WARN_ON(!list_empty(&napi->poll_list)); + __set_current_state(TASK_RUNNING); + return 0; + } + + schedule(); + /* woken being true indicates this thread owns this napi. */ + woken = true; + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + + return -1; +} + +static int napi_threaded_poll(void *data) +{ + struct napi_struct *napi = data; + void *have; + + while (!napi_thread_wait(napi)) { + for (;;) { + bool repoll = false; + + local_bh_disable(); + + have = netpoll_poll_lock(napi); + __napi_poll(napi, &repoll); + netpoll_poll_unlock(have); + + local_bh_enable(); + + if (!repoll) + break; + + cond_resched(); + } + } + return 0; +} + +static void skb_defer_free_flush(struct softnet_data *sd) +{ + struct sk_buff *skb, *next; + unsigned long flags; + + /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ + if (!READ_ONCE(sd->defer_list)) + return; + + spin_lock_irqsave(&sd->defer_lock, flags); + skb = sd->defer_list; + sd->defer_list = NULL; + sd->defer_count = 0; + spin_unlock_irqrestore(&sd->defer_lock, flags); + + while (skb != NULL) { + next = skb->next; + napi_consume_skb(skb, 1); + skb = next; + } +} + +static __latent_entropy void net_rx_action(struct softirq_action *h) +{ + struct softnet_data *sd = this_cpu_ptr(&softnet_data); + unsigned long time_limit = jiffies + + usecs_to_jiffies(READ_ONCE(netdev_budget_usecs)); + int budget = READ_ONCE(netdev_budget); + LIST_HEAD(list); + LIST_HEAD(repoll); + + local_irq_disable(); + list_splice_init(&sd->poll_list, &list); + local_irq_enable(); + + for (;;) { + struct napi_struct *n; + + skb_defer_free_flush(sd); + + if (list_empty(&list)) { + if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) + goto end; + break; + } + + n = list_first_entry(&list, struct napi_struct, poll_list); + budget -= napi_poll(n, &repoll); + + /* If softirq window is exhausted then punt. + * Allow this to run for 2 jiffies since which will allow + * an average latency of 1.5/HZ. + */ + if (unlikely(budget <= 0 || + time_after_eq(jiffies, time_limit))) { + sd->time_squeeze++; + break; + } + } + + local_irq_disable(); + + list_splice_tail_init(&sd->poll_list, &list); + list_splice_tail(&repoll, &list); + list_splice(&list, &sd->poll_list); + if (!list_empty(&sd->poll_list)) + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + + net_rps_action_and_irq_enable(sd); +end:; +} + +struct netdev_adjacent { + struct net_device *dev; + netdevice_tracker dev_tracker; + + /* upper master flag, there can only be one master device per list */ + bool master; + + /* lookup ignore flag */ + bool ignore; + + /* counter for the number of times this device was added to us */ + u16 ref_nr; + + /* private field for the users */ + void *private; + + struct list_head list; + struct rcu_head rcu; +}; + +static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, + struct list_head *adj_list) +{ + struct netdev_adjacent *adj; + + list_for_each_entry(adj, adj_list, list) { + if (adj->dev == adj_dev) + return adj; + } + return NULL; +} + +static int ____netdev_has_upper_dev(struct net_device *upper_dev, + struct netdev_nested_priv *priv) +{ + struct net_device *dev = (struct net_device *)priv->data; + + return upper_dev == dev; +} + +/** + * netdev_has_upper_dev - Check if device is linked to an upper device + * @dev: device + * @upper_dev: upper device to check + * + * Find out if a device is linked to specified upper device and return true + * in case it is. Note that this checks only immediate upper device, + * not through a complete stack of devices. The caller must hold the RTNL lock. + */ +bool netdev_has_upper_dev(struct net_device *dev, + struct net_device *upper_dev) +{ + struct netdev_nested_priv priv = { + .data = (void *)upper_dev, + }; + + ASSERT_RTNL(); + + return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, + &priv); +} +EXPORT_SYMBOL(netdev_has_upper_dev); + +/** + * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device + * @dev: device + * @upper_dev: upper device to check + * + * Find out if a device is linked to specified upper device and return true + * in case it is. Note that this checks the entire upper device chain. + * The caller must hold rcu lock. + */ + +bool netdev_has_upper_dev_all_rcu(struct net_device *dev, + struct net_device *upper_dev) +{ + struct netdev_nested_priv priv = { + .data = (void *)upper_dev, + }; + + return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, + &priv); +} +EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu); + +/** + * netdev_has_any_upper_dev - Check if device is linked to some device + * @dev: device + * + * Find out if a device is linked to an upper device and return true in case + * it is. The caller must hold the RTNL lock. + */ +bool netdev_has_any_upper_dev(struct net_device *dev) +{ + ASSERT_RTNL(); + + return !list_empty(&dev->adj_list.upper); +} +EXPORT_SYMBOL(netdev_has_any_upper_dev); + +/** + * netdev_master_upper_dev_get - Get master upper device + * @dev: device + * + * Find a master upper device and return pointer to it or NULL in case + * it's not there. The caller must hold the RTNL lock. + */ +struct net_device *netdev_master_upper_dev_get(struct net_device *dev) +{ + struct netdev_adjacent *upper; + + ASSERT_RTNL(); + + if (list_empty(&dev->adj_list.upper)) + return NULL; + + upper = list_first_entry(&dev->adj_list.upper, + struct netdev_adjacent, list); + if (likely(upper->master)) + return upper->dev; + return NULL; +} +EXPORT_SYMBOL(netdev_master_upper_dev_get); + +static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev) +{ + struct netdev_adjacent *upper; + + ASSERT_RTNL(); + + if (list_empty(&dev->adj_list.upper)) + return NULL; + + upper = list_first_entry(&dev->adj_list.upper, + struct netdev_adjacent, list); + if (likely(upper->master) && !upper->ignore) + return upper->dev; + return NULL; +} + +/** + * netdev_has_any_lower_dev - Check if device is linked to some device + * @dev: device + * + * Find out if a device is linked to a lower device and return true in case + * it is. The caller must hold the RTNL lock. + */ +static bool netdev_has_any_lower_dev(struct net_device *dev) +{ + ASSERT_RTNL(); + + return !list_empty(&dev->adj_list.lower); +} + +void *netdev_adjacent_get_private(struct list_head *adj_list) +{ + struct netdev_adjacent *adj; + + adj = list_entry(adj_list, struct netdev_adjacent, list); + + return adj->private; +} +EXPORT_SYMBOL(netdev_adjacent_get_private); + +/** + * netdev_upper_get_next_dev_rcu - Get the next dev from upper list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next device from the dev's upper list, starting from iter + * position. The caller must hold RCU read lock. + */ +struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *upper; + + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); + + upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&upper->list == &dev->adj_list.upper) + return NULL; + + *iter = &upper->list; + + return upper->dev; +} +EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); + +static struct net_device *__netdev_next_upper_dev(struct net_device *dev, + struct list_head **iter, + bool *ignore) +{ + struct netdev_adjacent *upper; + + upper = list_entry((*iter)->next, struct netdev_adjacent, list); + + if (&upper->list == &dev->adj_list.upper) + return NULL; + + *iter = &upper->list; + *ignore = upper->ignore; + + return upper->dev; +} + +static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *upper; + + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); + + upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&upper->list == &dev->adj_list.upper) + return NULL; + + *iter = &upper->list; + + return upper->dev; +} + +static int __netdev_walk_all_upper_dev(struct net_device *dev, + int (*fn)(struct net_device *dev, + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv) +{ + struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int ret, cur = 0; + bool ignore; + + now = dev; + iter = &dev->adj_list.upper; + + while (1) { + if (now != dev) { + ret = fn(now, priv); + if (ret) + return ret; + } + + next = NULL; + while (1) { + udev = __netdev_next_upper_dev(now, &iter, &ignore); + if (!udev) + break; + if (ignore) + continue; + + next = udev; + niter = &udev->adj_list.upper; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + break; + } + + if (!next) { + if (!cur) + return 0; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; + } + + return 0; +} + +int netdev_walk_all_upper_dev_rcu(struct net_device *dev, + int (*fn)(struct net_device *dev, + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv) +{ + struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int ret, cur = 0; + + now = dev; + iter = &dev->adj_list.upper; + + while (1) { + if (now != dev) { + ret = fn(now, priv); + if (ret) + return ret; + } + + next = NULL; + while (1) { + udev = netdev_next_upper_dev_rcu(now, &iter); + if (!udev) + break; + + next = udev; + niter = &udev->adj_list.upper; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + break; + } + + if (!next) { + if (!cur) + return 0; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; + } + + return 0; +} +EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu); + +static bool __netdev_has_upper_dev(struct net_device *dev, + struct net_device *upper_dev) +{ + struct netdev_nested_priv priv = { + .flags = 0, + .data = (void *)upper_dev, + }; + + ASSERT_RTNL(); + + return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev, + &priv); +} + +/** + * netdev_lower_get_next_private - Get the next ->private from the + * lower neighbour list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent->private from the dev's lower neighbour + * list, starting from iter position. The caller must hold either hold the + * RTNL lock or its own locking that guarantees that the neighbour lower + * list will remain unchanged. + */ +void *netdev_lower_get_next_private(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry(*iter, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = lower->list.next; + + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_get_next_private); + +/** + * netdev_lower_get_next_private_rcu - Get the next ->private from the + * lower neighbour list, RCU + * variant + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent->private from the dev's lower neighbour + * list, starting from iter position. The caller must hold RCU read lock. + */ +void *netdev_lower_get_next_private_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *lower; + + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + + lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = &lower->list; + + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); + +/** + * netdev_lower_get_next - Get the next device from the lower neighbour + * list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent from the dev's lower neighbour + * list, starting from iter position. The caller must hold RTNL lock or + * its own locking that guarantees that the neighbour lower + * list will remain unchanged. + */ +void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry(*iter, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = lower->list.next; + + return lower->dev; +} +EXPORT_SYMBOL(netdev_lower_get_next); + +static struct net_device *netdev_next_lower_dev(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry((*iter)->next, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = &lower->list; + + return lower->dev; +} + +static struct net_device *__netdev_next_lower_dev(struct net_device *dev, + struct list_head **iter, + bool *ignore) +{ + struct netdev_adjacent *lower; + + lower = list_entry((*iter)->next, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = &lower->list; + *ignore = lower->ignore; + + return lower->dev; +} + +int netdev_walk_all_lower_dev(struct net_device *dev, + int (*fn)(struct net_device *dev, + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv) +{ + struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int ret, cur = 0; + + now = dev; + iter = &dev->adj_list.lower; + + while (1) { + if (now != dev) { + ret = fn(now, priv); + if (ret) + return ret; + } + + next = NULL; + while (1) { + ldev = netdev_next_lower_dev(now, &iter); + if (!ldev) + break; + + next = ldev; + niter = &ldev->adj_list.lower; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + break; + } + + if (!next) { + if (!cur) + return 0; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; + } + + return 0; +} +EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev); + +static int __netdev_walk_all_lower_dev(struct net_device *dev, + int (*fn)(struct net_device *dev, + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv) +{ + struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int ret, cur = 0; + bool ignore; + + now = dev; + iter = &dev->adj_list.lower; + + while (1) { + if (now != dev) { + ret = fn(now, priv); + if (ret) + return ret; + } + + next = NULL; + while (1) { + ldev = __netdev_next_lower_dev(now, &iter, &ignore); + if (!ldev) + break; + if (ignore) + continue; + + next = ldev; + niter = &ldev->adj_list.lower; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + break; + } + + if (!next) { + if (!cur) + return 0; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; + } + + return 0; +} + +struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = &lower->list; + + return lower->dev; +} +EXPORT_SYMBOL(netdev_next_lower_dev_rcu); + +static u8 __netdev_upper_depth(struct net_device *dev) +{ + struct net_device *udev; + struct list_head *iter; + u8 max_depth = 0; + bool ignore; + + for (iter = &dev->adj_list.upper, + udev = __netdev_next_upper_dev(dev, &iter, &ignore); + udev; + udev = __netdev_next_upper_dev(dev, &iter, &ignore)) { + if (ignore) + continue; + if (max_depth < udev->upper_level) + max_depth = udev->upper_level; + } + + return max_depth; +} + +static u8 __netdev_lower_depth(struct net_device *dev) +{ + struct net_device *ldev; + struct list_head *iter; + u8 max_depth = 0; + bool ignore; + + for (iter = &dev->adj_list.lower, + ldev = __netdev_next_lower_dev(dev, &iter, &ignore); + ldev; + ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) { + if (ignore) + continue; + if (max_depth < ldev->lower_level) + max_depth = ldev->lower_level; + } + + return max_depth; +} + +static int __netdev_update_upper_level(struct net_device *dev, + struct netdev_nested_priv *__unused) +{ + dev->upper_level = __netdev_upper_depth(dev) + 1; + return 0; +} + +#ifdef CONFIG_LOCKDEP +static LIST_HEAD(net_unlink_list); + +static void net_unlink_todo(struct net_device *dev) +{ + if (list_empty(&dev->unlink_list)) + list_add_tail(&dev->unlink_list, &net_unlink_list); +} +#endif + +static int __netdev_update_lower_level(struct net_device *dev, + struct netdev_nested_priv *priv) +{ + dev->lower_level = __netdev_lower_depth(dev) + 1; + +#ifdef CONFIG_LOCKDEP + if (!priv) + return 0; + + if (priv->flags & NESTED_SYNC_IMM) + dev->nested_level = dev->lower_level - 1; + if (priv->flags & NESTED_SYNC_TODO) + net_unlink_todo(dev); +#endif + return 0; +} + +int netdev_walk_all_lower_dev_rcu(struct net_device *dev, + int (*fn)(struct net_device *dev, + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv) +{ + struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int ret, cur = 0; + + now = dev; + iter = &dev->adj_list.lower; + + while (1) { + if (now != dev) { + ret = fn(now, priv); + if (ret) + return ret; + } + + next = NULL; + while (1) { + ldev = netdev_next_lower_dev_rcu(now, &iter); + if (!ldev) + break; + + next = ldev; + niter = &ldev->adj_list.lower; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + break; + } + + if (!next) { + if (!cur) + return 0; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; + } + + return 0; +} +EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu); + +/** + * netdev_lower_get_first_private_rcu - Get the first ->private from the + * lower neighbour list, RCU + * variant + * @dev: device + * + * Gets the first netdev_adjacent->private from the dev's lower neighbour + * list. The caller must hold RCU read lock. + */ +void *netdev_lower_get_first_private_rcu(struct net_device *dev) +{ + struct netdev_adjacent *lower; + + lower = list_first_or_null_rcu(&dev->adj_list.lower, + struct netdev_adjacent, list); + if (lower) + return lower->private; + return NULL; +} +EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); + +/** + * netdev_master_upper_dev_get_rcu - Get master upper device + * @dev: device + * + * Find a master upper device and return pointer to it or NULL in case + * it's not there. The caller must hold the RCU read lock. + */ +struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) +{ + struct netdev_adjacent *upper; + + upper = list_first_or_null_rcu(&dev->adj_list.upper, + struct netdev_adjacent, list); + if (upper && likely(upper->master)) + return upper->dev; + return NULL; +} +EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); + +static int netdev_adjacent_sysfs_add(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *dev_list) +{ + char linkname[IFNAMSIZ+7]; + + sprintf(linkname, dev_list == &dev->adj_list.upper ? + "upper_%s" : "lower_%s", adj_dev->name); + return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), + linkname); +} +static void netdev_adjacent_sysfs_del(struct net_device *dev, + char *name, + struct list_head *dev_list) +{ + char linkname[IFNAMSIZ+7]; + + sprintf(linkname, dev_list == &dev->adj_list.upper ? + "upper_%s" : "lower_%s", name); + sysfs_remove_link(&(dev->dev.kobj), linkname); +} + +static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *dev_list) +{ + return (dev_list == &dev->adj_list.upper || + dev_list == &dev->adj_list.lower) && + net_eq(dev_net(dev), dev_net(adj_dev)); +} + +static int __netdev_adjacent_dev_insert(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *dev_list, + void *private, bool master) +{ + struct netdev_adjacent *adj; + int ret; + + adj = __netdev_find_adj(adj_dev, dev_list); + + if (adj) { + adj->ref_nr += 1; + pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n", + dev->name, adj_dev->name, adj->ref_nr); + + return 0; + } + + adj = kmalloc(sizeof(*adj), GFP_KERNEL); + if (!adj) + return -ENOMEM; + + adj->dev = adj_dev; + adj->master = master; + adj->ref_nr = 1; + adj->private = private; + adj->ignore = false; + netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL); + + pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n", + dev->name, adj_dev->name, adj->ref_nr, adj_dev->name); + + if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { + ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); + if (ret) + goto free_adj; + } + + /* Ensure that master link is always the first item in list. */ + if (master) { + ret = sysfs_create_link(&(dev->dev.kobj), + &(adj_dev->dev.kobj), "master"); + if (ret) + goto remove_symlinks; + + list_add_rcu(&adj->list, dev_list); + } else { + list_add_tail_rcu(&adj->list, dev_list); + } + + return 0; + +remove_symlinks: + if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) + netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); +free_adj: + netdev_put(adj_dev, &adj->dev_tracker); + kfree(adj); + + return ret; +} + +static void __netdev_adjacent_dev_remove(struct net_device *dev, + struct net_device *adj_dev, + u16 ref_nr, + struct list_head *dev_list) +{ + struct netdev_adjacent *adj; + + pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n", + dev->name, adj_dev->name, ref_nr); + + adj = __netdev_find_adj(adj_dev, dev_list); + + if (!adj) { + pr_err("Adjacency does not exist for device %s from %s\n", + dev->name, adj_dev->name); + WARN_ON(1); + return; + } + + if (adj->ref_nr > ref_nr) { + pr_debug("adjacency: %s to %s ref_nr - %d = %d\n", + dev->name, adj_dev->name, ref_nr, + adj->ref_nr - ref_nr); + adj->ref_nr -= ref_nr; + return; + } + + if (adj->master) + sysfs_remove_link(&(dev->dev.kobj), "master"); + + if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) + netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); + + list_del_rcu(&adj->list); + pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n", + adj_dev->name, dev->name, adj_dev->name); + netdev_put(adj_dev, &adj->dev_tracker); + kfree_rcu(adj, rcu); +} + +static int __netdev_adjacent_dev_link_lists(struct net_device *dev, + struct net_device *upper_dev, + struct list_head *up_list, + struct list_head *down_list, + void *private, bool master) +{ + int ret; + + ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, + private, master); + if (ret) + return ret; + + ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, + private, false); + if (ret) { + __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list); + return ret; + } + + return 0; +} + +static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, + struct net_device *upper_dev, + u16 ref_nr, + struct list_head *up_list, + struct list_head *down_list) +{ + __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); + __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list); +} + +static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, + struct net_device *upper_dev, + void *private, bool master) +{ + return __netdev_adjacent_dev_link_lists(dev, upper_dev, + &dev->adj_list.upper, + &upper_dev->adj_list.lower, + private, master); +} + +static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, + struct net_device *upper_dev) +{ + __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1, + &dev->adj_list.upper, + &upper_dev->adj_list.lower); +} + +static int __netdev_upper_dev_link(struct net_device *dev, + struct net_device *upper_dev, bool master, + void *upper_priv, void *upper_info, + struct netdev_nested_priv *priv, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_changeupper_info changeupper_info = { + .info = { + .dev = dev, + .extack = extack, + }, + .upper_dev = upper_dev, + .master = master, + .linking = true, + .upper_info = upper_info, + }; + struct net_device *master_dev; + int ret = 0; + + ASSERT_RTNL(); + + if (dev == upper_dev) + return -EBUSY; + + /* To prevent loops, check if dev is not upper device to upper_dev. */ + if (__netdev_has_upper_dev(upper_dev, dev)) + return -EBUSY; + + if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV) + return -EMLINK; + + if (!master) { + if (__netdev_has_upper_dev(dev, upper_dev)) + return -EEXIST; + } else { + master_dev = __netdev_master_upper_dev_get(dev); + if (master_dev) + return master_dev == upper_dev ? -EEXIST : -EBUSY; + } + + ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, + &changeupper_info.info); + ret = notifier_to_errno(ret); + if (ret) + return ret; + + ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv, + master); + if (ret) + return ret; + + ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, + &changeupper_info.info); + ret = notifier_to_errno(ret); + if (ret) + goto rollback; + + __netdev_update_upper_level(dev, NULL); + __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); + + __netdev_update_lower_level(upper_dev, priv); + __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, + priv); + + return 0; + +rollback: + __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); + + return ret; +} + +/** + * netdev_upper_dev_link - Add a link to the upper device + * @dev: device + * @upper_dev: new upper device + * @extack: netlink extended ack + * + * Adds a link to device which is upper to this one. The caller must hold + * the RTNL lock. On a failure a negative errno code is returned. + * On success the reference counts are adjusted and the function + * returns zero. + */ +int netdev_upper_dev_link(struct net_device *dev, + struct net_device *upper_dev, + struct netlink_ext_ack *extack) +{ + struct netdev_nested_priv priv = { + .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, + .data = NULL, + }; + + return __netdev_upper_dev_link(dev, upper_dev, false, + NULL, NULL, &priv, extack); +} +EXPORT_SYMBOL(netdev_upper_dev_link); + +/** + * netdev_master_upper_dev_link - Add a master link to the upper device + * @dev: device + * @upper_dev: new upper device + * @upper_priv: upper device private + * @upper_info: upper info to be passed down via notifier + * @extack: netlink extended ack + * + * Adds a link to device which is upper to this one. In this case, only + * one master upper device can be linked, although other non-master devices + * might be linked as well. The caller must hold the RTNL lock. + * On a failure a negative errno code is returned. On success the reference + * counts are adjusted and the function returns zero. + */ +int netdev_master_upper_dev_link(struct net_device *dev, + struct net_device *upper_dev, + void *upper_priv, void *upper_info, + struct netlink_ext_ack *extack) +{ + struct netdev_nested_priv priv = { + .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, + .data = NULL, + }; + + return __netdev_upper_dev_link(dev, upper_dev, true, + upper_priv, upper_info, &priv, extack); +} +EXPORT_SYMBOL(netdev_master_upper_dev_link); + +static void __netdev_upper_dev_unlink(struct net_device *dev, + struct net_device *upper_dev, + struct netdev_nested_priv *priv) +{ + struct netdev_notifier_changeupper_info changeupper_info = { + .info = { + .dev = dev, + }, + .upper_dev = upper_dev, + .linking = false, + }; + + ASSERT_RTNL(); + + changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; + + call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, + &changeupper_info.info); + + __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); + + call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, + &changeupper_info.info); + + __netdev_update_upper_level(dev, NULL); + __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); + + __netdev_update_lower_level(upper_dev, priv); + __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, + priv); +} + +/** + * netdev_upper_dev_unlink - Removes a link to upper device + * @dev: device + * @upper_dev: new upper device + * + * Removes a link to device which is upper to this one. The caller must hold + * the RTNL lock. + */ +void netdev_upper_dev_unlink(struct net_device *dev, + struct net_device *upper_dev) +{ + struct netdev_nested_priv priv = { + .flags = NESTED_SYNC_TODO, + .data = NULL, + }; + + __netdev_upper_dev_unlink(dev, upper_dev, &priv); +} +EXPORT_SYMBOL(netdev_upper_dev_unlink); + +static void __netdev_adjacent_dev_set(struct net_device *upper_dev, + struct net_device *lower_dev, + bool val) +{ + struct netdev_adjacent *adj; + + adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower); + if (adj) + adj->ignore = val; + + adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper); + if (adj) + adj->ignore = val; +} + +static void netdev_adjacent_dev_disable(struct net_device *upper_dev, + struct net_device *lower_dev) +{ + __netdev_adjacent_dev_set(upper_dev, lower_dev, true); +} + +static void netdev_adjacent_dev_enable(struct net_device *upper_dev, + struct net_device *lower_dev) +{ + __netdev_adjacent_dev_set(upper_dev, lower_dev, false); +} + +int netdev_adjacent_change_prepare(struct net_device *old_dev, + struct net_device *new_dev, + struct net_device *dev, + struct netlink_ext_ack *extack) +{ + struct netdev_nested_priv priv = { + .flags = 0, + .data = NULL, + }; + int err; + + if (!new_dev) + return 0; + + if (old_dev && new_dev != old_dev) + netdev_adjacent_dev_disable(dev, old_dev); + err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv, + extack); + if (err) { + if (old_dev && new_dev != old_dev) + netdev_adjacent_dev_enable(dev, old_dev); + return err; + } + + return 0; +} +EXPORT_SYMBOL(netdev_adjacent_change_prepare); + +void netdev_adjacent_change_commit(struct net_device *old_dev, + struct net_device *new_dev, + struct net_device *dev) +{ + struct netdev_nested_priv priv = { + .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, + .data = NULL, + }; + + if (!new_dev || !old_dev) + return; + + if (new_dev == old_dev) + return; + + netdev_adjacent_dev_enable(dev, old_dev); + __netdev_upper_dev_unlink(old_dev, dev, &priv); +} +EXPORT_SYMBOL(netdev_adjacent_change_commit); + +void netdev_adjacent_change_abort(struct net_device *old_dev, + struct net_device *new_dev, + struct net_device *dev) +{ + struct netdev_nested_priv priv = { + .flags = 0, + .data = NULL, + }; + + if (!new_dev) + return; + + if (old_dev && new_dev != old_dev) + netdev_adjacent_dev_enable(dev, old_dev); + + __netdev_upper_dev_unlink(new_dev, dev, &priv); +} +EXPORT_SYMBOL(netdev_adjacent_change_abort); + +/** + * netdev_bonding_info_change - Dispatch event about slave change + * @dev: device + * @bonding_info: info to dispatch + * + * Send NETDEV_BONDING_INFO to netdev notifiers with info. + * The caller must hold the RTNL lock. + */ +void netdev_bonding_info_change(struct net_device *dev, + struct netdev_bonding_info *bonding_info) +{ + struct netdev_notifier_bonding_info info = { + .info.dev = dev, + }; + + memcpy(&info.bonding_info, bonding_info, + sizeof(struct netdev_bonding_info)); + call_netdevice_notifiers_info(NETDEV_BONDING_INFO, + &info.info); +} +EXPORT_SYMBOL(netdev_bonding_info_change); + +static int netdev_offload_xstats_enable_l3(struct net_device *dev, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .info.extack = extack, + .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, + }; + int err; + int rc; + + dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3), + GFP_KERNEL); + if (!dev->offload_xstats_l3) + return -ENOMEM; + + rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE, + NETDEV_OFFLOAD_XSTATS_DISABLE, + &info.info); + err = notifier_to_errno(rc); + if (err) + goto free_stats; + + return 0; + +free_stats: + kfree(dev->offload_xstats_l3); + dev->offload_xstats_l3 = NULL; + return err; +} + +int netdev_offload_xstats_enable(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct netlink_ext_ack *extack) +{ + ASSERT_RTNL(); + + if (netdev_offload_xstats_enabled(dev, type)) + return -EALREADY; + + switch (type) { + case NETDEV_OFFLOAD_XSTATS_TYPE_L3: + return netdev_offload_xstats_enable_l3(dev, extack); + } + + WARN_ON(1); + return -EINVAL; +} +EXPORT_SYMBOL(netdev_offload_xstats_enable); + +static void netdev_offload_xstats_disable_l3(struct net_device *dev) +{ + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, + }; + + call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE, + &info.info); + kfree(dev->offload_xstats_l3); + dev->offload_xstats_l3 = NULL; +} + +int netdev_offload_xstats_disable(struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + ASSERT_RTNL(); + + if (!netdev_offload_xstats_enabled(dev, type)) + return -EALREADY; + + switch (type) { + case NETDEV_OFFLOAD_XSTATS_TYPE_L3: + netdev_offload_xstats_disable_l3(dev); + return 0; + } + + WARN_ON(1); + return -EINVAL; +} +EXPORT_SYMBOL(netdev_offload_xstats_disable); + +static void netdev_offload_xstats_disable_all(struct net_device *dev) +{ + netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3); +} + +static struct rtnl_hw_stats64 * +netdev_offload_xstats_get_ptr(const struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + switch (type) { + case NETDEV_OFFLOAD_XSTATS_TYPE_L3: + return dev->offload_xstats_l3; + } + + WARN_ON(1); + return NULL; +} + +bool netdev_offload_xstats_enabled(const struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + ASSERT_RTNL(); + + return netdev_offload_xstats_get_ptr(dev, type); +} +EXPORT_SYMBOL(netdev_offload_xstats_enabled); + +struct netdev_notifier_offload_xstats_ru { + bool used; +}; + +struct netdev_notifier_offload_xstats_rd { + struct rtnl_hw_stats64 stats; + bool used; +}; + +static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest, + const struct rtnl_hw_stats64 *src) +{ + dest->rx_packets += src->rx_packets; + dest->tx_packets += src->tx_packets; + dest->rx_bytes += src->rx_bytes; + dest->tx_bytes += src->tx_bytes; + dest->rx_errors += src->rx_errors; + dest->tx_errors += src->tx_errors; + dest->rx_dropped += src->rx_dropped; + dest->tx_dropped += src->tx_dropped; + dest->multicast += src->multicast; +} + +static int netdev_offload_xstats_get_used(struct net_device *dev, + enum netdev_offload_xstats_type type, + bool *p_used, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_offload_xstats_ru report_used = {}; + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .info.extack = extack, + .type = type, + .report_used = &report_used, + }; + int rc; + + WARN_ON(!netdev_offload_xstats_enabled(dev, type)); + rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED, + &info.info); + *p_used = report_used.used; + return notifier_to_errno(rc); +} + +static int netdev_offload_xstats_get_stats(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct rtnl_hw_stats64 *p_stats, + bool *p_used, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_offload_xstats_rd report_delta = {}; + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .info.extack = extack, + .type = type, + .report_delta = &report_delta, + }; + struct rtnl_hw_stats64 *stats; + int rc; + + stats = netdev_offload_xstats_get_ptr(dev, type); + if (WARN_ON(!stats)) + return -EINVAL; + + rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA, + &info.info); + + /* Cache whatever we got, even if there was an error, otherwise the + * successful stats retrievals would get lost. + */ + netdev_hw_stats64_add(stats, &report_delta.stats); + + if (p_stats) + *p_stats = *stats; + *p_used = report_delta.used; + + return notifier_to_errno(rc); +} + +int netdev_offload_xstats_get(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct rtnl_hw_stats64 *p_stats, bool *p_used, + struct netlink_ext_ack *extack) +{ + ASSERT_RTNL(); + + if (p_stats) + return netdev_offload_xstats_get_stats(dev, type, p_stats, + p_used, extack); + else + return netdev_offload_xstats_get_used(dev, type, p_used, + extack); +} +EXPORT_SYMBOL(netdev_offload_xstats_get); + +void +netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta, + const struct rtnl_hw_stats64 *stats) +{ + report_delta->used = true; + netdev_hw_stats64_add(&report_delta->stats, stats); +} +EXPORT_SYMBOL(netdev_offload_xstats_report_delta); + +void +netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used) +{ + report_used->used = true; +} +EXPORT_SYMBOL(netdev_offload_xstats_report_used); + +void netdev_offload_xstats_push_delta(struct net_device *dev, + enum netdev_offload_xstats_type type, + const struct rtnl_hw_stats64 *p_stats) +{ + struct rtnl_hw_stats64 *stats; + + ASSERT_RTNL(); + + stats = netdev_offload_xstats_get_ptr(dev, type); + if (WARN_ON(!stats)) + return; + + netdev_hw_stats64_add(stats, p_stats); +} +EXPORT_SYMBOL(netdev_offload_xstats_push_delta); + +/** + * netdev_get_xmit_slave - Get the xmit slave of master device + * @dev: device + * @skb: The packet + * @all_slaves: assume all the slaves are active + * + * The reference counters are not incremented so the caller must be + * careful with locks. The caller must hold RCU lock. + * %NULL is returned if no slave is found. + */ + +struct net_device *netdev_get_xmit_slave(struct net_device *dev, + struct sk_buff *skb, + bool all_slaves) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_get_xmit_slave) + return NULL; + return ops->ndo_get_xmit_slave(dev, skb, all_slaves); +} +EXPORT_SYMBOL(netdev_get_xmit_slave); + +static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev, + struct sock *sk) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_sk_get_lower_dev) + return NULL; + return ops->ndo_sk_get_lower_dev(dev, sk); +} + +/** + * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket + * @dev: device + * @sk: the socket + * + * %NULL is returned if no lower device is found. + */ + +struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev, + struct sock *sk) +{ + struct net_device *lower; + + lower = netdev_sk_get_lower_dev(dev, sk); + while (lower) { + dev = lower; + lower = netdev_sk_get_lower_dev(dev, sk); + } + + return dev; +} +EXPORT_SYMBOL(netdev_sk_get_lowest_dev); + +static void netdev_adjacent_add_links(struct net_device *dev) +{ + struct netdev_adjacent *iter; + + struct net *net = dev_net(dev); + + list_for_each_entry(iter, &dev->adj_list.upper, list) { + if (!net_eq(net, dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.lower); + netdev_adjacent_sysfs_add(dev, iter->dev, + &dev->adj_list.upper); + } + + list_for_each_entry(iter, &dev->adj_list.lower, list) { + if (!net_eq(net, dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.upper); + netdev_adjacent_sysfs_add(dev, iter->dev, + &dev->adj_list.lower); + } +} + +static void netdev_adjacent_del_links(struct net_device *dev) +{ + struct netdev_adjacent *iter; + + struct net *net = dev_net(dev); + + list_for_each_entry(iter, &dev->adj_list.upper, list) { + if (!net_eq(net, dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_del(iter->dev, dev->name, + &iter->dev->adj_list.lower); + netdev_adjacent_sysfs_del(dev, iter->dev->name, + &dev->adj_list.upper); + } + + list_for_each_entry(iter, &dev->adj_list.lower, list) { + if (!net_eq(net, dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_del(iter->dev, dev->name, + &iter->dev->adj_list.upper); + netdev_adjacent_sysfs_del(dev, iter->dev->name, + &dev->adj_list.lower); + } +} + +void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) +{ + struct netdev_adjacent *iter; + + struct net *net = dev_net(dev); + + list_for_each_entry(iter, &dev->adj_list.upper, list) { + if (!net_eq(net, dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_del(iter->dev, oldname, + &iter->dev->adj_list.lower); + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.lower); + } + + list_for_each_entry(iter, &dev->adj_list.lower, list) { + if (!net_eq(net, dev_net(iter->dev))) + continue; + netdev_adjacent_sysfs_del(iter->dev, oldname, + &iter->dev->adj_list.upper); + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.upper); + } +} + +void *netdev_lower_dev_get_private(struct net_device *dev, + struct net_device *lower_dev) +{ + struct netdev_adjacent *lower; + + if (!lower_dev) + return NULL; + lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower); + if (!lower) + return NULL; + + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_dev_get_private); + + +/** + * netdev_lower_state_changed - Dispatch event about lower device state change + * @lower_dev: device + * @lower_state_info: state to dispatch + * + * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info. + * The caller must hold the RTNL lock. + */ +void netdev_lower_state_changed(struct net_device *lower_dev, + void *lower_state_info) +{ + struct netdev_notifier_changelowerstate_info changelowerstate_info = { + .info.dev = lower_dev, + }; + + ASSERT_RTNL(); + changelowerstate_info.lower_state_info = lower_state_info; + call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, + &changelowerstate_info.info); +} +EXPORT_SYMBOL(netdev_lower_state_changed); + +static void dev_change_rx_flags(struct net_device *dev, int flags) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_change_rx_flags) + ops->ndo_change_rx_flags(dev, flags); +} + +static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) +{ + unsigned int old_flags = dev->flags; + kuid_t uid; + kgid_t gid; + + ASSERT_RTNL(); + + dev->flags |= IFF_PROMISC; + dev->promiscuity += inc; + if (dev->promiscuity == 0) { + /* + * Avoid overflow. + * If inc causes overflow, untouch promisc and return error. + */ + if (inc < 0) + dev->flags &= ~IFF_PROMISC; + else { + dev->promiscuity -= inc; + netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n"); + return -EOVERFLOW; + } + } + if (dev->flags != old_flags) { + pr_info("device %s %s promiscuous mode\n", + dev->name, + dev->flags & IFF_PROMISC ? "entered" : "left"); + if (audit_enabled) { + current_uid_gid(&uid, &gid); + audit_log(audit_context(), GFP_ATOMIC, + AUDIT_ANOM_PROMISCUOUS, + "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", + dev->name, (dev->flags & IFF_PROMISC), + (old_flags & IFF_PROMISC), + from_kuid(&init_user_ns, audit_get_loginuid(current)), + from_kuid(&init_user_ns, uid), + from_kgid(&init_user_ns, gid), + audit_get_sessionid(current)); + } + + dev_change_rx_flags(dev, IFF_PROMISC); + } + if (notify) + __dev_notify_flags(dev, old_flags, IFF_PROMISC); + return 0; +} + +/** + * dev_set_promiscuity - update promiscuity count on a device + * @dev: device + * @inc: modifier + * + * Add or remove promiscuity from a device. While the count in the device + * remains above zero the interface remains promiscuous. Once it hits zero + * the device reverts back to normal filtering operation. A negative inc + * value is used to drop promiscuity on the device. + * Return 0 if successful or a negative errno code on error. + */ +int dev_set_promiscuity(struct net_device *dev, int inc) +{ + unsigned int old_flags = dev->flags; + int err; + + err = __dev_set_promiscuity(dev, inc, true); + if (err < 0) + return err; + if (dev->flags != old_flags) + dev_set_rx_mode(dev); + return err; +} +EXPORT_SYMBOL(dev_set_promiscuity); + +static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) +{ + unsigned int old_flags = dev->flags, old_gflags = dev->gflags; + + ASSERT_RTNL(); + + dev->flags |= IFF_ALLMULTI; + dev->allmulti += inc; + if (dev->allmulti == 0) { + /* + * Avoid overflow. + * If inc causes overflow, untouch allmulti and return error. + */ + if (inc < 0) + dev->flags &= ~IFF_ALLMULTI; + else { + dev->allmulti -= inc; + netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n"); + return -EOVERFLOW; + } + } + if (dev->flags ^ old_flags) { + dev_change_rx_flags(dev, IFF_ALLMULTI); + dev_set_rx_mode(dev); + if (notify) + __dev_notify_flags(dev, old_flags, + dev->gflags ^ old_gflags); + } + return 0; +} + +/** + * dev_set_allmulti - update allmulti count on a device + * @dev: device + * @inc: modifier + * + * Add or remove reception of all multicast frames to a device. While the + * count in the device remains above zero the interface remains listening + * to all interfaces. Once it hits zero the device reverts back to normal + * filtering operation. A negative @inc value is used to drop the counter + * when releasing a resource needing all multicasts. + * Return 0 if successful or a negative errno code on error. + */ + +int dev_set_allmulti(struct net_device *dev, int inc) +{ + return __dev_set_allmulti(dev, inc, true); +} +EXPORT_SYMBOL(dev_set_allmulti); + +/* + * Upload unicast and multicast address lists to device and + * configure RX filtering. When the device doesn't support unicast + * filtering it is put in promiscuous mode while unicast addresses + * are present. + */ +void __dev_set_rx_mode(struct net_device *dev) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + /* dev_open will call this function so the list will stay sane. */ + if (!(dev->flags&IFF_UP)) + return; + + if (!netif_device_present(dev)) + return; + + if (!(dev->priv_flags & IFF_UNICAST_FLT)) { + /* Unicast addresses changes may only happen under the rtnl, + * therefore calling __dev_set_promiscuity here is safe. + */ + if (!netdev_uc_empty(dev) && !dev->uc_promisc) { + __dev_set_promiscuity(dev, 1, false); + dev->uc_promisc = true; + } else if (netdev_uc_empty(dev) && dev->uc_promisc) { + __dev_set_promiscuity(dev, -1, false); + dev->uc_promisc = false; + } + } + + if (ops->ndo_set_rx_mode) + ops->ndo_set_rx_mode(dev); +} + +void dev_set_rx_mode(struct net_device *dev) +{ + netif_addr_lock_bh(dev); + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); +} + +/** + * dev_get_flags - get flags reported to userspace + * @dev: device + * + * Get the combination of flag bits exported through APIs to userspace. + */ +unsigned int dev_get_flags(const struct net_device *dev) +{ + unsigned int flags; + + flags = (dev->flags & ~(IFF_PROMISC | + IFF_ALLMULTI | + IFF_RUNNING | + IFF_LOWER_UP | + IFF_DORMANT)) | + (dev->gflags & (IFF_PROMISC | + IFF_ALLMULTI)); + + if (netif_running(dev)) { + if (netif_oper_up(dev)) + flags |= IFF_RUNNING; + if (netif_carrier_ok(dev)) + flags |= IFF_LOWER_UP; + if (netif_dormant(dev)) + flags |= IFF_DORMANT; + } + + return flags; +} +EXPORT_SYMBOL(dev_get_flags); + +int __dev_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) +{ + unsigned int old_flags = dev->flags; + int ret; + + ASSERT_RTNL(); + + /* + * Set the flags on our device. + */ + + dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | + IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | + IFF_AUTOMEDIA)) | + (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | + IFF_ALLMULTI)); + + /* + * Load in the correct multicast list now the flags have changed. + */ + + if ((old_flags ^ flags) & IFF_MULTICAST) + dev_change_rx_flags(dev, IFF_MULTICAST); + + dev_set_rx_mode(dev); + + /* + * Have we downed the interface. We handle IFF_UP ourselves + * according to user attempts to set it, rather than blindly + * setting it. + */ + + ret = 0; + if ((old_flags ^ flags) & IFF_UP) { + if (old_flags & IFF_UP) + __dev_close(dev); + else + ret = __dev_open(dev, extack); + } + + if ((flags ^ dev->gflags) & IFF_PROMISC) { + int inc = (flags & IFF_PROMISC) ? 1 : -1; + unsigned int old_flags = dev->flags; + + dev->gflags ^= IFF_PROMISC; + + if (__dev_set_promiscuity(dev, inc, false) >= 0) + if (dev->flags != old_flags) + dev_set_rx_mode(dev); + } + + /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI + * is important. Some (broken) drivers set IFF_PROMISC, when + * IFF_ALLMULTI is requested not asking us and not reporting. + */ + if ((flags ^ dev->gflags) & IFF_ALLMULTI) { + int inc = (flags & IFF_ALLMULTI) ? 1 : -1; + + dev->gflags ^= IFF_ALLMULTI; + __dev_set_allmulti(dev, inc, false); + } + + return ret; +} + +void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, + unsigned int gchanges) +{ + unsigned int changes = dev->flags ^ old_flags; + + if (gchanges) + rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC); + + if (changes & IFF_UP) { + if (dev->flags & IFF_UP) + call_netdevice_notifiers(NETDEV_UP, dev); + else + call_netdevice_notifiers(NETDEV_DOWN, dev); + } + + if (dev->flags & IFF_UP && + (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { + struct netdev_notifier_change_info change_info = { + .info = { + .dev = dev, + }, + .flags_changed = changes, + }; + + call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); + } +} + +/** + * dev_change_flags - change device settings + * @dev: device + * @flags: device state flags + * @extack: netlink extended ack + * + * Change settings on device based state flags. The flags are + * in the userspace exported format. + */ +int dev_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) +{ + int ret; + unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; + + ret = __dev_change_flags(dev, flags, extack); + if (ret < 0) + return ret; + + changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); + __dev_notify_flags(dev, old_flags, changes); + return ret; +} +EXPORT_SYMBOL(dev_change_flags); + +int __dev_set_mtu(struct net_device *dev, int new_mtu) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_change_mtu) + return ops->ndo_change_mtu(dev, new_mtu); + + /* Pairs with all the lockless reads of dev->mtu in the stack */ + WRITE_ONCE(dev->mtu, new_mtu); + return 0; +} +EXPORT_SYMBOL(__dev_set_mtu); + +int dev_validate_mtu(struct net_device *dev, int new_mtu, + struct netlink_ext_ack *extack) +{ + /* MTU must be positive, and in range */ + if (new_mtu < 0 || new_mtu < dev->min_mtu) { + NL_SET_ERR_MSG(extack, "mtu less than device minimum"); + return -EINVAL; + } + + if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) { + NL_SET_ERR_MSG(extack, "mtu greater than device maximum"); + return -EINVAL; + } + return 0; +} + +/** + * dev_set_mtu_ext - Change maximum transfer unit + * @dev: device + * @new_mtu: new transfer unit + * @extack: netlink extended ack + * + * Change the maximum transfer size of the network device. + */ +int dev_set_mtu_ext(struct net_device *dev, int new_mtu, + struct netlink_ext_ack *extack) +{ + int err, orig_mtu; + + if (new_mtu == dev->mtu) + return 0; + + err = dev_validate_mtu(dev, new_mtu, extack); + if (err) + return err; + + if (!netif_device_present(dev)) + return -ENODEV; + + err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); + err = notifier_to_errno(err); + if (err) + return err; + + orig_mtu = dev->mtu; + err = __dev_set_mtu(dev, new_mtu); + + if (!err) { + err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, + orig_mtu); + err = notifier_to_errno(err); + if (err) { + /* setting mtu back and notifying everyone again, + * so that they have a chance to revert changes. + */ + __dev_set_mtu(dev, orig_mtu); + call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, + new_mtu); + } + } + return err; +} + +int dev_set_mtu(struct net_device *dev, int new_mtu) +{ + struct netlink_ext_ack extack; + int err; + + memset(&extack, 0, sizeof(extack)); + err = dev_set_mtu_ext(dev, new_mtu, &extack); + if (err && extack._msg) + net_err_ratelimited("%s: %s\n", dev->name, extack._msg); + return err; +} +EXPORT_SYMBOL(dev_set_mtu); + +/** + * dev_change_tx_queue_len - Change TX queue length of a netdevice + * @dev: device + * @new_len: new tx queue length + */ +int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) +{ + unsigned int orig_len = dev->tx_queue_len; + int res; + + if (new_len != (unsigned int)new_len) + return -ERANGE; + + if (new_len != orig_len) { + dev->tx_queue_len = new_len; + res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); + res = notifier_to_errno(res); + if (res) + goto err_rollback; + res = dev_qdisc_change_tx_queue_len(dev); + if (res) + goto err_rollback; + } + + return 0; + +err_rollback: + netdev_err(dev, "refused to change device tx_queue_len\n"); + dev->tx_queue_len = orig_len; + return res; +} + +/** + * dev_set_group - Change group this device belongs to + * @dev: device + * @new_group: group this device should belong to + */ +void dev_set_group(struct net_device *dev, int new_group) +{ + dev->group = new_group; +} + +/** + * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR. + * @dev: device + * @addr: new address + * @extack: netlink extended ack + */ +int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_pre_changeaddr_info info = { + .info.dev = dev, + .info.extack = extack, + .dev_addr = addr, + }; + int rc; + + rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info); + return notifier_to_errno(rc); +} +EXPORT_SYMBOL(dev_pre_changeaddr_notify); + +/** + * dev_set_mac_address - Change Media Access Control Address + * @dev: device + * @sa: new address + * @extack: netlink extended ack + * + * Change the hardware (MAC) address of the device + */ +int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, + struct netlink_ext_ack *extack) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int err; + + if (!ops->ndo_set_mac_address) + return -EOPNOTSUPP; + if (sa->sa_family != dev->type) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack); + if (err) + return err; + err = ops->ndo_set_mac_address(dev, sa); + if (err) + return err; + dev->addr_assign_type = NET_ADDR_SET; + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + add_device_randomness(dev->dev_addr, dev->addr_len); + return 0; +} +EXPORT_SYMBOL(dev_set_mac_address); + +static DECLARE_RWSEM(dev_addr_sem); + +int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, + struct netlink_ext_ack *extack) +{ + int ret; + + down_write(&dev_addr_sem); + ret = dev_set_mac_address(dev, sa, extack); + up_write(&dev_addr_sem); + return ret; +} +EXPORT_SYMBOL(dev_set_mac_address_user); + +int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) +{ + size_t size = sizeof(sa->sa_data); + struct net_device *dev; + int ret = 0; + + down_read(&dev_addr_sem); + rcu_read_lock(); + + dev = dev_get_by_name_rcu(net, dev_name); + if (!dev) { + ret = -ENODEV; + goto unlock; + } + if (!dev->addr_len) + memset(sa->sa_data, 0, size); + else + memcpy(sa->sa_data, dev->dev_addr, + min_t(size_t, size, dev->addr_len)); + sa->sa_family = dev->type; + +unlock: + rcu_read_unlock(); + up_read(&dev_addr_sem); + return ret; +} +EXPORT_SYMBOL(dev_get_mac_address); + +/** + * dev_change_carrier - Change device carrier + * @dev: device + * @new_carrier: new value + * + * Change device carrier + */ +int dev_change_carrier(struct net_device *dev, bool new_carrier) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_change_carrier) + return -EOPNOTSUPP; + if (!netif_device_present(dev)) + return -ENODEV; + return ops->ndo_change_carrier(dev, new_carrier); +} + +/** + * dev_get_phys_port_id - Get device physical port ID + * @dev: device + * @ppid: port ID + * + * Get device physical port ID + */ +int dev_get_phys_port_id(struct net_device *dev, + struct netdev_phys_item_id *ppid) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_get_phys_port_id) + return -EOPNOTSUPP; + return ops->ndo_get_phys_port_id(dev, ppid); +} + +/** + * dev_get_phys_port_name - Get device physical port name + * @dev: device + * @name: port name + * @len: limit of bytes to copy to name + * + * Get device physical port name + */ +int dev_get_phys_port_name(struct net_device *dev, + char *name, size_t len) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int err; + + if (ops->ndo_get_phys_port_name) { + err = ops->ndo_get_phys_port_name(dev, name, len); + if (err != -EOPNOTSUPP) + return err; + } + return devlink_compat_phys_port_name_get(dev, name, len); +} + +/** + * dev_get_port_parent_id - Get the device's port parent identifier + * @dev: network device + * @ppid: pointer to a storage for the port's parent identifier + * @recurse: allow/disallow recursion to lower devices + * + * Get the devices's port parent identifier + */ +int dev_get_port_parent_id(struct net_device *dev, + struct netdev_phys_item_id *ppid, + bool recurse) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct netdev_phys_item_id first = { }; + struct net_device *lower_dev; + struct list_head *iter; + int err; + + if (ops->ndo_get_port_parent_id) { + err = ops->ndo_get_port_parent_id(dev, ppid); + if (err != -EOPNOTSUPP) + return err; + } + + err = devlink_compat_switch_id_get(dev, ppid); + if (!recurse || err != -EOPNOTSUPP) + return err; + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + err = dev_get_port_parent_id(lower_dev, ppid, true); + if (err) + break; + if (!first.id_len) + first = *ppid; + else if (memcmp(&first, ppid, sizeof(*ppid))) + return -EOPNOTSUPP; + } + + return err; +} +EXPORT_SYMBOL(dev_get_port_parent_id); + +/** + * netdev_port_same_parent_id - Indicate if two network devices have + * the same port parent identifier + * @a: first network device + * @b: second network device + */ +bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) +{ + struct netdev_phys_item_id a_id = { }; + struct netdev_phys_item_id b_id = { }; + + if (dev_get_port_parent_id(a, &a_id, true) || + dev_get_port_parent_id(b, &b_id, true)) + return false; + + return netdev_phys_item_id_same(&a_id, &b_id); +} +EXPORT_SYMBOL(netdev_port_same_parent_id); + +/** + * dev_change_proto_down - set carrier according to proto_down. + * + * @dev: device + * @proto_down: new value + */ +int dev_change_proto_down(struct net_device *dev, bool proto_down) +{ + if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN)) + return -EOPNOTSUPP; + if (!netif_device_present(dev)) + return -ENODEV; + if (proto_down) + netif_carrier_off(dev); + else + netif_carrier_on(dev); + dev->proto_down = proto_down; + return 0; +} + +/** + * dev_change_proto_down_reason - proto down reason + * + * @dev: device + * @mask: proto down mask + * @value: proto down value + */ +void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, + u32 value) +{ + int b; + + if (!mask) { + dev->proto_down_reason = value; + } else { + for_each_set_bit(b, &mask, 32) { + if (value & (1 << b)) + dev->proto_down_reason |= BIT(b); + else + dev->proto_down_reason &= ~BIT(b); + } + } +} + +struct bpf_xdp_link { + struct bpf_link link; + struct net_device *dev; /* protected by rtnl_lock, no refcnt held */ + int flags; +}; + +static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags) +{ + if (flags & XDP_FLAGS_HW_MODE) + return XDP_MODE_HW; + if (flags & XDP_FLAGS_DRV_MODE) + return XDP_MODE_DRV; + if (flags & XDP_FLAGS_SKB_MODE) + return XDP_MODE_SKB; + return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB; +} + +static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode) +{ + switch (mode) { + case XDP_MODE_SKB: + return generic_xdp_install; + case XDP_MODE_DRV: + case XDP_MODE_HW: + return dev->netdev_ops->ndo_bpf; + default: + return NULL; + } +} + +static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev, + enum bpf_xdp_mode mode) +{ + return dev->xdp_state[mode].link; +} + +static struct bpf_prog *dev_xdp_prog(struct net_device *dev, + enum bpf_xdp_mode mode) +{ + struct bpf_xdp_link *link = dev_xdp_link(dev, mode); + + if (link) + return link->link.prog; + return dev->xdp_state[mode].prog; +} + +u8 dev_xdp_prog_count(struct net_device *dev) +{ + u8 count = 0; + int i; + + for (i = 0; i < __MAX_XDP_MODE; i++) + if (dev->xdp_state[i].prog || dev->xdp_state[i].link) + count++; + return count; +} +EXPORT_SYMBOL_GPL(dev_xdp_prog_count); + +u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) +{ + struct bpf_prog *prog = dev_xdp_prog(dev, mode); + + return prog ? prog->aux->id : 0; +} + +static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode, + struct bpf_xdp_link *link) +{ + dev->xdp_state[mode].link = link; + dev->xdp_state[mode].prog = NULL; +} + +static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode, + struct bpf_prog *prog) +{ + dev->xdp_state[mode].link = NULL; + dev->xdp_state[mode].prog = prog; +} + +static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, + bpf_op_t bpf_op, struct netlink_ext_ack *extack, + u32 flags, struct bpf_prog *prog) +{ + struct netdev_bpf xdp; + int err; + + memset(&xdp, 0, sizeof(xdp)); + xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG; + xdp.extack = extack; + xdp.flags = flags; + xdp.prog = prog; + + /* Drivers assume refcnt is already incremented (i.e, prog pointer is + * "moved" into driver), so they don't increment it on their own, but + * they do decrement refcnt when program is detached or replaced. + * Given net_device also owns link/prog, we need to bump refcnt here + * to prevent drivers from underflowing it. + */ + if (prog) + bpf_prog_inc(prog); + err = bpf_op(dev, &xdp); + if (err) { + if (prog) + bpf_prog_put(prog); + return err; + } + + if (mode != XDP_MODE_HW) + bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog); + + return 0; +} + +static void dev_xdp_uninstall(struct net_device *dev) +{ + struct bpf_xdp_link *link; + struct bpf_prog *prog; + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; + + ASSERT_RTNL(); + + for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) { + prog = dev_xdp_prog(dev, mode); + if (!prog) + continue; + + bpf_op = dev_xdp_bpf_op(dev, mode); + if (!bpf_op) + continue; + + WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); + + /* auto-detach link from net device */ + link = dev_xdp_link(dev, mode); + if (link) + link->dev = NULL; + else + bpf_prog_put(prog); + + dev_xdp_set_link(dev, mode, NULL); + } +} + +static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack, + struct bpf_xdp_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog, u32 flags) +{ + unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES); + struct bpf_prog *cur_prog; + struct net_device *upper; + struct list_head *iter; + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; + int err; + + ASSERT_RTNL(); + + /* either link or prog attachment, never both */ + if (link && (new_prog || old_prog)) + return -EINVAL; + /* link supports only XDP mode flags */ + if (link && (flags & ~XDP_FLAGS_MODES)) { + NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment"); + return -EINVAL; + } + /* just one XDP mode bit should be set, zero defaults to drv/skb mode */ + if (num_modes > 1) { + NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set"); + return -EINVAL; + } + /* avoid ambiguity if offload + drv/skb mode progs are both loaded */ + if (!num_modes && dev_xdp_prog_count(dev) > 1) { + NL_SET_ERR_MSG(extack, + "More than one program loaded, unset mode is ambiguous"); + return -EINVAL; + } + /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */ + if (old_prog && !(flags & XDP_FLAGS_REPLACE)) { + NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified"); + return -EINVAL; + } + + mode = dev_xdp_mode(dev, flags); + /* can't replace attached link */ + if (dev_xdp_link(dev, mode)) { + NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link"); + return -EBUSY; + } + + /* don't allow if an upper device already has a program */ + netdev_for_each_upper_dev_rcu(dev, upper, iter) { + if (dev_xdp_prog_count(upper) > 0) { + NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program"); + return -EEXIST; + } + } + + cur_prog = dev_xdp_prog(dev, mode); + /* can't replace attached prog with link */ + if (link && cur_prog) { + NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link"); + return -EBUSY; + } + if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) { + NL_SET_ERR_MSG(extack, "Active program does not match expected"); + return -EEXIST; + } + + /* put effective new program into new_prog */ + if (link) + new_prog = link->link.prog; + + if (new_prog) { + bool offload = mode == XDP_MODE_HW; + enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB + ? XDP_MODE_DRV : XDP_MODE_SKB; + + if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) { + NL_SET_ERR_MSG(extack, "XDP program already attached"); + return -EBUSY; + } + if (!offload && dev_xdp_prog(dev, other_mode)) { + NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time"); + return -EEXIST; + } + if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) { + NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported"); + return -EINVAL; + } + if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) { + NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); + return -EINVAL; + } + if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) { + NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device"); + return -EINVAL; + } + } + + /* don't call drivers if the effective program didn't change */ + if (new_prog != cur_prog) { + bpf_op = dev_xdp_bpf_op(dev, mode); + if (!bpf_op) { + NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode"); + return -EOPNOTSUPP; + } + + err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog); + if (err) + return err; + } + + if (link) + dev_xdp_set_link(dev, mode, link); + else + dev_xdp_set_prog(dev, mode, new_prog); + if (cur_prog) + bpf_prog_put(cur_prog); + + return 0; +} + +static int dev_xdp_attach_link(struct net_device *dev, + struct netlink_ext_ack *extack, + struct bpf_xdp_link *link) +{ + return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags); +} + +static int dev_xdp_detach_link(struct net_device *dev, + struct netlink_ext_ack *extack, + struct bpf_xdp_link *link) +{ + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; + + ASSERT_RTNL(); + + mode = dev_xdp_mode(dev, link->flags); + if (dev_xdp_link(dev, mode) != link) + return -EINVAL; + + bpf_op = dev_xdp_bpf_op(dev, mode); + WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); + dev_xdp_set_link(dev, mode, NULL); + return 0; +} + +static void bpf_xdp_link_release(struct bpf_link *link) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + + rtnl_lock(); + + /* if racing with net_device's tear down, xdp_link->dev might be + * already NULL, in which case link was already auto-detached + */ + if (xdp_link->dev) { + WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link)); + xdp_link->dev = NULL; + } + + rtnl_unlock(); +} + +static int bpf_xdp_link_detach(struct bpf_link *link) +{ + bpf_xdp_link_release(link); + return 0; +} + +static void bpf_xdp_link_dealloc(struct bpf_link *link) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + + kfree(xdp_link); +} + +static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + u32 ifindex = 0; + + rtnl_lock(); + if (xdp_link->dev) + ifindex = xdp_link->dev->ifindex; + rtnl_unlock(); + + seq_printf(seq, "ifindex:\t%u\n", ifindex); +} + +static int bpf_xdp_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + u32 ifindex = 0; + + rtnl_lock(); + if (xdp_link->dev) + ifindex = xdp_link->dev->ifindex; + rtnl_unlock(); + + info->xdp.ifindex = ifindex; + return 0; +} + +static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; + int err = 0; + + rtnl_lock(); + + /* link might have been auto-released already, so fail */ + if (!xdp_link->dev) { + err = -ENOLINK; + goto out_unlock; + } + + if (old_prog && link->prog != old_prog) { + err = -EPERM; + goto out_unlock; + } + old_prog = link->prog; + if (old_prog->type != new_prog->type || + old_prog->expected_attach_type != new_prog->expected_attach_type) { + err = -EINVAL; + goto out_unlock; + } + + if (old_prog == new_prog) { + /* no-op, don't disturb drivers */ + bpf_prog_put(new_prog); + goto out_unlock; + } + + mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags); + bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode); + err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL, + xdp_link->flags, new_prog); + if (err) + goto out_unlock; + + old_prog = xchg(&link->prog, new_prog); + bpf_prog_put(old_prog); + +out_unlock: + rtnl_unlock(); + return err; +} + +static const struct bpf_link_ops bpf_xdp_link_lops = { + .release = bpf_xdp_link_release, + .dealloc = bpf_xdp_link_dealloc, + .detach = bpf_xdp_link_detach, + .show_fdinfo = bpf_xdp_link_show_fdinfo, + .fill_link_info = bpf_xdp_link_fill_link_info, + .update_prog = bpf_xdp_link_update, +}; + +int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_link_primer link_primer; + struct bpf_xdp_link *link; + struct net_device *dev; + int err, fd; + + rtnl_lock(); + dev = dev_get_by_index(net, attr->link_create.target_ifindex); + if (!dev) { + rtnl_unlock(); + return -EINVAL; + } + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto unlock; + } + + bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog); + link->dev = dev; + link->flags = attr->link_create.flags; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + goto unlock; + } + + err = dev_xdp_attach_link(dev, NULL, link); + rtnl_unlock(); + + if (err) { + link->dev = NULL; + bpf_link_cleanup(&link_primer); + goto out_put_dev; + } + + fd = bpf_link_settle(&link_primer); + /* link itself doesn't hold dev's refcnt to not complicate shutdown */ + dev_put(dev); + return fd; + +unlock: + rtnl_unlock(); + +out_put_dev: + dev_put(dev); + return err; +} + +/** + * dev_change_xdp_fd - set or clear a bpf program for a device rx path + * @dev: device + * @extack: netlink extended ack + * @fd: new program fd or negative value to clear + * @expected_fd: old program fd that userspace expects to replace or clear + * @flags: xdp-related flags + * + * Set or clear a bpf program for a device + */ +int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, + int fd, int expected_fd, u32 flags) +{ + enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags); + struct bpf_prog *new_prog = NULL, *old_prog = NULL; + int err; + + ASSERT_RTNL(); + + if (fd >= 0) { + new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, + mode != XDP_MODE_SKB); + if (IS_ERR(new_prog)) + return PTR_ERR(new_prog); + } + + if (expected_fd >= 0) { + old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP, + mode != XDP_MODE_SKB); + if (IS_ERR(old_prog)) { + err = PTR_ERR(old_prog); + old_prog = NULL; + goto err_out; + } + } + + err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags); + +err_out: + if (err && new_prog) + bpf_prog_put(new_prog); + if (old_prog) + bpf_prog_put(old_prog); + return err; +} + +/** + * dev_new_index - allocate an ifindex + * @net: the applicable net namespace + * + * Returns a suitable unique value for a new device interface + * number. The caller must hold the rtnl semaphore or the + * dev_base_lock to be sure it remains unique. + */ +static int dev_new_index(struct net *net) +{ + int ifindex = net->ifindex; + + for (;;) { + if (++ifindex <= 0) + ifindex = 1; + if (!__dev_get_by_index(net, ifindex)) + return net->ifindex = ifindex; + } +} + +/* Delayed registration/unregisteration */ +LIST_HEAD(net_todo_list); +DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); + +static void net_set_todo(struct net_device *dev) +{ + list_add_tail(&dev->todo_list, &net_todo_list); + atomic_inc(&dev_net(dev)->dev_unreg_count); +} + +static netdev_features_t netdev_sync_upper_features(struct net_device *lower, + struct net_device *upper, netdev_features_t features) +{ + netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; + netdev_features_t feature; + int feature_bit; + + for_each_netdev_feature(upper_disables, feature_bit) { + feature = __NETIF_F_BIT(feature_bit); + if (!(upper->wanted_features & feature) + && (features & feature)) { + netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n", + &feature, upper->name); + features &= ~feature; + } + } + + return features; +} + +static void netdev_sync_lower_features(struct net_device *upper, + struct net_device *lower, netdev_features_t features) +{ + netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; + netdev_features_t feature; + int feature_bit; + + for_each_netdev_feature(upper_disables, feature_bit) { + feature = __NETIF_F_BIT(feature_bit); + if (!(features & feature) && (lower->features & feature)) { + netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", + &feature, lower->name); + lower->wanted_features &= ~feature; + __netdev_update_features(lower); + + if (unlikely(lower->features & feature)) + netdev_WARN(upper, "failed to disable %pNF on %s!\n", + &feature, lower->name); + else + netdev_features_change(lower); + } + } +} + +static netdev_features_t netdev_fix_features(struct net_device *dev, + netdev_features_t features) +{ + /* Fix illegal checksum combinations */ + if ((features & NETIF_F_HW_CSUM) && + (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { + netdev_warn(dev, "mixed HW and IP checksum settings.\n"); + features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); + } + + /* TSO requires that SG is present as well. */ + if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { + netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); + features &= ~NETIF_F_ALL_TSO; + } + + if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) && + !(features & NETIF_F_IP_CSUM)) { + netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n"); + features &= ~NETIF_F_TSO; + features &= ~NETIF_F_TSO_ECN; + } + + if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) && + !(features & NETIF_F_IPV6_CSUM)) { + netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n"); + features &= ~NETIF_F_TSO6; + } + + /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */ + if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO)) + features &= ~NETIF_F_TSO_MANGLEID; + + /* TSO ECN requires that TSO is present as well. */ + if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) + features &= ~NETIF_F_TSO_ECN; + + /* Software GSO depends on SG. */ + if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { + netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); + features &= ~NETIF_F_GSO; + } + + /* GSO partial features require GSO partial be set */ + if ((features & dev->gso_partial_features) && + !(features & NETIF_F_GSO_PARTIAL)) { + netdev_dbg(dev, + "Dropping partially supported GSO features since no GSO partial.\n"); + features &= ~dev->gso_partial_features; + } + + if (!(features & NETIF_F_RXCSUM)) { + /* NETIF_F_GRO_HW implies doing RXCSUM since every packet + * successfully merged by hardware must also have the + * checksum verified by hardware. If the user does not + * want to enable RXCSUM, logically, we should disable GRO_HW. + */ + if (features & NETIF_F_GRO_HW) { + netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n"); + features &= ~NETIF_F_GRO_HW; + } + } + + /* LRO/HW-GRO features cannot be combined with RX-FCS */ + if (features & NETIF_F_RXFCS) { + if (features & NETIF_F_LRO) { + netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n"); + features &= ~NETIF_F_LRO; + } + + if (features & NETIF_F_GRO_HW) { + netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n"); + features &= ~NETIF_F_GRO_HW; + } + } + + if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) { + netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n"); + features &= ~NETIF_F_LRO; + } + + if (features & NETIF_F_HW_TLS_TX) { + bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) == + (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); + bool hw_csum = features & NETIF_F_HW_CSUM; + + if (!ip_csum && !hw_csum) { + netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); + features &= ~NETIF_F_HW_TLS_TX; + } + } + + if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) { + netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n"); + features &= ~NETIF_F_HW_TLS_RX; + } + + return features; +} + +int __netdev_update_features(struct net_device *dev) +{ + struct net_device *upper, *lower; + netdev_features_t features; + struct list_head *iter; + int err = -1; + + ASSERT_RTNL(); + + features = netdev_get_wanted_features(dev); + + if (dev->netdev_ops->ndo_fix_features) + features = dev->netdev_ops->ndo_fix_features(dev, features); + + /* driver might be less strict about feature dependencies */ + features = netdev_fix_features(dev, features); + + /* some features can't be enabled if they're off on an upper device */ + netdev_for_each_upper_dev_rcu(dev, upper, iter) + features = netdev_sync_upper_features(dev, upper, features); + + if (dev->features == features) + goto sync_lower; + + netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", + &dev->features, &features); + + if (dev->netdev_ops->ndo_set_features) + err = dev->netdev_ops->ndo_set_features(dev, features); + else + err = 0; + + if (unlikely(err < 0)) { + netdev_err(dev, + "set_features() failed (%d); wanted %pNF, left %pNF\n", + err, &features, &dev->features); + /* return non-0 since some features might have changed and + * it's better to fire a spurious notification than miss it + */ + return -1; + } + +sync_lower: + /* some features must be disabled on lower devices when disabled + * on an upper device (think: bonding master or bridge) + */ + netdev_for_each_lower_dev(dev, lower, iter) + netdev_sync_lower_features(dev, lower, features); + + if (!err) { + netdev_features_t diff = features ^ dev->features; + + if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) { + /* udp_tunnel_{get,drop}_rx_info both need + * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the + * device, or they won't do anything. + * Thus we need to update dev->features + * *before* calling udp_tunnel_get_rx_info, + * but *after* calling udp_tunnel_drop_rx_info. + */ + if (features & NETIF_F_RX_UDP_TUNNEL_PORT) { + dev->features = features; + udp_tunnel_get_rx_info(dev); + } else { + udp_tunnel_drop_rx_info(dev); + } + } + + if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) { + if (features & NETIF_F_HW_VLAN_CTAG_FILTER) { + dev->features = features; + err |= vlan_get_rx_ctag_filter_info(dev); + } else { + vlan_drop_rx_ctag_filter_info(dev); + } + } + + if (diff & NETIF_F_HW_VLAN_STAG_FILTER) { + if (features & NETIF_F_HW_VLAN_STAG_FILTER) { + dev->features = features; + err |= vlan_get_rx_stag_filter_info(dev); + } else { + vlan_drop_rx_stag_filter_info(dev); + } + } + + dev->features = features; + } + + return err < 0 ? 0 : 1; +} + +/** + * netdev_update_features - recalculate device features + * @dev: the device to check + * + * Recalculate dev->features set and send notifications if it + * has changed. Should be called after driver or hardware dependent + * conditions might have changed that influence the features. + */ +void netdev_update_features(struct net_device *dev) +{ + if (__netdev_update_features(dev)) + netdev_features_change(dev); +} +EXPORT_SYMBOL(netdev_update_features); + +/** + * netdev_change_features - recalculate device features + * @dev: the device to check + * + * Recalculate dev->features set and send notifications even + * if they have not changed. Should be called instead of + * netdev_update_features() if also dev->vlan_features might + * have changed to allow the changes to be propagated to stacked + * VLAN devices. + */ +void netdev_change_features(struct net_device *dev) +{ + __netdev_update_features(dev); + netdev_features_change(dev); +} +EXPORT_SYMBOL(netdev_change_features); + +/** + * netif_stacked_transfer_operstate - transfer operstate + * @rootdev: the root or lower level device to transfer state from + * @dev: the device to transfer operstate to + * + * Transfer operational state from root to device. This is normally + * called when a stacking relationship exists between the root + * device and the device(a leaf device). + */ +void netif_stacked_transfer_operstate(const struct net_device *rootdev, + struct net_device *dev) +{ + if (rootdev->operstate == IF_OPER_DORMANT) + netif_dormant_on(dev); + else + netif_dormant_off(dev); + + if (rootdev->operstate == IF_OPER_TESTING) + netif_testing_on(dev); + else + netif_testing_off(dev); + + if (netif_carrier_ok(rootdev)) + netif_carrier_on(dev); + else + netif_carrier_off(dev); +} +EXPORT_SYMBOL(netif_stacked_transfer_operstate); + +static int netif_alloc_rx_queues(struct net_device *dev) +{ + unsigned int i, count = dev->num_rx_queues; + struct netdev_rx_queue *rx; + size_t sz = count * sizeof(*rx); + int err = 0; + + BUG_ON(count < 1); + + rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); + if (!rx) + return -ENOMEM; + + dev->_rx = rx; + + for (i = 0; i < count; i++) { + rx[i].dev = dev; + + /* XDP RX-queue setup */ + err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0); + if (err < 0) + goto err_rxq_info; + } + return 0; + +err_rxq_info: + /* Rollback successful reg's and free other resources */ + while (i--) + xdp_rxq_info_unreg(&rx[i].xdp_rxq); + kvfree(dev->_rx); + dev->_rx = NULL; + return err; +} + +static void netif_free_rx_queues(struct net_device *dev) +{ + unsigned int i, count = dev->num_rx_queues; + + /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */ + if (!dev->_rx) + return; + + for (i = 0; i < count; i++) + xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq); + + kvfree(dev->_rx); +} + +static void netdev_init_one_queue(struct net_device *dev, + struct netdev_queue *queue, void *_unused) +{ + /* Initialize queue lock */ + spin_lock_init(&queue->_xmit_lock); + netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); + queue->xmit_lock_owner = -1; + netdev_queue_numa_node_write(queue, NUMA_NO_NODE); + queue->dev = dev; +#ifdef CONFIG_BQL + dql_init(&queue->dql, HZ); +#endif +} + +static void netif_free_tx_queues(struct net_device *dev) +{ + kvfree(dev->_tx); +} + +static int netif_alloc_netdev_queues(struct net_device *dev) +{ + unsigned int count = dev->num_tx_queues; + struct netdev_queue *tx; + size_t sz = count * sizeof(*tx); + + if (count < 1 || count > 0xffff) + return -EINVAL; + + tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); + if (!tx) + return -ENOMEM; + + dev->_tx = tx; + + netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); + spin_lock_init(&dev->tx_global_lock); + + return 0; +} + +void netif_tx_stop_all_queues(struct net_device *dev) +{ + unsigned int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *txq = netdev_get_tx_queue(dev, i); + + netif_tx_stop_queue(txq); + } +} +EXPORT_SYMBOL(netif_tx_stop_all_queues); + +/** + * register_netdevice() - register a network device + * @dev: device to register + * + * Take a prepared network device structure and make it externally accessible. + * A %NETDEV_REGISTER message is sent to the netdev notifier chain. + * Callers must hold the rtnl lock - you may want register_netdev() + * instead of this. + */ +int register_netdevice(struct net_device *dev) +{ + int ret; + struct net *net = dev_net(dev); + + BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE < + NETDEV_FEATURE_COUNT); + BUG_ON(dev_boot_phase); + ASSERT_RTNL(); + + might_sleep(); + + /* When net_device's are persistent, this will be fatal. */ + BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); + BUG_ON(!net); + + ret = ethtool_check_ops(dev->ethtool_ops); + if (ret) + return ret; + + spin_lock_init(&dev->addr_list_lock); + netdev_set_addr_lockdep_class(dev); + + ret = dev_get_valid_name(net, dev, dev->name); + if (ret < 0) + goto out; + + ret = -ENOMEM; + dev->name_node = netdev_name_node_head_alloc(dev); + if (!dev->name_node) + goto out; + + /* Init, if this function is available */ + if (dev->netdev_ops->ndo_init) { + ret = dev->netdev_ops->ndo_init(dev); + if (ret) { + if (ret > 0) + ret = -EIO; + goto err_free_name; + } + } + + if (((dev->hw_features | dev->features) & + NETIF_F_HW_VLAN_CTAG_FILTER) && + (!dev->netdev_ops->ndo_vlan_rx_add_vid || + !dev->netdev_ops->ndo_vlan_rx_kill_vid)) { + netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n"); + ret = -EINVAL; + goto err_uninit; + } + + ret = -EBUSY; + if (!dev->ifindex) + dev->ifindex = dev_new_index(net); + else if (__dev_get_by_index(net, dev->ifindex)) + goto err_uninit; + + /* Transfer changeable features to wanted_features and enable + * software offloads (GSO and GRO). + */ + dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF); + dev->features |= NETIF_F_SOFT_FEATURES; + + if (dev->udp_tunnel_nic_info) { + dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT; + dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT; + } + + dev->wanted_features = dev->features & dev->hw_features; + + if (!(dev->flags & IFF_LOOPBACK)) + dev->hw_features |= NETIF_F_NOCACHE_COPY; + + /* If IPv4 TCP segmentation offload is supported we should also + * allow the device to enable segmenting the frame with the option + * of ignoring a static IP ID value. This doesn't enable the + * feature itself but allows the user to enable it later. + */ + if (dev->hw_features & NETIF_F_TSO) + dev->hw_features |= NETIF_F_TSO_MANGLEID; + if (dev->vlan_features & NETIF_F_TSO) + dev->vlan_features |= NETIF_F_TSO_MANGLEID; + if (dev->mpls_features & NETIF_F_TSO) + dev->mpls_features |= NETIF_F_TSO_MANGLEID; + if (dev->hw_enc_features & NETIF_F_TSO) + dev->hw_enc_features |= NETIF_F_TSO_MANGLEID; + + /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. + */ + dev->vlan_features |= NETIF_F_HIGHDMA; + + /* Make NETIF_F_SG inheritable to tunnel devices. + */ + dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL; + + /* Make NETIF_F_SG inheritable to MPLS. + */ + dev->mpls_features |= NETIF_F_SG; + + ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); + ret = notifier_to_errno(ret); + if (ret) + goto err_uninit; + + ret = netdev_register_kobject(dev); + write_lock(&dev_base_lock); + dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED; + write_unlock(&dev_base_lock); + if (ret) + goto err_uninit; + + __netdev_update_features(dev); + + /* + * Default initial state at registry is that the + * device is present. + */ + + set_bit(__LINK_STATE_PRESENT, &dev->state); + + linkwatch_init_dev(dev); + + dev_init_scheduler(dev); + + netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL); + list_netdevice(dev); + + add_device_randomness(dev->dev_addr, dev->addr_len); + + /* If the device has permanent device address, driver should + * set dev_addr and also addr_assign_type should be set to + * NET_ADDR_PERM (default value). + */ + if (dev->addr_assign_type == NET_ADDR_PERM) + memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); + + /* Notify protocols, that a new device appeared. */ + ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); + ret = notifier_to_errno(ret); + if (ret) { + /* Expect explicit free_netdev() on failure */ + dev->needs_free_netdev = false; + unregister_netdevice_queue(dev, NULL); + goto out; + } + /* + * Prevent userspace races by waiting until the network + * device is fully setup before sending notifications. + */ + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); + +out: + return ret; + +err_uninit: + if (dev->netdev_ops->ndo_uninit) + dev->netdev_ops->ndo_uninit(dev); + if (dev->priv_destructor) + dev->priv_destructor(dev); +err_free_name: + netdev_name_node_free(dev->name_node); + goto out; +} +EXPORT_SYMBOL(register_netdevice); + +/** + * init_dummy_netdev - init a dummy network device for NAPI + * @dev: device to init + * + * This takes a network device structure and initialize the minimum + * amount of fields so it can be used to schedule NAPI polls without + * registering a full blown interface. This is to be used by drivers + * that need to tie several hardware interfaces to a single NAPI + * poll scheduler due to HW limitations. + */ +int init_dummy_netdev(struct net_device *dev) +{ + /* Clear everything. Note we don't initialize spinlocks + * are they aren't supposed to be taken by any of the + * NAPI code and this dummy netdev is supposed to be + * only ever used for NAPI polls + */ + memset(dev, 0, sizeof(struct net_device)); + + /* make sure we BUG if trying to hit standard + * register/unregister code path + */ + dev->reg_state = NETREG_DUMMY; + + /* NAPI wants this */ + INIT_LIST_HEAD(&dev->napi_list); + + /* a dummy interface is started by default */ + set_bit(__LINK_STATE_PRESENT, &dev->state); + set_bit(__LINK_STATE_START, &dev->state); + + /* napi_busy_loop stats accounting wants this */ + dev_net_set(dev, &init_net); + + /* Note : We dont allocate pcpu_refcnt for dummy devices, + * because users of this 'device' dont need to change + * its refcount. + */ + + return 0; +} +EXPORT_SYMBOL_GPL(init_dummy_netdev); + + +/** + * register_netdev - register a network device + * @dev: device to register + * + * Take a completed network device structure and add it to the kernel + * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier + * chain. 0 is returned on success. A negative errno code is returned + * on a failure to set up the device, or if the name is a duplicate. + * + * This is a wrapper around register_netdevice that takes the rtnl semaphore + * and expands the device name if you passed a format string to + * alloc_netdev. + */ +int register_netdev(struct net_device *dev) +{ + int err; + + if (rtnl_lock_killable()) + return -EINTR; + err = register_netdevice(dev); + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(register_netdev); + +int netdev_refcnt_read(const struct net_device *dev) +{ +#ifdef CONFIG_PCPU_DEV_REFCNT + int i, refcnt = 0; + + for_each_possible_cpu(i) + refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); + return refcnt; +#else + return refcount_read(&dev->dev_refcnt); +#endif +} +EXPORT_SYMBOL(netdev_refcnt_read); + +int netdev_unregister_timeout_secs __read_mostly = 10; + +#define WAIT_REFS_MIN_MSECS 1 +#define WAIT_REFS_MAX_MSECS 250 +/** + * netdev_wait_allrefs_any - wait until all references are gone. + * @list: list of net_devices to wait on + * + * This is called when unregistering network devices. + * + * Any protocol or device that holds a reference should register + * for netdevice notification, and cleanup and put back the + * reference if they receive an UNREGISTER event. + * We can get stuck here if buggy protocols don't correctly + * call dev_put. + */ +static struct net_device *netdev_wait_allrefs_any(struct list_head *list) +{ + unsigned long rebroadcast_time, warning_time; + struct net_device *dev; + int wait = 0; + + rebroadcast_time = warning_time = jiffies; + + list_for_each_entry(dev, list, todo_list) + if (netdev_refcnt_read(dev) == 1) + return dev; + + while (true) { + if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { + rtnl_lock(); + + /* Rebroadcast unregister notification */ + list_for_each_entry(dev, list, todo_list) + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + + __rtnl_unlock(); + rcu_barrier(); + rtnl_lock(); + + list_for_each_entry(dev, list, todo_list) + if (test_bit(__LINK_STATE_LINKWATCH_PENDING, + &dev->state)) { + /* We must not have linkwatch events + * pending on unregister. If this + * happens, we simply run the queue + * unscheduled, resulting in a noop + * for this device. + */ + linkwatch_run_queue(); + break; + } + + __rtnl_unlock(); + + rebroadcast_time = jiffies; + } + + if (!wait) { + rcu_barrier(); + wait = WAIT_REFS_MIN_MSECS; + } else { + msleep(wait); + wait = min(wait << 1, WAIT_REFS_MAX_MSECS); + } + + list_for_each_entry(dev, list, todo_list) + if (netdev_refcnt_read(dev) == 1) + return dev; + + if (time_after(jiffies, warning_time + + READ_ONCE(netdev_unregister_timeout_secs) * HZ)) { + list_for_each_entry(dev, list, todo_list) { + pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", + dev->name, netdev_refcnt_read(dev)); + ref_tracker_dir_print(&dev->refcnt_tracker, 10); + } + + warning_time = jiffies; + } + } +} + +/* The sequence is: + * + * rtnl_lock(); + * ... + * register_netdevice(x1); + * register_netdevice(x2); + * ... + * unregister_netdevice(y1); + * unregister_netdevice(y2); + * ... + * rtnl_unlock(); + * free_netdev(y1); + * free_netdev(y2); + * + * We are invoked by rtnl_unlock(). + * This allows us to deal with problems: + * 1) We can delete sysfs objects which invoke hotplug + * without deadlocking with linkwatch via keventd. + * 2) Since we run with the RTNL semaphore not held, we can sleep + * safely in order to wait for the netdev refcnt to drop to zero. + * + * We must not return until all unregister events added during + * the interval the lock was held have been completed. + */ +void netdev_run_todo(void) +{ + struct net_device *dev, *tmp; + struct list_head list; +#ifdef CONFIG_LOCKDEP + struct list_head unlink_list; + + list_replace_init(&net_unlink_list, &unlink_list); + + while (!list_empty(&unlink_list)) { + struct net_device *dev = list_first_entry(&unlink_list, + struct net_device, + unlink_list); + list_del_init(&dev->unlink_list); + dev->nested_level = dev->lower_level - 1; + } +#endif + + /* Snapshot list, allow later requests */ + list_replace_init(&net_todo_list, &list); + + __rtnl_unlock(); + + /* Wait for rcu callbacks to finish before next phase */ + if (!list_empty(&list)) + rcu_barrier(); + + list_for_each_entry_safe(dev, tmp, &list, todo_list) { + if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { + netdev_WARN(dev, "run_todo but not unregistering\n"); + list_del(&dev->todo_list); + continue; + } + + write_lock(&dev_base_lock); + dev->reg_state = NETREG_UNREGISTERED; + write_unlock(&dev_base_lock); + linkwatch_forget_dev(dev); + } + + while (!list_empty(&list)) { + dev = netdev_wait_allrefs_any(&list); + list_del(&dev->todo_list); + + /* paranoia */ + BUG_ON(netdev_refcnt_read(dev) != 1); + BUG_ON(!list_empty(&dev->ptype_all)); + BUG_ON(!list_empty(&dev->ptype_specific)); + WARN_ON(rcu_access_pointer(dev->ip_ptr)); + WARN_ON(rcu_access_pointer(dev->ip6_ptr)); + + if (dev->priv_destructor) + dev->priv_destructor(dev); + if (dev->needs_free_netdev) + free_netdev(dev); + + if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count)) + wake_up(&netdev_unregistering_wq); + + /* Free network device */ + kobject_put(&dev->dev.kobj); + } +} + +/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has + * all the same fields in the same order as net_device_stats, with only + * the type differing, but rtnl_link_stats64 may have additional fields + * at the end for newer counters. + */ +void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, + const struct net_device_stats *netdev_stats) +{ + size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t); + const atomic_long_t *src = (atomic_long_t *)netdev_stats; + u64 *dst = (u64 *)stats64; + + BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); + for (i = 0; i < n; i++) + dst[i] = (unsigned long)atomic_long_read(&src[i]); + /* zero out counters that only exist in rtnl_link_stats64 */ + memset((char *)stats64 + n * sizeof(u64), 0, + sizeof(*stats64) - n * sizeof(u64)); +} +EXPORT_SYMBOL(netdev_stats_to_stats64); + +struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev) +{ + struct net_device_core_stats __percpu *p; + + p = alloc_percpu_gfp(struct net_device_core_stats, + GFP_ATOMIC | __GFP_NOWARN); + + if (p && cmpxchg(&dev->core_stats, NULL, p)) + free_percpu(p); + + /* This READ_ONCE() pairs with the cmpxchg() above */ + return READ_ONCE(dev->core_stats); +} +EXPORT_SYMBOL(netdev_core_stats_alloc); + +/** + * dev_get_stats - get network device statistics + * @dev: device to get statistics from + * @storage: place to store stats + * + * Get network statistics from device. Return @storage. + * The device driver may provide its own method by setting + * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; + * otherwise the internal statistics structure is used. + */ +struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, + struct rtnl_link_stats64 *storage) +{ + const struct net_device_ops *ops = dev->netdev_ops; + const struct net_device_core_stats __percpu *p; + + if (ops->ndo_get_stats64) { + memset(storage, 0, sizeof(*storage)); + ops->ndo_get_stats64(dev, storage); + } else if (ops->ndo_get_stats) { + netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); + } else { + netdev_stats_to_stats64(storage, &dev->stats); + } + + /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */ + p = READ_ONCE(dev->core_stats); + if (p) { + const struct net_device_core_stats *core_stats; + int i; + + for_each_possible_cpu(i) { + core_stats = per_cpu_ptr(p, i); + storage->rx_dropped += READ_ONCE(core_stats->rx_dropped); + storage->tx_dropped += READ_ONCE(core_stats->tx_dropped); + storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler); + storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped); + } + } + return storage; +} +EXPORT_SYMBOL(dev_get_stats); + +/** + * dev_fetch_sw_netstats - get per-cpu network device statistics + * @s: place to store stats + * @netstats: per-cpu network stats to read from + * + * Read per-cpu network statistics and populate the related fields in @s. + */ +void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, + const struct pcpu_sw_netstats __percpu *netstats) +{ + int cpu; + + for_each_possible_cpu(cpu) { + u64 rx_packets, rx_bytes, tx_packets, tx_bytes; + const struct pcpu_sw_netstats *stats; + unsigned int start; + + stats = per_cpu_ptr(netstats, cpu); + do { + start = u64_stats_fetch_begin_irq(&stats->syncp); + rx_packets = u64_stats_read(&stats->rx_packets); + rx_bytes = u64_stats_read(&stats->rx_bytes); + tx_packets = u64_stats_read(&stats->tx_packets); + tx_bytes = u64_stats_read(&stats->tx_bytes); + } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); + + s->rx_packets += rx_packets; + s->rx_bytes += rx_bytes; + s->tx_packets += tx_packets; + s->tx_bytes += tx_bytes; + } +} +EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats); + +/** + * dev_get_tstats64 - ndo_get_stats64 implementation + * @dev: device to get statistics from + * @s: place to store stats + * + * Populate @s from dev->stats and dev->tstats. Can be used as + * ndo_get_stats64() callback. + */ +void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s) +{ + netdev_stats_to_stats64(s, &dev->stats); + dev_fetch_sw_netstats(s, dev->tstats); +} +EXPORT_SYMBOL_GPL(dev_get_tstats64); + +struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) +{ + struct netdev_queue *queue = dev_ingress_queue(dev); + +#ifdef CONFIG_NET_CLS_ACT + if (queue) + return queue; + queue = kzalloc(sizeof(*queue), GFP_KERNEL); + if (!queue) + return NULL; + netdev_init_one_queue(dev, queue, NULL); + RCU_INIT_POINTER(queue->qdisc, &noop_qdisc); + RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc); + rcu_assign_pointer(dev->ingress_queue, queue); +#endif + return queue; +} + +static const struct ethtool_ops default_ethtool_ops; + +void netdev_set_default_ethtool_ops(struct net_device *dev, + const struct ethtool_ops *ops) +{ + if (dev->ethtool_ops == &default_ethtool_ops) + dev->ethtool_ops = ops; +} +EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); + +void netdev_freemem(struct net_device *dev) +{ + char *addr = (char *)dev - dev->padded; + + kvfree(addr); +} + +/** + * alloc_netdev_mqs - allocate network device + * @sizeof_priv: size of private data to allocate space for + * @name: device name format string + * @name_assign_type: origin of device name + * @setup: callback to initialize device + * @txqs: the number of TX subqueues to allocate + * @rxqs: the number of RX subqueues to allocate + * + * Allocates a struct net_device with private data area for driver use + * and performs basic initialization. Also allocates subqueue structs + * for each queue on the device. + */ +struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, + unsigned char name_assign_type, + void (*setup)(struct net_device *), + unsigned int txqs, unsigned int rxqs) +{ + struct net_device *dev; + unsigned int alloc_size; + struct net_device *p; + + BUG_ON(strlen(name) >= sizeof(dev->name)); + + if (txqs < 1) { + pr_err("alloc_netdev: Unable to allocate device with zero queues\n"); + return NULL; + } + + if (rxqs < 1) { + pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); + return NULL; + } + + alloc_size = sizeof(struct net_device); + if (sizeof_priv) { + /* ensure 32-byte alignment of private area */ + alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); + alloc_size += sizeof_priv; + } + /* ensure 32-byte alignment of whole construct */ + alloc_size += NETDEV_ALIGN - 1; + + p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); + if (!p) + return NULL; + + dev = PTR_ALIGN(p, NETDEV_ALIGN); + dev->padded = (char *)dev - (char *)p; + + ref_tracker_dir_init(&dev->refcnt_tracker, 128); +#ifdef CONFIG_PCPU_DEV_REFCNT + dev->pcpu_refcnt = alloc_percpu(int); + if (!dev->pcpu_refcnt) + goto free_dev; + __dev_hold(dev); +#else + refcount_set(&dev->dev_refcnt, 1); +#endif + + if (dev_addr_init(dev)) + goto free_pcpu; + + dev_mc_init(dev); + dev_uc_init(dev); + + dev_net_set(dev, &init_net); + + dev->gso_max_size = GSO_LEGACY_MAX_SIZE; + dev->gso_max_segs = GSO_MAX_SEGS; + dev->gro_max_size = GRO_LEGACY_MAX_SIZE; + dev->tso_max_size = TSO_LEGACY_MAX_SIZE; + dev->tso_max_segs = TSO_MAX_SEGS; + dev->upper_level = 1; + dev->lower_level = 1; +#ifdef CONFIG_LOCKDEP + dev->nested_level = 0; + INIT_LIST_HEAD(&dev->unlink_list); +#endif + + INIT_LIST_HEAD(&dev->napi_list); + INIT_LIST_HEAD(&dev->unreg_list); + INIT_LIST_HEAD(&dev->close_list); + INIT_LIST_HEAD(&dev->link_watch_list); + INIT_LIST_HEAD(&dev->adj_list.upper); + INIT_LIST_HEAD(&dev->adj_list.lower); + INIT_LIST_HEAD(&dev->ptype_all); + INIT_LIST_HEAD(&dev->ptype_specific); + INIT_LIST_HEAD(&dev->net_notifier_list); +#ifdef CONFIG_NET_SCHED + hash_init(dev->qdisc_hash); +#endif + dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; + setup(dev); + + if (!dev->tx_queue_len) { + dev->priv_flags |= IFF_NO_QUEUE; + dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; + } + + dev->num_tx_queues = txqs; + dev->real_num_tx_queues = txqs; + if (netif_alloc_netdev_queues(dev)) + goto free_all; + + dev->num_rx_queues = rxqs; + dev->real_num_rx_queues = rxqs; + if (netif_alloc_rx_queues(dev)) + goto free_all; + + strcpy(dev->name, name); + dev->name_assign_type = name_assign_type; + dev->group = INIT_NETDEV_GROUP; + if (!dev->ethtool_ops) + dev->ethtool_ops = &default_ethtool_ops; + + nf_hook_netdev_init(dev); + + return dev; + +free_all: + free_netdev(dev); + return NULL; + +free_pcpu: +#ifdef CONFIG_PCPU_DEV_REFCNT + free_percpu(dev->pcpu_refcnt); +free_dev: +#endif + netdev_freemem(dev); + return NULL; +} +EXPORT_SYMBOL(alloc_netdev_mqs); + +/** + * free_netdev - free network device + * @dev: device + * + * This function does the last stage of destroying an allocated device + * interface. The reference to the device object is released. If this + * is the last reference then it will be freed.Must be called in process + * context. + */ +void free_netdev(struct net_device *dev) +{ + struct napi_struct *p, *n; + + might_sleep(); + + /* When called immediately after register_netdevice() failed the unwind + * handling may still be dismantling the device. Handle that case by + * deferring the free. + */ + if (dev->reg_state == NETREG_UNREGISTERING) { + ASSERT_RTNL(); + dev->needs_free_netdev = true; + return; + } + + netif_free_tx_queues(dev); + netif_free_rx_queues(dev); + + kfree(rcu_dereference_protected(dev->ingress_queue, 1)); + + /* Flush device addresses */ + dev_addr_flush(dev); + + list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) + netif_napi_del(p); + + ref_tracker_dir_exit(&dev->refcnt_tracker); +#ifdef CONFIG_PCPU_DEV_REFCNT + free_percpu(dev->pcpu_refcnt); + dev->pcpu_refcnt = NULL; +#endif + free_percpu(dev->core_stats); + dev->core_stats = NULL; + free_percpu(dev->xdp_bulkq); + dev->xdp_bulkq = NULL; + + /* Compatibility with error handling in drivers */ + if (dev->reg_state == NETREG_UNINITIALIZED) { + netdev_freemem(dev); + return; + } + + BUG_ON(dev->reg_state != NETREG_UNREGISTERED); + dev->reg_state = NETREG_RELEASED; + + /* will free via device release */ + put_device(&dev->dev); +} +EXPORT_SYMBOL(free_netdev); + +/** + * synchronize_net - Synchronize with packet receive processing + * + * Wait for packets currently being received to be done. + * Does not block later packets from starting. + */ +void synchronize_net(void) +{ + might_sleep(); + if (rtnl_is_locked()) + synchronize_rcu_expedited(); + else + synchronize_rcu(); +} +EXPORT_SYMBOL(synchronize_net); + +/** + * unregister_netdevice_queue - remove device from the kernel + * @dev: device + * @head: list + * + * This function shuts down a device interface and removes it + * from the kernel tables. + * If head not NULL, device is queued to be unregistered later. + * + * Callers must hold the rtnl semaphore. You may want + * unregister_netdev() instead of this. + */ + +void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) +{ + ASSERT_RTNL(); + + if (head) { + list_move_tail(&dev->unreg_list, head); + } else { + LIST_HEAD(single); + + list_add(&dev->unreg_list, &single); + unregister_netdevice_many(&single); + } +} +EXPORT_SYMBOL(unregister_netdevice_queue); + +/** + * unregister_netdevice_many - unregister many devices + * @head: list of devices + * + * Note: As most callers use a stack allocated list_head, + * we force a list_del() to make sure stack wont be corrupted later. + */ +void unregister_netdevice_many(struct list_head *head) +{ + struct net_device *dev, *tmp; + LIST_HEAD(close_head); + + BUG_ON(dev_boot_phase); + ASSERT_RTNL(); + + if (list_empty(head)) + return; + + list_for_each_entry_safe(dev, tmp, head, unreg_list) { + /* Some devices call without registering + * for initialization unwind. Remove those + * devices and proceed with the remaining. + */ + if (dev->reg_state == NETREG_UNINITIALIZED) { + pr_debug("unregister_netdevice: device %s/%p never was registered\n", + dev->name, dev); + + WARN_ON(1); + list_del(&dev->unreg_list); + continue; + } + dev->dismantle = true; + BUG_ON(dev->reg_state != NETREG_REGISTERED); + } + + /* If device is running, close it first. */ + list_for_each_entry(dev, head, unreg_list) + list_add_tail(&dev->close_list, &close_head); + dev_close_many(&close_head, true); + + list_for_each_entry(dev, head, unreg_list) { + /* And unlink it from device chain. */ + write_lock(&dev_base_lock); + unlist_netdevice(dev, false); + dev->reg_state = NETREG_UNREGISTERING; + write_unlock(&dev_base_lock); + } + flush_all_backlogs(); + + synchronize_net(); + + list_for_each_entry(dev, head, unreg_list) { + struct sk_buff *skb = NULL; + + /* Shutdown queueing discipline. */ + dev_shutdown(dev); + + dev_xdp_uninstall(dev); + + netdev_offload_xstats_disable_all(dev); + + /* Notify protocols, that we are about to destroy + * this device. They should clean all the things. + */ + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, + GFP_KERNEL, NULL, 0); + + /* + * Flush the unicast and multicast chains + */ + dev_uc_flush(dev); + dev_mc_flush(dev); + + netdev_name_node_alt_flush(dev); + netdev_name_node_free(dev->name_node); + + if (dev->netdev_ops->ndo_uninit) + dev->netdev_ops->ndo_uninit(dev); + + if (skb) + rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); + + /* Notifier chain MUST detach us all upper devices. */ + WARN_ON(netdev_has_any_upper_dev(dev)); + WARN_ON(netdev_has_any_lower_dev(dev)); + + /* Remove entries from kobject tree */ + netdev_unregister_kobject(dev); +#ifdef CONFIG_XPS + /* Remove XPS queueing entries */ + netif_reset_xps_queues_gt(dev, 0); +#endif + } + + synchronize_net(); + + list_for_each_entry(dev, head, unreg_list) { + netdev_put(dev, &dev->dev_registered_tracker); + net_set_todo(dev); + } + + list_del(head); +} +EXPORT_SYMBOL(unregister_netdevice_many); + +/** + * unregister_netdev - remove device from the kernel + * @dev: device + * + * This function shuts down a device interface and removes it + * from the kernel tables. + * + * This is just a wrapper for unregister_netdevice that takes + * the rtnl semaphore. In general you want to use this and not + * unregister_netdevice. + */ +void unregister_netdev(struct net_device *dev) +{ + rtnl_lock(); + unregister_netdevice(dev); + rtnl_unlock(); +} +EXPORT_SYMBOL(unregister_netdev); + +/** + * __dev_change_net_namespace - move device to different nethost namespace + * @dev: device + * @net: network namespace + * @pat: If not NULL name pattern to try if the current device name + * is already taken in the destination network namespace. + * @new_ifindex: If not zero, specifies device index in the target + * namespace. + * + * This function shuts down a device interface and moves it + * to a new network namespace. On success 0 is returned, on + * a failure a netagive errno code is returned. + * + * Callers must hold the rtnl semaphore. + */ + +int __dev_change_net_namespace(struct net_device *dev, struct net *net, + const char *pat, int new_ifindex) +{ + struct netdev_name_node *name_node; + struct net *net_old = dev_net(dev); + char new_name[IFNAMSIZ] = {}; + int err, new_nsid; + + ASSERT_RTNL(); + + /* Don't allow namespace local devices to be moved. */ + err = -EINVAL; + if (dev->features & NETIF_F_NETNS_LOCAL) + goto out; + + /* Ensure the device has been registrered */ + if (dev->reg_state != NETREG_REGISTERED) + goto out; + + /* Get out if there is nothing todo */ + err = 0; + if (net_eq(net_old, net)) + goto out; + + /* Pick the destination device name, and ensure + * we can use it in the destination network namespace. + */ + err = -EEXIST; + if (netdev_name_in_use(net, dev->name)) { + /* We get here if we can't use the current device name */ + if (!pat) + goto out; + err = dev_prep_valid_name(net, dev, pat, new_name); + if (err < 0) + goto out; + } + /* Check that none of the altnames conflicts. */ + err = -EEXIST; + netdev_for_each_altname(dev, name_node) + if (netdev_name_in_use(net, name_node->name)) + goto out; + + /* Check that new_ifindex isn't used yet. */ + err = -EBUSY; + if (new_ifindex && __dev_get_by_index(net, new_ifindex)) + goto out; + + /* + * And now a mini version of register_netdevice unregister_netdevice. + */ + + /* If device is running close it first. */ + dev_close(dev); + + /* And unlink it from device chain */ + unlist_netdevice(dev, true); + + synchronize_net(); + + /* Shutdown queueing discipline. */ + dev_shutdown(dev); + + /* Notify protocols, that we are about to destroy + * this device. They should clean all the things. + * + * Note that dev->reg_state stays at NETREG_REGISTERED. + * This is wanted because this way 8021q and macvlan know + * the device is just moving and can keep their slaves up. + */ + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + rcu_barrier(); + + new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL); + /* If there is an ifindex conflict assign a new one */ + if (!new_ifindex) { + if (__dev_get_by_index(net, dev->ifindex)) + new_ifindex = dev_new_index(net); + else + new_ifindex = dev->ifindex; + } + + rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid, + new_ifindex); + + /* + * Flush the unicast and multicast chains + */ + dev_uc_flush(dev); + dev_mc_flush(dev); + + /* Send a netdev-removed uevent to the old namespace */ + kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); + netdev_adjacent_del_links(dev); + + /* Move per-net netdevice notifiers that are following the netdevice */ + move_netdevice_notifiers_dev_net(dev, net); + + /* Actually switch the network namespace */ + dev_net_set(dev, net); + dev->ifindex = new_ifindex; + + /* Send a netdev-add uevent to the new namespace */ + kobject_uevent(&dev->dev.kobj, KOBJ_ADD); + netdev_adjacent_add_links(dev); + + if (new_name[0]) /* Rename the netdev to prepared name */ + strscpy(dev->name, new_name, IFNAMSIZ); + + /* Fixup kobjects */ + err = device_rename(&dev->dev, dev->name); + WARN_ON(err); + + /* Adapt owner in case owning user namespace of target network + * namespace is different from the original one. + */ + err = netdev_change_owner(dev, net_old, net); + WARN_ON(err); + + /* Add the device back in the hashes */ + list_netdevice(dev); + + /* Notify protocols, that a new device appeared. */ + call_netdevice_notifiers(NETDEV_REGISTER, dev); + + /* + * Prevent userspace races by waiting until the network + * device is fully setup before sending notifications. + */ + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); + + synchronize_net(); + err = 0; +out: + return err; +} +EXPORT_SYMBOL_GPL(__dev_change_net_namespace); + +static int dev_cpu_dead(unsigned int oldcpu) +{ + struct sk_buff **list_skb; + struct sk_buff *skb; + unsigned int cpu; + struct softnet_data *sd, *oldsd, *remsd = NULL; + + local_irq_disable(); + cpu = smp_processor_id(); + sd = &per_cpu(softnet_data, cpu); + oldsd = &per_cpu(softnet_data, oldcpu); + + /* Find end of our completion_queue. */ + list_skb = &sd->completion_queue; + while (*list_skb) + list_skb = &(*list_skb)->next; + /* Append completion queue from offline CPU. */ + *list_skb = oldsd->completion_queue; + oldsd->completion_queue = NULL; + + /* Append output queue from offline CPU. */ + if (oldsd->output_queue) { + *sd->output_queue_tailp = oldsd->output_queue; + sd->output_queue_tailp = oldsd->output_queue_tailp; + oldsd->output_queue = NULL; + oldsd->output_queue_tailp = &oldsd->output_queue; + } + /* Append NAPI poll list from offline CPU, with one exception : + * process_backlog() must be called by cpu owning percpu backlog. + * We properly handle process_queue & input_pkt_queue later. + */ + while (!list_empty(&oldsd->poll_list)) { + struct napi_struct *napi = list_first_entry(&oldsd->poll_list, + struct napi_struct, + poll_list); + + list_del_init(&napi->poll_list); + if (napi->poll == process_backlog) + napi->state = 0; + else + ____napi_schedule(sd, napi); + } + + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_enable(); + +#ifdef CONFIG_RPS + remsd = oldsd->rps_ipi_list; + oldsd->rps_ipi_list = NULL; +#endif + /* send out pending IPI's on offline CPU */ + net_rps_send_ipi(remsd); + + /* Process offline CPU's input_pkt_queue */ + while ((skb = __skb_dequeue(&oldsd->process_queue))) { + netif_rx(skb); + input_queue_head_incr(oldsd); + } + while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { + netif_rx(skb); + input_queue_head_incr(oldsd); + } + + return 0; +} + +/** + * netdev_increment_features - increment feature set by one + * @all: current feature set + * @one: new feature set + * @mask: mask feature set + * + * Computes a new feature set after adding a device with feature set + * @one to the master device with current feature set @all. Will not + * enable anything that is off in @mask. Returns the new feature set. + */ +netdev_features_t netdev_increment_features(netdev_features_t all, + netdev_features_t one, netdev_features_t mask) +{ + if (mask & NETIF_F_HW_CSUM) + mask |= NETIF_F_CSUM_MASK; + mask |= NETIF_F_VLAN_CHALLENGED; + + all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask; + all &= one | ~NETIF_F_ALL_FOR_ALL; + + /* If one device supports hw checksumming, set for all. */ + if (all & NETIF_F_HW_CSUM) + all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM); + + return all; +} +EXPORT_SYMBOL(netdev_increment_features); + +static struct hlist_head * __net_init netdev_create_hash(void) +{ + int i; + struct hlist_head *hash; + + hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL); + if (hash != NULL) + for (i = 0; i < NETDEV_HASHENTRIES; i++) + INIT_HLIST_HEAD(&hash[i]); + + return hash; +} + +/* Initialize per network namespace state */ +static int __net_init netdev_init(struct net *net) +{ + BUILD_BUG_ON(GRO_HASH_BUCKETS > + 8 * sizeof_field(struct napi_struct, gro_bitmask)); + + INIT_LIST_HEAD(&net->dev_base_head); + + net->dev_name_head = netdev_create_hash(); + if (net->dev_name_head == NULL) + goto err_name; + + net->dev_index_head = netdev_create_hash(); + if (net->dev_index_head == NULL) + goto err_idx; + + RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain); + + return 0; + +err_idx: + kfree(net->dev_name_head); +err_name: + return -ENOMEM; +} + +/** + * netdev_drivername - network driver for the device + * @dev: network device + * + * Determine network driver for device. + */ +const char *netdev_drivername(const struct net_device *dev) +{ + const struct device_driver *driver; + const struct device *parent; + const char *empty = ""; + + parent = dev->dev.parent; + if (!parent) + return empty; + + driver = parent->driver; + if (driver && driver->name) + return driver->name; + return empty; +} + +static void __netdev_printk(const char *level, const struct net_device *dev, + struct va_format *vaf) +{ + if (dev && dev->dev.parent) { + dev_printk_emit(level[1] - '0', + dev->dev.parent, + "%s %s %s%s: %pV", + dev_driver_string(dev->dev.parent), + dev_name(dev->dev.parent), + netdev_name(dev), netdev_reg_state(dev), + vaf); + } else if (dev) { + printk("%s%s%s: %pV", + level, netdev_name(dev), netdev_reg_state(dev), vaf); + } else { + printk("%s(NULL net_device): %pV", level, vaf); + } +} + +void netdev_printk(const char *level, const struct net_device *dev, + const char *format, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, format); + + vaf.fmt = format; + vaf.va = &args; + + __netdev_printk(level, dev, &vaf); + + va_end(args); +} +EXPORT_SYMBOL(netdev_printk); + +#define define_netdev_printk_level(func, level) \ +void func(const struct net_device *dev, const char *fmt, ...) \ +{ \ + struct va_format vaf; \ + va_list args; \ + \ + va_start(args, fmt); \ + \ + vaf.fmt = fmt; \ + vaf.va = &args; \ + \ + __netdev_printk(level, dev, &vaf); \ + \ + va_end(args); \ +} \ +EXPORT_SYMBOL(func); + +define_netdev_printk_level(netdev_emerg, KERN_EMERG); +define_netdev_printk_level(netdev_alert, KERN_ALERT); +define_netdev_printk_level(netdev_crit, KERN_CRIT); +define_netdev_printk_level(netdev_err, KERN_ERR); +define_netdev_printk_level(netdev_warn, KERN_WARNING); +define_netdev_printk_level(netdev_notice, KERN_NOTICE); +define_netdev_printk_level(netdev_info, KERN_INFO); + +static void __net_exit netdev_exit(struct net *net) +{ + kfree(net->dev_name_head); + kfree(net->dev_index_head); + if (net != &init_net) + WARN_ON_ONCE(!list_empty(&net->dev_base_head)); +} + +static struct pernet_operations __net_initdata netdev_net_ops = { + .init = netdev_init, + .exit = netdev_exit, +}; + +static void __net_exit default_device_exit_net(struct net *net) +{ + struct netdev_name_node *name_node, *tmp; + struct net_device *dev, *aux; + /* + * Push all migratable network devices back to the + * initial network namespace + */ + ASSERT_RTNL(); + for_each_netdev_safe(net, dev, aux) { + int err; + char fb_name[IFNAMSIZ]; + + /* Ignore unmoveable devices (i.e. loopback) */ + if (dev->features & NETIF_F_NETNS_LOCAL) + continue; + + /* Leave virtual devices for the generic cleanup */ + if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund) + continue; + + /* Push remaining network devices to init_net */ + snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); + if (netdev_name_in_use(&init_net, fb_name)) + snprintf(fb_name, IFNAMSIZ, "dev%%d"); + + netdev_for_each_altname_safe(dev, name_node, tmp) + if (netdev_name_in_use(&init_net, name_node->name)) { + netdev_name_node_del(name_node); + synchronize_rcu(); + __netdev_name_node_alt_destroy(name_node); + } + + err = dev_change_net_namespace(dev, &init_net, fb_name); + if (err) { + pr_emerg("%s: failed to move %s to init_net: %d\n", + __func__, dev->name, err); + BUG(); + } + } +} + +static void __net_exit default_device_exit_batch(struct list_head *net_list) +{ + /* At exit all network devices most be removed from a network + * namespace. Do this in the reverse order of registration. + * Do this across as many network namespaces as possible to + * improve batching efficiency. + */ + struct net_device *dev; + struct net *net; + LIST_HEAD(dev_kill_list); + + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) { + default_device_exit_net(net); + cond_resched(); + } + + list_for_each_entry(net, net_list, exit_list) { + for_each_netdev_reverse(net, dev) { + if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) + dev->rtnl_link_ops->dellink(dev, &dev_kill_list); + else + unregister_netdevice_queue(dev, &dev_kill_list); + } + } + unregister_netdevice_many(&dev_kill_list); + rtnl_unlock(); +} + +static struct pernet_operations __net_initdata default_device_ops = { + .exit_batch = default_device_exit_batch, +}; + +/* + * Initialize the DEV module. At boot time this walks the device list and + * unhooks any devices that fail to initialise (normally hardware not + * present) and leaves us with a valid list of present and active devices. + * + */ + +/* + * This is called single threaded during boot, so no need + * to take the rtnl semaphore. + */ +static int __init net_dev_init(void) +{ + int i, rc = -ENOMEM; + + BUG_ON(!dev_boot_phase); + + if (dev_proc_init()) + goto out; + + if (netdev_kobject_init()) + goto out; + + INIT_LIST_HEAD(&ptype_all); + for (i = 0; i < PTYPE_HASH_SIZE; i++) + INIT_LIST_HEAD(&ptype_base[i]); + + if (register_pernet_subsys(&netdev_net_ops)) + goto out; + + /* + * Initialise the packet receive queues. + */ + + for_each_possible_cpu(i) { + struct work_struct *flush = per_cpu_ptr(&flush_works, i); + struct softnet_data *sd = &per_cpu(softnet_data, i); + + INIT_WORK(flush, flush_backlog); + + skb_queue_head_init(&sd->input_pkt_queue); + skb_queue_head_init(&sd->process_queue); +#ifdef CONFIG_XFRM_OFFLOAD + skb_queue_head_init(&sd->xfrm_backlog); +#endif + INIT_LIST_HEAD(&sd->poll_list); + sd->output_queue_tailp = &sd->output_queue; +#ifdef CONFIG_RPS + INIT_CSD(&sd->csd, rps_trigger_softirq, sd); + sd->cpu = i; +#endif + INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); + spin_lock_init(&sd->defer_lock); + + init_gro_hash(&sd->backlog); + sd->backlog.poll = process_backlog; + sd->backlog.weight = weight_p; + } + + dev_boot_phase = 0; + + /* The loopback device is special if any other network devices + * is present in a network namespace the loopback device must + * be present. Since we now dynamically allocate and free the + * loopback device ensure this invariant is maintained by + * keeping the loopback device as the first device on the + * list of network devices. Ensuring the loopback devices + * is the first device that appears and the last network device + * that disappears. + */ + if (register_pernet_device(&loopback_net_ops)) + goto out; + + if (register_pernet_device(&default_device_ops)) + goto out; + + open_softirq(NET_TX_SOFTIRQ, net_tx_action); + open_softirq(NET_RX_SOFTIRQ, net_rx_action); + + rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead", + NULL, dev_cpu_dead); + WARN_ON(rc < 0); + rc = 0; +out: + return rc; +} + +subsys_initcall(net_dev_init); diff --git a/net/core/dev.h b/net/core/dev.h new file mode 100644 index 000000000..db9ff8cd8 --- /dev/null +++ b/net/core/dev.h @@ -0,0 +1,118 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_CORE_DEV_H +#define _NET_CORE_DEV_H + +#include <linux/types.h> + +struct net; +struct net_device; +struct netdev_bpf; +struct netdev_phys_item_id; +struct netlink_ext_ack; + +/* Random bits of netdevice that don't need to be exposed */ +#define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ +struct sd_flow_limit { + u64 count; + unsigned int num_buckets; + unsigned int history_head; + u16 history[FLOW_LIMIT_HISTORY]; + u8 buckets[]; +}; + +extern int netdev_flow_limit_table_len; + +#ifdef CONFIG_PROC_FS +int __init dev_proc_init(void); +#else +#define dev_proc_init() 0 +#endif + +void linkwatch_init_dev(struct net_device *dev); +void linkwatch_forget_dev(struct net_device *dev); +void linkwatch_run_queue(void); + +void dev_addr_flush(struct net_device *dev); +int dev_addr_init(struct net_device *dev); +void dev_addr_check(struct net_device *dev); + +/* sysctls not referred to from outside net/core/ */ +extern int netdev_budget; +extern unsigned int netdev_budget_usecs; +extern unsigned int sysctl_skb_defer_max; +extern int netdev_tstamp_prequeue; +extern int netdev_unregister_timeout_secs; +extern int weight_p; +extern int dev_weight_rx_bias; +extern int dev_weight_tx_bias; + +/* rtnl helpers */ +extern struct list_head net_todo_list; +void netdev_run_todo(void); + +/* netdev management, shared between various uAPI entry points */ +struct netdev_name_node { + struct hlist_node hlist; + struct list_head list; + struct net_device *dev; + const char *name; +}; + +int netdev_get_name(struct net *net, char *name, int ifindex); +int dev_change_name(struct net_device *dev, const char *newname); + +#define netdev_for_each_altname(dev, namenode) \ + list_for_each_entry((namenode), &(dev)->name_node->list, list) +#define netdev_for_each_altname_safe(dev, namenode, next) \ + list_for_each_entry_safe((namenode), (next), &(dev)->name_node->list, \ + list) + +int netdev_name_node_alt_create(struct net_device *dev, const char *name); +int netdev_name_node_alt_destroy(struct net_device *dev, const char *name); + +int dev_validate_mtu(struct net_device *dev, int mtu, + struct netlink_ext_ack *extack); +int dev_set_mtu_ext(struct net_device *dev, int mtu, + struct netlink_ext_ack *extack); + +int dev_get_phys_port_id(struct net_device *dev, + struct netdev_phys_item_id *ppid); +int dev_get_phys_port_name(struct net_device *dev, + char *name, size_t len); + +int dev_change_proto_down(struct net_device *dev, bool proto_down); +void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, + u32 value); + +typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); +int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, + int fd, int expected_fd, u32 flags); + +int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len); +void dev_set_group(struct net_device *dev, int new_group); +int dev_change_carrier(struct net_device *dev, bool new_carrier); + +void __dev_set_rx_mode(struct net_device *dev); + +static inline void netif_set_gso_max_size(struct net_device *dev, + unsigned int size) +{ + /* dev->gso_max_size is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_max_size, size); +} + +static inline void netif_set_gso_max_segs(struct net_device *dev, + unsigned int segs) +{ + /* dev->gso_max_segs is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_max_segs, segs); +} + +static inline void netif_set_gro_max_size(struct net_device *dev, + unsigned int size) +{ + /* This pairs with the READ_ONCE() in skb_gro_receive() */ + WRITE_ONCE(dev->gro_max_size, size); +} + +#endif diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c new file mode 100644 index 000000000..baa63dee2 --- /dev/null +++ b/net/core/dev_addr_lists.c @@ -0,0 +1,1050 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * net/core/dev_addr_lists.c - Functions for handling net device lists + * Copyright (c) 2010 Jiri Pirko <jpirko@redhat.com> + * + * This file contains functions for working with unicast, multicast and device + * addresses lists. + */ + +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/export.h> +#include <linux/list.h> + +#include "dev.h" + +/* + * General list handling functions + */ + +static int __hw_addr_insert(struct netdev_hw_addr_list *list, + struct netdev_hw_addr *new, int addr_len) +{ + struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL; + struct netdev_hw_addr *ha; + + while (*ins_point) { + int diff; + + ha = rb_entry(*ins_point, struct netdev_hw_addr, node); + diff = memcmp(new->addr, ha->addr, addr_len); + if (diff == 0) + diff = memcmp(&new->type, &ha->type, sizeof(new->type)); + + parent = *ins_point; + if (diff < 0) + ins_point = &parent->rb_left; + else if (diff > 0) + ins_point = &parent->rb_right; + else + return -EEXIST; + } + + rb_link_node_rcu(&new->node, parent, ins_point); + rb_insert_color(&new->node, &list->tree); + + return 0; +} + +static struct netdev_hw_addr* +__hw_addr_create(const unsigned char *addr, int addr_len, + unsigned char addr_type, bool global, bool sync) +{ + struct netdev_hw_addr *ha; + int alloc_size; + + alloc_size = sizeof(*ha); + if (alloc_size < L1_CACHE_BYTES) + alloc_size = L1_CACHE_BYTES; + ha = kmalloc(alloc_size, GFP_ATOMIC); + if (!ha) + return NULL; + memcpy(ha->addr, addr, addr_len); + ha->type = addr_type; + ha->refcount = 1; + ha->global_use = global; + ha->synced = sync ? 1 : 0; + ha->sync_cnt = 0; + + return ha; +} + +static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, + const unsigned char *addr, int addr_len, + unsigned char addr_type, bool global, bool sync, + int sync_count, bool exclusive) +{ + struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL; + struct netdev_hw_addr *ha; + + if (addr_len > MAX_ADDR_LEN) + return -EINVAL; + + while (*ins_point) { + int diff; + + ha = rb_entry(*ins_point, struct netdev_hw_addr, node); + diff = memcmp(addr, ha->addr, addr_len); + if (diff == 0) + diff = memcmp(&addr_type, &ha->type, sizeof(addr_type)); + + parent = *ins_point; + if (diff < 0) { + ins_point = &parent->rb_left; + } else if (diff > 0) { + ins_point = &parent->rb_right; + } else { + if (exclusive) + return -EEXIST; + if (global) { + /* check if addr is already used as global */ + if (ha->global_use) + return 0; + else + ha->global_use = true; + } + if (sync) { + if (ha->synced && sync_count) + return -EEXIST; + else + ha->synced++; + } + ha->refcount++; + return 0; + } + } + + ha = __hw_addr_create(addr, addr_len, addr_type, global, sync); + if (!ha) + return -ENOMEM; + + rb_link_node(&ha->node, parent, ins_point); + rb_insert_color(&ha->node, &list->tree); + + list_add_tail_rcu(&ha->list, &list->list); + list->count++; + + return 0; +} + +static int __hw_addr_add(struct netdev_hw_addr_list *list, + const unsigned char *addr, int addr_len, + unsigned char addr_type) +{ + return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false, + 0, false); +} + +static int __hw_addr_del_entry(struct netdev_hw_addr_list *list, + struct netdev_hw_addr *ha, bool global, + bool sync) +{ + if (global && !ha->global_use) + return -ENOENT; + + if (sync && !ha->synced) + return -ENOENT; + + if (global) + ha->global_use = false; + + if (sync) + ha->synced--; + + if (--ha->refcount) + return 0; + + rb_erase(&ha->node, &list->tree); + + list_del_rcu(&ha->list); + kfree_rcu(ha, rcu_head); + list->count--; + return 0; +} + +static struct netdev_hw_addr *__hw_addr_lookup(struct netdev_hw_addr_list *list, + const unsigned char *addr, int addr_len, + unsigned char addr_type) +{ + struct rb_node *node; + + node = list->tree.rb_node; + + while (node) { + struct netdev_hw_addr *ha = rb_entry(node, struct netdev_hw_addr, node); + int diff = memcmp(addr, ha->addr, addr_len); + + if (diff == 0 && addr_type) + diff = memcmp(&addr_type, &ha->type, sizeof(addr_type)); + + if (diff < 0) + node = node->rb_left; + else if (diff > 0) + node = node->rb_right; + else + return ha; + } + + return NULL; +} + +static int __hw_addr_del_ex(struct netdev_hw_addr_list *list, + const unsigned char *addr, int addr_len, + unsigned char addr_type, bool global, bool sync) +{ + struct netdev_hw_addr *ha = __hw_addr_lookup(list, addr, addr_len, addr_type); + + if (!ha) + return -ENOENT; + return __hw_addr_del_entry(list, ha, global, sync); +} + +static int __hw_addr_del(struct netdev_hw_addr_list *list, + const unsigned char *addr, int addr_len, + unsigned char addr_type) +{ + return __hw_addr_del_ex(list, addr, addr_len, addr_type, false, false); +} + +static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr *ha, + int addr_len) +{ + int err; + + err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type, + false, true, ha->sync_cnt, false); + if (err && err != -EEXIST) + return err; + + if (!err) { + ha->sync_cnt++; + ha->refcount++; + } + + return 0; +} + +static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + struct netdev_hw_addr *ha, + int addr_len) +{ + int err; + + err = __hw_addr_del_ex(to_list, ha->addr, addr_len, ha->type, + false, true); + if (err) + return; + ha->sync_cnt--; + /* address on from list is not marked synced */ + __hw_addr_del_entry(from_list, ha, false, false); +} + +static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len) +{ + int err = 0; + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &from_list->list, list) { + if (ha->sync_cnt == ha->refcount) { + __hw_addr_unsync_one(to_list, from_list, ha, addr_len); + } else { + err = __hw_addr_sync_one(to_list, ha, addr_len); + if (err) + break; + } + } + return err; +} + +/* This function only works where there is a strict 1-1 relationship + * between source and destionation of they synch. If you ever need to + * sync addresses to more then 1 destination, you need to use + * __hw_addr_sync_multiple(). + */ +int __hw_addr_sync(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len) +{ + int err = 0; + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &from_list->list, list) { + if (!ha->sync_cnt) { + err = __hw_addr_sync_one(to_list, ha, addr_len); + if (err) + break; + } else if (ha->refcount == 1) + __hw_addr_unsync_one(to_list, from_list, ha, addr_len); + } + return err; +} +EXPORT_SYMBOL(__hw_addr_sync); + +void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &from_list->list, list) { + if (ha->sync_cnt) + __hw_addr_unsync_one(to_list, from_list, ha, addr_len); + } +} +EXPORT_SYMBOL(__hw_addr_unsync); + +/** + * __hw_addr_sync_dev - Synchonize device's multicast list + * @list: address list to syncronize + * @dev: device to sync + * @sync: function to call if address should be added + * @unsync: function to call if address should be removed + * + * This function is intended to be called from the ndo_set_rx_mode + * function of devices that require explicit address add/remove + * notifications. The unsync function may be NULL in which case + * the addresses requiring removal will simply be removed without + * any notification to the device. + **/ +int __hw_addr_sync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*sync)(struct net_device *, const unsigned char *), + int (*unsync)(struct net_device *, + const unsigned char *)) +{ + struct netdev_hw_addr *ha, *tmp; + int err; + + /* first go through and flush out any stale entries */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + if (!ha->sync_cnt || ha->refcount != 1) + continue; + + /* if unsync is defined and fails defer unsyncing address */ + if (unsync && unsync(dev, ha->addr)) + continue; + + ha->sync_cnt--; + __hw_addr_del_entry(list, ha, false, false); + } + + /* go through and sync new entries to the list */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + if (ha->sync_cnt) + continue; + + err = sync(dev, ha->addr); + if (err) + return err; + + ha->sync_cnt++; + ha->refcount++; + } + + return 0; +} +EXPORT_SYMBOL(__hw_addr_sync_dev); + +/** + * __hw_addr_ref_sync_dev - Synchronize device's multicast address list taking + * into account references + * @list: address list to synchronize + * @dev: device to sync + * @sync: function to call if address or reference on it should be added + * @unsync: function to call if address or some reference on it should removed + * + * This function is intended to be called from the ndo_set_rx_mode + * function of devices that require explicit address or references on it + * add/remove notifications. The unsync function may be NULL in which case + * the addresses or references on it requiring removal will simply be + * removed without any notification to the device. That is responsibility of + * the driver to identify and distribute address or references on it between + * internal address tables. + **/ +int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*sync)(struct net_device *, + const unsigned char *, int), + int (*unsync)(struct net_device *, + const unsigned char *, int)) +{ + struct netdev_hw_addr *ha, *tmp; + int err, ref_cnt; + + /* first go through and flush out any unsynced/stale entries */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + /* sync if address is not used */ + if ((ha->sync_cnt << 1) <= ha->refcount) + continue; + + /* if fails defer unsyncing address */ + ref_cnt = ha->refcount - ha->sync_cnt; + if (unsync && unsync(dev, ha->addr, ref_cnt)) + continue; + + ha->refcount = (ref_cnt << 1) + 1; + ha->sync_cnt = ref_cnt; + __hw_addr_del_entry(list, ha, false, false); + } + + /* go through and sync updated/new entries to the list */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + /* sync if address added or reused */ + if ((ha->sync_cnt << 1) >= ha->refcount) + continue; + + ref_cnt = ha->refcount - ha->sync_cnt; + err = sync(dev, ha->addr, ref_cnt); + if (err) + return err; + + ha->refcount = ref_cnt << 1; + ha->sync_cnt = ref_cnt; + } + + return 0; +} +EXPORT_SYMBOL(__hw_addr_ref_sync_dev); + +/** + * __hw_addr_ref_unsync_dev - Remove synchronized addresses and references on + * it from device + * @list: address list to remove synchronized addresses (references on it) from + * @dev: device to sync + * @unsync: function to call if address and references on it should be removed + * + * Remove all addresses that were added to the device by + * __hw_addr_ref_sync_dev(). This function is intended to be called from the + * ndo_stop or ndo_open functions on devices that require explicit address (or + * references on it) add/remove notifications. If the unsync function pointer + * is NULL then this function can be used to just reset the sync_cnt for the + * addresses in the list. + **/ +void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*unsync)(struct net_device *, + const unsigned char *, int)) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &list->list, list) { + if (!ha->sync_cnt) + continue; + + /* if fails defer unsyncing address */ + if (unsync && unsync(dev, ha->addr, ha->sync_cnt)) + continue; + + ha->refcount -= ha->sync_cnt - 1; + ha->sync_cnt = 0; + __hw_addr_del_entry(list, ha, false, false); + } +} +EXPORT_SYMBOL(__hw_addr_ref_unsync_dev); + +/** + * __hw_addr_unsync_dev - Remove synchronized addresses from device + * @list: address list to remove synchronized addresses from + * @dev: device to sync + * @unsync: function to call if address should be removed + * + * Remove all addresses that were added to the device by __hw_addr_sync_dev(). + * This function is intended to be called from the ndo_stop or ndo_open + * functions on devices that require explicit address add/remove + * notifications. If the unsync function pointer is NULL then this function + * can be used to just reset the sync_cnt for the addresses in the list. + **/ +void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*unsync)(struct net_device *, + const unsigned char *)) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &list->list, list) { + if (!ha->sync_cnt) + continue; + + /* if unsync is defined and fails defer unsyncing address */ + if (unsync && unsync(dev, ha->addr)) + continue; + + ha->sync_cnt--; + __hw_addr_del_entry(list, ha, false, false); + } +} +EXPORT_SYMBOL(__hw_addr_unsync_dev); + +static void __hw_addr_flush(struct netdev_hw_addr_list *list) +{ + struct netdev_hw_addr *ha, *tmp; + + list->tree = RB_ROOT; + list_for_each_entry_safe(ha, tmp, &list->list, list) { + list_del_rcu(&ha->list); + kfree_rcu(ha, rcu_head); + } + list->count = 0; +} + +void __hw_addr_init(struct netdev_hw_addr_list *list) +{ + INIT_LIST_HEAD(&list->list); + list->count = 0; + list->tree = RB_ROOT; +} +EXPORT_SYMBOL(__hw_addr_init); + +/* + * Device addresses handling functions + */ + +/* Check that netdev->dev_addr is not written to directly as this would + * break the rbtree layout. All changes should go thru dev_addr_set() and co. + * Remove this check in mid-2024. + */ +void dev_addr_check(struct net_device *dev) +{ + if (!memcmp(dev->dev_addr, dev->dev_addr_shadow, MAX_ADDR_LEN)) + return; + + netdev_warn(dev, "Current addr: %*ph\n", MAX_ADDR_LEN, dev->dev_addr); + netdev_warn(dev, "Expected addr: %*ph\n", + MAX_ADDR_LEN, dev->dev_addr_shadow); + netdev_WARN(dev, "Incorrect netdev->dev_addr\n"); +} + +/** + * dev_addr_flush - Flush device address list + * @dev: device + * + * Flush device address list and reset ->dev_addr. + * + * The caller must hold the rtnl_mutex. + */ +void dev_addr_flush(struct net_device *dev) +{ + /* rtnl_mutex must be held here */ + dev_addr_check(dev); + + __hw_addr_flush(&dev->dev_addrs); + dev->dev_addr = NULL; +} + +/** + * dev_addr_init - Init device address list + * @dev: device + * + * Init device address list and create the first element, + * used by ->dev_addr. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_init(struct net_device *dev) +{ + unsigned char addr[MAX_ADDR_LEN]; + struct netdev_hw_addr *ha; + int err; + + /* rtnl_mutex must be held here */ + + __hw_addr_init(&dev->dev_addrs); + memset(addr, 0, sizeof(addr)); + err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr), + NETDEV_HW_ADDR_T_LAN); + if (!err) { + /* + * Get the first (previously created) address from the list + * and set dev_addr pointer to this location. + */ + ha = list_first_entry(&dev->dev_addrs.list, + struct netdev_hw_addr, list); + dev->dev_addr = ha->addr; + } + return err; +} + +void dev_addr_mod(struct net_device *dev, unsigned int offset, + const void *addr, size_t len) +{ + struct netdev_hw_addr *ha; + + dev_addr_check(dev); + + ha = container_of(dev->dev_addr, struct netdev_hw_addr, addr[0]); + rb_erase(&ha->node, &dev->dev_addrs.tree); + memcpy(&ha->addr[offset], addr, len); + memcpy(&dev->dev_addr_shadow[offset], addr, len); + WARN_ON(__hw_addr_insert(&dev->dev_addrs, ha, dev->addr_len)); +} +EXPORT_SYMBOL(dev_addr_mod); + +/** + * dev_addr_add - Add a device address + * @dev: device + * @addr: address to add + * @addr_type: address type + * + * Add a device address to the device or increase the reference count if + * it already exists. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_add(struct net_device *dev, const unsigned char *addr, + unsigned char addr_type) +{ + int err; + + ASSERT_RTNL(); + + err = dev_pre_changeaddr_notify(dev, addr, NULL); + if (err) + return err; + err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return err; +} +EXPORT_SYMBOL(dev_addr_add); + +/** + * dev_addr_del - Release a device address. + * @dev: device + * @addr: address to delete + * @addr_type: address type + * + * Release reference to a device address and remove it from the device + * if the reference count drops to zero. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_del(struct net_device *dev, const unsigned char *addr, + unsigned char addr_type) +{ + int err; + struct netdev_hw_addr *ha; + + ASSERT_RTNL(); + + /* + * We can not remove the first address from the list because + * dev->dev_addr points to that. + */ + ha = list_first_entry(&dev->dev_addrs.list, + struct netdev_hw_addr, list); + if (!memcmp(ha->addr, addr, dev->addr_len) && + ha->type == addr_type && ha->refcount == 1) + return -ENOENT; + + err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len, + addr_type); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return err; +} +EXPORT_SYMBOL(dev_addr_del); + +/* + * Unicast list handling functions + */ + +/** + * dev_uc_add_excl - Add a global secondary unicast address + * @dev: device + * @addr: address to add + */ +int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr) +{ + int err; + + netif_addr_lock_bh(dev); + err = __hw_addr_add_ex(&dev->uc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_UNICAST, true, false, + 0, true); + if (!err) + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); + return err; +} +EXPORT_SYMBOL(dev_uc_add_excl); + +/** + * dev_uc_add - Add a secondary unicast address + * @dev: device + * @addr: address to add + * + * Add a secondary unicast address to the device or increase + * the reference count if it already exists. + */ +int dev_uc_add(struct net_device *dev, const unsigned char *addr) +{ + int err; + + netif_addr_lock_bh(dev); + err = __hw_addr_add(&dev->uc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_UNICAST); + if (!err) + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); + return err; +} +EXPORT_SYMBOL(dev_uc_add); + +/** + * dev_uc_del - Release secondary unicast address. + * @dev: device + * @addr: address to delete + * + * Release reference to a secondary unicast address and remove it + * from the device if the reference count drops to zero. + */ +int dev_uc_del(struct net_device *dev, const unsigned char *addr) +{ + int err; + + netif_addr_lock_bh(dev); + err = __hw_addr_del(&dev->uc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_UNICAST); + if (!err) + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); + return err; +} +EXPORT_SYMBOL(dev_uc_del); + +/** + * dev_uc_sync - Synchronize device's unicast list to another device + * @to: destination device + * @from: source device + * + * Add newly added addresses to the destination device and release + * addresses that have no users left. The source device must be + * locked by netif_addr_lock_bh. + * + * This function is intended to be called from the dev->set_rx_mode + * function of layered software devices. This function assumes that + * addresses will only ever be synced to the @to devices and no other. + */ +int dev_uc_sync(struct net_device *to, struct net_device *from) +{ + int err = 0; + + if (to->addr_len != from->addr_len) + return -EINVAL; + + netif_addr_lock(to); + err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); + if (!err) + __dev_set_rx_mode(to); + netif_addr_unlock(to); + return err; +} +EXPORT_SYMBOL(dev_uc_sync); + +/** + * dev_uc_sync_multiple - Synchronize device's unicast list to another + * device, but allow for multiple calls to sync to multiple devices. + * @to: destination device + * @from: source device + * + * Add newly added addresses to the destination device and release + * addresses that have been deleted from the source. The source device + * must be locked by netif_addr_lock_bh. + * + * This function is intended to be called from the dev->set_rx_mode + * function of layered software devices. It allows for a single source + * device to be synced to multiple destination devices. + */ +int dev_uc_sync_multiple(struct net_device *to, struct net_device *from) +{ + int err = 0; + + if (to->addr_len != from->addr_len) + return -EINVAL; + + netif_addr_lock(to); + err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len); + if (!err) + __dev_set_rx_mode(to); + netif_addr_unlock(to); + return err; +} +EXPORT_SYMBOL(dev_uc_sync_multiple); + +/** + * dev_uc_unsync - Remove synchronized addresses from the destination device + * @to: destination device + * @from: source device + * + * Remove all addresses that were added to the destination device by + * dev_uc_sync(). This function is intended to be called from the + * dev->stop function of layered software devices. + */ +void dev_uc_unsync(struct net_device *to, struct net_device *from) +{ + if (to->addr_len != from->addr_len) + return; + + /* netif_addr_lock_bh() uses lockdep subclass 0, this is okay for two + * reasons: + * 1) This is always called without any addr_list_lock, so as the + * outermost one here, it must be 0. + * 2) This is called by some callers after unlinking the upper device, + * so the dev->lower_level becomes 1 again. + * Therefore, the subclass for 'from' is 0, for 'to' is either 1 or + * larger. + */ + netif_addr_lock_bh(from); + netif_addr_lock(to); + __hw_addr_unsync(&to->uc, &from->uc, to->addr_len); + __dev_set_rx_mode(to); + netif_addr_unlock(to); + netif_addr_unlock_bh(from); +} +EXPORT_SYMBOL(dev_uc_unsync); + +/** + * dev_uc_flush - Flush unicast addresses + * @dev: device + * + * Flush unicast addresses. + */ +void dev_uc_flush(struct net_device *dev) +{ + netif_addr_lock_bh(dev); + __hw_addr_flush(&dev->uc); + netif_addr_unlock_bh(dev); +} +EXPORT_SYMBOL(dev_uc_flush); + +/** + * dev_uc_init - Init unicast address list + * @dev: device + * + * Init unicast address list. + */ +void dev_uc_init(struct net_device *dev) +{ + __hw_addr_init(&dev->uc); +} +EXPORT_SYMBOL(dev_uc_init); + +/* + * Multicast list handling functions + */ + +/** + * dev_mc_add_excl - Add a global secondary multicast address + * @dev: device + * @addr: address to add + */ +int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr) +{ + int err; + + netif_addr_lock_bh(dev); + err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_MULTICAST, true, false, + 0, true); + if (!err) + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); + return err; +} +EXPORT_SYMBOL(dev_mc_add_excl); + +static int __dev_mc_add(struct net_device *dev, const unsigned char *addr, + bool global) +{ + int err; + + netif_addr_lock_bh(dev); + err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_MULTICAST, global, false, + 0, false); + if (!err) + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); + return err; +} +/** + * dev_mc_add - Add a multicast address + * @dev: device + * @addr: address to add + * + * Add a multicast address to the device or increase + * the reference count if it already exists. + */ +int dev_mc_add(struct net_device *dev, const unsigned char *addr) +{ + return __dev_mc_add(dev, addr, false); +} +EXPORT_SYMBOL(dev_mc_add); + +/** + * dev_mc_add_global - Add a global multicast address + * @dev: device + * @addr: address to add + * + * Add a global multicast address to the device. + */ +int dev_mc_add_global(struct net_device *dev, const unsigned char *addr) +{ + return __dev_mc_add(dev, addr, true); +} +EXPORT_SYMBOL(dev_mc_add_global); + +static int __dev_mc_del(struct net_device *dev, const unsigned char *addr, + bool global) +{ + int err; + + netif_addr_lock_bh(dev); + err = __hw_addr_del_ex(&dev->mc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_MULTICAST, global, false); + if (!err) + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); + return err; +} + +/** + * dev_mc_del - Delete a multicast address. + * @dev: device + * @addr: address to delete + * + * Release reference to a multicast address and remove it + * from the device if the reference count drops to zero. + */ +int dev_mc_del(struct net_device *dev, const unsigned char *addr) +{ + return __dev_mc_del(dev, addr, false); +} +EXPORT_SYMBOL(dev_mc_del); + +/** + * dev_mc_del_global - Delete a global multicast address. + * @dev: device + * @addr: address to delete + * + * Release reference to a multicast address and remove it + * from the device if the reference count drops to zero. + */ +int dev_mc_del_global(struct net_device *dev, const unsigned char *addr) +{ + return __dev_mc_del(dev, addr, true); +} +EXPORT_SYMBOL(dev_mc_del_global); + +/** + * dev_mc_sync - Synchronize device's multicast list to another device + * @to: destination device + * @from: source device + * + * Add newly added addresses to the destination device and release + * addresses that have no users left. The source device must be + * locked by netif_addr_lock_bh. + * + * This function is intended to be called from the ndo_set_rx_mode + * function of layered software devices. + */ +int dev_mc_sync(struct net_device *to, struct net_device *from) +{ + int err = 0; + + if (to->addr_len != from->addr_len) + return -EINVAL; + + netif_addr_lock(to); + err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len); + if (!err) + __dev_set_rx_mode(to); + netif_addr_unlock(to); + return err; +} +EXPORT_SYMBOL(dev_mc_sync); + +/** + * dev_mc_sync_multiple - Synchronize device's multicast list to another + * device, but allow for multiple calls to sync to multiple devices. + * @to: destination device + * @from: source device + * + * Add newly added addresses to the destination device and release + * addresses that have no users left. The source device must be + * locked by netif_addr_lock_bh. + * + * This function is intended to be called from the ndo_set_rx_mode + * function of layered software devices. It allows for a single + * source device to be synced to multiple destination devices. + */ +int dev_mc_sync_multiple(struct net_device *to, struct net_device *from) +{ + int err = 0; + + if (to->addr_len != from->addr_len) + return -EINVAL; + + netif_addr_lock(to); + err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len); + if (!err) + __dev_set_rx_mode(to); + netif_addr_unlock(to); + return err; +} +EXPORT_SYMBOL(dev_mc_sync_multiple); + +/** + * dev_mc_unsync - Remove synchronized addresses from the destination device + * @to: destination device + * @from: source device + * + * Remove all addresses that were added to the destination device by + * dev_mc_sync(). This function is intended to be called from the + * dev->stop function of layered software devices. + */ +void dev_mc_unsync(struct net_device *to, struct net_device *from) +{ + if (to->addr_len != from->addr_len) + return; + + /* See the above comments inside dev_uc_unsync(). */ + netif_addr_lock_bh(from); + netif_addr_lock(to); + __hw_addr_unsync(&to->mc, &from->mc, to->addr_len); + __dev_set_rx_mode(to); + netif_addr_unlock(to); + netif_addr_unlock_bh(from); +} +EXPORT_SYMBOL(dev_mc_unsync); + +/** + * dev_mc_flush - Flush multicast addresses + * @dev: device + * + * Flush multicast addresses. + */ +void dev_mc_flush(struct net_device *dev) +{ + netif_addr_lock_bh(dev); + __hw_addr_flush(&dev->mc); + netif_addr_unlock_bh(dev); +} +EXPORT_SYMBOL(dev_mc_flush); + +/** + * dev_mc_init - Init multicast address list + * @dev: device + * + * Init multicast address list. + */ +void dev_mc_init(struct net_device *dev) +{ + __hw_addr_init(&dev->mc); +} +EXPORT_SYMBOL(dev_mc_init); diff --git a/net/core/dev_addr_lists_test.c b/net/core/dev_addr_lists_test.c new file mode 100644 index 000000000..049cfbc58 --- /dev/null +++ b/net/core/dev_addr_lists_test.c @@ -0,0 +1,236 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <kunit/test.h> +#include <linux/etherdevice.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> + +static const struct net_device_ops dummy_netdev_ops = { +}; + +struct dev_addr_test_priv { + u32 addr_seen; +}; + +static int dev_addr_test_sync(struct net_device *netdev, const unsigned char *a) +{ + struct dev_addr_test_priv *datp = netdev_priv(netdev); + + if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN)) + datp->addr_seen |= 1 << a[0]; + return 0; +} + +static int dev_addr_test_unsync(struct net_device *netdev, + const unsigned char *a) +{ + struct dev_addr_test_priv *datp = netdev_priv(netdev); + + if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN)) + datp->addr_seen &= ~(1 << a[0]); + return 0; +} + +static int dev_addr_test_init(struct kunit *test) +{ + struct dev_addr_test_priv *datp; + struct net_device *netdev; + int err; + + netdev = alloc_etherdev(sizeof(*datp)); + KUNIT_ASSERT_TRUE(test, !!netdev); + + test->priv = netdev; + netdev->netdev_ops = &dummy_netdev_ops; + + err = register_netdev(netdev); + if (err) { + free_netdev(netdev); + KUNIT_FAIL(test, "Can't register netdev %d", err); + } + + rtnl_lock(); + return 0; +} + +static void dev_addr_test_exit(struct kunit *test) +{ + struct net_device *netdev = test->priv; + + rtnl_unlock(); + unregister_netdev(netdev); + free_netdev(netdev); +} + +static void dev_addr_test_basic(struct kunit *test) +{ + struct net_device *netdev = test->priv; + u8 addr[ETH_ALEN]; + + KUNIT_EXPECT_TRUE(test, !!netdev->dev_addr); + + memset(addr, 2, sizeof(addr)); + eth_hw_addr_set(netdev, addr); + KUNIT_EXPECT_EQ(test, 0, memcmp(netdev->dev_addr, addr, sizeof(addr))); + + memset(addr, 3, sizeof(addr)); + dev_addr_set(netdev, addr); + KUNIT_EXPECT_EQ(test, 0, memcmp(netdev->dev_addr, addr, sizeof(addr))); +} + +static void dev_addr_test_sync_one(struct kunit *test) +{ + struct net_device *netdev = test->priv; + struct dev_addr_test_priv *datp; + u8 addr[ETH_ALEN]; + + datp = netdev_priv(netdev); + + memset(addr, 1, sizeof(addr)); + eth_hw_addr_set(netdev, addr); + + __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 2, datp->addr_seen); + + memset(addr, 2, sizeof(addr)); + eth_hw_addr_set(netdev, addr); + + datp->addr_seen = 0; + __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + /* It's not going to sync anything because the main address is + * considered synced and we overwrite in place. + */ + KUNIT_EXPECT_EQ(test, 0, datp->addr_seen); +} + +static void dev_addr_test_add_del(struct kunit *test) +{ + struct net_device *netdev = test->priv; + struct dev_addr_test_priv *datp; + u8 addr[ETH_ALEN]; + int i; + + datp = netdev_priv(netdev); + + for (i = 1; i < 4; i++) { + memset(addr, i, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + } + /* Add 3 again */ + KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + + __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 0xf, datp->addr_seen); + + KUNIT_EXPECT_EQ(test, 0, dev_addr_del(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + + __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 0xf, datp->addr_seen); + + for (i = 1; i < 4; i++) { + memset(addr, i, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_addr_del(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + } + + __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 1, datp->addr_seen); +} + +static void dev_addr_test_del_main(struct kunit *test) +{ + struct net_device *netdev = test->priv; + u8 addr[ETH_ALEN]; + + memset(addr, 1, sizeof(addr)); + eth_hw_addr_set(netdev, addr); + + KUNIT_EXPECT_EQ(test, -ENOENT, dev_addr_del(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + KUNIT_EXPECT_EQ(test, 0, dev_addr_del(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + KUNIT_EXPECT_EQ(test, -ENOENT, dev_addr_del(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); +} + +static void dev_addr_test_add_set(struct kunit *test) +{ + struct net_device *netdev = test->priv; + struct dev_addr_test_priv *datp; + u8 addr[ETH_ALEN]; + int i; + + datp = netdev_priv(netdev); + + /* There is no external API like dev_addr_add_excl(), + * so shuffle the tree a little bit and exploit aliasing. + */ + for (i = 1; i < 16; i++) { + memset(addr, i, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + } + + memset(addr, i, sizeof(addr)); + eth_hw_addr_set(netdev, addr); + KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + memset(addr, 0, sizeof(addr)); + eth_hw_addr_set(netdev, addr); + + __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 0xffff, datp->addr_seen); +} + +static void dev_addr_test_add_excl(struct kunit *test) +{ + struct net_device *netdev = test->priv; + u8 addr[ETH_ALEN]; + int i; + + for (i = 0; i < 10; i++) { + memset(addr, i, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_uc_add_excl(netdev, addr)); + } + KUNIT_EXPECT_EQ(test, -EEXIST, dev_uc_add_excl(netdev, addr)); + + for (i = 0; i < 10; i += 2) { + memset(addr, i, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_uc_del(netdev, addr)); + } + for (i = 1; i < 10; i += 2) { + memset(addr, i, sizeof(addr)); + KUNIT_EXPECT_EQ(test, -EEXIST, dev_uc_add_excl(netdev, addr)); + } +} + +static struct kunit_case dev_addr_test_cases[] = { + KUNIT_CASE(dev_addr_test_basic), + KUNIT_CASE(dev_addr_test_sync_one), + KUNIT_CASE(dev_addr_test_add_del), + KUNIT_CASE(dev_addr_test_del_main), + KUNIT_CASE(dev_addr_test_add_set), + KUNIT_CASE(dev_addr_test_add_excl), + {} +}; + +static struct kunit_suite dev_addr_test_suite = { + .name = "dev-addr-list-test", + .test_cases = dev_addr_test_cases, + .init = dev_addr_test_init, + .exit = dev_addr_test_exit, +}; +kunit_test_suite(dev_addr_test_suite); + +MODULE_LICENSE("GPL"); diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c new file mode 100644 index 000000000..7674bb9f3 --- /dev/null +++ b/net/core/dev_ioctl.c @@ -0,0 +1,619 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kmod.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/etherdevice.h> +#include <linux/rtnetlink.h> +#include <linux/net_tstamp.h> +#include <linux/wireless.h> +#include <linux/if_bridge.h> +#include <net/dsa.h> +#include <net/wext.h> + +#include "dev.h" + +/* + * Map an interface index to its name (SIOCGIFNAME) + */ + +/* + * We need this ioctl for efficient implementation of the + * if_indextoname() function required by the IPv6 API. Without + * it, we would have to search all the interfaces to find a + * match. --pb + */ + +static int dev_ifname(struct net *net, struct ifreq *ifr) +{ + ifr->ifr_name[IFNAMSIZ-1] = 0; + return netdev_get_name(net, ifr->ifr_name, ifr->ifr_ifindex); +} + +/* + * Perform a SIOCGIFCONF call. This structure will change + * size eventually, and there is nothing I can do about it. + * Thus we will need a 'compatibility mode'. + */ +int dev_ifconf(struct net *net, struct ifconf __user *uifc) +{ + struct net_device *dev; + void __user *pos; + size_t size; + int len, total = 0, done; + + /* both the ifconf and the ifreq structures are slightly different */ + if (in_compat_syscall()) { + struct compat_ifconf ifc32; + + if (copy_from_user(&ifc32, uifc, sizeof(struct compat_ifconf))) + return -EFAULT; + + pos = compat_ptr(ifc32.ifcbuf); + len = ifc32.ifc_len; + size = sizeof(struct compat_ifreq); + } else { + struct ifconf ifc; + + if (copy_from_user(&ifc, uifc, sizeof(struct ifconf))) + return -EFAULT; + + pos = ifc.ifc_buf; + len = ifc.ifc_len; + size = sizeof(struct ifreq); + } + + /* Loop over the interfaces, and write an info block for each. */ + rtnl_lock(); + for_each_netdev(net, dev) { + if (!pos) + done = inet_gifconf(dev, NULL, 0, size); + else + done = inet_gifconf(dev, pos + total, + len - total, size); + if (done < 0) { + rtnl_unlock(); + return -EFAULT; + } + total += done; + } + rtnl_unlock(); + + return put_user(total, &uifc->ifc_len); +} + +static int dev_getifmap(struct net_device *dev, struct ifreq *ifr) +{ + struct ifmap *ifmap = &ifr->ifr_map; + + if (in_compat_syscall()) { + struct compat_ifmap *cifmap = (struct compat_ifmap *)ifmap; + + cifmap->mem_start = dev->mem_start; + cifmap->mem_end = dev->mem_end; + cifmap->base_addr = dev->base_addr; + cifmap->irq = dev->irq; + cifmap->dma = dev->dma; + cifmap->port = dev->if_port; + + return 0; + } + + ifmap->mem_start = dev->mem_start; + ifmap->mem_end = dev->mem_end; + ifmap->base_addr = dev->base_addr; + ifmap->irq = dev->irq; + ifmap->dma = dev->dma; + ifmap->port = dev->if_port; + + return 0; +} + +static int dev_setifmap(struct net_device *dev, struct ifreq *ifr) +{ + struct compat_ifmap *cifmap = (struct compat_ifmap *)&ifr->ifr_map; + + if (!dev->netdev_ops->ndo_set_config) + return -EOPNOTSUPP; + + if (in_compat_syscall()) { + struct ifmap ifmap = { + .mem_start = cifmap->mem_start, + .mem_end = cifmap->mem_end, + .base_addr = cifmap->base_addr, + .irq = cifmap->irq, + .dma = cifmap->dma, + .port = cifmap->port, + }; + + return dev->netdev_ops->ndo_set_config(dev, &ifmap); + } + + return dev->netdev_ops->ndo_set_config(dev, &ifr->ifr_map); +} + +/* + * Perform the SIOCxIFxxx calls, inside rcu_read_lock() + */ +static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) +{ + int err; + struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name); + + if (!dev) + return -ENODEV; + + switch (cmd) { + case SIOCGIFFLAGS: /* Get interface flags */ + ifr->ifr_flags = (short) dev_get_flags(dev); + return 0; + + case SIOCGIFMETRIC: /* Get the metric on the interface + (currently unused) */ + ifr->ifr_metric = 0; + return 0; + + case SIOCGIFMTU: /* Get the MTU of a device */ + ifr->ifr_mtu = dev->mtu; + return 0; + + case SIOCGIFSLAVE: + err = -EINVAL; + break; + + case SIOCGIFMAP: + return dev_getifmap(dev, ifr); + + case SIOCGIFINDEX: + ifr->ifr_ifindex = dev->ifindex; + return 0; + + case SIOCGIFTXQLEN: + ifr->ifr_qlen = dev->tx_queue_len; + return 0; + + default: + /* dev_ioctl() should ensure this case + * is never reached + */ + WARN_ON(1); + err = -ENOTTY; + break; + + } + return err; +} + +static int net_hwtstamp_validate(struct ifreq *ifr) +{ + struct hwtstamp_config cfg; + enum hwtstamp_tx_types tx_type; + enum hwtstamp_rx_filters rx_filter; + int tx_type_valid = 0; + int rx_filter_valid = 0; + + if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) + return -EFAULT; + + if (cfg.flags & ~HWTSTAMP_FLAG_MASK) + return -EINVAL; + + tx_type = cfg.tx_type; + rx_filter = cfg.rx_filter; + + switch (tx_type) { + case HWTSTAMP_TX_OFF: + case HWTSTAMP_TX_ON: + case HWTSTAMP_TX_ONESTEP_SYNC: + case HWTSTAMP_TX_ONESTEP_P2P: + tx_type_valid = 1; + break; + case __HWTSTAMP_TX_CNT: + /* not a real value */ + break; + } + + switch (rx_filter) { + case HWTSTAMP_FILTER_NONE: + case HWTSTAMP_FILTER_ALL: + case HWTSTAMP_FILTER_SOME: + case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: + case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: + case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: + case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: + case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: + case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: + case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_EVENT: + case HWTSTAMP_FILTER_PTP_V2_SYNC: + case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: + case HWTSTAMP_FILTER_NTP_ALL: + rx_filter_valid = 1; + break; + case __HWTSTAMP_FILTER_CNT: + /* not a real value */ + break; + } + + if (!tx_type_valid || !rx_filter_valid) + return -ERANGE; + + return 0; +} + +static int dev_eth_ioctl(struct net_device *dev, + struct ifreq *ifr, unsigned int cmd) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int err; + + err = dsa_ndo_eth_ioctl(dev, ifr, cmd); + if (err == 0 || err != -EOPNOTSUPP) + return err; + + if (ops->ndo_eth_ioctl) { + if (netif_device_present(dev)) + err = ops->ndo_eth_ioctl(dev, ifr, cmd); + else + err = -ENODEV; + } + + return err; +} + +static int dev_siocbond(struct net_device *dev, + struct ifreq *ifr, unsigned int cmd) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_siocbond) { + if (netif_device_present(dev)) + return ops->ndo_siocbond(dev, ifr, cmd); + else + return -ENODEV; + } + + return -EOPNOTSUPP; +} + +static int dev_siocdevprivate(struct net_device *dev, struct ifreq *ifr, + void __user *data, unsigned int cmd) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_siocdevprivate) { + if (netif_device_present(dev)) + return ops->ndo_siocdevprivate(dev, ifr, data, cmd); + else + return -ENODEV; + } + + return -EOPNOTSUPP; +} + +static int dev_siocwandev(struct net_device *dev, struct if_settings *ifs) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_siocwandev) { + if (netif_device_present(dev)) + return ops->ndo_siocwandev(dev, ifs); + else + return -ENODEV; + } + + return -EOPNOTSUPP; +} + +/* + * Perform the SIOCxIFxxx calls, inside rtnl_lock() + */ +static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, + unsigned int cmd) +{ + int err; + struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); + const struct net_device_ops *ops; + netdevice_tracker dev_tracker; + + if (!dev) + return -ENODEV; + + ops = dev->netdev_ops; + + switch (cmd) { + case SIOCSIFFLAGS: /* Set interface flags */ + return dev_change_flags(dev, ifr->ifr_flags, NULL); + + case SIOCSIFMETRIC: /* Set the metric on the interface + (currently unused) */ + return -EOPNOTSUPP; + + case SIOCSIFMTU: /* Set the MTU of a device */ + return dev_set_mtu(dev, ifr->ifr_mtu); + + case SIOCSIFHWADDR: + if (dev->addr_len > sizeof(struct sockaddr)) + return -EINVAL; + return dev_set_mac_address_user(dev, &ifr->ifr_hwaddr, NULL); + + case SIOCSIFHWBROADCAST: + if (ifr->ifr_hwaddr.sa_family != dev->type) + return -EINVAL; + memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, + min(sizeof(ifr->ifr_hwaddr.sa_data), + (size_t)dev->addr_len)); + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return 0; + + case SIOCSIFMAP: + return dev_setifmap(dev, ifr); + + case SIOCADDMULTI: + if (!ops->ndo_set_rx_mode || + ifr->ifr_hwaddr.sa_family != AF_UNSPEC) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); + + case SIOCDELMULTI: + if (!ops->ndo_set_rx_mode || + ifr->ifr_hwaddr.sa_family != AF_UNSPEC) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); + + case SIOCSIFTXQLEN: + if (ifr->ifr_qlen < 0) + return -EINVAL; + return dev_change_tx_queue_len(dev, ifr->ifr_qlen); + + case SIOCSIFNAME: + ifr->ifr_newname[IFNAMSIZ-1] = '\0'; + return dev_change_name(dev, ifr->ifr_newname); + + case SIOCWANDEV: + return dev_siocwandev(dev, &ifr->ifr_settings); + + case SIOCBRADDIF: + case SIOCBRDELIF: + if (!netif_device_present(dev)) + return -ENODEV; + if (!netif_is_bridge_master(dev)) + return -EOPNOTSUPP; + netdev_hold(dev, &dev_tracker, GFP_KERNEL); + rtnl_unlock(); + err = br_ioctl_call(net, netdev_priv(dev), cmd, ifr, NULL); + netdev_put(dev, &dev_tracker); + rtnl_lock(); + return err; + + case SIOCSHWTSTAMP: + err = net_hwtstamp_validate(ifr); + if (err) + return err; + fallthrough; + + /* + * Unknown or private ioctl + */ + default: + if (cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15) + return dev_siocdevprivate(dev, ifr, data, cmd); + + if (cmd == SIOCGMIIPHY || + cmd == SIOCGMIIREG || + cmd == SIOCSMIIREG || + cmd == SIOCSHWTSTAMP || + cmd == SIOCGHWTSTAMP) { + err = dev_eth_ioctl(dev, ifr, cmd); + } else if (cmd == SIOCBONDENSLAVE || + cmd == SIOCBONDRELEASE || + cmd == SIOCBONDSETHWADDR || + cmd == SIOCBONDSLAVEINFOQUERY || + cmd == SIOCBONDINFOQUERY || + cmd == SIOCBONDCHANGEACTIVE) { + err = dev_siocbond(dev, ifr, cmd); + } else + err = -EINVAL; + + } + return err; +} + +/** + * dev_load - load a network module + * @net: the applicable net namespace + * @name: name of interface + * + * If a network interface is not present and the process has suitable + * privileges this function loads the module. If module loading is not + * available in this kernel then it becomes a nop. + */ + +void dev_load(struct net *net, const char *name) +{ + struct net_device *dev; + int no_module; + + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, name); + rcu_read_unlock(); + + no_module = !dev; + if (no_module && capable(CAP_NET_ADMIN)) + no_module = request_module("netdev-%s", name); + if (no_module && capable(CAP_SYS_MODULE)) + request_module("%s", name); +} +EXPORT_SYMBOL(dev_load); + +/* + * This function handles all "interface"-type I/O control requests. The actual + * 'doing' part of this is dev_ifsioc above. + */ + +/** + * dev_ioctl - network device ioctl + * @net: the applicable net namespace + * @cmd: command to issue + * @ifr: pointer to a struct ifreq in user space + * @need_copyout: whether or not copy_to_user() should be called + * + * Issue ioctl functions to devices. This is normally called by the + * user space syscall interfaces but can sometimes be useful for + * other purposes. The return value is the return from the syscall if + * positive or a negative errno code on error. + */ + +int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, + void __user *data, bool *need_copyout) +{ + int ret; + char *colon; + + if (need_copyout) + *need_copyout = true; + if (cmd == SIOCGIFNAME) + return dev_ifname(net, ifr); + + ifr->ifr_name[IFNAMSIZ-1] = 0; + + colon = strchr(ifr->ifr_name, ':'); + if (colon) + *colon = 0; + + /* + * See which interface the caller is talking about. + */ + + switch (cmd) { + case SIOCGIFHWADDR: + dev_load(net, ifr->ifr_name); + ret = dev_get_mac_address(&ifr->ifr_hwaddr, net, ifr->ifr_name); + if (colon) + *colon = ':'; + return ret; + /* + * These ioctl calls: + * - can be done by all. + * - atomic and do not require locking. + * - return a value + */ + case SIOCGIFFLAGS: + case SIOCGIFMETRIC: + case SIOCGIFMTU: + case SIOCGIFSLAVE: + case SIOCGIFMAP: + case SIOCGIFINDEX: + case SIOCGIFTXQLEN: + dev_load(net, ifr->ifr_name); + rcu_read_lock(); + ret = dev_ifsioc_locked(net, ifr, cmd); + rcu_read_unlock(); + if (colon) + *colon = ':'; + return ret; + + case SIOCETHTOOL: + dev_load(net, ifr->ifr_name); + ret = dev_ethtool(net, ifr, data); + if (colon) + *colon = ':'; + return ret; + + /* + * These ioctl calls: + * - require superuser power. + * - require strict serialization. + * - return a value + */ + case SIOCGMIIPHY: + case SIOCGMIIREG: + case SIOCSIFNAME: + dev_load(net, ifr->ifr_name); + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + rtnl_lock(); + ret = dev_ifsioc(net, ifr, data, cmd); + rtnl_unlock(); + if (colon) + *colon = ':'; + return ret; + + /* + * These ioctl calls: + * - require superuser power. + * - require strict serialization. + * - do not return a value + */ + case SIOCSIFMAP: + case SIOCSIFTXQLEN: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + fallthrough; + /* + * These ioctl calls: + * - require local superuser power. + * - require strict serialization. + * - do not return a value + */ + case SIOCSIFFLAGS: + case SIOCSIFMETRIC: + case SIOCSIFMTU: + case SIOCSIFHWADDR: + case SIOCSIFSLAVE: + case SIOCADDMULTI: + case SIOCDELMULTI: + case SIOCSIFHWBROADCAST: + case SIOCSMIIREG: + case SIOCBONDENSLAVE: + case SIOCBONDRELEASE: + case SIOCBONDSETHWADDR: + case SIOCBONDCHANGEACTIVE: + case SIOCBRADDIF: + case SIOCBRDELIF: + case SIOCSHWTSTAMP: + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + fallthrough; + case SIOCBONDSLAVEINFOQUERY: + case SIOCBONDINFOQUERY: + dev_load(net, ifr->ifr_name); + rtnl_lock(); + ret = dev_ifsioc(net, ifr, data, cmd); + rtnl_unlock(); + if (need_copyout) + *need_copyout = false; + return ret; + + case SIOCGIFMEM: + /* Get the per device memory space. We can add this but + * currently do not support it */ + case SIOCSIFMEM: + /* Set the per device memory buffer space. + * Not applicable in our case */ + case SIOCSIFLINK: + return -ENOTTY; + + /* + * Unknown or private ioctl. + */ + default: + if (cmd == SIOCWANDEV || + cmd == SIOCGHWTSTAMP || + (cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15)) { + dev_load(net, ifr->ifr_name); + rtnl_lock(); + ret = dev_ifsioc(net, ifr, data, cmd); + rtnl_unlock(); + return ret; + } + return -ENOTTY; + } +} diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c new file mode 100644 index 000000000..8e0a90b45 --- /dev/null +++ b/net/core/drop_monitor.c @@ -0,0 +1,1772 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Monitoring code for network dropped packet alerts + * + * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/string.h> +#include <linux/if_arp.h> +#include <linux/inetdevice.h> +#include <linux/inet.h> +#include <linux/interrupt.h> +#include <linux/netpoll.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <linux/netlink.h> +#include <linux/net_dropmon.h> +#include <linux/percpu.h> +#include <linux/timer.h> +#include <linux/bitops.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <net/genetlink.h> +#include <net/netevent.h> +#include <net/flow_offload.h> +#include <net/devlink.h> + +#include <trace/events/skb.h> +#include <trace/events/napi.h> +#include <trace/events/devlink.h> + +#include <asm/unaligned.h> + +#define TRACE_ON 1 +#define TRACE_OFF 0 + +/* + * Globals, our netlink socket pointer + * and the work handle that will send up + * netlink alerts + */ +static int trace_state = TRACE_OFF; +static bool monitor_hw; + +/* net_dm_mutex + * + * An overall lock guarding every operation coming from userspace. + */ +static DEFINE_MUTEX(net_dm_mutex); + +struct net_dm_stats { + u64_stats_t dropped; + struct u64_stats_sync syncp; +}; + +#define NET_DM_MAX_HW_TRAP_NAME_LEN 40 + +struct net_dm_hw_entry { + char trap_name[NET_DM_MAX_HW_TRAP_NAME_LEN]; + u32 count; +}; + +struct net_dm_hw_entries { + u32 num_entries; + struct net_dm_hw_entry entries[]; +}; + +struct per_cpu_dm_data { + spinlock_t lock; /* Protects 'skb', 'hw_entries' and + * 'send_timer' + */ + union { + struct sk_buff *skb; + struct net_dm_hw_entries *hw_entries; + }; + struct sk_buff_head drop_queue; + struct work_struct dm_alert_work; + struct timer_list send_timer; + struct net_dm_stats stats; +}; + +struct dm_hw_stat_delta { + unsigned long last_rx; + unsigned long last_drop_val; + struct rcu_head rcu; +}; + +static struct genl_family net_drop_monitor_family; + +static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); +static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_hw_cpu_data); + +static int dm_hit_limit = 64; +static int dm_delay = 1; +static unsigned long dm_hw_check_delta = 2*HZ; + +static enum net_dm_alert_mode net_dm_alert_mode = NET_DM_ALERT_MODE_SUMMARY; +static u32 net_dm_trunc_len; +static u32 net_dm_queue_len = 1000; + +struct net_dm_alert_ops { + void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb, + void *location, + enum skb_drop_reason reason); + void (*napi_poll_probe)(void *ignore, struct napi_struct *napi, + int work, int budget); + void (*work_item_func)(struct work_struct *work); + void (*hw_work_item_func)(struct work_struct *work); + void (*hw_trap_probe)(void *ignore, const struct devlink *devlink, + struct sk_buff *skb, + const struct devlink_trap_metadata *metadata); +}; + +struct net_dm_skb_cb { + union { + struct devlink_trap_metadata *hw_metadata; + void *pc; + }; + enum skb_drop_reason reason; +}; + +#define NET_DM_SKB_CB(__skb) ((struct net_dm_skb_cb *)&((__skb)->cb[0])) + +static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data) +{ + size_t al; + struct net_dm_alert_msg *msg; + struct nlattr *nla; + struct sk_buff *skb; + unsigned long flags; + void *msg_header; + + al = sizeof(struct net_dm_alert_msg); + al += dm_hit_limit * sizeof(struct net_dm_drop_point); + al += sizeof(struct nlattr); + + skb = genlmsg_new(al, GFP_KERNEL); + + if (!skb) + goto err; + + msg_header = genlmsg_put(skb, 0, 0, &net_drop_monitor_family, + 0, NET_DM_CMD_ALERT); + if (!msg_header) { + nlmsg_free(skb); + skb = NULL; + goto err; + } + nla = nla_reserve(skb, NLA_UNSPEC, + sizeof(struct net_dm_alert_msg)); + if (!nla) { + nlmsg_free(skb); + skb = NULL; + goto err; + } + msg = nla_data(nla); + memset(msg, 0, al); + goto out; + +err: + mod_timer(&data->send_timer, jiffies + HZ / 10); +out: + spin_lock_irqsave(&data->lock, flags); + swap(data->skb, skb); + spin_unlock_irqrestore(&data->lock, flags); + + if (skb) { + struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data; + struct genlmsghdr *gnlh = (struct genlmsghdr *)nlmsg_data(nlh); + + genlmsg_end(skb, genlmsg_data(gnlh)); + } + + return skb; +} + +static const struct genl_multicast_group dropmon_mcgrps[] = { + { .name = "events", .cap_sys_admin = 1 }, +}; + +static void send_dm_alert(struct work_struct *work) +{ + struct sk_buff *skb; + struct per_cpu_dm_data *data; + + data = container_of(work, struct per_cpu_dm_data, dm_alert_work); + + skb = reset_per_cpu_data(data); + + if (skb) + genlmsg_multicast(&net_drop_monitor_family, skb, 0, + 0, GFP_KERNEL); +} + +/* + * This is the timer function to delay the sending of an alert + * in the event that more drops will arrive during the + * hysteresis period. + */ +static void sched_send_work(struct timer_list *t) +{ + struct per_cpu_dm_data *data = from_timer(data, t, send_timer); + + schedule_work(&data->dm_alert_work); +} + +static void trace_drop_common(struct sk_buff *skb, void *location) +{ + struct net_dm_alert_msg *msg; + struct net_dm_drop_point *point; + struct nlmsghdr *nlh; + struct nlattr *nla; + int i; + struct sk_buff *dskb; + struct per_cpu_dm_data *data; + unsigned long flags; + + local_irq_save(flags); + data = this_cpu_ptr(&dm_cpu_data); + spin_lock(&data->lock); + dskb = data->skb; + + if (!dskb) + goto out; + + nlh = (struct nlmsghdr *)dskb->data; + nla = genlmsg_data(nlmsg_data(nlh)); + msg = nla_data(nla); + point = msg->points; + for (i = 0; i < msg->entries; i++) { + if (!memcmp(&location, &point->pc, sizeof(void *))) { + point->count++; + goto out; + } + point++; + } + if (msg->entries == dm_hit_limit) + goto out; + /* + * We need to create a new entry + */ + __nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point)); + nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point)); + memcpy(point->pc, &location, sizeof(void *)); + point->count = 1; + msg->entries++; + + if (!timer_pending(&data->send_timer)) { + data->send_timer.expires = jiffies + dm_delay * HZ; + add_timer(&data->send_timer); + } + +out: + spin_unlock_irqrestore(&data->lock, flags); +} + +static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, + void *location, + enum skb_drop_reason reason) +{ + trace_drop_common(skb, location); +} + +static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi, + int work, int budget) +{ + struct net_device *dev = napi->dev; + struct dm_hw_stat_delta *stat; + /* + * Don't check napi structures with no associated device + */ + if (!dev) + return; + + rcu_read_lock(); + stat = rcu_dereference(dev->dm_private); + if (stat) { + /* + * only add a note to our monitor buffer if: + * 1) its after the last_rx delta + * 2) our rx_dropped count has gone up + */ + if (time_after(jiffies, stat->last_rx + dm_hw_check_delta) && + (dev->stats.rx_dropped != stat->last_drop_val)) { + trace_drop_common(NULL, NULL); + stat->last_drop_val = dev->stats.rx_dropped; + stat->last_rx = jiffies; + } + } + rcu_read_unlock(); +} + +static struct net_dm_hw_entries * +net_dm_hw_reset_per_cpu_data(struct per_cpu_dm_data *hw_data) +{ + struct net_dm_hw_entries *hw_entries; + unsigned long flags; + + hw_entries = kzalloc(struct_size(hw_entries, entries, dm_hit_limit), + GFP_KERNEL); + if (!hw_entries) { + /* If the memory allocation failed, we try to perform another + * allocation in 1/10 second. Otherwise, the probe function + * will constantly bail out. + */ + mod_timer(&hw_data->send_timer, jiffies + HZ / 10); + } + + spin_lock_irqsave(&hw_data->lock, flags); + swap(hw_data->hw_entries, hw_entries); + spin_unlock_irqrestore(&hw_data->lock, flags); + + return hw_entries; +} + +static int net_dm_hw_entry_put(struct sk_buff *msg, + const struct net_dm_hw_entry *hw_entry) +{ + struct nlattr *attr; + + attr = nla_nest_start(msg, NET_DM_ATTR_HW_ENTRY); + if (!attr) + return -EMSGSIZE; + + if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_NAME, hw_entry->trap_name)) + goto nla_put_failure; + + if (nla_put_u32(msg, NET_DM_ATTR_HW_TRAP_COUNT, hw_entry->count)) + goto nla_put_failure; + + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + +static int net_dm_hw_entries_put(struct sk_buff *msg, + const struct net_dm_hw_entries *hw_entries) +{ + struct nlattr *attr; + int i; + + attr = nla_nest_start(msg, NET_DM_ATTR_HW_ENTRIES); + if (!attr) + return -EMSGSIZE; + + for (i = 0; i < hw_entries->num_entries; i++) { + int rc; + + rc = net_dm_hw_entry_put(msg, &hw_entries->entries[i]); + if (rc) + goto nla_put_failure; + } + + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + +static int +net_dm_hw_summary_report_fill(struct sk_buff *msg, + const struct net_dm_hw_entries *hw_entries) +{ + struct net_dm_alert_msg anc_hdr = { 0 }; + void *hdr; + int rc; + + hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0, + NET_DM_CMD_ALERT); + if (!hdr) + return -EMSGSIZE; + + /* We need to put the ancillary header in order not to break user + * space. + */ + if (nla_put(msg, NLA_UNSPEC, sizeof(anc_hdr), &anc_hdr)) + goto nla_put_failure; + + rc = net_dm_hw_entries_put(msg, hw_entries); + if (rc) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void net_dm_hw_summary_work(struct work_struct *work) +{ + struct net_dm_hw_entries *hw_entries; + struct per_cpu_dm_data *hw_data; + struct sk_buff *msg; + int rc; + + hw_data = container_of(work, struct per_cpu_dm_data, dm_alert_work); + + hw_entries = net_dm_hw_reset_per_cpu_data(hw_data); + if (!hw_entries) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + goto out; + + rc = net_dm_hw_summary_report_fill(msg, hw_entries); + if (rc) { + nlmsg_free(msg); + goto out; + } + + genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL); + +out: + kfree(hw_entries); +} + +static void +net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink, + struct sk_buff *skb, + const struct devlink_trap_metadata *metadata) +{ + struct net_dm_hw_entries *hw_entries; + struct net_dm_hw_entry *hw_entry; + struct per_cpu_dm_data *hw_data; + unsigned long flags; + int i; + + if (metadata->trap_type == DEVLINK_TRAP_TYPE_CONTROL) + return; + + hw_data = this_cpu_ptr(&dm_hw_cpu_data); + spin_lock_irqsave(&hw_data->lock, flags); + hw_entries = hw_data->hw_entries; + + if (!hw_entries) + goto out; + + for (i = 0; i < hw_entries->num_entries; i++) { + hw_entry = &hw_entries->entries[i]; + if (!strncmp(hw_entry->trap_name, metadata->trap_name, + NET_DM_MAX_HW_TRAP_NAME_LEN - 1)) { + hw_entry->count++; + goto out; + } + } + if (WARN_ON_ONCE(hw_entries->num_entries == dm_hit_limit)) + goto out; + + hw_entry = &hw_entries->entries[hw_entries->num_entries]; + strscpy(hw_entry->trap_name, metadata->trap_name, + NET_DM_MAX_HW_TRAP_NAME_LEN - 1); + hw_entry->count = 1; + hw_entries->num_entries++; + + if (!timer_pending(&hw_data->send_timer)) { + hw_data->send_timer.expires = jiffies + dm_delay * HZ; + add_timer(&hw_data->send_timer); + } + +out: + spin_unlock_irqrestore(&hw_data->lock, flags); +} + +static const struct net_dm_alert_ops net_dm_alert_summary_ops = { + .kfree_skb_probe = trace_kfree_skb_hit, + .napi_poll_probe = trace_napi_poll_hit, + .work_item_func = send_dm_alert, + .hw_work_item_func = net_dm_hw_summary_work, + .hw_trap_probe = net_dm_hw_trap_summary_probe, +}; + +static void net_dm_packet_trace_kfree_skb_hit(void *ignore, + struct sk_buff *skb, + void *location, + enum skb_drop_reason reason) +{ + ktime_t tstamp = ktime_get_real(); + struct per_cpu_dm_data *data; + struct net_dm_skb_cb *cb; + struct sk_buff *nskb; + unsigned long flags; + + if (!skb_mac_header_was_set(skb)) + return; + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return; + + if (unlikely(reason >= SKB_DROP_REASON_MAX || reason <= 0)) + reason = SKB_DROP_REASON_NOT_SPECIFIED; + cb = NET_DM_SKB_CB(nskb); + cb->reason = reason; + cb->pc = location; + /* Override the timestamp because we care about the time when the + * packet was dropped. + */ + nskb->tstamp = tstamp; + + data = this_cpu_ptr(&dm_cpu_data); + + spin_lock_irqsave(&data->drop_queue.lock, flags); + if (skb_queue_len(&data->drop_queue) < net_dm_queue_len) + __skb_queue_tail(&data->drop_queue, nskb); + else + goto unlock_free; + spin_unlock_irqrestore(&data->drop_queue.lock, flags); + + schedule_work(&data->dm_alert_work); + + return; + +unlock_free: + spin_unlock_irqrestore(&data->drop_queue.lock, flags); + u64_stats_update_begin(&data->stats.syncp); + u64_stats_inc(&data->stats.dropped); + u64_stats_update_end(&data->stats.syncp); + consume_skb(nskb); +} + +static void net_dm_packet_trace_napi_poll_hit(void *ignore, + struct napi_struct *napi, + int work, int budget) +{ +} + +static size_t net_dm_in_port_size(void) +{ + /* NET_DM_ATTR_IN_PORT nest */ + return nla_total_size(0) + + /* NET_DM_ATTR_PORT_NETDEV_IFINDEX */ + nla_total_size(sizeof(u32)) + + /* NET_DM_ATTR_PORT_NETDEV_NAME */ + nla_total_size(IFNAMSIZ + 1); +} + +#define NET_DM_MAX_SYMBOL_LEN 40 + +static size_t net_dm_packet_report_size(size_t payload_len, + enum skb_drop_reason reason) +{ + size_t size; + + size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize); + + return NLMSG_ALIGN(size) + + /* NET_DM_ATTR_ORIGIN */ + nla_total_size(sizeof(u16)) + + /* NET_DM_ATTR_PC */ + nla_total_size(sizeof(u64)) + + /* NET_DM_ATTR_SYMBOL */ + nla_total_size(NET_DM_MAX_SYMBOL_LEN + 1) + + /* NET_DM_ATTR_IN_PORT */ + net_dm_in_port_size() + + /* NET_DM_ATTR_TIMESTAMP */ + nla_total_size(sizeof(u64)) + + /* NET_DM_ATTR_ORIG_LEN */ + nla_total_size(sizeof(u32)) + + /* NET_DM_ATTR_PROTO */ + nla_total_size(sizeof(u16)) + + /* NET_DM_ATTR_REASON */ + nla_total_size(strlen(drop_reasons[reason]) + 1) + + /* NET_DM_ATTR_PAYLOAD */ + nla_total_size(payload_len); +} + +static int net_dm_packet_report_in_port_put(struct sk_buff *msg, int ifindex, + const char *name) +{ + struct nlattr *attr; + + attr = nla_nest_start(msg, NET_DM_ATTR_IN_PORT); + if (!attr) + return -EMSGSIZE; + + if (ifindex && + nla_put_u32(msg, NET_DM_ATTR_PORT_NETDEV_IFINDEX, ifindex)) + goto nla_put_failure; + + if (name && nla_put_string(msg, NET_DM_ATTR_PORT_NETDEV_NAME, name)) + goto nla_put_failure; + + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + +static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb, + size_t payload_len) +{ + struct net_dm_skb_cb *cb = NET_DM_SKB_CB(skb); + char buf[NET_DM_MAX_SYMBOL_LEN]; + struct nlattr *attr; + void *hdr; + int rc; + + hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0, + NET_DM_CMD_PACKET_ALERT); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_SW)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, (u64)(uintptr_t)cb->pc, + NET_DM_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_string(msg, NET_DM_ATTR_REASON, + drop_reasons[cb->reason])) + goto nla_put_failure; + + snprintf(buf, sizeof(buf), "%pS", cb->pc); + if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf)) + goto nla_put_failure; + + rc = net_dm_packet_report_in_port_put(msg, skb->skb_iif, NULL); + if (rc) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP, + ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u32(msg, NET_DM_ATTR_ORIG_LEN, skb->len)) + goto nla_put_failure; + + if (!payload_len) + goto out; + + if (nla_put_u16(msg, NET_DM_ATTR_PROTO, be16_to_cpu(skb->protocol))) + goto nla_put_failure; + + attr = skb_put(msg, nla_total_size(payload_len)); + attr->nla_type = NET_DM_ATTR_PAYLOAD; + attr->nla_len = nla_attr_size(payload_len); + if (skb_copy_bits(skb, 0, nla_data(attr), payload_len)) + goto nla_put_failure; + +out: + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +#define NET_DM_MAX_PACKET_SIZE (0xffff - NLA_HDRLEN - NLA_ALIGNTO) + +static void net_dm_packet_report(struct sk_buff *skb) +{ + struct sk_buff *msg; + size_t payload_len; + int rc; + + /* Make sure we start copying the packet from the MAC header */ + if (skb->data > skb_mac_header(skb)) + skb_push(skb, skb->data - skb_mac_header(skb)); + else + skb_pull(skb, skb_mac_header(skb) - skb->data); + + /* Ensure packet fits inside a single netlink attribute */ + payload_len = min_t(size_t, skb->len, NET_DM_MAX_PACKET_SIZE); + if (net_dm_trunc_len) + payload_len = min_t(size_t, net_dm_trunc_len, payload_len); + + msg = nlmsg_new(net_dm_packet_report_size(payload_len, + NET_DM_SKB_CB(skb)->reason), + GFP_KERNEL); + if (!msg) + goto out; + + rc = net_dm_packet_report_fill(msg, skb, payload_len); + if (rc) { + nlmsg_free(msg); + goto out; + } + + genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL); + +out: + consume_skb(skb); +} + +static void net_dm_packet_work(struct work_struct *work) +{ + struct per_cpu_dm_data *data; + struct sk_buff_head list; + struct sk_buff *skb; + unsigned long flags; + + data = container_of(work, struct per_cpu_dm_data, dm_alert_work); + + __skb_queue_head_init(&list); + + spin_lock_irqsave(&data->drop_queue.lock, flags); + skb_queue_splice_tail_init(&data->drop_queue, &list); + spin_unlock_irqrestore(&data->drop_queue.lock, flags); + + while ((skb = __skb_dequeue(&list))) + net_dm_packet_report(skb); +} + +static size_t +net_dm_flow_action_cookie_size(const struct devlink_trap_metadata *hw_metadata) +{ + return hw_metadata->fa_cookie ? + nla_total_size(hw_metadata->fa_cookie->cookie_len) : 0; +} + +static size_t +net_dm_hw_packet_report_size(size_t payload_len, + const struct devlink_trap_metadata *hw_metadata) +{ + size_t size; + + size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize); + + return NLMSG_ALIGN(size) + + /* NET_DM_ATTR_ORIGIN */ + nla_total_size(sizeof(u16)) + + /* NET_DM_ATTR_HW_TRAP_GROUP_NAME */ + nla_total_size(strlen(hw_metadata->trap_group_name) + 1) + + /* NET_DM_ATTR_HW_TRAP_NAME */ + nla_total_size(strlen(hw_metadata->trap_name) + 1) + + /* NET_DM_ATTR_IN_PORT */ + net_dm_in_port_size() + + /* NET_DM_ATTR_FLOW_ACTION_COOKIE */ + net_dm_flow_action_cookie_size(hw_metadata) + + /* NET_DM_ATTR_TIMESTAMP */ + nla_total_size(sizeof(u64)) + + /* NET_DM_ATTR_ORIG_LEN */ + nla_total_size(sizeof(u32)) + + /* NET_DM_ATTR_PROTO */ + nla_total_size(sizeof(u16)) + + /* NET_DM_ATTR_PAYLOAD */ + nla_total_size(payload_len); +} + +static int net_dm_hw_packet_report_fill(struct sk_buff *msg, + struct sk_buff *skb, size_t payload_len) +{ + struct devlink_trap_metadata *hw_metadata; + struct nlattr *attr; + void *hdr; + + hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; + + hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0, + NET_DM_CMD_PACKET_ALERT); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_HW)) + goto nla_put_failure; + + if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_GROUP_NAME, + hw_metadata->trap_group_name)) + goto nla_put_failure; + + if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_NAME, + hw_metadata->trap_name)) + goto nla_put_failure; + + if (hw_metadata->input_dev) { + struct net_device *dev = hw_metadata->input_dev; + int rc; + + rc = net_dm_packet_report_in_port_put(msg, dev->ifindex, + dev->name); + if (rc) + goto nla_put_failure; + } + + if (hw_metadata->fa_cookie && + nla_put(msg, NET_DM_ATTR_FLOW_ACTION_COOKIE, + hw_metadata->fa_cookie->cookie_len, + hw_metadata->fa_cookie->cookie)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP, + ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u32(msg, NET_DM_ATTR_ORIG_LEN, skb->len)) + goto nla_put_failure; + + if (!payload_len) + goto out; + + if (nla_put_u16(msg, NET_DM_ATTR_PROTO, be16_to_cpu(skb->protocol))) + goto nla_put_failure; + + attr = skb_put(msg, nla_total_size(payload_len)); + attr->nla_type = NET_DM_ATTR_PAYLOAD; + attr->nla_len = nla_attr_size(payload_len); + if (skb_copy_bits(skb, 0, nla_data(attr), payload_len)) + goto nla_put_failure; + +out: + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static struct devlink_trap_metadata * +net_dm_hw_metadata_copy(const struct devlink_trap_metadata *metadata) +{ + const struct flow_action_cookie *fa_cookie; + struct devlink_trap_metadata *hw_metadata; + const char *trap_group_name; + const char *trap_name; + + hw_metadata = kzalloc(sizeof(*hw_metadata), GFP_ATOMIC); + if (!hw_metadata) + return NULL; + + trap_group_name = kstrdup(metadata->trap_group_name, GFP_ATOMIC); + if (!trap_group_name) + goto free_hw_metadata; + hw_metadata->trap_group_name = trap_group_name; + + trap_name = kstrdup(metadata->trap_name, GFP_ATOMIC); + if (!trap_name) + goto free_trap_group; + hw_metadata->trap_name = trap_name; + + if (metadata->fa_cookie) { + size_t cookie_size = sizeof(*fa_cookie) + + metadata->fa_cookie->cookie_len; + + fa_cookie = kmemdup(metadata->fa_cookie, cookie_size, + GFP_ATOMIC); + if (!fa_cookie) + goto free_trap_name; + hw_metadata->fa_cookie = fa_cookie; + } + + hw_metadata->input_dev = metadata->input_dev; + netdev_hold(hw_metadata->input_dev, &hw_metadata->dev_tracker, + GFP_ATOMIC); + + return hw_metadata; + +free_trap_name: + kfree(trap_name); +free_trap_group: + kfree(trap_group_name); +free_hw_metadata: + kfree(hw_metadata); + return NULL; +} + +static void +net_dm_hw_metadata_free(struct devlink_trap_metadata *hw_metadata) +{ + netdev_put(hw_metadata->input_dev, &hw_metadata->dev_tracker); + kfree(hw_metadata->fa_cookie); + kfree(hw_metadata->trap_name); + kfree(hw_metadata->trap_group_name); + kfree(hw_metadata); +} + +static void net_dm_hw_packet_report(struct sk_buff *skb) +{ + struct devlink_trap_metadata *hw_metadata; + struct sk_buff *msg; + size_t payload_len; + int rc; + + if (skb->data > skb_mac_header(skb)) + skb_push(skb, skb->data - skb_mac_header(skb)); + else + skb_pull(skb, skb_mac_header(skb) - skb->data); + + payload_len = min_t(size_t, skb->len, NET_DM_MAX_PACKET_SIZE); + if (net_dm_trunc_len) + payload_len = min_t(size_t, net_dm_trunc_len, payload_len); + + hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; + msg = nlmsg_new(net_dm_hw_packet_report_size(payload_len, hw_metadata), + GFP_KERNEL); + if (!msg) + goto out; + + rc = net_dm_hw_packet_report_fill(msg, skb, payload_len); + if (rc) { + nlmsg_free(msg); + goto out; + } + + genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL); + +out: + net_dm_hw_metadata_free(NET_DM_SKB_CB(skb)->hw_metadata); + consume_skb(skb); +} + +static void net_dm_hw_packet_work(struct work_struct *work) +{ + struct per_cpu_dm_data *hw_data; + struct sk_buff_head list; + struct sk_buff *skb; + unsigned long flags; + + hw_data = container_of(work, struct per_cpu_dm_data, dm_alert_work); + + __skb_queue_head_init(&list); + + spin_lock_irqsave(&hw_data->drop_queue.lock, flags); + skb_queue_splice_tail_init(&hw_data->drop_queue, &list); + spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags); + + while ((skb = __skb_dequeue(&list))) + net_dm_hw_packet_report(skb); +} + +static void +net_dm_hw_trap_packet_probe(void *ignore, const struct devlink *devlink, + struct sk_buff *skb, + const struct devlink_trap_metadata *metadata) +{ + struct devlink_trap_metadata *n_hw_metadata; + ktime_t tstamp = ktime_get_real(); + struct per_cpu_dm_data *hw_data; + struct sk_buff *nskb; + unsigned long flags; + + if (metadata->trap_type == DEVLINK_TRAP_TYPE_CONTROL) + return; + + if (!skb_mac_header_was_set(skb)) + return; + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return; + + n_hw_metadata = net_dm_hw_metadata_copy(metadata); + if (!n_hw_metadata) + goto free; + + NET_DM_SKB_CB(nskb)->hw_metadata = n_hw_metadata; + nskb->tstamp = tstamp; + + hw_data = this_cpu_ptr(&dm_hw_cpu_data); + + spin_lock_irqsave(&hw_data->drop_queue.lock, flags); + if (skb_queue_len(&hw_data->drop_queue) < net_dm_queue_len) + __skb_queue_tail(&hw_data->drop_queue, nskb); + else + goto unlock_free; + spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags); + + schedule_work(&hw_data->dm_alert_work); + + return; + +unlock_free: + spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags); + u64_stats_update_begin(&hw_data->stats.syncp); + u64_stats_inc(&hw_data->stats.dropped); + u64_stats_update_end(&hw_data->stats.syncp); + net_dm_hw_metadata_free(n_hw_metadata); +free: + consume_skb(nskb); +} + +static const struct net_dm_alert_ops net_dm_alert_packet_ops = { + .kfree_skb_probe = net_dm_packet_trace_kfree_skb_hit, + .napi_poll_probe = net_dm_packet_trace_napi_poll_hit, + .work_item_func = net_dm_packet_work, + .hw_work_item_func = net_dm_hw_packet_work, + .hw_trap_probe = net_dm_hw_trap_packet_probe, +}; + +static const struct net_dm_alert_ops *net_dm_alert_ops_arr[] = { + [NET_DM_ALERT_MODE_SUMMARY] = &net_dm_alert_summary_ops, + [NET_DM_ALERT_MODE_PACKET] = &net_dm_alert_packet_ops, +}; + +#if IS_ENABLED(CONFIG_NET_DEVLINK) +static int net_dm_hw_probe_register(const struct net_dm_alert_ops *ops) +{ + return register_trace_devlink_trap_report(ops->hw_trap_probe, NULL); +} + +static void net_dm_hw_probe_unregister(const struct net_dm_alert_ops *ops) +{ + unregister_trace_devlink_trap_report(ops->hw_trap_probe, NULL); + tracepoint_synchronize_unregister(); +} +#else +static int net_dm_hw_probe_register(const struct net_dm_alert_ops *ops) +{ + return -EOPNOTSUPP; +} + +static void net_dm_hw_probe_unregister(const struct net_dm_alert_ops *ops) +{ +} +#endif + +static int net_dm_hw_monitor_start(struct netlink_ext_ack *extack) +{ + const struct net_dm_alert_ops *ops; + int cpu, rc; + + if (monitor_hw) { + NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already enabled"); + return -EAGAIN; + } + + ops = net_dm_alert_ops_arr[net_dm_alert_mode]; + + if (!try_module_get(THIS_MODULE)) { + NL_SET_ERR_MSG_MOD(extack, "Failed to take reference on module"); + return -ENODEV; + } + + for_each_possible_cpu(cpu) { + struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); + struct net_dm_hw_entries *hw_entries; + + INIT_WORK(&hw_data->dm_alert_work, ops->hw_work_item_func); + timer_setup(&hw_data->send_timer, sched_send_work, 0); + hw_entries = net_dm_hw_reset_per_cpu_data(hw_data); + kfree(hw_entries); + } + + rc = net_dm_hw_probe_register(ops); + if (rc) { + NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to devlink_trap_probe() tracepoint"); + goto err_module_put; + } + + monitor_hw = true; + + return 0; + +err_module_put: + for_each_possible_cpu(cpu) { + struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); + struct sk_buff *skb; + + del_timer_sync(&hw_data->send_timer); + cancel_work_sync(&hw_data->dm_alert_work); + while ((skb = __skb_dequeue(&hw_data->drop_queue))) { + struct devlink_trap_metadata *hw_metadata; + + hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; + net_dm_hw_metadata_free(hw_metadata); + consume_skb(skb); + } + } + module_put(THIS_MODULE); + return rc; +} + +static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack) +{ + const struct net_dm_alert_ops *ops; + int cpu; + + if (!monitor_hw) { + NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already disabled"); + return; + } + + ops = net_dm_alert_ops_arr[net_dm_alert_mode]; + + monitor_hw = false; + + net_dm_hw_probe_unregister(ops); + + for_each_possible_cpu(cpu) { + struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); + struct sk_buff *skb; + + del_timer_sync(&hw_data->send_timer); + cancel_work_sync(&hw_data->dm_alert_work); + while ((skb = __skb_dequeue(&hw_data->drop_queue))) { + struct devlink_trap_metadata *hw_metadata; + + hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; + net_dm_hw_metadata_free(hw_metadata); + consume_skb(skb); + } + } + + module_put(THIS_MODULE); +} + +static int net_dm_trace_on_set(struct netlink_ext_ack *extack) +{ + const struct net_dm_alert_ops *ops; + int cpu, rc; + + ops = net_dm_alert_ops_arr[net_dm_alert_mode]; + + if (!try_module_get(THIS_MODULE)) { + NL_SET_ERR_MSG_MOD(extack, "Failed to take reference on module"); + return -ENODEV; + } + + for_each_possible_cpu(cpu) { + struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); + struct sk_buff *skb; + + INIT_WORK(&data->dm_alert_work, ops->work_item_func); + timer_setup(&data->send_timer, sched_send_work, 0); + /* Allocate a new per-CPU skb for the summary alert message and + * free the old one which might contain stale data from + * previous tracing. + */ + skb = reset_per_cpu_data(data); + consume_skb(skb); + } + + rc = register_trace_kfree_skb(ops->kfree_skb_probe, NULL); + if (rc) { + NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to kfree_skb() tracepoint"); + goto err_module_put; + } + + rc = register_trace_napi_poll(ops->napi_poll_probe, NULL); + if (rc) { + NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to napi_poll() tracepoint"); + goto err_unregister_trace; + } + + return 0; + +err_unregister_trace: + unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL); +err_module_put: + for_each_possible_cpu(cpu) { + struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); + struct sk_buff *skb; + + del_timer_sync(&data->send_timer); + cancel_work_sync(&data->dm_alert_work); + while ((skb = __skb_dequeue(&data->drop_queue))) + consume_skb(skb); + } + module_put(THIS_MODULE); + return rc; +} + +static void net_dm_trace_off_set(void) +{ + const struct net_dm_alert_ops *ops; + int cpu; + + ops = net_dm_alert_ops_arr[net_dm_alert_mode]; + + unregister_trace_napi_poll(ops->napi_poll_probe, NULL); + unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL); + + tracepoint_synchronize_unregister(); + + /* Make sure we do not send notifications to user space after request + * to stop tracing returns. + */ + for_each_possible_cpu(cpu) { + struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); + struct sk_buff *skb; + + del_timer_sync(&data->send_timer); + cancel_work_sync(&data->dm_alert_work); + while ((skb = __skb_dequeue(&data->drop_queue))) + consume_skb(skb); + } + + module_put(THIS_MODULE); +} + +static int set_all_monitor_traces(int state, struct netlink_ext_ack *extack) +{ + int rc = 0; + + if (state == trace_state) { + NL_SET_ERR_MSG_MOD(extack, "Trace state already set to requested state"); + return -EAGAIN; + } + + switch (state) { + case TRACE_ON: + rc = net_dm_trace_on_set(extack); + break; + case TRACE_OFF: + net_dm_trace_off_set(); + break; + default: + rc = 1; + break; + } + + if (!rc) + trace_state = state; + else + rc = -EINPROGRESS; + + return rc; +} + +static bool net_dm_is_monitoring(void) +{ + return trace_state == TRACE_ON || monitor_hw; +} + +static int net_dm_alert_mode_get_from_info(struct genl_info *info, + enum net_dm_alert_mode *p_alert_mode) +{ + u8 val; + + val = nla_get_u8(info->attrs[NET_DM_ATTR_ALERT_MODE]); + + switch (val) { + case NET_DM_ALERT_MODE_SUMMARY: + case NET_DM_ALERT_MODE_PACKET: + *p_alert_mode = val; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int net_dm_alert_mode_set(struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + enum net_dm_alert_mode alert_mode; + int rc; + + if (!info->attrs[NET_DM_ATTR_ALERT_MODE]) + return 0; + + rc = net_dm_alert_mode_get_from_info(info, &alert_mode); + if (rc) { + NL_SET_ERR_MSG_MOD(extack, "Invalid alert mode"); + return -EINVAL; + } + + net_dm_alert_mode = alert_mode; + + return 0; +} + +static void net_dm_trunc_len_set(struct genl_info *info) +{ + if (!info->attrs[NET_DM_ATTR_TRUNC_LEN]) + return; + + net_dm_trunc_len = nla_get_u32(info->attrs[NET_DM_ATTR_TRUNC_LEN]); +} + +static void net_dm_queue_len_set(struct genl_info *info) +{ + if (!info->attrs[NET_DM_ATTR_QUEUE_LEN]) + return; + + net_dm_queue_len = nla_get_u32(info->attrs[NET_DM_ATTR_QUEUE_LEN]); +} + +static int net_dm_cmd_config(struct sk_buff *skb, + struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + int rc; + + if (net_dm_is_monitoring()) { + NL_SET_ERR_MSG_MOD(extack, "Cannot configure drop monitor during monitoring"); + return -EBUSY; + } + + rc = net_dm_alert_mode_set(info); + if (rc) + return rc; + + net_dm_trunc_len_set(info); + + net_dm_queue_len_set(info); + + return 0; +} + +static int net_dm_monitor_start(bool set_sw, bool set_hw, + struct netlink_ext_ack *extack) +{ + bool sw_set = false; + int rc; + + if (set_sw) { + rc = set_all_monitor_traces(TRACE_ON, extack); + if (rc) + return rc; + sw_set = true; + } + + if (set_hw) { + rc = net_dm_hw_monitor_start(extack); + if (rc) + goto err_monitor_hw; + } + + return 0; + +err_monitor_hw: + if (sw_set) + set_all_monitor_traces(TRACE_OFF, extack); + return rc; +} + +static void net_dm_monitor_stop(bool set_sw, bool set_hw, + struct netlink_ext_ack *extack) +{ + if (set_hw) + net_dm_hw_monitor_stop(extack); + if (set_sw) + set_all_monitor_traces(TRACE_OFF, extack); +} + +static int net_dm_cmd_trace(struct sk_buff *skb, + struct genl_info *info) +{ + bool set_sw = !!info->attrs[NET_DM_ATTR_SW_DROPS]; + bool set_hw = !!info->attrs[NET_DM_ATTR_HW_DROPS]; + struct netlink_ext_ack *extack = info->extack; + + /* To maintain backward compatibility, we start / stop monitoring of + * software drops if no flag is specified. + */ + if (!set_sw && !set_hw) + set_sw = true; + + switch (info->genlhdr->cmd) { + case NET_DM_CMD_START: + return net_dm_monitor_start(set_sw, set_hw, extack); + case NET_DM_CMD_STOP: + net_dm_monitor_stop(set_sw, set_hw, extack); + return 0; + } + + return -EOPNOTSUPP; +} + +static int net_dm_config_fill(struct sk_buff *msg, struct genl_info *info) +{ + void *hdr; + + hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, + &net_drop_monitor_family, 0, NET_DM_CMD_CONFIG_NEW); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u8(msg, NET_DM_ATTR_ALERT_MODE, net_dm_alert_mode)) + goto nla_put_failure; + + if (nla_put_u32(msg, NET_DM_ATTR_TRUNC_LEN, net_dm_trunc_len)) + goto nla_put_failure; + + if (nla_put_u32(msg, NET_DM_ATTR_QUEUE_LEN, net_dm_queue_len)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int net_dm_cmd_config_get(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + int rc; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + rc = net_dm_config_fill(msg, info); + if (rc) + goto free_msg; + + return genlmsg_reply(msg, info); + +free_msg: + nlmsg_free(msg); + return rc; +} + +static void net_dm_stats_read(struct net_dm_stats *stats) +{ + int cpu; + + memset(stats, 0, sizeof(*stats)); + for_each_possible_cpu(cpu) { + struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); + struct net_dm_stats *cpu_stats = &data->stats; + unsigned int start; + u64 dropped; + + do { + start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + dropped = u64_stats_read(&cpu_stats->dropped); + } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); + + u64_stats_add(&stats->dropped, dropped); + } +} + +static int net_dm_stats_put(struct sk_buff *msg) +{ + struct net_dm_stats stats; + struct nlattr *attr; + + net_dm_stats_read(&stats); + + attr = nla_nest_start(msg, NET_DM_ATTR_STATS); + if (!attr) + return -EMSGSIZE; + + if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED, + u64_stats_read(&stats.dropped), NET_DM_ATTR_PAD)) + goto nla_put_failure; + + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + +static void net_dm_hw_stats_read(struct net_dm_stats *stats) +{ + int cpu; + + memset(stats, 0, sizeof(*stats)); + for_each_possible_cpu(cpu) { + struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); + struct net_dm_stats *cpu_stats = &hw_data->stats; + unsigned int start; + u64 dropped; + + do { + start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + dropped = u64_stats_read(&cpu_stats->dropped); + } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); + + u64_stats_add(&stats->dropped, dropped); + } +} + +static int net_dm_hw_stats_put(struct sk_buff *msg) +{ + struct net_dm_stats stats; + struct nlattr *attr; + + net_dm_hw_stats_read(&stats); + + attr = nla_nest_start(msg, NET_DM_ATTR_HW_STATS); + if (!attr) + return -EMSGSIZE; + + if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED, + u64_stats_read(&stats.dropped), NET_DM_ATTR_PAD)) + goto nla_put_failure; + + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + +static int net_dm_stats_fill(struct sk_buff *msg, struct genl_info *info) +{ + void *hdr; + int rc; + + hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, + &net_drop_monitor_family, 0, NET_DM_CMD_STATS_NEW); + if (!hdr) + return -EMSGSIZE; + + rc = net_dm_stats_put(msg); + if (rc) + goto nla_put_failure; + + rc = net_dm_hw_stats_put(msg); + if (rc) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int net_dm_cmd_stats_get(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + int rc; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + rc = net_dm_stats_fill(msg, info); + if (rc) + goto free_msg; + + return genlmsg_reply(msg, info); + +free_msg: + nlmsg_free(msg); + return rc; +} + +static int dropmon_net_event(struct notifier_block *ev_block, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct dm_hw_stat_delta *stat; + + switch (event) { + case NETDEV_REGISTER: + if (WARN_ON_ONCE(rtnl_dereference(dev->dm_private))) + break; + stat = kzalloc(sizeof(*stat), GFP_KERNEL); + if (!stat) + break; + + stat->last_rx = jiffies; + rcu_assign_pointer(dev->dm_private, stat); + + break; + case NETDEV_UNREGISTER: + stat = rtnl_dereference(dev->dm_private); + if (stat) { + rcu_assign_pointer(dev->dm_private, NULL); + kfree_rcu(stat, rcu); + } + break; + } + return NOTIFY_DONE; +} + +static const struct nla_policy net_dm_nl_policy[NET_DM_ATTR_MAX + 1] = { + [NET_DM_ATTR_UNSPEC] = { .strict_start_type = NET_DM_ATTR_UNSPEC + 1 }, + [NET_DM_ATTR_ALERT_MODE] = { .type = NLA_U8 }, + [NET_DM_ATTR_TRUNC_LEN] = { .type = NLA_U32 }, + [NET_DM_ATTR_QUEUE_LEN] = { .type = NLA_U32 }, + [NET_DM_ATTR_SW_DROPS] = {. type = NLA_FLAG }, + [NET_DM_ATTR_HW_DROPS] = {. type = NLA_FLAG }, +}; + +static const struct genl_small_ops dropmon_ops[] = { + { + .cmd = NET_DM_CMD_CONFIG, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = net_dm_cmd_config, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = NET_DM_CMD_START, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = net_dm_cmd_trace, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = NET_DM_CMD_STOP, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = net_dm_cmd_trace, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = NET_DM_CMD_CONFIG_GET, + .doit = net_dm_cmd_config_get, + }, + { + .cmd = NET_DM_CMD_STATS_GET, + .doit = net_dm_cmd_stats_get, + }, +}; + +static int net_dm_nl_pre_doit(const struct genl_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + mutex_lock(&net_dm_mutex); + + return 0; +} + +static void net_dm_nl_post_doit(const struct genl_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + mutex_unlock(&net_dm_mutex); +} + +static struct genl_family net_drop_monitor_family __ro_after_init = { + .hdrsize = 0, + .name = "NET_DM", + .version = 2, + .maxattr = NET_DM_ATTR_MAX, + .policy = net_dm_nl_policy, + .pre_doit = net_dm_nl_pre_doit, + .post_doit = net_dm_nl_post_doit, + .module = THIS_MODULE, + .small_ops = dropmon_ops, + .n_small_ops = ARRAY_SIZE(dropmon_ops), + .resv_start_op = NET_DM_CMD_STATS_GET + 1, + .mcgrps = dropmon_mcgrps, + .n_mcgrps = ARRAY_SIZE(dropmon_mcgrps), +}; + +static struct notifier_block dropmon_net_notifier = { + .notifier_call = dropmon_net_event +}; + +static void __net_dm_cpu_data_init(struct per_cpu_dm_data *data) +{ + spin_lock_init(&data->lock); + skb_queue_head_init(&data->drop_queue); + u64_stats_init(&data->stats.syncp); +} + +static void __net_dm_cpu_data_fini(struct per_cpu_dm_data *data) +{ + WARN_ON(!skb_queue_empty(&data->drop_queue)); +} + +static void net_dm_cpu_data_init(int cpu) +{ + struct per_cpu_dm_data *data; + + data = &per_cpu(dm_cpu_data, cpu); + __net_dm_cpu_data_init(data); +} + +static void net_dm_cpu_data_fini(int cpu) +{ + struct per_cpu_dm_data *data; + + data = &per_cpu(dm_cpu_data, cpu); + /* At this point, we should have exclusive access + * to this struct and can free the skb inside it. + */ + consume_skb(data->skb); + __net_dm_cpu_data_fini(data); +} + +static void net_dm_hw_cpu_data_init(int cpu) +{ + struct per_cpu_dm_data *hw_data; + + hw_data = &per_cpu(dm_hw_cpu_data, cpu); + __net_dm_cpu_data_init(hw_data); +} + +static void net_dm_hw_cpu_data_fini(int cpu) +{ + struct per_cpu_dm_data *hw_data; + + hw_data = &per_cpu(dm_hw_cpu_data, cpu); + kfree(hw_data->hw_entries); + __net_dm_cpu_data_fini(hw_data); +} + +static int __init init_net_drop_monitor(void) +{ + int cpu, rc; + + pr_info("Initializing network drop monitor service\n"); + + if (sizeof(void *) > 8) { + pr_err("Unable to store program counters on this arch, Drop monitor failed\n"); + return -ENOSPC; + } + + rc = genl_register_family(&net_drop_monitor_family); + if (rc) { + pr_err("Could not create drop monitor netlink family\n"); + return rc; + } + WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT); + + rc = register_netdevice_notifier(&dropmon_net_notifier); + if (rc < 0) { + pr_crit("Failed to register netdevice notifier\n"); + goto out_unreg; + } + + rc = 0; + + for_each_possible_cpu(cpu) { + net_dm_cpu_data_init(cpu); + net_dm_hw_cpu_data_init(cpu); + } + + goto out; + +out_unreg: + genl_unregister_family(&net_drop_monitor_family); +out: + return rc; +} + +static void exit_net_drop_monitor(void) +{ + int cpu; + + BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); + + /* + * Because of the module_get/put we do in the trace state change path + * we are guaranteed not to have any current users when we get here + */ + + for_each_possible_cpu(cpu) { + net_dm_hw_cpu_data_fini(cpu); + net_dm_cpu_data_fini(cpu); + } + + BUG_ON(genl_unregister_family(&net_drop_monitor_family)); +} + +module_init(init_net_drop_monitor); +module_exit(exit_net_drop_monitor); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Neil Horman <nhorman@tuxdriver.com>"); +MODULE_ALIAS_GENL_FAMILY("NET_DM"); +MODULE_DESCRIPTION("Monitoring code for network dropped packet alerts"); diff --git a/net/core/dst.c b/net/core/dst.c new file mode 100644 index 000000000..d178c5641 --- /dev/null +++ b/net/core/dst.c @@ -0,0 +1,351 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * net/core/dst.c Protocol independent destination cache. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <linux/bitops.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/workqueue.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/string.h> +#include <linux/types.h> +#include <net/net_namespace.h> +#include <linux/sched.h> +#include <linux/prefetch.h> +#include <net/lwtunnel.h> +#include <net/xfrm.h> + +#include <net/dst.h> +#include <net/dst_metadata.h> + +int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL(dst_discard_out); + +const struct dst_metrics dst_default_metrics = { + /* This initializer is needed to force linker to place this variable + * into const section. Otherwise it might end into bss section. + * We really want to avoid false sharing on this variable, and catch + * any writes on it. + */ + .refcnt = REFCOUNT_INIT(1), +}; +EXPORT_SYMBOL(dst_default_metrics); + +void dst_init(struct dst_entry *dst, struct dst_ops *ops, + struct net_device *dev, int initial_ref, int initial_obsolete, + unsigned short flags) +{ + dst->dev = dev; + netdev_hold(dev, &dst->dev_tracker, GFP_ATOMIC); + dst->ops = ops; + dst_init_metrics(dst, dst_default_metrics.metrics, true); + dst->expires = 0UL; +#ifdef CONFIG_XFRM + dst->xfrm = NULL; +#endif + dst->input = dst_discard; + dst->output = dst_discard_out; + dst->error = 0; + dst->obsolete = initial_obsolete; + dst->header_len = 0; + dst->trailer_len = 0; +#ifdef CONFIG_IP_ROUTE_CLASSID + dst->tclassid = 0; +#endif + dst->lwtstate = NULL; + atomic_set(&dst->__refcnt, initial_ref); + dst->__use = 0; + dst->lastuse = jiffies; + dst->flags = flags; + if (!(flags & DST_NOCOUNT)) + dst_entries_add(ops, 1); +} +EXPORT_SYMBOL(dst_init); + +void *dst_alloc(struct dst_ops *ops, struct net_device *dev, + int initial_ref, int initial_obsolete, unsigned short flags) +{ + struct dst_entry *dst; + + if (ops->gc && + !(flags & DST_NOCOUNT) && + dst_entries_get_fast(ops) > ops->gc_thresh) + ops->gc(ops); + + dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); + if (!dst) + return NULL; + + dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags); + + return dst; +} +EXPORT_SYMBOL(dst_alloc); + +struct dst_entry *dst_destroy(struct dst_entry * dst) +{ + struct dst_entry *child = NULL; + + smp_rmb(); + +#ifdef CONFIG_XFRM + if (dst->xfrm) { + struct xfrm_dst *xdst = (struct xfrm_dst *) dst; + + child = xdst->child; + } +#endif + if (!(dst->flags & DST_NOCOUNT)) + dst_entries_add(dst->ops, -1); + + if (dst->ops->destroy) + dst->ops->destroy(dst); + netdev_put(dst->dev, &dst->dev_tracker); + + lwtstate_put(dst->lwtstate); + + if (dst->flags & DST_METADATA) + metadata_dst_free((struct metadata_dst *)dst); + else + kmem_cache_free(dst->ops->kmem_cachep, dst); + + dst = child; + if (dst) + dst_release_immediate(dst); + return NULL; +} +EXPORT_SYMBOL(dst_destroy); + +static void dst_destroy_rcu(struct rcu_head *head) +{ + struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); + + dst = dst_destroy(dst); +} + +/* Operations to mark dst as DEAD and clean up the net device referenced + * by dst: + * 1. put the dst under blackhole interface and discard all tx/rx packets + * on this route. + * 2. release the net_device + * This function should be called when removing routes from the fib tree + * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to + * make the next dst_ops->check() fail. + */ +void dst_dev_put(struct dst_entry *dst) +{ + struct net_device *dev = dst->dev; + + dst->obsolete = DST_OBSOLETE_DEAD; + if (dst->ops->ifdown) + dst->ops->ifdown(dst, dev, true); + dst->input = dst_discard; + dst->output = dst_discard_out; + dst->dev = blackhole_netdev; + netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker, + GFP_ATOMIC); +} +EXPORT_SYMBOL(dst_dev_put); + +void dst_release(struct dst_entry *dst) +{ + if (dst) { + int newrefcnt; + + newrefcnt = atomic_dec_return(&dst->__refcnt); + if (WARN_ONCE(newrefcnt < 0, "dst_release underflow")) + net_warn_ratelimited("%s: dst:%p refcnt:%d\n", + __func__, dst, newrefcnt); + if (!newrefcnt) + call_rcu(&dst->rcu_head, dst_destroy_rcu); + } +} +EXPORT_SYMBOL(dst_release); + +void dst_release_immediate(struct dst_entry *dst) +{ + if (dst) { + int newrefcnt; + + newrefcnt = atomic_dec_return(&dst->__refcnt); + if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow")) + net_warn_ratelimited("%s: dst:%p refcnt:%d\n", + __func__, dst, newrefcnt); + if (!newrefcnt) + dst_destroy(dst); + } +} +EXPORT_SYMBOL(dst_release_immediate); + +u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) +{ + struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC); + + if (p) { + struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old); + unsigned long prev, new; + + refcount_set(&p->refcnt, 1); + memcpy(p->metrics, old_p->metrics, sizeof(p->metrics)); + + new = (unsigned long) p; + prev = cmpxchg(&dst->_metrics, old, new); + + if (prev != old) { + kfree(p); + p = (struct dst_metrics *)__DST_METRICS_PTR(prev); + if (prev & DST_METRICS_READ_ONLY) + p = NULL; + } else if (prev & DST_METRICS_REFCOUNTED) { + if (refcount_dec_and_test(&old_p->refcnt)) + kfree(old_p); + } + } + BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0); + return (u32 *)p; +} +EXPORT_SYMBOL(dst_cow_metrics_generic); + +/* Caller asserts that dst_metrics_read_only(dst) is false. */ +void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) +{ + unsigned long prev, new; + + new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY; + prev = cmpxchg(&dst->_metrics, old, new); + if (prev == old) + kfree(__DST_METRICS_PTR(old)); +} +EXPORT_SYMBOL(__dst_destroy_metrics_generic); + +struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie) +{ + return NULL; +} + +u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old) +{ + return NULL; +} + +struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr) +{ + return NULL; +} + +void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu, + bool confirm_neigh) +{ +} +EXPORT_SYMBOL_GPL(dst_blackhole_update_pmtu); + +void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb) +{ +} +EXPORT_SYMBOL_GPL(dst_blackhole_redirect); + +unsigned int dst_blackhole_mtu(const struct dst_entry *dst) +{ + unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); + + return mtu ? : dst->dev->mtu; +} +EXPORT_SYMBOL_GPL(dst_blackhole_mtu); + +static struct dst_ops dst_blackhole_ops = { + .family = AF_UNSPEC, + .neigh_lookup = dst_blackhole_neigh_lookup, + .check = dst_blackhole_check, + .cow_metrics = dst_blackhole_cow_metrics, + .update_pmtu = dst_blackhole_update_pmtu, + .redirect = dst_blackhole_redirect, + .mtu = dst_blackhole_mtu, +}; + +static void __metadata_dst_init(struct metadata_dst *md_dst, + enum metadata_type type, u8 optslen) +{ + struct dst_entry *dst; + + dst = &md_dst->dst; + dst_init(dst, &dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, + DST_METADATA | DST_NOCOUNT); + memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst)); + md_dst->type = type; +} + +struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, + gfp_t flags) +{ + struct metadata_dst *md_dst; + + md_dst = kmalloc(sizeof(*md_dst) + optslen, flags); + if (!md_dst) + return NULL; + + __metadata_dst_init(md_dst, type, optslen); + + return md_dst; +} +EXPORT_SYMBOL_GPL(metadata_dst_alloc); + +void metadata_dst_free(struct metadata_dst *md_dst) +{ +#ifdef CONFIG_DST_CACHE + if (md_dst->type == METADATA_IP_TUNNEL) + dst_cache_destroy(&md_dst->u.tun_info.dst_cache); +#endif + kfree(md_dst); +} +EXPORT_SYMBOL_GPL(metadata_dst_free); + +struct metadata_dst __percpu * +metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags) +{ + int cpu; + struct metadata_dst __percpu *md_dst; + + md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen, + __alignof__(struct metadata_dst), flags); + if (!md_dst) + return NULL; + + for_each_possible_cpu(cpu) + __metadata_dst_init(per_cpu_ptr(md_dst, cpu), type, optslen); + + return md_dst; +} +EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu); + +void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst) +{ +#ifdef CONFIG_DST_CACHE + int cpu; + + for_each_possible_cpu(cpu) { + struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu); + + if (one_md_dst->type == METADATA_IP_TUNNEL) + dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache); + } +#endif + free_percpu(md_dst); +} +EXPORT_SYMBOL_GPL(metadata_dst_free_percpu); diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c new file mode 100644 index 000000000..0ccfd5fa5 --- /dev/null +++ b/net/core/dst_cache.c @@ -0,0 +1,183 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * net/core/dst_cache.c - dst entry cache + * + * Copyright (c) 2016 Paolo Abeni <pabeni@redhat.com> + */ + +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <net/dst_cache.h> +#include <net/route.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <net/ip6_fib.h> +#endif +#include <uapi/linux/in.h> + +struct dst_cache_pcpu { + unsigned long refresh_ts; + struct dst_entry *dst; + u32 cookie; + union { + struct in_addr in_saddr; + struct in6_addr in6_saddr; + }; +}; + +static void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache, + struct dst_entry *dst, u32 cookie) +{ + dst_release(dst_cache->dst); + if (dst) + dst_hold(dst); + + dst_cache->cookie = cookie; + dst_cache->dst = dst; +} + +static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, + struct dst_cache_pcpu *idst) +{ + struct dst_entry *dst; + + dst = idst->dst; + if (!dst) + goto fail; + + /* the cache already hold a dst reference; it can't go away */ + dst_hold(dst); + + if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) || + (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) { + dst_cache_per_cpu_dst_set(idst, NULL, 0); + dst_release(dst); + goto fail; + } + return dst; + +fail: + idst->refresh_ts = jiffies; + return NULL; +} + +struct dst_entry *dst_cache_get(struct dst_cache *dst_cache) +{ + if (!dst_cache->cache) + return NULL; + + return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); +} +EXPORT_SYMBOL_GPL(dst_cache_get); + +struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) +{ + struct dst_cache_pcpu *idst; + struct dst_entry *dst; + + if (!dst_cache->cache) + return NULL; + + idst = this_cpu_ptr(dst_cache->cache); + dst = dst_cache_per_cpu_get(dst_cache, idst); + if (!dst) + return NULL; + + *saddr = idst->in_saddr.s_addr; + return container_of(dst, struct rtable, dst); +} +EXPORT_SYMBOL_GPL(dst_cache_get_ip4); + +void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, + __be32 saddr) +{ + struct dst_cache_pcpu *idst; + + if (!dst_cache->cache) + return; + + idst = this_cpu_ptr(dst_cache->cache); + dst_cache_per_cpu_dst_set(idst, dst, 0); + idst->in_saddr.s_addr = saddr; +} +EXPORT_SYMBOL_GPL(dst_cache_set_ip4); + +#if IS_ENABLED(CONFIG_IPV6) +void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, + const struct in6_addr *saddr) +{ + struct dst_cache_pcpu *idst; + + if (!dst_cache->cache) + return; + + idst = this_cpu_ptr(dst_cache->cache); + dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst, + rt6_get_cookie((struct rt6_info *)dst)); + idst->in6_saddr = *saddr; +} +EXPORT_SYMBOL_GPL(dst_cache_set_ip6); + +struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, + struct in6_addr *saddr) +{ + struct dst_cache_pcpu *idst; + struct dst_entry *dst; + + if (!dst_cache->cache) + return NULL; + + idst = this_cpu_ptr(dst_cache->cache); + dst = dst_cache_per_cpu_get(dst_cache, idst); + if (!dst) + return NULL; + + *saddr = idst->in6_saddr; + return dst; +} +EXPORT_SYMBOL_GPL(dst_cache_get_ip6); +#endif + +int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp) +{ + dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu, + gfp | __GFP_ZERO); + if (!dst_cache->cache) + return -ENOMEM; + + dst_cache_reset(dst_cache); + return 0; +} +EXPORT_SYMBOL_GPL(dst_cache_init); + +void dst_cache_destroy(struct dst_cache *dst_cache) +{ + int i; + + if (!dst_cache->cache) + return; + + for_each_possible_cpu(i) + dst_release(per_cpu_ptr(dst_cache->cache, i)->dst); + + free_percpu(dst_cache->cache); +} +EXPORT_SYMBOL_GPL(dst_cache_destroy); + +void dst_cache_reset_now(struct dst_cache *dst_cache) +{ + int i; + + if (!dst_cache->cache) + return; + + dst_cache->reset_ts = jiffies; + for_each_possible_cpu(i) { + struct dst_cache_pcpu *idst = per_cpu_ptr(dst_cache->cache, i); + struct dst_entry *dst = idst->dst; + + idst->cookie = 0; + idst->dst = NULL; + dst_release(dst); + } +} +EXPORT_SYMBOL_GPL(dst_cache_reset_now); diff --git a/net/core/failover.c b/net/core/failover.c new file mode 100644 index 000000000..864d2d83e --- /dev/null +++ b/net/core/failover.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2018, Intel Corporation. */ + +/* A common module to handle registrations and notifications for paravirtual + * drivers to enable accelerated datapath and support VF live migration. + * + * The notifier and event handling code is based on netvsc driver. + */ + +#include <linux/module.h> +#include <linux/etherdevice.h> +#include <uapi/linux/if_arp.h> +#include <linux/rtnetlink.h> +#include <linux/if_vlan.h> +#include <net/failover.h> + +static LIST_HEAD(failover_list); +static DEFINE_SPINLOCK(failover_lock); + +static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops) +{ + struct net_device *failover_dev; + struct failover *failover; + + spin_lock(&failover_lock); + list_for_each_entry(failover, &failover_list, list) { + failover_dev = rtnl_dereference(failover->failover_dev); + if (ether_addr_equal(failover_dev->perm_addr, mac)) { + *ops = rtnl_dereference(failover->ops); + spin_unlock(&failover_lock); + return failover_dev; + } + } + spin_unlock(&failover_lock); + return NULL; +} + +/** + * failover_slave_register - Register a slave netdev + * + * @slave_dev: slave netdev that is being registered + * + * Registers a slave device to a failover instance. Only ethernet devices + * are supported. + */ +static int failover_slave_register(struct net_device *slave_dev) +{ + struct netdev_lag_upper_info lag_upper_info; + struct net_device *failover_dev; + struct failover_ops *fops; + int err; + + if (slave_dev->type != ARPHRD_ETHER) + goto done; + + ASSERT_RTNL(); + + failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); + if (!failover_dev) + goto done; + + if (fops && fops->slave_pre_register && + fops->slave_pre_register(slave_dev, failover_dev)) + goto done; + + err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame, + failover_dev); + if (err) { + netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n", + err); + goto done; + } + + lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP; + err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL, + &lag_upper_info, NULL); + if (err) { + netdev_err(slave_dev, "can not set failover device %s (err = %d)\n", + failover_dev->name, err); + goto err_upper_link; + } + + slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK); + + if (fops && fops->slave_register && + !fops->slave_register(slave_dev, failover_dev)) + return NOTIFY_OK; + + netdev_upper_dev_unlink(slave_dev, failover_dev); + slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK); +err_upper_link: + netdev_rx_handler_unregister(slave_dev); +done: + return NOTIFY_DONE; +} + +/** + * failover_slave_unregister - Unregister a slave netdev + * + * @slave_dev: slave netdev that is being unregistered + * + * Unregisters a slave device from a failover instance. + */ +int failover_slave_unregister(struct net_device *slave_dev) +{ + struct net_device *failover_dev; + struct failover_ops *fops; + + if (!netif_is_failover_slave(slave_dev)) + goto done; + + ASSERT_RTNL(); + + failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); + if (!failover_dev) + goto done; + + if (fops && fops->slave_pre_unregister && + fops->slave_pre_unregister(slave_dev, failover_dev)) + goto done; + + netdev_rx_handler_unregister(slave_dev); + netdev_upper_dev_unlink(slave_dev, failover_dev); + slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK); + + if (fops && fops->slave_unregister && + !fops->slave_unregister(slave_dev, failover_dev)) + return NOTIFY_OK; + +done: + return NOTIFY_DONE; +} +EXPORT_SYMBOL_GPL(failover_slave_unregister); + +static int failover_slave_link_change(struct net_device *slave_dev) +{ + struct net_device *failover_dev; + struct failover_ops *fops; + + if (!netif_is_failover_slave(slave_dev)) + goto done; + + ASSERT_RTNL(); + + failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); + if (!failover_dev) + goto done; + + if (!netif_running(failover_dev)) + goto done; + + if (fops && fops->slave_link_change && + !fops->slave_link_change(slave_dev, failover_dev)) + return NOTIFY_OK; + +done: + return NOTIFY_DONE; +} + +static int failover_slave_name_change(struct net_device *slave_dev) +{ + struct net_device *failover_dev; + struct failover_ops *fops; + + if (!netif_is_failover_slave(slave_dev)) + goto done; + + ASSERT_RTNL(); + + failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); + if (!failover_dev) + goto done; + + if (!netif_running(failover_dev)) + goto done; + + if (fops && fops->slave_name_change && + !fops->slave_name_change(slave_dev, failover_dev)) + return NOTIFY_OK; + +done: + return NOTIFY_DONE; +} + +static int +failover_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); + + /* Skip parent events */ + if (netif_is_failover(event_dev)) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_REGISTER: + return failover_slave_register(event_dev); + case NETDEV_UNREGISTER: + return failover_slave_unregister(event_dev); + case NETDEV_UP: + case NETDEV_DOWN: + case NETDEV_CHANGE: + return failover_slave_link_change(event_dev); + case NETDEV_CHANGENAME: + return failover_slave_name_change(event_dev); + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block failover_notifier = { + .notifier_call = failover_event, +}; + +static void +failover_existing_slave_register(struct net_device *failover_dev) +{ + struct net *net = dev_net(failover_dev); + struct net_device *dev; + + rtnl_lock(); + for_each_netdev(net, dev) { + if (netif_is_failover(dev)) + continue; + if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) + failover_slave_register(dev); + } + rtnl_unlock(); +} + +/** + * failover_register - Register a failover instance + * + * @dev: failover netdev + * @ops: failover ops + * + * Allocate and register a failover instance for a failover netdev. ops + * provides handlers for slave device register/unregister/link change/ + * name change events. + * + * Return: pointer to failover instance + */ +struct failover *failover_register(struct net_device *dev, + struct failover_ops *ops) +{ + struct failover *failover; + + if (dev->type != ARPHRD_ETHER) + return ERR_PTR(-EINVAL); + + failover = kzalloc(sizeof(*failover), GFP_KERNEL); + if (!failover) + return ERR_PTR(-ENOMEM); + + rcu_assign_pointer(failover->ops, ops); + netdev_hold(dev, &failover->dev_tracker, GFP_KERNEL); + dev->priv_flags |= IFF_FAILOVER; + rcu_assign_pointer(failover->failover_dev, dev); + + spin_lock(&failover_lock); + list_add_tail(&failover->list, &failover_list); + spin_unlock(&failover_lock); + + netdev_info(dev, "failover master:%s registered\n", dev->name); + + failover_existing_slave_register(dev); + + return failover; +} +EXPORT_SYMBOL_GPL(failover_register); + +/** + * failover_unregister - Unregister a failover instance + * + * @failover: pointer to failover instance + * + * Unregisters and frees a failover instance. + */ +void failover_unregister(struct failover *failover) +{ + struct net_device *failover_dev; + + failover_dev = rcu_dereference(failover->failover_dev); + + netdev_info(failover_dev, "failover master:%s unregistered\n", + failover_dev->name); + + failover_dev->priv_flags &= ~IFF_FAILOVER; + netdev_put(failover_dev, &failover->dev_tracker); + + spin_lock(&failover_lock); + list_del(&failover->list); + spin_unlock(&failover_lock); + + kfree(failover); +} +EXPORT_SYMBOL_GPL(failover_unregister); + +static __init int +failover_init(void) +{ + register_netdevice_notifier(&failover_notifier); + + return 0; +} +module_init(failover_init); + +static __exit +void failover_exit(void) +{ + unregister_netdevice_notifier(&failover_notifier); +} +module_exit(failover_exit); + +MODULE_DESCRIPTION("Generic failover infrastructure/interface"); +MODULE_LICENSE("GPL v2"); diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c new file mode 100644 index 000000000..fc9625980 --- /dev/null +++ b/net/core/fib_notifier.c @@ -0,0 +1,199 @@ +#include <linux/rtnetlink.h> +#include <linux/notifier.h> +#include <linux/rcupdate.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <net/fib_notifier.h> + +static unsigned int fib_notifier_net_id; + +struct fib_notifier_net { + struct list_head fib_notifier_ops; + struct atomic_notifier_head fib_chain; +}; + +int call_fib_notifier(struct notifier_block *nb, + enum fib_event_type event_type, + struct fib_notifier_info *info) +{ + int err; + + err = nb->notifier_call(nb, event_type, info); + return notifier_to_errno(err); +} +EXPORT_SYMBOL(call_fib_notifier); + +int call_fib_notifiers(struct net *net, enum fib_event_type event_type, + struct fib_notifier_info *info) +{ + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + int err; + + err = atomic_notifier_call_chain(&fn_net->fib_chain, event_type, info); + return notifier_to_errno(err); +} +EXPORT_SYMBOL(call_fib_notifiers); + +static unsigned int fib_seq_sum(struct net *net) +{ + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + struct fib_notifier_ops *ops; + unsigned int fib_seq = 0; + + rtnl_lock(); + rcu_read_lock(); + list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { + if (!try_module_get(ops->owner)) + continue; + fib_seq += ops->fib_seq_read(net); + module_put(ops->owner); + } + rcu_read_unlock(); + rtnl_unlock(); + + return fib_seq; +} + +static int fib_net_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) +{ + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + struct fib_notifier_ops *ops; + int err = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { + if (!try_module_get(ops->owner)) + continue; + err = ops->fib_dump(net, nb, extack); + module_put(ops->owner); + if (err) + goto unlock; + } + +unlock: + rcu_read_unlock(); + + return err; +} + +static bool fib_dump_is_consistent(struct net *net, struct notifier_block *nb, + void (*cb)(struct notifier_block *nb), + unsigned int fib_seq) +{ + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + + atomic_notifier_chain_register(&fn_net->fib_chain, nb); + if (fib_seq == fib_seq_sum(net)) + return true; + atomic_notifier_chain_unregister(&fn_net->fib_chain, nb); + if (cb) + cb(nb); + return false; +} + +#define FIB_DUMP_MAX_RETRIES 5 +int register_fib_notifier(struct net *net, struct notifier_block *nb, + void (*cb)(struct notifier_block *nb), + struct netlink_ext_ack *extack) +{ + int retries = 0; + int err; + + do { + unsigned int fib_seq = fib_seq_sum(net); + + err = fib_net_dump(net, nb, extack); + if (err) + return err; + + if (fib_dump_is_consistent(net, nb, cb, fib_seq)) + return 0; + } while (++retries < FIB_DUMP_MAX_RETRIES); + + return -EBUSY; +} +EXPORT_SYMBOL(register_fib_notifier); + +int unregister_fib_notifier(struct net *net, struct notifier_block *nb) +{ + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + + return atomic_notifier_chain_unregister(&fn_net->fib_chain, nb); +} +EXPORT_SYMBOL(unregister_fib_notifier); + +static int __fib_notifier_ops_register(struct fib_notifier_ops *ops, + struct net *net) +{ + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + struct fib_notifier_ops *o; + + list_for_each_entry(o, &fn_net->fib_notifier_ops, list) + if (ops->family == o->family) + return -EEXIST; + list_add_tail_rcu(&ops->list, &fn_net->fib_notifier_ops); + return 0; +} + +struct fib_notifier_ops * +fib_notifier_ops_register(const struct fib_notifier_ops *tmpl, struct net *net) +{ + struct fib_notifier_ops *ops; + int err; + + ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL); + if (!ops) + return ERR_PTR(-ENOMEM); + + err = __fib_notifier_ops_register(ops, net); + if (err) + goto err_register; + + return ops; + +err_register: + kfree(ops); + return ERR_PTR(err); +} +EXPORT_SYMBOL(fib_notifier_ops_register); + +void fib_notifier_ops_unregister(struct fib_notifier_ops *ops) +{ + list_del_rcu(&ops->list); + kfree_rcu(ops, rcu); +} +EXPORT_SYMBOL(fib_notifier_ops_unregister); + +static int __net_init fib_notifier_net_init(struct net *net) +{ + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + + INIT_LIST_HEAD(&fn_net->fib_notifier_ops); + ATOMIC_INIT_NOTIFIER_HEAD(&fn_net->fib_chain); + return 0; +} + +static void __net_exit fib_notifier_net_exit(struct net *net) +{ + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + + WARN_ON_ONCE(!list_empty(&fn_net->fib_notifier_ops)); +} + +static struct pernet_operations fib_notifier_net_ops = { + .init = fib_notifier_net_init, + .exit = fib_notifier_net_exit, + .id = &fib_notifier_net_id, + .size = sizeof(struct fib_notifier_net), +}; + +static int __init fib_notifier_init(void) +{ + return register_pernet_subsys(&fib_notifier_net_ops); +} + +subsys_initcall(fib_notifier_init); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c new file mode 100644 index 000000000..75282222e --- /dev/null +++ b/net/core/fib_rules.c @@ -0,0 +1,1319 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * net/core/fib_rules.c Generic Routing Rules + * + * Authors: Thomas Graf <tgraf@suug.ch> + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/list.h> +#include <linux/module.h> +#include <net/net_namespace.h> +#include <net/sock.h> +#include <net/fib_rules.h> +#include <net/ip_tunnels.h> +#include <linux/indirect_call_wrapper.h> + +#if defined(CONFIG_IPV6) && defined(CONFIG_IPV6_MULTIPLE_TABLES) +#ifdef CONFIG_IP_MULTIPLE_TABLES +#define INDIRECT_CALL_MT(f, f2, f1, ...) \ + INDIRECT_CALL_INET(f, f2, f1, __VA_ARGS__) +#else +#define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__) +#endif +#elif defined(CONFIG_IP_MULTIPLE_TABLES) +#define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__) +#else +#define INDIRECT_CALL_MT(f, f2, f1, ...) f(__VA_ARGS__) +#endif + +static const struct fib_kuid_range fib_kuid_range_unset = { + KUIDT_INIT(0), + KUIDT_INIT(~0), +}; + +bool fib_rule_matchall(const struct fib_rule *rule) +{ + if (rule->iifindex || rule->oifindex || rule->mark || rule->tun_id || + rule->flags) + return false; + if (rule->suppress_ifgroup != -1 || rule->suppress_prefixlen != -1) + return false; + if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) || + !uid_eq(rule->uid_range.end, fib_kuid_range_unset.end)) + return false; + if (fib_rule_port_range_set(&rule->sport_range)) + return false; + if (fib_rule_port_range_set(&rule->dport_range)) + return false; + return true; +} +EXPORT_SYMBOL_GPL(fib_rule_matchall); + +int fib_default_rule_add(struct fib_rules_ops *ops, + u32 pref, u32 table, u32 flags) +{ + struct fib_rule *r; + + r = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); + if (r == NULL) + return -ENOMEM; + + refcount_set(&r->refcnt, 1); + r->action = FR_ACT_TO_TBL; + r->pref = pref; + r->table = table; + r->flags = flags; + r->proto = RTPROT_KERNEL; + r->fr_net = ops->fro_net; + r->uid_range = fib_kuid_range_unset; + + r->suppress_prefixlen = -1; + r->suppress_ifgroup = -1; + + /* The lock is not required here, the list in unreacheable + * at the moment this function is called */ + list_add_tail(&r->list, &ops->rules_list); + return 0; +} +EXPORT_SYMBOL(fib_default_rule_add); + +static u32 fib_default_rule_pref(struct fib_rules_ops *ops) +{ + struct list_head *pos; + struct fib_rule *rule; + + if (!list_empty(&ops->rules_list)) { + pos = ops->rules_list.next; + if (pos->next != &ops->rules_list) { + rule = list_entry(pos->next, struct fib_rule, list); + if (rule->pref) + return rule->pref - 1; + } + } + + return 0; +} + +static void notify_rule_change(int event, struct fib_rule *rule, + struct fib_rules_ops *ops, struct nlmsghdr *nlh, + u32 pid); + +static struct fib_rules_ops *lookup_rules_ops(struct net *net, int family) +{ + struct fib_rules_ops *ops; + + rcu_read_lock(); + list_for_each_entry_rcu(ops, &net->rules_ops, list) { + if (ops->family == family) { + if (!try_module_get(ops->owner)) + ops = NULL; + rcu_read_unlock(); + return ops; + } + } + rcu_read_unlock(); + + return NULL; +} + +static void rules_ops_put(struct fib_rules_ops *ops) +{ + if (ops) + module_put(ops->owner); +} + +static void flush_route_cache(struct fib_rules_ops *ops) +{ + if (ops->flush_cache) + ops->flush_cache(ops); +} + +static int __fib_rules_register(struct fib_rules_ops *ops) +{ + int err = -EEXIST; + struct fib_rules_ops *o; + struct net *net; + + net = ops->fro_net; + + if (ops->rule_size < sizeof(struct fib_rule)) + return -EINVAL; + + if (ops->match == NULL || ops->configure == NULL || + ops->compare == NULL || ops->fill == NULL || + ops->action == NULL) + return -EINVAL; + + spin_lock(&net->rules_mod_lock); + list_for_each_entry(o, &net->rules_ops, list) + if (ops->family == o->family) + goto errout; + + list_add_tail_rcu(&ops->list, &net->rules_ops); + err = 0; +errout: + spin_unlock(&net->rules_mod_lock); + + return err; +} + +struct fib_rules_ops * +fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net) +{ + struct fib_rules_ops *ops; + int err; + + ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL); + if (ops == NULL) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&ops->rules_list); + ops->fro_net = net; + + err = __fib_rules_register(ops); + if (err) { + kfree(ops); + ops = ERR_PTR(err); + } + + return ops; +} +EXPORT_SYMBOL_GPL(fib_rules_register); + +static void fib_rules_cleanup_ops(struct fib_rules_ops *ops) +{ + struct fib_rule *rule, *tmp; + + list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) { + list_del_rcu(&rule->list); + if (ops->delete) + ops->delete(rule); + fib_rule_put(rule); + } +} + +void fib_rules_unregister(struct fib_rules_ops *ops) +{ + struct net *net = ops->fro_net; + + spin_lock(&net->rules_mod_lock); + list_del_rcu(&ops->list); + spin_unlock(&net->rules_mod_lock); + + fib_rules_cleanup_ops(ops); + kfree_rcu(ops, rcu); +} +EXPORT_SYMBOL_GPL(fib_rules_unregister); + +static int uid_range_set(struct fib_kuid_range *range) +{ + return uid_valid(range->start) && uid_valid(range->end); +} + +static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb) +{ + struct fib_rule_uid_range *in; + struct fib_kuid_range out; + + in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]); + + out.start = make_kuid(current_user_ns(), in->start); + out.end = make_kuid(current_user_ns(), in->end); + + return out; +} + +static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range) +{ + struct fib_rule_uid_range out = { + from_kuid_munged(current_user_ns(), range->start), + from_kuid_munged(current_user_ns(), range->end) + }; + + return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out); +} + +static int nla_get_port_range(struct nlattr *pattr, + struct fib_rule_port_range *port_range) +{ + const struct fib_rule_port_range *pr = nla_data(pattr); + + if (!fib_rule_port_range_valid(pr)) + return -EINVAL; + + port_range->start = pr->start; + port_range->end = pr->end; + + return 0; +} + +static int nla_put_port_range(struct sk_buff *skb, int attrtype, + struct fib_rule_port_range *range) +{ + return nla_put(skb, attrtype, sizeof(*range), range); +} + +static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, + struct flowi *fl, int flags, + struct fib_lookup_arg *arg) +{ + int ret = 0; + + if (rule->iifindex && (rule->iifindex != fl->flowi_iif)) + goto out; + + if (rule->oifindex && (rule->oifindex != fl->flowi_oif)) + goto out; + + if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask) + goto out; + + if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id)) + goto out; + + if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg)) + goto out; + + if (uid_lt(fl->flowi_uid, rule->uid_range.start) || + uid_gt(fl->flowi_uid, rule->uid_range.end)) + goto out; + + ret = INDIRECT_CALL_MT(ops->match, + fib6_rule_match, + fib4_rule_match, + rule, fl, flags); +out: + return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; +} + +int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl, + int flags, struct fib_lookup_arg *arg) +{ + struct fib_rule *rule; + int err; + + rcu_read_lock(); + + list_for_each_entry_rcu(rule, &ops->rules_list, list) { +jumped: + if (!fib_rule_match(rule, ops, fl, flags, arg)) + continue; + + if (rule->action == FR_ACT_GOTO) { + struct fib_rule *target; + + target = rcu_dereference(rule->ctarget); + if (target == NULL) { + continue; + } else { + rule = target; + goto jumped; + } + } else if (rule->action == FR_ACT_NOP) + continue; + else + err = INDIRECT_CALL_MT(ops->action, + fib6_rule_action, + fib4_rule_action, + rule, fl, flags, arg); + + if (!err && ops->suppress && INDIRECT_CALL_MT(ops->suppress, + fib6_rule_suppress, + fib4_rule_suppress, + rule, flags, arg)) + continue; + + if (err != -EAGAIN) { + if ((arg->flags & FIB_LOOKUP_NOREF) || + likely(refcount_inc_not_zero(&rule->refcnt))) { + arg->rule = rule; + goto out; + } + break; + } + } + + err = -ESRCH; +out: + rcu_read_unlock(); + + return err; +} +EXPORT_SYMBOL_GPL(fib_rules_lookup); + +static int call_fib_rule_notifier(struct notifier_block *nb, + enum fib_event_type event_type, + struct fib_rule *rule, int family, + struct netlink_ext_ack *extack) +{ + struct fib_rule_notifier_info info = { + .info.family = family, + .info.extack = extack, + .rule = rule, + }; + + return call_fib_notifier(nb, event_type, &info.info); +} + +static int call_fib_rule_notifiers(struct net *net, + enum fib_event_type event_type, + struct fib_rule *rule, + struct fib_rules_ops *ops, + struct netlink_ext_ack *extack) +{ + struct fib_rule_notifier_info info = { + .info.family = ops->family, + .info.extack = extack, + .rule = rule, + }; + + ops->fib_rules_seq++; + return call_fib_notifiers(net, event_type, &info.info); +} + +/* Called with rcu_read_lock() */ +int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, + struct netlink_ext_ack *extack) +{ + struct fib_rules_ops *ops; + struct fib_rule *rule; + int err = 0; + + ops = lookup_rules_ops(net, family); + if (!ops) + return -EAFNOSUPPORT; + list_for_each_entry_rcu(rule, &ops->rules_list, list) { + err = call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD, + rule, family, extack); + if (err) + break; + } + rules_ops_put(ops); + + return err; +} +EXPORT_SYMBOL_GPL(fib_rules_dump); + +unsigned int fib_rules_seq_read(struct net *net, int family) +{ + unsigned int fib_rules_seq; + struct fib_rules_ops *ops; + + ASSERT_RTNL(); + + ops = lookup_rules_ops(net, family); + if (!ops) + return 0; + fib_rules_seq = ops->fib_rules_seq; + rules_ops_put(ops); + + return fib_rules_seq; +} +EXPORT_SYMBOL_GPL(fib_rules_seq_read); + +static struct fib_rule *rule_find(struct fib_rules_ops *ops, + struct fib_rule_hdr *frh, + struct nlattr **tb, + struct fib_rule *rule, + bool user_priority) +{ + struct fib_rule *r; + + list_for_each_entry(r, &ops->rules_list, list) { + if (rule->action && r->action != rule->action) + continue; + + if (rule->table && r->table != rule->table) + continue; + + if (user_priority && r->pref != rule->pref) + continue; + + if (rule->iifname[0] && + memcmp(r->iifname, rule->iifname, IFNAMSIZ)) + continue; + + if (rule->oifname[0] && + memcmp(r->oifname, rule->oifname, IFNAMSIZ)) + continue; + + if (rule->mark && r->mark != rule->mark) + continue; + + if (rule->suppress_ifgroup != -1 && + r->suppress_ifgroup != rule->suppress_ifgroup) + continue; + + if (rule->suppress_prefixlen != -1 && + r->suppress_prefixlen != rule->suppress_prefixlen) + continue; + + if (rule->mark_mask && r->mark_mask != rule->mark_mask) + continue; + + if (rule->tun_id && r->tun_id != rule->tun_id) + continue; + + if (r->fr_net != rule->fr_net) + continue; + + if (rule->l3mdev && r->l3mdev != rule->l3mdev) + continue; + + if (uid_range_set(&rule->uid_range) && + (!uid_eq(r->uid_range.start, rule->uid_range.start) || + !uid_eq(r->uid_range.end, rule->uid_range.end))) + continue; + + if (rule->ip_proto && r->ip_proto != rule->ip_proto) + continue; + + if (rule->proto && r->proto != rule->proto) + continue; + + if (fib_rule_port_range_set(&rule->sport_range) && + !fib_rule_port_range_compare(&r->sport_range, + &rule->sport_range)) + continue; + + if (fib_rule_port_range_set(&rule->dport_range) && + !fib_rule_port_range_compare(&r->dport_range, + &rule->dport_range)) + continue; + + if (!ops->compare(r, frh, tb)) + continue; + return r; + } + + return NULL; +} + +#ifdef CONFIG_NET_L3_MASTER_DEV +static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, + struct netlink_ext_ack *extack) +{ + nlrule->l3mdev = nla_get_u8(nla); + if (nlrule->l3mdev != 1) { + NL_SET_ERR_MSG(extack, "Invalid l3mdev attribute"); + return -1; + } + + return 0; +} +#else +static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, + struct netlink_ext_ack *extack) +{ + NL_SET_ERR_MSG(extack, "l3mdev support is not enabled in kernel"); + return -1; +} +#endif + +static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + struct fib_rules_ops *ops, + struct nlattr *tb[], + struct fib_rule **rule, + bool *user_priority) +{ + struct net *net = sock_net(skb->sk); + struct fib_rule_hdr *frh = nlmsg_data(nlh); + struct fib_rule *nlrule = NULL; + int err = -EINVAL; + + if (frh->src_len) + if (!tb[FRA_SRC] || + frh->src_len > (ops->addr_size * 8) || + nla_len(tb[FRA_SRC]) != ops->addr_size) { + NL_SET_ERR_MSG(extack, "Invalid source address"); + goto errout; + } + + if (frh->dst_len) + if (!tb[FRA_DST] || + frh->dst_len > (ops->addr_size * 8) || + nla_len(tb[FRA_DST]) != ops->addr_size) { + NL_SET_ERR_MSG(extack, "Invalid dst address"); + goto errout; + } + + nlrule = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); + if (!nlrule) { + err = -ENOMEM; + goto errout; + } + refcount_set(&nlrule->refcnt, 1); + nlrule->fr_net = net; + + if (tb[FRA_PRIORITY]) { + nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]); + *user_priority = true; + } else { + nlrule->pref = fib_default_rule_pref(ops); + } + + nlrule->proto = tb[FRA_PROTOCOL] ? + nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC; + + if (tb[FRA_IIFNAME]) { + struct net_device *dev; + + nlrule->iifindex = -1; + nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); + dev = __dev_get_by_name(net, nlrule->iifname); + if (dev) + nlrule->iifindex = dev->ifindex; + } + + if (tb[FRA_OIFNAME]) { + struct net_device *dev; + + nlrule->oifindex = -1; + nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); + dev = __dev_get_by_name(net, nlrule->oifname); + if (dev) + nlrule->oifindex = dev->ifindex; + } + + if (tb[FRA_FWMARK]) { + nlrule->mark = nla_get_u32(tb[FRA_FWMARK]); + if (nlrule->mark) + /* compatibility: if the mark value is non-zero all bits + * are compared unless a mask is explicitly specified. + */ + nlrule->mark_mask = 0xFFFFFFFF; + } + + if (tb[FRA_FWMASK]) + nlrule->mark_mask = nla_get_u32(tb[FRA_FWMASK]); + + if (tb[FRA_TUN_ID]) + nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); + + err = -EINVAL; + if (tb[FRA_L3MDEV] && + fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0) + goto errout_free; + + nlrule->action = frh->action; + nlrule->flags = frh->flags; + nlrule->table = frh_get_table(frh, tb); + if (tb[FRA_SUPPRESS_PREFIXLEN]) + nlrule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]); + else + nlrule->suppress_prefixlen = -1; + + if (tb[FRA_SUPPRESS_IFGROUP]) + nlrule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]); + else + nlrule->suppress_ifgroup = -1; + + if (tb[FRA_GOTO]) { + if (nlrule->action != FR_ACT_GOTO) { + NL_SET_ERR_MSG(extack, "Unexpected goto"); + goto errout_free; + } + + nlrule->target = nla_get_u32(tb[FRA_GOTO]); + /* Backward jumps are prohibited to avoid endless loops */ + if (nlrule->target <= nlrule->pref) { + NL_SET_ERR_MSG(extack, "Backward goto not supported"); + goto errout_free; + } + } else if (nlrule->action == FR_ACT_GOTO) { + NL_SET_ERR_MSG(extack, "Missing goto target for action goto"); + goto errout_free; + } + + if (nlrule->l3mdev && nlrule->table) { + NL_SET_ERR_MSG(extack, "l3mdev and table are mutually exclusive"); + goto errout_free; + } + + if (tb[FRA_UID_RANGE]) { + if (current_user_ns() != net->user_ns) { + err = -EPERM; + NL_SET_ERR_MSG(extack, "No permission to set uid"); + goto errout_free; + } + + nlrule->uid_range = nla_get_kuid_range(tb); + + if (!uid_range_set(&nlrule->uid_range) || + !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) { + NL_SET_ERR_MSG(extack, "Invalid uid range"); + goto errout_free; + } + } else { + nlrule->uid_range = fib_kuid_range_unset; + } + + if (tb[FRA_IP_PROTO]) + nlrule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]); + + if (tb[FRA_SPORT_RANGE]) { + err = nla_get_port_range(tb[FRA_SPORT_RANGE], + &nlrule->sport_range); + if (err) { + NL_SET_ERR_MSG(extack, "Invalid sport range"); + goto errout_free; + } + } + + if (tb[FRA_DPORT_RANGE]) { + err = nla_get_port_range(tb[FRA_DPORT_RANGE], + &nlrule->dport_range); + if (err) { + NL_SET_ERR_MSG(extack, "Invalid dport range"); + goto errout_free; + } + } + + *rule = nlrule; + + return 0; + +errout_free: + kfree(nlrule); +errout: + return err; +} + +static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, + struct nlattr **tb, struct fib_rule *rule) +{ + struct fib_rule *r; + + list_for_each_entry(r, &ops->rules_list, list) { + if (r->action != rule->action) + continue; + + if (r->table != rule->table) + continue; + + if (r->pref != rule->pref) + continue; + + if (memcmp(r->iifname, rule->iifname, IFNAMSIZ)) + continue; + + if (memcmp(r->oifname, rule->oifname, IFNAMSIZ)) + continue; + + if (r->mark != rule->mark) + continue; + + if (r->suppress_ifgroup != rule->suppress_ifgroup) + continue; + + if (r->suppress_prefixlen != rule->suppress_prefixlen) + continue; + + if (r->mark_mask != rule->mark_mask) + continue; + + if (r->tun_id != rule->tun_id) + continue; + + if (r->fr_net != rule->fr_net) + continue; + + if (r->l3mdev != rule->l3mdev) + continue; + + if (!uid_eq(r->uid_range.start, rule->uid_range.start) || + !uid_eq(r->uid_range.end, rule->uid_range.end)) + continue; + + if (r->ip_proto != rule->ip_proto) + continue; + + if (r->proto != rule->proto) + continue; + + if (!fib_rule_port_range_compare(&r->sport_range, + &rule->sport_range)) + continue; + + if (!fib_rule_port_range_compare(&r->dport_range, + &rule->dport_range)) + continue; + + if (!ops->compare(r, frh, tb)) + continue; + return 1; + } + return 0; +} + +static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = { + [FRA_UNSPEC] = { .strict_start_type = FRA_DPORT_RANGE + 1 }, + [FRA_IIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, + [FRA_OIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, + [FRA_PRIORITY] = { .type = NLA_U32 }, + [FRA_FWMARK] = { .type = NLA_U32 }, + [FRA_FLOW] = { .type = NLA_U32 }, + [FRA_TUN_ID] = { .type = NLA_U64 }, + [FRA_FWMASK] = { .type = NLA_U32 }, + [FRA_TABLE] = { .type = NLA_U32 }, + [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, + [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, + [FRA_GOTO] = { .type = NLA_U32 }, + [FRA_L3MDEV] = { .type = NLA_U8 }, + [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) }, + [FRA_PROTOCOL] = { .type = NLA_U8 }, + [FRA_IP_PROTO] = { .type = NLA_U8 }, + [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, + [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) } +}; + +int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct fib_rule_hdr *frh = nlmsg_data(nlh); + struct fib_rules_ops *ops = NULL; + struct fib_rule *rule = NULL, *r, *last = NULL; + struct nlattr *tb[FRA_MAX + 1]; + int err = -EINVAL, unresolved = 0; + bool user_priority = false; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + NL_SET_ERR_MSG(extack, "Invalid msg length"); + goto errout; + } + + ops = lookup_rules_ops(net, frh->family); + if (!ops) { + err = -EAFNOSUPPORT; + NL_SET_ERR_MSG(extack, "Rule family not supported"); + goto errout; + } + + err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, + fib_rule_policy, extack); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Error parsing msg"); + goto errout; + } + + err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority); + if (err) + goto errout; + + if ((nlh->nlmsg_flags & NLM_F_EXCL) && + rule_exists(ops, frh, tb, rule)) { + err = -EEXIST; + goto errout_free; + } + + err = ops->configure(rule, skb, frh, tb, extack); + if (err < 0) + goto errout_free; + + err = call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops, + extack); + if (err < 0) + goto errout_free; + + list_for_each_entry(r, &ops->rules_list, list) { + if (r->pref == rule->target) { + RCU_INIT_POINTER(rule->ctarget, r); + break; + } + } + + if (rcu_dereference_protected(rule->ctarget, 1) == NULL) + unresolved = 1; + + list_for_each_entry(r, &ops->rules_list, list) { + if (r->pref > rule->pref) + break; + last = r; + } + + if (last) + list_add_rcu(&rule->list, &last->list); + else + list_add_rcu(&rule->list, &ops->rules_list); + + if (ops->unresolved_rules) { + /* + * There are unresolved goto rules in the list, check if + * any of them are pointing to this new rule. + */ + list_for_each_entry(r, &ops->rules_list, list) { + if (r->action == FR_ACT_GOTO && + r->target == rule->pref && + rtnl_dereference(r->ctarget) == NULL) { + rcu_assign_pointer(r->ctarget, rule); + if (--ops->unresolved_rules == 0) + break; + } + } + } + + if (rule->action == FR_ACT_GOTO) + ops->nr_goto_rules++; + + if (unresolved) + ops->unresolved_rules++; + + if (rule->tun_id) + ip_tunnel_need_metadata(); + + notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); + flush_route_cache(ops); + rules_ops_put(ops); + return 0; + +errout_free: + kfree(rule); +errout: + rules_ops_put(ops); + return err; +} +EXPORT_SYMBOL_GPL(fib_nl_newrule); + +int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct fib_rule_hdr *frh = nlmsg_data(nlh); + struct fib_rules_ops *ops = NULL; + struct fib_rule *rule = NULL, *r, *nlrule = NULL; + struct nlattr *tb[FRA_MAX+1]; + int err = -EINVAL; + bool user_priority = false; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + NL_SET_ERR_MSG(extack, "Invalid msg length"); + goto errout; + } + + ops = lookup_rules_ops(net, frh->family); + if (ops == NULL) { + err = -EAFNOSUPPORT; + NL_SET_ERR_MSG(extack, "Rule family not supported"); + goto errout; + } + + err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, + fib_rule_policy, extack); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Error parsing msg"); + goto errout; + } + + err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority); + if (err) + goto errout; + + rule = rule_find(ops, frh, tb, nlrule, user_priority); + if (!rule) { + err = -ENOENT; + goto errout; + } + + if (rule->flags & FIB_RULE_PERMANENT) { + err = -EPERM; + goto errout; + } + + if (ops->delete) { + err = ops->delete(rule); + if (err) + goto errout; + } + + if (rule->tun_id) + ip_tunnel_unneed_metadata(); + + list_del_rcu(&rule->list); + + if (rule->action == FR_ACT_GOTO) { + ops->nr_goto_rules--; + if (rtnl_dereference(rule->ctarget) == NULL) + ops->unresolved_rules--; + } + + /* + * Check if this rule is a target to any of them. If so, + * adjust to the next one with the same preference or + * disable them. As this operation is eventually very + * expensive, it is only performed if goto rules, except + * current if it is goto rule, have actually been added. + */ + if (ops->nr_goto_rules > 0) { + struct fib_rule *n; + + n = list_next_entry(rule, list); + if (&n->list == &ops->rules_list || n->pref != rule->pref) + n = NULL; + list_for_each_entry(r, &ops->rules_list, list) { + if (rtnl_dereference(r->ctarget) != rule) + continue; + rcu_assign_pointer(r->ctarget, n); + if (!n) + ops->unresolved_rules++; + } + } + + call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, + NULL); + notify_rule_change(RTM_DELRULE, rule, ops, nlh, + NETLINK_CB(skb).portid); + fib_rule_put(rule); + flush_route_cache(ops); + rules_ops_put(ops); + kfree(nlrule); + return 0; + +errout: + kfree(nlrule); + rules_ops_put(ops); + return err; +} +EXPORT_SYMBOL_GPL(fib_nl_delrule); + +static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + struct fib_rule *rule) +{ + size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)) + + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */ + + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */ + + nla_total_size(4) /* FRA_PRIORITY */ + + nla_total_size(4) /* FRA_TABLE */ + + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */ + + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + + nla_total_size(4) /* FRA_FWMARK */ + + nla_total_size(4) /* FRA_FWMASK */ + + nla_total_size_64bit(8) /* FRA_TUN_ID */ + + nla_total_size(sizeof(struct fib_kuid_range)) + + nla_total_size(1) /* FRA_PROTOCOL */ + + nla_total_size(1) /* FRA_IP_PROTO */ + + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */ + + nla_total_size(sizeof(struct fib_rule_port_range)); /* FRA_DPORT_RANGE */ + + if (ops->nlmsg_payload) + payload += ops->nlmsg_payload(rule); + + return payload; +} + +static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, + u32 pid, u32 seq, int type, int flags, + struct fib_rules_ops *ops) +{ + struct nlmsghdr *nlh; + struct fib_rule_hdr *frh; + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*frh), flags); + if (nlh == NULL) + return -EMSGSIZE; + + frh = nlmsg_data(nlh); + frh->family = ops->family; + frh->table = rule->table < 256 ? rule->table : RT_TABLE_COMPAT; + if (nla_put_u32(skb, FRA_TABLE, rule->table)) + goto nla_put_failure; + if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) + goto nla_put_failure; + frh->res1 = 0; + frh->res2 = 0; + frh->action = rule->action; + frh->flags = rule->flags; + + if (nla_put_u8(skb, FRA_PROTOCOL, rule->proto)) + goto nla_put_failure; + + if (rule->action == FR_ACT_GOTO && + rcu_access_pointer(rule->ctarget) == NULL) + frh->flags |= FIB_RULE_UNRESOLVED; + + if (rule->iifname[0]) { + if (nla_put_string(skb, FRA_IIFNAME, rule->iifname)) + goto nla_put_failure; + if (rule->iifindex == -1) + frh->flags |= FIB_RULE_IIF_DETACHED; + } + + if (rule->oifname[0]) { + if (nla_put_string(skb, FRA_OIFNAME, rule->oifname)) + goto nla_put_failure; + if (rule->oifindex == -1) + frh->flags |= FIB_RULE_OIF_DETACHED; + } + + if ((rule->pref && + nla_put_u32(skb, FRA_PRIORITY, rule->pref)) || + (rule->mark && + nla_put_u32(skb, FRA_FWMARK, rule->mark)) || + ((rule->mark_mask || rule->mark) && + nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) || + (rule->target && + nla_put_u32(skb, FRA_GOTO, rule->target)) || + (rule->tun_id && + nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) || + (rule->l3mdev && + nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) || + (uid_range_set(&rule->uid_range) && + nla_put_uid_range(skb, &rule->uid_range)) || + (fib_rule_port_range_set(&rule->sport_range) && + nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) || + (fib_rule_port_range_set(&rule->dport_range) && + nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) || + (rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto))) + goto nla_put_failure; + + if (rule->suppress_ifgroup != -1) { + if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup)) + goto nla_put_failure; + } + + if (ops->fill(rule, skb, frh) < 0) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb, + struct fib_rules_ops *ops) +{ + int idx = 0; + struct fib_rule *rule; + int err = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(rule, &ops->rules_list, list) { + if (idx < cb->args[1]) + goto skip; + + err = fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, RTM_NEWRULE, + NLM_F_MULTI, ops); + if (err) + break; +skip: + idx++; + } + rcu_read_unlock(); + cb->args[1] = idx; + rules_ops_put(ops); + + return err; +} + +static int fib_valid_dumprule_req(const struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct fib_rule_hdr *frh; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request"); + return -EINVAL; + } + + frh = nlmsg_data(nlh); + if (frh->dst_len || frh->src_len || frh->tos || frh->table || + frh->res1 || frh->res2 || frh->action || frh->flags) { + NL_SET_ERR_MSG(extack, + "Invalid values in header for fib rule dump request"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*frh))) { + NL_SET_ERR_MSG(extack, "Invalid data after header in fib rule dump request"); + return -EINVAL; + } + + return 0; +} + +static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct nlmsghdr *nlh = cb->nlh; + struct net *net = sock_net(skb->sk); + struct fib_rules_ops *ops; + int idx = 0, family; + + if (cb->strict_check) { + int err = fib_valid_dumprule_req(nlh, cb->extack); + + if (err < 0) + return err; + } + + family = rtnl_msg_family(nlh); + if (family != AF_UNSPEC) { + /* Protocol specific dump request */ + ops = lookup_rules_ops(net, family); + if (ops == NULL) + return -EAFNOSUPPORT; + + dump_rules(skb, cb, ops); + + return skb->len; + } + + rcu_read_lock(); + list_for_each_entry_rcu(ops, &net->rules_ops, list) { + if (idx < cb->args[0] || !try_module_get(ops->owner)) + goto skip; + + if (dump_rules(skb, cb, ops) < 0) + break; + + cb->args[1] = 0; +skip: + idx++; + } + rcu_read_unlock(); + cb->args[0] = idx; + + return skb->len; +} + +static void notify_rule_change(int event, struct fib_rule *rule, + struct fib_rules_ops *ops, struct nlmsghdr *nlh, + u32 pid) +{ + struct net *net; + struct sk_buff *skb; + int err = -ENOMEM; + + net = ops->fro_net; + skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL); + if (skb == NULL) + goto errout; + + err = fib_nl_fill_rule(skb, rule, pid, nlh->nlmsg_seq, event, 0, ops); + if (err < 0) { + /* -EMSGSIZE implies BUG in fib_rule_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, ops->nlgroup, err); +} + +static void attach_rules(struct list_head *rules, struct net_device *dev) +{ + struct fib_rule *rule; + + list_for_each_entry(rule, rules, list) { + if (rule->iifindex == -1 && + strcmp(dev->name, rule->iifname) == 0) + rule->iifindex = dev->ifindex; + if (rule->oifindex == -1 && + strcmp(dev->name, rule->oifname) == 0) + rule->oifindex = dev->ifindex; + } +} + +static void detach_rules(struct list_head *rules, struct net_device *dev) +{ + struct fib_rule *rule; + + list_for_each_entry(rule, rules, list) { + if (rule->iifindex == dev->ifindex) + rule->iifindex = -1; + if (rule->oifindex == dev->ifindex) + rule->oifindex = -1; + } +} + + +static int fib_rules_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + struct fib_rules_ops *ops; + + ASSERT_RTNL(); + + switch (event) { + case NETDEV_REGISTER: + list_for_each_entry(ops, &net->rules_ops, list) + attach_rules(&ops->rules_list, dev); + break; + + case NETDEV_CHANGENAME: + list_for_each_entry(ops, &net->rules_ops, list) { + detach_rules(&ops->rules_list, dev); + attach_rules(&ops->rules_list, dev); + } + break; + + case NETDEV_UNREGISTER: + list_for_each_entry(ops, &net->rules_ops, list) + detach_rules(&ops->rules_list, dev); + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block fib_rules_notifier = { + .notifier_call = fib_rules_event, +}; + +static int __net_init fib_rules_net_init(struct net *net) +{ + INIT_LIST_HEAD(&net->rules_ops); + spin_lock_init(&net->rules_mod_lock); + return 0; +} + +static void __net_exit fib_rules_net_exit(struct net *net) +{ + WARN_ON_ONCE(!list_empty(&net->rules_ops)); +} + +static struct pernet_operations fib_rules_net_ops = { + .init = fib_rules_net_init, + .exit = fib_rules_net_exit, +}; + +static int __init fib_rules_init(void) +{ + int err; + rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, 0); + + err = register_pernet_subsys(&fib_rules_net_ops); + if (err < 0) + goto fail; + + err = register_netdevice_notifier(&fib_rules_notifier); + if (err < 0) + goto fail_unregister; + + return 0; + +fail_unregister: + unregister_pernet_subsys(&fib_rules_net_ops); +fail: + rtnl_unregister(PF_UNSPEC, RTM_NEWRULE); + rtnl_unregister(PF_UNSPEC, RTM_DELRULE); + rtnl_unregister(PF_UNSPEC, RTM_GETRULE); + return err; +} + +subsys_initcall(fib_rules_init); diff --git a/net/core/filter.c b/net/core/filter.c new file mode 100644 index 000000000..3a6110ea4 --- /dev/null +++ b/net/core/filter.c @@ -0,0 +1,11669 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Linux Socket Filter - Kernel level socket filtering + * + * Based on the design of the Berkeley Packet Filter. The new + * internal format has been designed by PLUMgrid: + * + * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com + * + * Authors: + * + * Jay Schulist <jschlst@samba.org> + * Alexei Starovoitov <ast@plumgrid.com> + * Daniel Borkmann <dborkman@redhat.com> + * + * Andi Kleen - Fix a few bad bugs and races. + * Kris Katterjohn - Added many additional checks in bpf_check_classic() + */ + +#include <linux/atomic.h> +#include <linux/bpf_verifier.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/fcntl.h> +#include <linux/socket.h> +#include <linux/sock_diag.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/if_packet.h> +#include <linux/if_arp.h> +#include <linux/gfp.h> +#include <net/inet_common.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/netlink.h> +#include <linux/skbuff.h> +#include <linux/skmsg.h> +#include <net/sock.h> +#include <net/flow_dissector.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <linux/uaccess.h> +#include <asm/unaligned.h> +#include <linux/filter.h> +#include <linux/ratelimit.h> +#include <linux/seccomp.h> +#include <linux/if_vlan.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <net/sch_generic.h> +#include <net/cls_cgroup.h> +#include <net/dst_metadata.h> +#include <net/dst.h> +#include <net/sock_reuseport.h> +#include <net/busy_poll.h> +#include <net/tcp.h> +#include <net/xfrm.h> +#include <net/udp.h> +#include <linux/bpf_trace.h> +#include <net/xdp_sock.h> +#include <linux/inetdevice.h> +#include <net/inet_hashtables.h> +#include <net/inet6_hashtables.h> +#include <net/ip_fib.h> +#include <net/nexthop.h> +#include <net/flow.h> +#include <net/arp.h> +#include <net/ipv6.h> +#include <net/net_namespace.h> +#include <linux/seg6_local.h> +#include <net/seg6.h> +#include <net/seg6_local.h> +#include <net/lwtunnel.h> +#include <net/ipv6_stubs.h> +#include <net/bpf_sk_storage.h> +#include <net/transp_v6.h> +#include <linux/btf_ids.h> +#include <net/tls.h> +#include <net/xdp.h> +#include <net/mptcp.h> +#include <net/netfilter/nf_conntrack_bpf.h> + +static const struct bpf_func_proto * +bpf_sk_base_func_proto(enum bpf_func_id func_id); + +int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len) +{ + if (in_compat_syscall()) { + struct compat_sock_fprog f32; + + if (len != sizeof(f32)) + return -EINVAL; + if (copy_from_sockptr(&f32, src, sizeof(f32))) + return -EFAULT; + memset(dst, 0, sizeof(*dst)); + dst->len = f32.len; + dst->filter = compat_ptr(f32.filter); + } else { + if (len != sizeof(*dst)) + return -EINVAL; + if (copy_from_sockptr(dst, src, sizeof(*dst))) + return -EFAULT; + } + + return 0; +} +EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user); + +/** + * sk_filter_trim_cap - run a packet through a socket filter + * @sk: sock associated with &sk_buff + * @skb: buffer to filter + * @cap: limit on how short the eBPF program may trim the packet + * + * Run the eBPF program and then cut skb->data to correct size returned by + * the program. If pkt_len is 0 we toss packet. If skb->len is smaller + * than pkt_len we keep whole skb->data. This is the socket level + * wrapper to bpf_prog_run. It returns 0 if the packet should + * be accepted or -EPERM if the packet should be tossed. + * + */ +int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) +{ + int err; + struct sk_filter *filter; + + /* + * If the skb was allocated from pfmemalloc reserves, only + * allow SOCK_MEMALLOC sockets to use it as this socket is + * helping free memory + */ + if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); + return -ENOMEM; + } + err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); + if (err) + return err; + + err = security_sock_rcv_skb(sk, skb); + if (err) + return err; + + rcu_read_lock(); + filter = rcu_dereference(sk->sk_filter); + if (filter) { + struct sock *save_sk = skb->sk; + unsigned int pkt_len; + + skb->sk = sk; + pkt_len = bpf_prog_run_save_cb(filter->prog, skb); + skb->sk = save_sk; + err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; + } + rcu_read_unlock(); + + return err; +} +EXPORT_SYMBOL(sk_filter_trim_cap); + +BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb) +{ + return skb_get_poff(skb); +} + +BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) +{ + struct nlattr *nla; + + if (skb_is_nonlinear(skb)) + return 0; + + if (skb->len < sizeof(struct nlattr)) + return 0; + + if (a > skb->len - sizeof(struct nlattr)) + return 0; + + nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x); + if (nla) + return (void *) nla - (void *) skb->data; + + return 0; +} + +BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) +{ + struct nlattr *nla; + + if (skb_is_nonlinear(skb)) + return 0; + + if (skb->len < sizeof(struct nlattr)) + return 0; + + if (a > skb->len - sizeof(struct nlattr)) + return 0; + + nla = (struct nlattr *) &skb->data[a]; + if (nla->nla_len > skb->len - a) + return 0; + + nla = nla_find_nested(nla, x); + if (nla) + return (void *) nla - (void *) skb->data; + + return 0; +} + +BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + u8 tmp, *ptr; + const int len = sizeof(tmp); + + if (offset >= 0) { + if (headlen - offset >= len) + return *(u8 *)(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return tmp; + } else { + ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); + if (likely(ptr)) + return *(u8 *)ptr; + } + + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len, + offset); +} + +BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + __be16 tmp, *ptr; + const int len = sizeof(tmp); + + if (offset >= 0) { + if (headlen - offset >= len) + return get_unaligned_be16(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be16_to_cpu(tmp); + } else { + ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); + if (likely(ptr)) + return get_unaligned_be16(ptr); + } + + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len, + offset); +} + +BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + __be32 tmp, *ptr; + const int len = sizeof(tmp); + + if (likely(offset >= 0)) { + if (headlen - offset >= len) + return get_unaligned_be32(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be32_to_cpu(tmp); + } else { + ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); + if (likely(ptr)) + return get_unaligned_be32(ptr); + } + + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len, + offset); +} + +static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, + struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + + switch (skb_field) { + case SKF_AD_MARK: + BUILD_BUG_ON(sizeof_field(struct sk_buff, mark) != 4); + + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, mark)); + break; + + case SKF_AD_PKTTYPE: + *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX); +#ifdef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5); +#endif + break; + + case SKF_AD_QUEUE: + BUILD_BUG_ON(sizeof_field(struct sk_buff, queue_mapping) != 2); + + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, queue_mapping)); + break; + + case SKF_AD_VLAN_TAG: + BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_tci) != 2); + + /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, vlan_tci)); + break; + case SKF_AD_VLAN_TAG_PRESENT: + *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET); + if (PKT_VLAN_PRESENT_BIT) + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, PKT_VLAN_PRESENT_BIT); + if (PKT_VLAN_PRESENT_BIT < 7) + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1); + break; + } + + return insn - insn_buf; +} + +static bool convert_bpf_extensions(struct sock_filter *fp, + struct bpf_insn **insnp) +{ + struct bpf_insn *insn = *insnp; + u32 cnt; + + switch (fp->k) { + case SKF_AD_OFF + SKF_AD_PROTOCOL: + BUILD_BUG_ON(sizeof_field(struct sk_buff, protocol) != 2); + + /* A = *(u16 *) (CTX + offsetof(protocol)) */ + *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, + offsetof(struct sk_buff, protocol)); + /* A = ntohs(A) [emitting a nop or swap16] */ + *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); + break; + + case SKF_AD_OFF + SKF_AD_PKTTYPE: + cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; + break; + + case SKF_AD_OFF + SKF_AD_IFINDEX: + case SKF_AD_OFF + SKF_AD_HATYPE: + BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4); + BUILD_BUG_ON(sizeof_field(struct net_device, type) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), + BPF_REG_TMP, BPF_REG_CTX, + offsetof(struct sk_buff, dev)); + /* if (tmp != 0) goto pc + 1 */ + *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1); + *insn++ = BPF_EXIT_INSN(); + if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) + *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP, + offsetof(struct net_device, ifindex)); + else + *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP, + offsetof(struct net_device, type)); + break; + + case SKF_AD_OFF + SKF_AD_MARK: + cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; + break; + + case SKF_AD_OFF + SKF_AD_RXHASH: + BUILD_BUG_ON(sizeof_field(struct sk_buff, hash) != 4); + + *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, + offsetof(struct sk_buff, hash)); + break; + + case SKF_AD_OFF + SKF_AD_QUEUE: + cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; + break; + + case SKF_AD_OFF + SKF_AD_VLAN_TAG: + cnt = convert_skb_access(SKF_AD_VLAN_TAG, + BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; + break; + + case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: + cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, + BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; + break; + + case SKF_AD_OFF + SKF_AD_VLAN_TPID: + BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_proto) != 2); + + /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */ + *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, + offsetof(struct sk_buff, vlan_proto)); + /* A = ntohs(A) [emitting a nop or swap16] */ + *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); + break; + + case SKF_AD_OFF + SKF_AD_PAY_OFFSET: + case SKF_AD_OFF + SKF_AD_NLATTR: + case SKF_AD_OFF + SKF_AD_NLATTR_NEST: + case SKF_AD_OFF + SKF_AD_CPU: + case SKF_AD_OFF + SKF_AD_RANDOM: + /* arg1 = CTX */ + *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); + /* arg2 = A */ + *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A); + /* arg3 = X */ + *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X); + /* Emit call(arg1=CTX, arg2=A, arg3=X) */ + switch (fp->k) { + case SKF_AD_OFF + SKF_AD_PAY_OFFSET: + *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset); + break; + case SKF_AD_OFF + SKF_AD_NLATTR: + *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr); + break; + case SKF_AD_OFF + SKF_AD_NLATTR_NEST: + *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest); + break; + case SKF_AD_OFF + SKF_AD_CPU: + *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id); + break; + case SKF_AD_OFF + SKF_AD_RANDOM: + *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); + bpf_user_rnd_init_once(); + break; + } + break; + + case SKF_AD_OFF + SKF_AD_ALU_XOR_X: + /* A ^= X */ + *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X); + break; + + default: + /* This is just a dummy call to avoid letting the compiler + * evict __bpf_call_base() as an optimization. Placed here + * where no-one bothers. + */ + BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0); + return false; + } + + *insnp = insn; + return true; +} + +static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) +{ + const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS); + int size = bpf_size_to_bytes(BPF_SIZE(fp->code)); + bool endian = BPF_SIZE(fp->code) == BPF_H || + BPF_SIZE(fp->code) == BPF_W; + bool indirect = BPF_MODE(fp->code) == BPF_IND; + const int ip_align = NET_IP_ALIGN; + struct bpf_insn *insn = *insnp; + int offset = fp->k; + + if (!indirect && + ((unaligned_ok && offset >= 0) || + (!unaligned_ok && offset >= 0 && + offset + ip_align >= 0 && + offset + ip_align % size == 0))) { + bool ldx_off_ok = offset <= S16_MAX; + + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); + if (offset) + *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); + *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, + size, 2 + endian + (!ldx_off_ok * 2)); + if (ldx_off_ok) { + *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, + BPF_REG_D, offset); + } else { + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D); + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset); + *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, + BPF_REG_TMP, 0); + } + if (endian) + *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8); + *insn++ = BPF_JMP_A(8); + } + + *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); + *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D); + *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H); + if (!indirect) { + *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset); + } else { + *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X); + if (fp->k) + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset); + } + + switch (BPF_SIZE(fp->code)) { + case BPF_B: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8); + break; + case BPF_H: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16); + break; + case BPF_W: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32); + break; + default: + return false; + } + + *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *insn = BPF_EXIT_INSN(); + + *insnp = insn; + return true; +} + +/** + * bpf_convert_filter - convert filter program + * @prog: the user passed filter program + * @len: the length of the user passed filter program + * @new_prog: allocated 'struct bpf_prog' or NULL + * @new_len: pointer to store length of converted program + * @seen_ld_abs: bool whether we've seen ld_abs/ind + * + * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' + * style extended BPF (eBPF). + * Conversion workflow: + * + * 1) First pass for calculating the new program length: + * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs) + * + * 2) 2nd pass to remap in two passes: 1st pass finds new + * jump offsets, 2nd pass remapping: + * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs) + */ +static int bpf_convert_filter(struct sock_filter *prog, int len, + struct bpf_prog *new_prog, int *new_len, + bool *seen_ld_abs) +{ + int new_flen = 0, pass = 0, target, i, stack_off; + struct bpf_insn *new_insn, *first_insn = NULL; + struct sock_filter *fp; + int *addrs = NULL; + u8 bpf_src; + + BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK); + BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); + + if (len <= 0 || len > BPF_MAXINSNS) + return -EINVAL; + + if (new_prog) { + first_insn = new_prog->insnsi; + addrs = kcalloc(len, sizeof(*addrs), + GFP_KERNEL | __GFP_NOWARN); + if (!addrs) + return -ENOMEM; + } + +do_pass: + new_insn = first_insn; + fp = prog; + + /* Classic BPF related prologue emission. */ + if (new_prog) { + /* Classic BPF expects A and X to be reset first. These need + * to be guaranteed to be the first two instructions. + */ + *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); + + /* All programs must keep CTX in callee saved BPF_REG_CTX. + * In eBPF case it's done by the compiler, here we need to + * do this ourself. Initial CTX is present in BPF_REG_ARG1. + */ + *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); + if (*seen_ld_abs) { + /* For packet access in classic BPF, cache skb->data + * in callee-saved BPF R8 and skb->len - skb->data_len + * (headlen) in BPF R9. Since classic BPF is read-only + * on CTX, we only need to cache it once. + */ + *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + BPF_REG_D, BPF_REG_CTX, + offsetof(struct sk_buff, data)); + *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX, + offsetof(struct sk_buff, len)); + *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX, + offsetof(struct sk_buff, data_len)); + *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP); + } + } else { + new_insn += 3; + } + + for (i = 0; i < len; fp++, i++) { + struct bpf_insn tmp_insns[32] = { }; + struct bpf_insn *insn = tmp_insns; + + if (addrs) + addrs[i] = new_insn - first_insn; + + switch (fp->code) { + /* All arithmetic insns and skb loads map as-is. */ + case BPF_ALU | BPF_ADD | BPF_X: + case BPF_ALU | BPF_ADD | BPF_K: + case BPF_ALU | BPF_SUB | BPF_X: + case BPF_ALU | BPF_SUB | BPF_K: + case BPF_ALU | BPF_AND | BPF_X: + case BPF_ALU | BPF_AND | BPF_K: + case BPF_ALU | BPF_OR | BPF_X: + case BPF_ALU | BPF_OR | BPF_K: + case BPF_ALU | BPF_LSH | BPF_X: + case BPF_ALU | BPF_LSH | BPF_K: + case BPF_ALU | BPF_RSH | BPF_X: + case BPF_ALU | BPF_RSH | BPF_K: + case BPF_ALU | BPF_XOR | BPF_X: + case BPF_ALU | BPF_XOR | BPF_K: + case BPF_ALU | BPF_MUL | BPF_X: + case BPF_ALU | BPF_MUL | BPF_K: + case BPF_ALU | BPF_DIV | BPF_X: + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU | BPF_MOD | BPF_X: + case BPF_ALU | BPF_MOD | BPF_K: + case BPF_ALU | BPF_NEG: + case BPF_LD | BPF_ABS | BPF_W: + case BPF_LD | BPF_ABS | BPF_H: + case BPF_LD | BPF_ABS | BPF_B: + case BPF_LD | BPF_IND | BPF_W: + case BPF_LD | BPF_IND | BPF_H: + case BPF_LD | BPF_IND | BPF_B: + /* Check for overloaded BPF extension and + * directly convert it if found, otherwise + * just move on with mapping. + */ + if (BPF_CLASS(fp->code) == BPF_LD && + BPF_MODE(fp->code) == BPF_ABS && + convert_bpf_extensions(fp, &insn)) + break; + if (BPF_CLASS(fp->code) == BPF_LD && + convert_bpf_ld_abs(fp, &insn)) { + *seen_ld_abs = true; + break; + } + + if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || + fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { + *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); + /* Error with exception code on div/mod by 0. + * For cBPF programs, this was always return 0. + */ + *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *insn++ = BPF_EXIT_INSN(); + } + + *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); + break; + + /* Jump transformation cannot use BPF block macros + * everywhere as offset calculation and target updates + * require a bit more work than the rest, i.e. jump + * opcodes map as-is, but offsets need adjustment. + */ + +#define BPF_EMIT_JMP \ + do { \ + const s32 off_min = S16_MIN, off_max = S16_MAX; \ + s32 off; \ + \ + if (target >= len || target < 0) \ + goto err; \ + off = addrs ? addrs[target] - addrs[i] - 1 : 0; \ + /* Adjust pc relative offset for 2nd or 3rd insn. */ \ + off -= insn - tmp_insns; \ + /* Reject anything not fitting into insn->off. */ \ + if (off < off_min || off > off_max) \ + goto err; \ + insn->off = off; \ + } while (0) + + case BPF_JMP | BPF_JA: + target = i + fp->k + 1; + insn->code = fp->code; + BPF_EMIT_JMP; + break; + + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JSET | BPF_K: + case BPF_JMP | BPF_JSET | BPF_X: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGE | BPF_X: + if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) { + /* BPF immediates are signed, zero extend + * immediate into tmp register and use it + * in compare insn. + */ + *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k); + + insn->dst_reg = BPF_REG_A; + insn->src_reg = BPF_REG_TMP; + bpf_src = BPF_X; + } else { + insn->dst_reg = BPF_REG_A; + insn->imm = fp->k; + bpf_src = BPF_SRC(fp->code); + insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0; + } + + /* Common case where 'jump_false' is next insn. */ + if (fp->jf == 0) { + insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; + target = i + fp->jt + 1; + BPF_EMIT_JMP; + break; + } + + /* Convert some jumps when 'jump_true' is next insn. */ + if (fp->jt == 0) { + switch (BPF_OP(fp->code)) { + case BPF_JEQ: + insn->code = BPF_JMP | BPF_JNE | bpf_src; + break; + case BPF_JGT: + insn->code = BPF_JMP | BPF_JLE | bpf_src; + break; + case BPF_JGE: + insn->code = BPF_JMP | BPF_JLT | bpf_src; + break; + default: + goto jmp_rest; + } + + target = i + fp->jf + 1; + BPF_EMIT_JMP; + break; + } +jmp_rest: + /* Other jumps are mapped into two insns: Jxx and JA. */ + target = i + fp->jt + 1; + insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; + BPF_EMIT_JMP; + insn++; + + insn->code = BPF_JMP | BPF_JA; + target = i + fp->jf + 1; + BPF_EMIT_JMP; + break; + + /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ + case BPF_LDX | BPF_MSH | BPF_B: { + struct sock_filter tmp = { + .code = BPF_LD | BPF_ABS | BPF_B, + .k = fp->k, + }; + + *seen_ld_abs = true; + + /* X = A */ + *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); + /* A = BPF_R0 = *(u8 *) (skb->data + K) */ + convert_bpf_ld_abs(&tmp, &insn); + insn++; + /* A &= 0xf */ + *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); + /* A <<= 2 */ + *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); + /* tmp = X */ + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X); + /* X = A */ + *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); + /* A = tmp */ + *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); + break; + } + /* RET_K is remaped into 2 insns. RET_A case doesn't need an + * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. + */ + case BPF_RET | BPF_A: + case BPF_RET | BPF_K: + if (BPF_RVAL(fp->code) == BPF_K) + *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0, + 0, fp->k); + *insn = BPF_EXIT_INSN(); + break; + + /* Store to stack. */ + case BPF_ST: + case BPF_STX: + stack_off = fp->k * 4 + 4; + *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) == + BPF_ST ? BPF_REG_A : BPF_REG_X, + -stack_off); + /* check_load_and_stores() verifies that classic BPF can + * load from stack only after write, so tracking + * stack_depth for ST|STX insns is enough + */ + if (new_prog && new_prog->aux->stack_depth < stack_off) + new_prog->aux->stack_depth = stack_off; + break; + + /* Load from stack. */ + case BPF_LD | BPF_MEM: + case BPF_LDX | BPF_MEM: + stack_off = fp->k * 4 + 4; + *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? + BPF_REG_A : BPF_REG_X, BPF_REG_FP, + -stack_off); + break; + + /* A = K or X = K */ + case BPF_LD | BPF_IMM: + case BPF_LDX | BPF_IMM: + *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ? + BPF_REG_A : BPF_REG_X, fp->k); + break; + + /* X = A */ + case BPF_MISC | BPF_TAX: + *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); + break; + + /* A = X */ + case BPF_MISC | BPF_TXA: + *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X); + break; + + /* A = skb->len or X = skb->len */ + case BPF_LD | BPF_W | BPF_LEN: + case BPF_LDX | BPF_W | BPF_LEN: + *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? + BPF_REG_A : BPF_REG_X, BPF_REG_CTX, + offsetof(struct sk_buff, len)); + break; + + /* Access seccomp_data fields. */ + case BPF_LDX | BPF_ABS | BPF_W: + /* A = *(u32 *) (ctx + K) */ + *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k); + break; + + /* Unknown instruction. */ + default: + goto err; + } + + insn++; + if (new_prog) + memcpy(new_insn, tmp_insns, + sizeof(*insn) * (insn - tmp_insns)); + new_insn += insn - tmp_insns; + } + + if (!new_prog) { + /* Only calculating new length. */ + *new_len = new_insn - first_insn; + if (*seen_ld_abs) + *new_len += 4; /* Prologue bits. */ + return 0; + } + + pass++; + if (new_flen != new_insn - first_insn) { + new_flen = new_insn - first_insn; + if (pass > 2) + goto err; + goto do_pass; + } + + kfree(addrs); + BUG_ON(*new_len != new_flen); + return 0; +err: + kfree(addrs); + return -EINVAL; +} + +/* Security: + * + * As we dont want to clear mem[] array for each packet going through + * __bpf_prog_run(), we check that filter loaded by user never try to read + * a cell if not previously written, and we check all branches to be sure + * a malicious user doesn't try to abuse us. + */ +static int check_load_and_stores(const struct sock_filter *filter, int flen) +{ + u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */ + int pc, ret = 0; + + BUILD_BUG_ON(BPF_MEMWORDS > 16); + + masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL); + if (!masks) + return -ENOMEM; + + memset(masks, 0xff, flen * sizeof(*masks)); + + for (pc = 0; pc < flen; pc++) { + memvalid &= masks[pc]; + + switch (filter[pc].code) { + case BPF_ST: + case BPF_STX: + memvalid |= (1 << filter[pc].k); + break; + case BPF_LD | BPF_MEM: + case BPF_LDX | BPF_MEM: + if (!(memvalid & (1 << filter[pc].k))) { + ret = -EINVAL; + goto error; + } + break; + case BPF_JMP | BPF_JA: + /* A jump must set masks on target */ + masks[pc + 1 + filter[pc].k] &= memvalid; + memvalid = ~0; + break; + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JSET | BPF_K: + case BPF_JMP | BPF_JSET | BPF_X: + /* A jump must set masks on targets */ + masks[pc + 1 + filter[pc].jt] &= memvalid; + masks[pc + 1 + filter[pc].jf] &= memvalid; + memvalid = ~0; + break; + } + } +error: + kfree(masks); + return ret; +} + +static bool chk_code_allowed(u16 code_to_probe) +{ + static const bool codes[] = { + /* 32 bit ALU operations */ + [BPF_ALU | BPF_ADD | BPF_K] = true, + [BPF_ALU | BPF_ADD | BPF_X] = true, + [BPF_ALU | BPF_SUB | BPF_K] = true, + [BPF_ALU | BPF_SUB | BPF_X] = true, + [BPF_ALU | BPF_MUL | BPF_K] = true, + [BPF_ALU | BPF_MUL | BPF_X] = true, + [BPF_ALU | BPF_DIV | BPF_K] = true, + [BPF_ALU | BPF_DIV | BPF_X] = true, + [BPF_ALU | BPF_MOD | BPF_K] = true, + [BPF_ALU | BPF_MOD | BPF_X] = true, + [BPF_ALU | BPF_AND | BPF_K] = true, + [BPF_ALU | BPF_AND | BPF_X] = true, + [BPF_ALU | BPF_OR | BPF_K] = true, + [BPF_ALU | BPF_OR | BPF_X] = true, + [BPF_ALU | BPF_XOR | BPF_K] = true, + [BPF_ALU | BPF_XOR | BPF_X] = true, + [BPF_ALU | BPF_LSH | BPF_K] = true, + [BPF_ALU | BPF_LSH | BPF_X] = true, + [BPF_ALU | BPF_RSH | BPF_K] = true, + [BPF_ALU | BPF_RSH | BPF_X] = true, + [BPF_ALU | BPF_NEG] = true, + /* Load instructions */ + [BPF_LD | BPF_W | BPF_ABS] = true, + [BPF_LD | BPF_H | BPF_ABS] = true, + [BPF_LD | BPF_B | BPF_ABS] = true, + [BPF_LD | BPF_W | BPF_LEN] = true, + [BPF_LD | BPF_W | BPF_IND] = true, + [BPF_LD | BPF_H | BPF_IND] = true, + [BPF_LD | BPF_B | BPF_IND] = true, + [BPF_LD | BPF_IMM] = true, + [BPF_LD | BPF_MEM] = true, + [BPF_LDX | BPF_W | BPF_LEN] = true, + [BPF_LDX | BPF_B | BPF_MSH] = true, + [BPF_LDX | BPF_IMM] = true, + [BPF_LDX | BPF_MEM] = true, + /* Store instructions */ + [BPF_ST] = true, + [BPF_STX] = true, + /* Misc instructions */ + [BPF_MISC | BPF_TAX] = true, + [BPF_MISC | BPF_TXA] = true, + /* Return instructions */ + [BPF_RET | BPF_K] = true, + [BPF_RET | BPF_A] = true, + /* Jump instructions */ + [BPF_JMP | BPF_JA] = true, + [BPF_JMP | BPF_JEQ | BPF_K] = true, + [BPF_JMP | BPF_JEQ | BPF_X] = true, + [BPF_JMP | BPF_JGE | BPF_K] = true, + [BPF_JMP | BPF_JGE | BPF_X] = true, + [BPF_JMP | BPF_JGT | BPF_K] = true, + [BPF_JMP | BPF_JGT | BPF_X] = true, + [BPF_JMP | BPF_JSET | BPF_K] = true, + [BPF_JMP | BPF_JSET | BPF_X] = true, + }; + + if (code_to_probe >= ARRAY_SIZE(codes)) + return false; + + return codes[code_to_probe]; +} + +static bool bpf_check_basics_ok(const struct sock_filter *filter, + unsigned int flen) +{ + if (filter == NULL) + return false; + if (flen == 0 || flen > BPF_MAXINSNS) + return false; + + return true; +} + +/** + * bpf_check_classic - verify socket filter code + * @filter: filter to verify + * @flen: length of filter + * + * Check the user's filter code. If we let some ugly + * filter code slip through kaboom! The filter must contain + * no references or jumps that are out of range, no illegal + * instructions, and must end with a RET instruction. + * + * All jumps are forward as they are not signed. + * + * Returns 0 if the rule set is legal or -EINVAL if not. + */ +static int bpf_check_classic(const struct sock_filter *filter, + unsigned int flen) +{ + bool anc_found; + int pc; + + /* Check the filter code now */ + for (pc = 0; pc < flen; pc++) { + const struct sock_filter *ftest = &filter[pc]; + + /* May we actually operate on this code? */ + if (!chk_code_allowed(ftest->code)) + return -EINVAL; + + /* Some instructions need special checks */ + switch (ftest->code) { + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU | BPF_MOD | BPF_K: + /* Check for division by zero */ + if (ftest->k == 0) + return -EINVAL; + break; + case BPF_ALU | BPF_LSH | BPF_K: + case BPF_ALU | BPF_RSH | BPF_K: + if (ftest->k >= 32) + return -EINVAL; + break; + case BPF_LD | BPF_MEM: + case BPF_LDX | BPF_MEM: + case BPF_ST: + case BPF_STX: + /* Check for invalid memory addresses */ + if (ftest->k >= BPF_MEMWORDS) + return -EINVAL; + break; + case BPF_JMP | BPF_JA: + /* Note, the large ftest->k might cause loops. + * Compare this with conditional jumps below, + * where offsets are limited. --ANK (981016) + */ + if (ftest->k >= (unsigned int)(flen - pc - 1)) + return -EINVAL; + break; + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JSET | BPF_K: + case BPF_JMP | BPF_JSET | BPF_X: + /* Both conditionals must be safe */ + if (pc + ftest->jt + 1 >= flen || + pc + ftest->jf + 1 >= flen) + return -EINVAL; + break; + case BPF_LD | BPF_W | BPF_ABS: + case BPF_LD | BPF_H | BPF_ABS: + case BPF_LD | BPF_B | BPF_ABS: + anc_found = false; + if (bpf_anc_helper(ftest) & BPF_ANC) + anc_found = true; + /* Ancillary operation unknown or unsupported */ + if (anc_found == false && ftest->k >= SKF_AD_OFF) + return -EINVAL; + } + } + + /* Last instruction must be a RET code */ + switch (filter[flen - 1].code) { + case BPF_RET | BPF_K: + case BPF_RET | BPF_A: + return check_load_and_stores(filter, flen); + } + + return -EINVAL; +} + +static int bpf_prog_store_orig_filter(struct bpf_prog *fp, + const struct sock_fprog *fprog) +{ + unsigned int fsize = bpf_classic_proglen(fprog); + struct sock_fprog_kern *fkprog; + + fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL); + if (!fp->orig_prog) + return -ENOMEM; + + fkprog = fp->orig_prog; + fkprog->len = fprog->len; + + fkprog->filter = kmemdup(fp->insns, fsize, + GFP_KERNEL | __GFP_NOWARN); + if (!fkprog->filter) { + kfree(fp->orig_prog); + return -ENOMEM; + } + + return 0; +} + +static void bpf_release_orig_filter(struct bpf_prog *fp) +{ + struct sock_fprog_kern *fprog = fp->orig_prog; + + if (fprog) { + kfree(fprog->filter); + kfree(fprog); + } +} + +static void __bpf_prog_release(struct bpf_prog *prog) +{ + if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) { + bpf_prog_put(prog); + } else { + bpf_release_orig_filter(prog); + bpf_prog_free(prog); + } +} + +static void __sk_filter_release(struct sk_filter *fp) +{ + __bpf_prog_release(fp->prog); + kfree(fp); +} + +/** + * sk_filter_release_rcu - Release a socket filter by rcu_head + * @rcu: rcu_head that contains the sk_filter to free + */ +static void sk_filter_release_rcu(struct rcu_head *rcu) +{ + struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); + + __sk_filter_release(fp); +} + +/** + * sk_filter_release - release a socket filter + * @fp: filter to remove + * + * Remove a filter from a socket and release its resources. + */ +static void sk_filter_release(struct sk_filter *fp) +{ + if (refcount_dec_and_test(&fp->refcnt)) + call_rcu(&fp->rcu, sk_filter_release_rcu); +} + +void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) +{ + u32 filter_size = bpf_prog_size(fp->prog->len); + + atomic_sub(filter_size, &sk->sk_omem_alloc); + sk_filter_release(fp); +} + +/* try to charge the socket memory if there is space available + * return true on success + */ +static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) +{ + u32 filter_size = bpf_prog_size(fp->prog->len); + int optmem_max = READ_ONCE(sysctl_optmem_max); + + /* same check as in sock_kmalloc() */ + if (filter_size <= optmem_max && + atomic_read(&sk->sk_omem_alloc) + filter_size < optmem_max) { + atomic_add(filter_size, &sk->sk_omem_alloc); + return true; + } + return false; +} + +bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) +{ + if (!refcount_inc_not_zero(&fp->refcnt)) + return false; + + if (!__sk_filter_charge(sk, fp)) { + sk_filter_release(fp); + return false; + } + return true; +} + +static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) +{ + struct sock_filter *old_prog; + struct bpf_prog *old_fp; + int err, new_len, old_len = fp->len; + bool seen_ld_abs = false; + + /* We are free to overwrite insns et al right here as it won't be used at + * this point in time anymore internally after the migration to the eBPF + * instruction representation. + */ + BUILD_BUG_ON(sizeof(struct sock_filter) != + sizeof(struct bpf_insn)); + + /* Conversion cannot happen on overlapping memory areas, + * so we need to keep the user BPF around until the 2nd + * pass. At this time, the user BPF is stored in fp->insns. + */ + old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter), + GFP_KERNEL | __GFP_NOWARN); + if (!old_prog) { + err = -ENOMEM; + goto out_err; + } + + /* 1st pass: calculate the new program length. */ + err = bpf_convert_filter(old_prog, old_len, NULL, &new_len, + &seen_ld_abs); + if (err) + goto out_err_free; + + /* Expand fp for appending the new filter representation. */ + old_fp = fp; + fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0); + if (!fp) { + /* The old_fp is still around in case we couldn't + * allocate new memory, so uncharge on that one. + */ + fp = old_fp; + err = -ENOMEM; + goto out_err_free; + } + + fp->len = new_len; + + /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ + err = bpf_convert_filter(old_prog, old_len, fp, &new_len, + &seen_ld_abs); + if (err) + /* 2nd bpf_convert_filter() can fail only if it fails + * to allocate memory, remapping must succeed. Note, + * that at this time old_fp has already been released + * by krealloc(). + */ + goto out_err_free; + + fp = bpf_prog_select_runtime(fp, &err); + if (err) + goto out_err_free; + + kfree(old_prog); + return fp; + +out_err_free: + kfree(old_prog); +out_err: + __bpf_prog_release(fp); + return ERR_PTR(err); +} + +static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, + bpf_aux_classic_check_t trans) +{ + int err; + + fp->bpf_func = NULL; + fp->jited = 0; + + err = bpf_check_classic(fp->insns, fp->len); + if (err) { + __bpf_prog_release(fp); + return ERR_PTR(err); + } + + /* There might be additional checks and transformations + * needed on classic filters, f.e. in case of seccomp. + */ + if (trans) { + err = trans(fp->insns, fp->len); + if (err) { + __bpf_prog_release(fp); + return ERR_PTR(err); + } + } + + /* Probe if we can JIT compile the filter and if so, do + * the compilation of the filter. + */ + bpf_jit_compile(fp); + + /* JIT compiler couldn't process this filter, so do the eBPF translation + * for the optimized interpreter. + */ + if (!fp->jited) + fp = bpf_migrate_filter(fp); + + return fp; +} + +/** + * bpf_prog_create - create an unattached filter + * @pfp: the unattached filter that is created + * @fprog: the filter program + * + * Create a filter independent of any socket. We first run some + * sanity checks on it to make sure it does not explode on us later. + * If an error occurs or there is insufficient memory for the filter + * a negative errno code is returned. On success the return is zero. + */ +int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) +{ + unsigned int fsize = bpf_classic_proglen(fprog); + struct bpf_prog *fp; + + /* Make sure new filter is there and in the right amounts. */ + if (!bpf_check_basics_ok(fprog->filter, fprog->len)) + return -EINVAL; + + fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); + if (!fp) + return -ENOMEM; + + memcpy(fp->insns, fprog->filter, fsize); + + fp->len = fprog->len; + /* Since unattached filters are not copied back to user + * space through sk_get_filter(), we do not need to hold + * a copy here, and can spare us the work. + */ + fp->orig_prog = NULL; + + /* bpf_prepare_filter() already takes care of freeing + * memory in case something goes wrong. + */ + fp = bpf_prepare_filter(fp, NULL); + if (IS_ERR(fp)) + return PTR_ERR(fp); + + *pfp = fp; + return 0; +} +EXPORT_SYMBOL_GPL(bpf_prog_create); + +/** + * bpf_prog_create_from_user - create an unattached filter from user buffer + * @pfp: the unattached filter that is created + * @fprog: the filter program + * @trans: post-classic verifier transformation handler + * @save_orig: save classic BPF program + * + * This function effectively does the same as bpf_prog_create(), only + * that it builds up its insns buffer from user space provided buffer. + * It also allows for passing a bpf_aux_classic_check_t handler. + */ +int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, + bpf_aux_classic_check_t trans, bool save_orig) +{ + unsigned int fsize = bpf_classic_proglen(fprog); + struct bpf_prog *fp; + int err; + + /* Make sure new filter is there and in the right amounts. */ + if (!bpf_check_basics_ok(fprog->filter, fprog->len)) + return -EINVAL; + + fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); + if (!fp) + return -ENOMEM; + + if (copy_from_user(fp->insns, fprog->filter, fsize)) { + __bpf_prog_free(fp); + return -EFAULT; + } + + fp->len = fprog->len; + fp->orig_prog = NULL; + + if (save_orig) { + err = bpf_prog_store_orig_filter(fp, fprog); + if (err) { + __bpf_prog_free(fp); + return -ENOMEM; + } + } + + /* bpf_prepare_filter() already takes care of freeing + * memory in case something goes wrong. + */ + fp = bpf_prepare_filter(fp, trans); + if (IS_ERR(fp)) + return PTR_ERR(fp); + + *pfp = fp; + return 0; +} +EXPORT_SYMBOL_GPL(bpf_prog_create_from_user); + +void bpf_prog_destroy(struct bpf_prog *fp) +{ + __bpf_prog_release(fp); +} +EXPORT_SYMBOL_GPL(bpf_prog_destroy); + +static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) +{ + struct sk_filter *fp, *old_fp; + + fp = kmalloc(sizeof(*fp), GFP_KERNEL); + if (!fp) + return -ENOMEM; + + fp->prog = prog; + + if (!__sk_filter_charge(sk, fp)) { + kfree(fp); + return -ENOMEM; + } + refcount_set(&fp->refcnt, 1); + + old_fp = rcu_dereference_protected(sk->sk_filter, + lockdep_sock_is_held(sk)); + rcu_assign_pointer(sk->sk_filter, fp); + + if (old_fp) + sk_filter_uncharge(sk, old_fp); + + return 0; +} + +static +struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) +{ + unsigned int fsize = bpf_classic_proglen(fprog); + struct bpf_prog *prog; + int err; + + if (sock_flag(sk, SOCK_FILTER_LOCKED)) + return ERR_PTR(-EPERM); + + /* Make sure new filter is there and in the right amounts. */ + if (!bpf_check_basics_ok(fprog->filter, fprog->len)) + return ERR_PTR(-EINVAL); + + prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); + if (!prog) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(prog->insns, fprog->filter, fsize)) { + __bpf_prog_free(prog); + return ERR_PTR(-EFAULT); + } + + prog->len = fprog->len; + + err = bpf_prog_store_orig_filter(prog, fprog); + if (err) { + __bpf_prog_free(prog); + return ERR_PTR(-ENOMEM); + } + + /* bpf_prepare_filter() already takes care of freeing + * memory in case something goes wrong. + */ + return bpf_prepare_filter(prog, NULL); +} + +/** + * sk_attach_filter - attach a socket filter + * @fprog: the filter program + * @sk: the socket to use + * + * Attach the user's filter code. We first run some sanity checks on + * it to make sure it does not explode on us later. If an error + * occurs or there is insufficient memory for the filter a negative + * errno code is returned. On success the return is zero. + */ +int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) +{ + struct bpf_prog *prog = __get_filter(fprog, sk); + int err; + + if (IS_ERR(prog)) + return PTR_ERR(prog); + + err = __sk_attach_prog(prog, sk); + if (err < 0) { + __bpf_prog_release(prog); + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(sk_attach_filter); + +int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) +{ + struct bpf_prog *prog = __get_filter(fprog, sk); + int err; + + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) + err = -ENOMEM; + else + err = reuseport_attach_prog(sk, prog); + + if (err) + __bpf_prog_release(prog); + + return err; +} + +static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) +{ + if (sock_flag(sk, SOCK_FILTER_LOCKED)) + return ERR_PTR(-EPERM); + + return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); +} + +int sk_attach_bpf(u32 ufd, struct sock *sk) +{ + struct bpf_prog *prog = __get_bpf(ufd, sk); + int err; + + if (IS_ERR(prog)) + return PTR_ERR(prog); + + err = __sk_attach_prog(prog, sk); + if (err < 0) { + bpf_prog_put(prog); + return err; + } + + return 0; +} + +int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) +{ + struct bpf_prog *prog; + int err; + + if (sock_flag(sk, SOCK_FILTER_LOCKED)) + return -EPERM; + + prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); + if (PTR_ERR(prog) == -EINVAL) + prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) { + /* Like other non BPF_PROG_TYPE_SOCKET_FILTER + * bpf prog (e.g. sockmap). It depends on the + * limitation imposed by bpf_prog_load(). + * Hence, sysctl_optmem_max is not checked. + */ + if ((sk->sk_type != SOCK_STREAM && + sk->sk_type != SOCK_DGRAM) || + (sk->sk_protocol != IPPROTO_UDP && + sk->sk_protocol != IPPROTO_TCP) || + (sk->sk_family != AF_INET && + sk->sk_family != AF_INET6)) { + err = -ENOTSUPP; + goto err_prog_put; + } + } else { + /* BPF_PROG_TYPE_SOCKET_FILTER */ + if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) { + err = -ENOMEM; + goto err_prog_put; + } + } + + err = reuseport_attach_prog(sk, prog); +err_prog_put: + if (err) + bpf_prog_put(prog); + + return err; +} + +void sk_reuseport_prog_free(struct bpf_prog *prog) +{ + if (!prog) + return; + + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) + bpf_prog_put(prog); + else + bpf_prog_destroy(prog); +} + +struct bpf_scratchpad { + union { + __be32 diff[MAX_BPF_STACK / sizeof(__be32)]; + u8 buff[MAX_BPF_STACK]; + }; +}; + +static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); + +static inline int __bpf_try_make_writable(struct sk_buff *skb, + unsigned int write_len) +{ + return skb_ensure_writable(skb, write_len); +} + +static inline int bpf_try_make_writable(struct sk_buff *skb, + unsigned int write_len) +{ + int err = __bpf_try_make_writable(skb, write_len); + + bpf_compute_data_pointers(skb); + return err; +} + +static int bpf_try_make_head_writable(struct sk_buff *skb) +{ + return bpf_try_make_writable(skb, skb_headlen(skb)); +} + +static inline void bpf_push_mac_rcsum(struct sk_buff *skb) +{ + if (skb_at_tc_ingress(skb)) + skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len); +} + +static inline void bpf_pull_mac_rcsum(struct sk_buff *skb) +{ + if (skb_at_tc_ingress(skb)) + skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len); +} + +BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset, + const void *, from, u32, len, u64, flags) +{ + void *ptr; + + if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) + return -EINVAL; + if (unlikely(offset > INT_MAX)) + return -EFAULT; + if (unlikely(bpf_try_make_writable(skb, offset + len))) + return -EFAULT; + + ptr = skb->data + offset; + if (flags & BPF_F_RECOMPUTE_CSUM) + __skb_postpull_rcsum(skb, ptr, len, offset); + + memcpy(ptr, from, len); + + if (flags & BPF_F_RECOMPUTE_CSUM) + __skb_postpush_rcsum(skb, ptr, len, offset); + if (flags & BPF_F_INVALIDATE_HASH) + skb_clear_hash(skb); + + return 0; +} + +static const struct bpf_func_proto bpf_skb_store_bytes_proto = { + .func = bpf_skb_store_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset, + void *, to, u32, len) +{ + void *ptr; + + if (unlikely(offset > INT_MAX)) + goto err_clear; + + ptr = skb_header_pointer(skb, offset, len, to); + if (unlikely(!ptr)) + goto err_clear; + if (ptr != to) + memcpy(to, ptr, len); + + return 0; +err_clear: + memset(to, 0, len); + return -EFAULT; +} + +static const struct bpf_func_proto bpf_skb_load_bytes_proto = { + .func = bpf_skb_load_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + +BPF_CALL_4(bpf_flow_dissector_load_bytes, + const struct bpf_flow_dissector *, ctx, u32, offset, + void *, to, u32, len) +{ + void *ptr; + + if (unlikely(offset > 0xffff)) + goto err_clear; + + if (unlikely(!ctx->skb)) + goto err_clear; + + ptr = skb_header_pointer(ctx->skb, offset, len, to); + if (unlikely(!ptr)) + goto err_clear; + if (ptr != to) + memcpy(to, ptr, len); + + return 0; +err_clear: + memset(to, 0, len); + return -EFAULT; +} + +static const struct bpf_func_proto bpf_flow_dissector_load_bytes_proto = { + .func = bpf_flow_dissector_load_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, + u32, offset, void *, to, u32, len, u32, start_header) +{ + u8 *end = skb_tail_pointer(skb); + u8 *start, *ptr; + + if (unlikely(offset > 0xffff)) + goto err_clear; + + switch (start_header) { + case BPF_HDR_START_MAC: + if (unlikely(!skb_mac_header_was_set(skb))) + goto err_clear; + start = skb_mac_header(skb); + break; + case BPF_HDR_START_NET: + start = skb_network_header(skb); + break; + default: + goto err_clear; + } + + ptr = start + offset; + + if (likely(ptr + len <= end)) { + memcpy(to, ptr, len); + return 0; + } + +err_clear: + memset(to, 0, len); + return -EFAULT; +} + +static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = { + .func = bpf_skb_load_bytes_relative, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) +{ + /* Idea is the following: should the needed direct read/write + * test fail during runtime, we can pull in more data and redo + * again, since implicitly, we invalidate previous checks here. + * + * Or, since we know how much we need to make read/writeable, + * this can be done once at the program beginning for direct + * access case. By this we overcome limitations of only current + * headroom being accessible. + */ + return bpf_try_make_writable(skb, len ? : skb_headlen(skb)); +} + +static const struct bpf_func_proto bpf_skb_pull_data_proto = { + .func = bpf_skb_pull_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk) +{ + return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_sk_fullsock_proto = { + .func = bpf_sk_fullsock, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, +}; + +static inline int sk_skb_try_make_writable(struct sk_buff *skb, + unsigned int write_len) +{ + return __bpf_try_make_writable(skb, write_len); +} + +BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len) +{ + /* Idea is the following: should the needed direct read/write + * test fail during runtime, we can pull in more data and redo + * again, since implicitly, we invalidate previous checks here. + * + * Or, since we know how much we need to make read/writeable, + * this can be done once at the program beginning for direct + * access case. By this we overcome limitations of only current + * headroom being accessible. + */ + return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb)); +} + +static const struct bpf_func_proto sk_skb_pull_data_proto = { + .func = sk_skb_pull_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, + u64, from, u64, to, u64, flags) +{ + __sum16 *ptr; + + if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) + return -EINVAL; + if (unlikely(offset > 0xffff || offset & 1)) + return -EFAULT; + if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) + return -EFAULT; + + ptr = (__sum16 *)(skb->data + offset); + switch (flags & BPF_F_HDR_FIELD_MASK) { + case 0: + if (unlikely(from != 0)) + return -EINVAL; + + csum_replace_by_diff(ptr, to); + break; + case 2: + csum_replace2(ptr, from, to); + break; + case 4: + csum_replace4(ptr, from, to); + break; + default: + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto bpf_l3_csum_replace_proto = { + .func = bpf_l3_csum_replace, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, + u64, from, u64, to, u64, flags) +{ + bool is_pseudo = flags & BPF_F_PSEUDO_HDR; + bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; + bool do_mforce = flags & BPF_F_MARK_ENFORCE; + __sum16 *ptr; + + if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE | + BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) + return -EINVAL; + if (unlikely(offset > 0xffff || offset & 1)) + return -EFAULT; + if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) + return -EFAULT; + + ptr = (__sum16 *)(skb->data + offset); + if (is_mmzero && !do_mforce && !*ptr) + return 0; + + switch (flags & BPF_F_HDR_FIELD_MASK) { + case 0: + if (unlikely(from != 0)) + return -EINVAL; + + inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); + break; + case 2: + inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); + break; + case 4: + inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo); + break; + default: + return -EINVAL; + } + + if (is_mmzero && !*ptr) + *ptr = CSUM_MANGLED_0; + return 0; +} + +static const struct bpf_func_proto bpf_l4_csum_replace_proto = { + .func = bpf_l4_csum_replace, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, + __be32 *, to, u32, to_size, __wsum, seed) +{ + struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); + u32 diff_size = from_size + to_size; + int i, j = 0; + + /* This is quite flexible, some examples: + * + * from_size == 0, to_size > 0, seed := csum --> pushing data + * from_size > 0, to_size == 0, seed := csum --> pulling data + * from_size > 0, to_size > 0, seed := 0 --> diffing data + * + * Even for diffing, from_size and to_size don't need to be equal. + */ + if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) || + diff_size > sizeof(sp->diff))) + return -EINVAL; + + for (i = 0; i < from_size / sizeof(__be32); i++, j++) + sp->diff[j] = ~from[i]; + for (i = 0; i < to_size / sizeof(__be32); i++, j++) + sp->diff[j] = to[i]; + + return csum_partial(sp->diff, diff_size, seed); +} + +static const struct bpf_func_proto bpf_csum_diff_proto = { + .func = bpf_csum_diff, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, + .arg4_type = ARG_CONST_SIZE_OR_ZERO, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum) +{ + /* The interface is to be used in combination with bpf_csum_diff() + * for direct packet writes. csum rotation for alignment as well + * as emulating csum_sub() can be done from the eBPF program. + */ + if (skb->ip_summed == CHECKSUM_COMPLETE) + return (skb->csum = csum_add(skb->csum, csum)); + + return -ENOTSUPP; +} + +static const struct bpf_func_proto bpf_csum_update_proto = { + .func = bpf_csum_update, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level) +{ + /* The interface is to be used in combination with bpf_skb_adjust_room() + * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET + * is passed as flags, for example. + */ + switch (level) { + case BPF_CSUM_LEVEL_INC: + __skb_incr_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_DEC: + __skb_decr_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_RESET: + __skb_reset_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_QUERY: + return skb->ip_summed == CHECKSUM_UNNECESSARY ? + skb->csum_level : -EACCES; + default: + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto bpf_csum_level_proto = { + .func = bpf_csum_level, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) +{ + return dev_forward_skb_nomtu(dev, skb); +} + +static inline int __bpf_rx_skb_no_mac(struct net_device *dev, + struct sk_buff *skb) +{ + int ret = ____dev_forward_skb(dev, skb, false); + + if (likely(!ret)) { + skb->dev = dev; + ret = netif_rx(skb); + } + + return ret; +} + +static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) +{ + int ret; + + if (dev_xmit_recursion()) { + net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); + kfree_skb(skb); + return -ENETDOWN; + } + + skb->dev = dev; + skb_clear_tstamp(skb); + + dev_xmit_recursion_inc(); + ret = dev_queue_xmit(skb); + dev_xmit_recursion_dec(); + + return ret; +} + +static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, + u32 flags) +{ + unsigned int mlen = skb_network_offset(skb); + + if (unlikely(skb->len <= mlen)) { + kfree_skb(skb); + return -ERANGE; + } + + if (mlen) { + __skb_pull(skb, mlen); + if (unlikely(!skb->len)) { + kfree_skb(skb); + return -ERANGE; + } + + /* At ingress, the mac header has already been pulled once. + * At egress, skb_pospull_rcsum has to be done in case that + * the skb is originated from ingress (i.e. a forwarded skb) + * to ensure that rcsum starts at net header. + */ + if (!skb_at_tc_ingress(skb)) + skb_postpull_rcsum(skb, skb_mac_header(skb), mlen); + } + skb_pop_mac_header(skb); + skb_reset_mac_len(skb); + return flags & BPF_F_INGRESS ? + __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb); +} + +static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, + u32 flags) +{ + /* Verify that a link layer header is carried */ + if (unlikely(skb->mac_header >= skb->network_header || skb->len == 0)) { + kfree_skb(skb); + return -ERANGE; + } + + bpf_push_mac_rcsum(skb); + return flags & BPF_F_INGRESS ? + __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); +} + +static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, + u32 flags) +{ + if (dev_is_mac_header_xmit(dev)) + return __bpf_redirect_common(skb, dev, flags); + else + return __bpf_redirect_no_mac(skb, dev, flags); +} + +#if IS_ENABLED(CONFIG_IPV6) +static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, + struct net_device *dev, struct bpf_nh_params *nh) +{ + u32 hh_len = LL_RESERVED_SPACE(dev); + const struct in6_addr *nexthop; + struct dst_entry *dst = NULL; + struct neighbour *neigh; + + if (dev_xmit_recursion()) { + net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); + goto out_drop; + } + + skb->dev = dev; + skb_clear_tstamp(skb); + + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { + skb = skb_expand_head(skb, hh_len); + if (!skb) + return -ENOMEM; + } + + rcu_read_lock(); + if (!nh) { + dst = skb_dst(skb); + nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), + &ipv6_hdr(skb)->daddr); + } else { + nexthop = &nh->ipv6_nh; + } + neigh = ip_neigh_gw6(dev, nexthop); + if (likely(!IS_ERR(neigh))) { + int ret; + + sock_confirm_neigh(skb, neigh); + local_bh_disable(); + dev_xmit_recursion_inc(); + ret = neigh_output(neigh, skb, false); + dev_xmit_recursion_dec(); + local_bh_enable(); + rcu_read_unlock(); + return ret; + } + rcu_read_unlock_bh(); + if (dst) + IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); +out_drop: + kfree_skb(skb); + return -ENETDOWN; +} + +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct net *net = dev_net(dev); + int err, ret = NET_XMIT_DROP; + + if (!nh) { + struct dst_entry *dst; + struct flowi6 fl6 = { + .flowi6_flags = FLOWI_FLAG_ANYSRC, + .flowi6_mark = skb->mark, + .flowlabel = ip6_flowinfo(ip6h), + .flowi6_oif = dev->ifindex, + .flowi6_proto = ip6h->nexthdr, + .daddr = ip6h->daddr, + .saddr = ip6h->saddr, + }; + + dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); + if (IS_ERR(dst)) + goto out_drop; + + skb_dst_set(skb, dst); + } else if (nh->nh_family != AF_INET6) { + goto out_drop; + } + + err = bpf_out_neigh_v6(net, skb, dev, nh); + if (unlikely(net_xmit_eval(err))) + dev->stats.tx_errors++; + else + ret = NET_XMIT_SUCCESS; + goto out_xmit; +out_drop: + dev->stats.tx_errors++; + kfree_skb(skb); +out_xmit: + return ret; +} +#else +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) +{ + kfree_skb(skb); + return NET_XMIT_DROP; +} +#endif /* CONFIG_IPV6 */ + +#if IS_ENABLED(CONFIG_INET) +static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, + struct net_device *dev, struct bpf_nh_params *nh) +{ + u32 hh_len = LL_RESERVED_SPACE(dev); + struct neighbour *neigh; + bool is_v6gw = false; + + if (dev_xmit_recursion()) { + net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); + goto out_drop; + } + + skb->dev = dev; + skb_clear_tstamp(skb); + + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { + skb = skb_expand_head(skb, hh_len); + if (!skb) + return -ENOMEM; + } + + rcu_read_lock(); + if (!nh) { + struct dst_entry *dst = skb_dst(skb); + struct rtable *rt = container_of(dst, struct rtable, dst); + + neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); + } else if (nh->nh_family == AF_INET6) { + neigh = ip_neigh_gw6(dev, &nh->ipv6_nh); + is_v6gw = true; + } else if (nh->nh_family == AF_INET) { + neigh = ip_neigh_gw4(dev, nh->ipv4_nh); + } else { + rcu_read_unlock(); + goto out_drop; + } + + if (likely(!IS_ERR(neigh))) { + int ret; + + sock_confirm_neigh(skb, neigh); + local_bh_disable(); + dev_xmit_recursion_inc(); + ret = neigh_output(neigh, skb, is_v6gw); + dev_xmit_recursion_dec(); + local_bh_enable(); + rcu_read_unlock(); + return ret; + } + rcu_read_unlock(); +out_drop: + kfree_skb(skb); + return -ENETDOWN; +} + +static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) +{ + const struct iphdr *ip4h = ip_hdr(skb); + struct net *net = dev_net(dev); + int err, ret = NET_XMIT_DROP; + + if (!nh) { + struct flowi4 fl4 = { + .flowi4_flags = FLOWI_FLAG_ANYSRC, + .flowi4_mark = skb->mark, + .flowi4_tos = RT_TOS(ip4h->tos), + .flowi4_oif = dev->ifindex, + .flowi4_proto = ip4h->protocol, + .daddr = ip4h->daddr, + .saddr = ip4h->saddr, + }; + struct rtable *rt; + + rt = ip_route_output_flow(net, &fl4, NULL); + if (IS_ERR(rt)) + goto out_drop; + if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { + ip_rt_put(rt); + goto out_drop; + } + + skb_dst_set(skb, &rt->dst); + } + + err = bpf_out_neigh_v4(net, skb, dev, nh); + if (unlikely(net_xmit_eval(err))) + dev->stats.tx_errors++; + else + ret = NET_XMIT_SUCCESS; + goto out_xmit; +out_drop: + dev->stats.tx_errors++; + kfree_skb(skb); +out_xmit: + return ret; +} +#else +static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) +{ + kfree_skb(skb); + return NET_XMIT_DROP; +} +#endif /* CONFIG_INET */ + +static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) +{ + struct ethhdr *ethh = eth_hdr(skb); + + if (unlikely(skb->mac_header >= skb->network_header)) + goto out; + bpf_push_mac_rcsum(skb); + if (is_multicast_ether_addr(ethh->h_dest)) + goto out; + + skb_pull(skb, sizeof(*ethh)); + skb_unset_mac_header(skb); + skb_reset_network_header(skb); + + if (skb->protocol == htons(ETH_P_IP)) + return __bpf_redirect_neigh_v4(skb, dev, nh); + else if (skb->protocol == htons(ETH_P_IPV6)) + return __bpf_redirect_neigh_v6(skb, dev, nh); +out: + kfree_skb(skb); + return -ENOTSUPP; +} + +/* Internal, non-exposed redirect flags. */ +enum { + BPF_F_NEIGH = (1ULL << 1), + BPF_F_PEER = (1ULL << 2), + BPF_F_NEXTHOP = (1ULL << 3), +#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP) +}; + +BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) +{ + struct net_device *dev; + struct sk_buff *clone; + int ret; + + if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) + return -EINVAL; + + dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); + if (unlikely(!dev)) + return -EINVAL; + + clone = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!clone)) + return -ENOMEM; + + /* For direct write, we need to keep the invariant that the skbs + * we're dealing with need to be uncloned. Should uncloning fail + * here, we need to free the just generated clone to unclone once + * again. + */ + ret = bpf_try_make_head_writable(skb); + if (unlikely(ret)) { + kfree_skb(clone); + return -ENOMEM; + } + + return __bpf_redirect(clone, dev, flags); +} + +static const struct bpf_func_proto bpf_clone_redirect_proto = { + .func = bpf_clone_redirect, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); +EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); + +int skb_do_redirect(struct sk_buff *skb) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct net *net = dev_net(skb->dev); + struct net_device *dev; + u32 flags = ri->flags; + + dev = dev_get_by_index_rcu(net, ri->tgt_index); + ri->tgt_index = 0; + ri->flags = 0; + if (unlikely(!dev)) + goto out_drop; + if (flags & BPF_F_PEER) { + const struct net_device_ops *ops = dev->netdev_ops; + + if (unlikely(!ops->ndo_get_peer_dev || + !skb_at_tc_ingress(skb))) + goto out_drop; + dev = ops->ndo_get_peer_dev(dev); + if (unlikely(!dev || + !(dev->flags & IFF_UP) || + net_eq(net, dev_net(dev)))) + goto out_drop; + skb->dev = dev; + return -EAGAIN; + } + return flags & BPF_F_NEIGH ? + __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ? + &ri->nh : NULL) : + __bpf_redirect(skb, dev, flags); +out_drop: + kfree_skb(skb); + return -EINVAL; +} + +BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) + return TC_ACT_SHOT; + + ri->flags = flags; + ri->tgt_index = ifindex; + + return TC_ACT_REDIRECT; +} + +static const struct bpf_func_proto bpf_redirect_proto = { + .func = bpf_redirect, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + if (unlikely(flags)) + return TC_ACT_SHOT; + + ri->flags = BPF_F_PEER; + ri->tgt_index = ifindex; + + return TC_ACT_REDIRECT; +} + +static const struct bpf_func_proto bpf_redirect_peer_proto = { + .func = bpf_redirect_peer, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params, + int, plen, u64, flags) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + if (unlikely((plen && plen < sizeof(*params)) || flags)) + return TC_ACT_SHOT; + + ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0); + ri->tgt_index = ifindex; + + BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params)); + if (plen) + memcpy(&ri->nh, params, sizeof(ri->nh)); + + return TC_ACT_REDIRECT; +} + +static const struct bpf_func_proto bpf_redirect_neigh_proto = { + .func = bpf_redirect_neigh, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes) +{ + msg->apply_bytes = bytes; + return 0; +} + +static const struct bpf_func_proto bpf_msg_apply_bytes_proto = { + .func = bpf_msg_apply_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes) +{ + msg->cork_bytes = bytes; + return 0; +} + +static void sk_msg_reset_curr(struct sk_msg *msg) +{ + u32 i = msg->sg.start; + u32 len = 0; + + do { + len += sk_msg_elem(msg, i)->length; + sk_msg_iter_var_next(i); + if (len >= msg->sg.size) + break; + } while (i != msg->sg.end); + + msg->sg.curr = i; + msg->sg.copybreak = 0; +} + +static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { + .func = bpf_msg_cork_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, + u32, end, u64, flags) +{ + u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start; + u32 first_sge, last_sge, i, shift, bytes_sg_total; + struct scatterlist *sge; + u8 *raw, *to, *from; + struct page *page; + + if (unlikely(flags || end <= start)) + return -EINVAL; + + /* First find the starting scatterlist element */ + i = msg->sg.start; + do { + offset += len; + len = sk_msg_elem(msg, i)->length; + if (start < offset + len) + break; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); + + if (unlikely(start >= offset + len)) + return -EINVAL; + + first_sge = i; + /* The start may point into the sg element so we need to also + * account for the headroom. + */ + bytes_sg_total = start - offset + bytes; + if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len) + goto out; + + /* At this point we need to linearize multiple scatterlist + * elements or a single shared page. Either way we need to + * copy into a linear buffer exclusively owned by BPF. Then + * place the buffer in the scatterlist and fixup the original + * entries by removing the entries now in the linear buffer + * and shifting the remaining entries. For now we do not try + * to copy partial entries to avoid complexity of running out + * of sg_entry slots. The downside is reading a single byte + * will copy the entire sg entry. + */ + do { + copy += sk_msg_elem(msg, i)->length; + sk_msg_iter_var_next(i); + if (bytes_sg_total <= copy) + break; + } while (i != msg->sg.end); + last_sge = i; + + if (unlikely(bytes_sg_total > copy)) + return -EINVAL; + + page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, + get_order(copy)); + if (unlikely(!page)) + return -ENOMEM; + + raw = page_address(page); + i = first_sge; + do { + sge = sk_msg_elem(msg, i); + from = sg_virt(sge); + len = sge->length; + to = raw + poffset; + + memcpy(to, from, len); + poffset += len; + sge->length = 0; + put_page(sg_page(sge)); + + sk_msg_iter_var_next(i); + } while (i != last_sge); + + sg_set_page(&msg->sg.data[first_sge], page, copy, 0); + + /* To repair sg ring we need to shift entries. If we only + * had a single entry though we can just replace it and + * be done. Otherwise walk the ring and shift the entries. + */ + WARN_ON_ONCE(last_sge == first_sge); + shift = last_sge > first_sge ? + last_sge - first_sge - 1 : + NR_MSG_FRAG_IDS - first_sge + last_sge - 1; + if (!shift) + goto out; + + i = first_sge; + sk_msg_iter_var_next(i); + do { + u32 move_from; + + if (i + shift >= NR_MSG_FRAG_IDS) + move_from = i + shift - NR_MSG_FRAG_IDS; + else + move_from = i + shift; + if (move_from == msg->sg.end) + break; + + msg->sg.data[i] = msg->sg.data[move_from]; + msg->sg.data[move_from].length = 0; + msg->sg.data[move_from].page_link = 0; + msg->sg.data[move_from].offset = 0; + sk_msg_iter_var_next(i); + } while (1); + + msg->sg.end = msg->sg.end - shift > msg->sg.end ? + msg->sg.end - shift + NR_MSG_FRAG_IDS : + msg->sg.end - shift; +out: + sk_msg_reset_curr(msg); + msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset; + msg->data_end = msg->data + bytes; + return 0; +} + +static const struct bpf_func_proto bpf_msg_pull_data_proto = { + .func = bpf_msg_pull_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, + u32, len, u64, flags) +{ + struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge; + u32 new, i = 0, l = 0, space, copy = 0, offset = 0; + u8 *raw, *to, *from; + struct page *page; + + if (unlikely(flags)) + return -EINVAL; + + if (unlikely(len == 0)) + return 0; + + /* First find the starting scatterlist element */ + i = msg->sg.start; + do { + offset += l; + l = sk_msg_elem(msg, i)->length; + + if (start < offset + l) + break; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); + + if (start >= offset + l) + return -EINVAL; + + space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); + + /* If no space available will fallback to copy, we need at + * least one scatterlist elem available to push data into + * when start aligns to the beginning of an element or two + * when it falls inside an element. We handle the start equals + * offset case because its the common case for inserting a + * header. + */ + if (!space || (space == 1 && start != offset)) + copy = msg->sg.data[i].length; + + page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, + get_order(copy + len)); + if (unlikely(!page)) + return -ENOMEM; + + if (copy) { + int front, back; + + raw = page_address(page); + + psge = sk_msg_elem(msg, i); + front = start - offset; + back = psge->length - front; + from = sg_virt(psge); + + if (front) + memcpy(raw, from, front); + + if (back) { + from += front; + to = raw + front + len; + + memcpy(to, from, back); + } + + put_page(sg_page(psge)); + } else if (start - offset) { + psge = sk_msg_elem(msg, i); + rsge = sk_msg_elem_cpy(msg, i); + + psge->length = start - offset; + rsge.length -= psge->length; + rsge.offset += start; + + sk_msg_iter_var_next(i); + sg_unmark_end(psge); + sg_unmark_end(&rsge); + sk_msg_iter_next(msg, end); + } + + /* Slot(s) to place newly allocated data */ + new = i; + + /* Shift one or two slots as needed */ + if (!copy) { + sge = sk_msg_elem_cpy(msg, i); + + sk_msg_iter_var_next(i); + sg_unmark_end(&sge); + sk_msg_iter_next(msg, end); + + nsge = sk_msg_elem_cpy(msg, i); + if (rsge.length) { + sk_msg_iter_var_next(i); + nnsge = sk_msg_elem_cpy(msg, i); + } + + while (i != msg->sg.end) { + msg->sg.data[i] = sge; + sge = nsge; + sk_msg_iter_var_next(i); + if (rsge.length) { + nsge = nnsge; + nnsge = sk_msg_elem_cpy(msg, i); + } else { + nsge = sk_msg_elem_cpy(msg, i); + } + } + } + + /* Place newly allocated data buffer */ + sk_mem_charge(msg->sk, len); + msg->sg.size += len; + __clear_bit(new, msg->sg.copy); + sg_set_page(&msg->sg.data[new], page, len + copy, 0); + if (rsge.length) { + get_page(sg_page(&rsge)); + sk_msg_iter_var_next(new); + msg->sg.data[new] = rsge; + } + + sk_msg_reset_curr(msg); + sk_msg_compute_data_pointers(msg); + return 0; +} + +static const struct bpf_func_proto bpf_msg_push_data_proto = { + .func = bpf_msg_push_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + +static void sk_msg_shift_left(struct sk_msg *msg, int i) +{ + int prev; + + do { + prev = i; + sk_msg_iter_var_next(i); + msg->sg.data[prev] = msg->sg.data[i]; + } while (i != msg->sg.end); + + sk_msg_iter_prev(msg, end); +} + +static void sk_msg_shift_right(struct sk_msg *msg, int i) +{ + struct scatterlist tmp, sge; + + sk_msg_iter_next(msg, end); + sge = sk_msg_elem_cpy(msg, i); + sk_msg_iter_var_next(i); + tmp = sk_msg_elem_cpy(msg, i); + + while (i != msg->sg.end) { + msg->sg.data[i] = sge; + sk_msg_iter_var_next(i); + sge = tmp; + tmp = sk_msg_elem_cpy(msg, i); + } +} + +BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, + u32, len, u64, flags) +{ + u32 i = 0, l = 0, space, offset = 0; + u64 last = start + len; + int pop; + + if (unlikely(flags)) + return -EINVAL; + + /* First find the starting scatterlist element */ + i = msg->sg.start; + do { + offset += l; + l = sk_msg_elem(msg, i)->length; + + if (start < offset + l) + break; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); + + /* Bounds checks: start and pop must be inside message */ + if (start >= offset + l || last >= msg->sg.size) + return -EINVAL; + + space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); + + pop = len; + /* --------------| offset + * -| start |-------- len -------| + * + * |----- a ----|-------- pop -------|----- b ----| + * |______________________________________________| length + * + * + * a: region at front of scatter element to save + * b: region at back of scatter element to save when length > A + pop + * pop: region to pop from element, same as input 'pop' here will be + * decremented below per iteration. + * + * Two top-level cases to handle when start != offset, first B is non + * zero and second B is zero corresponding to when a pop includes more + * than one element. + * + * Then if B is non-zero AND there is no space allocate space and + * compact A, B regions into page. If there is space shift ring to + * the rigth free'ing the next element in ring to place B, leaving + * A untouched except to reduce length. + */ + if (start != offset) { + struct scatterlist *nsge, *sge = sk_msg_elem(msg, i); + int a = start; + int b = sge->length - pop - a; + + sk_msg_iter_var_next(i); + + if (pop < sge->length - a) { + if (space) { + sge->length = a; + sk_msg_shift_right(msg, i); + nsge = sk_msg_elem(msg, i); + get_page(sg_page(sge)); + sg_set_page(nsge, + sg_page(sge), + b, sge->offset + pop + a); + } else { + struct page *page, *orig; + u8 *to, *from; + + page = alloc_pages(__GFP_NOWARN | + __GFP_COMP | GFP_ATOMIC, + get_order(a + b)); + if (unlikely(!page)) + return -ENOMEM; + + sge->length = a; + orig = sg_page(sge); + from = sg_virt(sge); + to = page_address(page); + memcpy(to, from, a); + memcpy(to + a, from + a + pop, b); + sg_set_page(sge, page, a + b, 0); + put_page(orig); + } + pop = 0; + } else if (pop >= sge->length - a) { + pop -= (sge->length - a); + sge->length = a; + } + } + + /* From above the current layout _must_ be as follows, + * + * -| offset + * -| start + * + * |---- pop ---|---------------- b ------------| + * |____________________________________________| length + * + * Offset and start of the current msg elem are equal because in the + * previous case we handled offset != start and either consumed the + * entire element and advanced to the next element OR pop == 0. + * + * Two cases to handle here are first pop is less than the length + * leaving some remainder b above. Simply adjust the element's layout + * in this case. Or pop >= length of the element so that b = 0. In this + * case advance to next element decrementing pop. + */ + while (pop) { + struct scatterlist *sge = sk_msg_elem(msg, i); + + if (pop < sge->length) { + sge->length -= pop; + sge->offset += pop; + pop = 0; + } else { + pop -= sge->length; + sk_msg_shift_left(msg, i); + } + sk_msg_iter_var_next(i); + } + + sk_mem_uncharge(msg->sk, len - pop); + msg->sg.size -= (len - pop); + sk_msg_reset_curr(msg); + sk_msg_compute_data_pointers(msg); + return 0; +} + +static const struct bpf_func_proto bpf_msg_pop_data_proto = { + .func = bpf_msg_pop_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + +#ifdef CONFIG_CGROUP_NET_CLASSID +BPF_CALL_0(bpf_get_cgroup_classid_curr) +{ + return __task_get_classid(current); +} + +const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = { + .func = bpf_get_cgroup_classid_curr, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +BPF_CALL_1(bpf_skb_cgroup_classid, const struct sk_buff *, skb) +{ + struct sock *sk = skb_to_full_sk(skb); + + if (!sk || !sk_fullsock(sk)) + return 0; + + return sock_cgroup_classid(&sk->sk_cgrp_data); +} + +static const struct bpf_func_proto bpf_skb_cgroup_classid_proto = { + .func = bpf_skb_cgroup_classid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; +#endif + +BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) +{ + return task_get_classid(skb); +} + +static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { + .func = bpf_get_cgroup_classid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb) +{ + return dst_tclassid(skb); +} + +static const struct bpf_func_proto bpf_get_route_realm_proto = { + .func = bpf_get_route_realm, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb) +{ + /* If skb_clear_hash() was called due to mangling, we can + * trigger SW recalculation here. Later access to hash + * can then use the inline skb->hash via context directly + * instead of calling this helper again. + */ + return skb_get_hash(skb); +} + +static const struct bpf_func_proto bpf_get_hash_recalc_proto = { + .func = bpf_get_hash_recalc, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb) +{ + /* After all direct packet write, this can be used once for + * triggering a lazy recalc on next skb_get_hash() invocation. + */ + skb_clear_hash(skb); + return 0; +} + +static const struct bpf_func_proto bpf_set_hash_invalid_proto = { + .func = bpf_set_hash_invalid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash) +{ + /* Set user specified hash as L4(+), so that it gets returned + * on skb_get_hash() call unless BPF prog later on triggers a + * skb_clear_hash(). + */ + __skb_set_sw_hash(skb, hash, true); + return 0; +} + +static const struct bpf_func_proto bpf_set_hash_proto = { + .func = bpf_set_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, + u16, vlan_tci) +{ + int ret; + + if (unlikely(vlan_proto != htons(ETH_P_8021Q) && + vlan_proto != htons(ETH_P_8021AD))) + vlan_proto = htons(ETH_P_8021Q); + + bpf_push_mac_rcsum(skb); + ret = skb_vlan_push(skb, vlan_proto, vlan_tci); + bpf_pull_mac_rcsum(skb); + + bpf_compute_data_pointers(skb); + return ret; +} + +static const struct bpf_func_proto bpf_skb_vlan_push_proto = { + .func = bpf_skb_vlan_push, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) +{ + int ret; + + bpf_push_mac_rcsum(skb); + ret = skb_vlan_pop(skb); + bpf_pull_mac_rcsum(skb); + + bpf_compute_data_pointers(skb); + return ret; +} + +static const struct bpf_func_proto bpf_skb_vlan_pop_proto = { + .func = bpf_skb_vlan_pop, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) +{ + /* Caller already did skb_cow() with len as headroom, + * so no need to do it here. + */ + skb_push(skb, len); + memmove(skb->data, skb->data + len, off); + memset(skb->data + off, 0, len); + + /* No skb_postpush_rcsum(skb, skb->data + off, len) + * needed here as it does not change the skb->csum + * result for checksum complete when summing over + * zeroed blocks. + */ + return 0; +} + +static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len) +{ + void *old_data; + + /* skb_ensure_writable() is not needed here, as we're + * already working on an uncloned skb. + */ + if (unlikely(!pskb_may_pull(skb, off + len))) + return -ENOMEM; + + old_data = skb->data; + __skb_pull(skb, len); + skb_postpull_rcsum(skb, old_data + off, len); + memmove(skb->data, old_data, off); + + return 0; +} + +static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len) +{ + bool trans_same = skb->transport_header == skb->network_header; + int ret; + + /* There's no need for __skb_push()/__skb_pull() pair to + * get to the start of the mac header as we're guaranteed + * to always start from here under eBPF. + */ + ret = bpf_skb_generic_push(skb, off, len); + if (likely(!ret)) { + skb->mac_header -= len; + skb->network_header -= len; + if (trans_same) + skb->transport_header = skb->network_header; + } + + return ret; +} + +static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) +{ + bool trans_same = skb->transport_header == skb->network_header; + int ret; + + /* Same here, __skb_push()/__skb_pull() pair not needed. */ + ret = bpf_skb_generic_pop(skb, off, len); + if (likely(!ret)) { + skb->mac_header += len; + skb->network_header += len; + if (trans_same) + skb->transport_header = skb->network_header; + } + + return ret; +} + +static int bpf_skb_proto_4_to_6(struct sk_buff *skb) +{ + const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); + u32 off = skb_mac_header_len(skb); + int ret; + + ret = skb_cow(skb, len_diff); + if (unlikely(ret < 0)) + return ret; + + ret = bpf_skb_net_hdr_push(skb, off, len_diff); + if (unlikely(ret < 0)) + return ret; + + if (skb_is_gso(skb)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + + /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */ + if (shinfo->gso_type & SKB_GSO_TCPV4) { + shinfo->gso_type &= ~SKB_GSO_TCPV4; + shinfo->gso_type |= SKB_GSO_TCPV6; + } + } + + skb->protocol = htons(ETH_P_IPV6); + skb_clear_hash(skb); + + return 0; +} + +static int bpf_skb_proto_6_to_4(struct sk_buff *skb) +{ + const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); + u32 off = skb_mac_header_len(skb); + int ret; + + ret = skb_unclone(skb, GFP_ATOMIC); + if (unlikely(ret < 0)) + return ret; + + ret = bpf_skb_net_hdr_pop(skb, off, len_diff); + if (unlikely(ret < 0)) + return ret; + + if (skb_is_gso(skb)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + + /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */ + if (shinfo->gso_type & SKB_GSO_TCPV6) { + shinfo->gso_type &= ~SKB_GSO_TCPV6; + shinfo->gso_type |= SKB_GSO_TCPV4; + } + } + + skb->protocol = htons(ETH_P_IP); + skb_clear_hash(skb); + + return 0; +} + +static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) +{ + __be16 from_proto = skb->protocol; + + if (from_proto == htons(ETH_P_IP) && + to_proto == htons(ETH_P_IPV6)) + return bpf_skb_proto_4_to_6(skb); + + if (from_proto == htons(ETH_P_IPV6) && + to_proto == htons(ETH_P_IP)) + return bpf_skb_proto_6_to_4(skb); + + return -ENOTSUPP; +} + +BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, + u64, flags) +{ + int ret; + + if (unlikely(flags)) + return -EINVAL; + + /* General idea is that this helper does the basic groundwork + * needed for changing the protocol, and eBPF program fills the + * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace() + * and other helpers, rather than passing a raw buffer here. + * + * The rationale is to keep this minimal and without a need to + * deal with raw packet data. F.e. even if we would pass buffers + * here, the program still needs to call the bpf_lX_csum_replace() + * helpers anyway. Plus, this way we keep also separation of + * concerns, since f.e. bpf_skb_store_bytes() should only take + * care of stores. + * + * Currently, additional options and extension header space are + * not supported, but flags register is reserved so we can adapt + * that. For offloads, we mark packet as dodgy, so that headers + * need to be verified first. + */ + ret = bpf_skb_proto_xlat(skb, proto); + bpf_compute_data_pointers(skb); + return ret; +} + +static const struct bpf_func_proto bpf_skb_change_proto_proto = { + .func = bpf_skb_change_proto, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type) +{ + /* We only allow a restricted subset to be changed for now. */ + if (unlikely(!skb_pkt_type_ok(skb->pkt_type) || + !skb_pkt_type_ok(pkt_type))) + return -EINVAL; + + skb->pkt_type = pkt_type; + return 0; +} + +static const struct bpf_func_proto bpf_skb_change_type_proto = { + .func = bpf_skb_change_type, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +static u32 bpf_skb_net_base_len(const struct sk_buff *skb) +{ + switch (skb->protocol) { + case htons(ETH_P_IP): + return sizeof(struct iphdr); + case htons(ETH_P_IPV6): + return sizeof(struct ipv6hdr); + default: + return ~0U; + } +} + +#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \ + BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) + +#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ + BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ + BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ + BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ + BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \ + BPF_F_ADJ_ROOM_ENCAP_L2( \ + BPF_ADJ_ROOM_ENCAP_L2_MASK)) + +static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, + u64 flags) +{ + u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT; + bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; + u16 mac_len = 0, inner_net = 0, inner_trans = 0; + unsigned int gso_type = SKB_GSO_DODGY; + int ret; + + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { + /* udp gso_size delineates datagrams, only allow if fixed */ + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || + !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + return -ENOTSUPP; + } + + ret = skb_cow_head(skb, len_diff); + if (unlikely(ret < 0)) + return ret; + + if (encap) { + if (skb->protocol != htons(ETH_P_IP) && + skb->protocol != htons(ETH_P_IPV6)) + return -ENOTSUPP; + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 && + flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) + return -EINVAL; + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE && + flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) + return -EINVAL; + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH && + inner_mac_len < ETH_HLEN) + return -EINVAL; + + if (skb->encapsulation) + return -EALREADY; + + mac_len = skb->network_header - skb->mac_header; + inner_net = skb->network_header; + if (inner_mac_len > len_diff) + return -EINVAL; + inner_trans = skb->transport_header; + } + + ret = bpf_skb_net_hdr_push(skb, off, len_diff); + if (unlikely(ret < 0)) + return ret; + + if (encap) { + skb->inner_mac_header = inner_net - inner_mac_len; + skb->inner_network_header = inner_net; + skb->inner_transport_header = inner_trans; + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH) + skb_set_inner_protocol(skb, htons(ETH_P_TEB)); + else + skb_set_inner_protocol(skb, skb->protocol); + + skb->encapsulation = 1; + skb_set_network_header(skb, mac_len); + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) + gso_type |= SKB_GSO_UDP_TUNNEL; + else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE) + gso_type |= SKB_GSO_GRE; + else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) + gso_type |= SKB_GSO_IPXIP6; + else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) + gso_type |= SKB_GSO_IPXIP4; + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE || + flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) { + int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ? + sizeof(struct ipv6hdr) : + sizeof(struct iphdr); + + skb_set_transport_header(skb, mac_len + nh_len); + } + + /* Match skb->protocol to new outer l3 protocol */ + if (skb->protocol == htons(ETH_P_IP) && + flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) + skb->protocol = htons(ETH_P_IPV6); + else if (skb->protocol == htons(ETH_P_IPV6) && + flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) + skb->protocol = htons(ETH_P_IP); + } + + if (skb_is_gso(skb)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + + /* Due to header grow, MSS needs to be downgraded. */ + if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + skb_decrease_gso_size(shinfo, len_diff); + + /* Header must be checked, and gso_segs recomputed. */ + shinfo->gso_type |= gso_type; + shinfo->gso_segs = 0; + } + + return 0; +} + +static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, + u64 flags) +{ + int ret; + + if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | + BPF_F_ADJ_ROOM_NO_CSUM_RESET))) + return -EINVAL; + + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { + /* udp gso_size delineates datagrams, only allow if fixed */ + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || + !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + return -ENOTSUPP; + } + + ret = skb_unclone(skb, GFP_ATOMIC); + if (unlikely(ret < 0)) + return ret; + + ret = bpf_skb_net_hdr_pop(skb, off, len_diff); + if (unlikely(ret < 0)) + return ret; + + if (skb_is_gso(skb)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + + /* Due to header shrink, MSS can be upgraded. */ + if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + skb_increase_gso_size(shinfo, len_diff); + + /* Header must be checked, and gso_segs recomputed. */ + shinfo->gso_type |= SKB_GSO_DODGY; + shinfo->gso_segs = 0; + } + + return 0; +} + +#define BPF_SKB_MAX_LEN SKB_MAX_ALLOC + +BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, + u32, mode, u64, flags) +{ + u32 len_diff_abs = abs(len_diff); + bool shrink = len_diff < 0; + int ret = 0; + + if (unlikely(flags || mode)) + return -EINVAL; + if (unlikely(len_diff_abs > 0xfffU)) + return -EFAULT; + + if (!shrink) { + ret = skb_cow(skb, len_diff); + if (unlikely(ret < 0)) + return ret; + __skb_push(skb, len_diff_abs); + memset(skb->data, 0, len_diff_abs); + } else { + if (unlikely(!pskb_may_pull(skb, len_diff_abs))) + return -ENOMEM; + __skb_pull(skb, len_diff_abs); + } + if (tls_sw_has_ctx_rx(skb->sk)) { + struct strp_msg *rxm = strp_msg(skb); + + rxm->full_len += len_diff; + } + return ret; +} + +static const struct bpf_func_proto sk_skb_adjust_room_proto = { + .func = sk_skb_adjust_room, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, + u32, mode, u64, flags) +{ + u32 len_cur, len_diff_abs = abs(len_diff); + u32 len_min = bpf_skb_net_base_len(skb); + u32 len_max = BPF_SKB_MAX_LEN; + __be16 proto = skb->protocol; + bool shrink = len_diff < 0; + u32 off; + int ret; + + if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK | + BPF_F_ADJ_ROOM_NO_CSUM_RESET))) + return -EINVAL; + if (unlikely(len_diff_abs > 0xfffU)) + return -EFAULT; + if (unlikely(proto != htons(ETH_P_IP) && + proto != htons(ETH_P_IPV6))) + return -ENOTSUPP; + + off = skb_mac_header_len(skb); + switch (mode) { + case BPF_ADJ_ROOM_NET: + off += bpf_skb_net_base_len(skb); + break; + case BPF_ADJ_ROOM_MAC: + break; + default: + return -ENOTSUPP; + } + + len_cur = skb->len - skb_network_offset(skb); + if ((shrink && (len_diff_abs >= len_cur || + len_cur - len_diff_abs < len_min)) || + (!shrink && (skb->len + len_diff_abs > len_max && + !skb_is_gso(skb)))) + return -ENOTSUPP; + + ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) : + bpf_skb_net_grow(skb, off, len_diff_abs, flags); + if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET)) + __skb_reset_checksum_unnecessary(skb); + + bpf_compute_data_pointers(skb); + return ret; +} + +static const struct bpf_func_proto bpf_skb_adjust_room_proto = { + .func = bpf_skb_adjust_room, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + +static u32 __bpf_skb_min_len(const struct sk_buff *skb) +{ + u32 min_len = skb_network_offset(skb); + + if (skb_transport_header_was_set(skb)) + min_len = skb_transport_offset(skb); + if (skb->ip_summed == CHECKSUM_PARTIAL) + min_len = skb_checksum_start_offset(skb) + + skb->csum_offset + sizeof(__sum16); + return min_len; +} + +static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len) +{ + unsigned int old_len = skb->len; + int ret; + + ret = __skb_grow_rcsum(skb, new_len); + if (!ret) + memset(skb->data + old_len, 0, new_len - old_len); + return ret; +} + +static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) +{ + return __skb_trim_rcsum(skb, new_len); +} + +static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len, + u64 flags) +{ + u32 max_len = BPF_SKB_MAX_LEN; + u32 min_len = __bpf_skb_min_len(skb); + int ret; + + if (unlikely(flags || new_len > max_len || new_len < min_len)) + return -EINVAL; + if (skb->encapsulation) + return -ENOTSUPP; + + /* The basic idea of this helper is that it's performing the + * needed work to either grow or trim an skb, and eBPF program + * rewrites the rest via helpers like bpf_skb_store_bytes(), + * bpf_lX_csum_replace() and others rather than passing a raw + * buffer here. This one is a slow path helper and intended + * for replies with control messages. + * + * Like in bpf_skb_change_proto(), we want to keep this rather + * minimal and without protocol specifics so that we are able + * to separate concerns as in bpf_skb_store_bytes() should only + * be the one responsible for writing buffers. + * + * It's really expected to be a slow path operation here for + * control message replies, so we're implicitly linearizing, + * uncloning and drop offloads from the skb by this. + */ + ret = __bpf_try_make_writable(skb, skb->len); + if (!ret) { + if (new_len > skb->len) + ret = bpf_skb_grow_rcsum(skb, new_len); + else if (new_len < skb->len) + ret = bpf_skb_trim_rcsum(skb, new_len); + if (!ret && skb_is_gso(skb)) + skb_gso_reset(skb); + } + return ret; +} + +BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, + u64, flags) +{ + int ret = __bpf_skb_change_tail(skb, new_len, flags); + + bpf_compute_data_pointers(skb); + return ret; +} + +static const struct bpf_func_proto bpf_skb_change_tail_proto = { + .func = bpf_skb_change_tail, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len, + u64, flags) +{ + return __bpf_skb_change_tail(skb, new_len, flags); +} + +static const struct bpf_func_proto sk_skb_change_tail_proto = { + .func = sk_skb_change_tail, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, + u64 flags) +{ + u32 max_len = BPF_SKB_MAX_LEN; + u32 new_len = skb->len + head_room; + int ret; + + if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) || + new_len < skb->len)) + return -EINVAL; + + ret = skb_cow(skb, head_room); + if (likely(!ret)) { + /* Idea for this helper is that we currently only + * allow to expand on mac header. This means that + * skb->protocol network header, etc, stay as is. + * Compared to bpf_skb_change_tail(), we're more + * flexible due to not needing to linearize or + * reset GSO. Intention for this helper is to be + * used by an L3 skb that needs to push mac header + * for redirection into L2 device. + */ + __skb_push(skb, head_room); + memset(skb->data, 0, head_room); + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + } + + return ret; +} + +BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, + u64, flags) +{ + int ret = __bpf_skb_change_head(skb, head_room, flags); + + bpf_compute_data_pointers(skb); + return ret; +} + +static const struct bpf_func_proto bpf_skb_change_head_proto = { + .func = bpf_skb_change_head, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room, + u64, flags) +{ + return __bpf_skb_change_head(skb, head_room, flags); +} + +static const struct bpf_func_proto sk_skb_change_head_proto = { + .func = sk_skb_change_head, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp) +{ + return xdp_get_buff_len(xdp); +} + +static const struct bpf_func_proto bpf_xdp_get_buff_len_proto = { + .func = bpf_xdp_get_buff_len, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BTF_ID_LIST_SINGLE(bpf_xdp_get_buff_len_bpf_ids, struct, xdp_buff) + +const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto = { + .func = bpf_xdp_get_buff_len, + .gpl_only = false, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_xdp_get_buff_len_bpf_ids[0], +}; + +static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) +{ + return xdp_data_meta_unsupported(xdp) ? 0 : + xdp->data - xdp->data_meta; +} + +BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) +{ + void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); + unsigned long metalen = xdp_get_metalen(xdp); + void *data_start = xdp_frame_end + metalen; + void *data = xdp->data + offset; + + if (unlikely(data < data_start || + data > xdp->data_end - ETH_HLEN)) + return -EINVAL; + + if (metalen) + memmove(xdp->data_meta + offset, + xdp->data_meta, metalen); + xdp->data_meta += offset; + xdp->data = data; + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { + .func = bpf_xdp_adjust_head, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +static void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, + void *buf, unsigned long len, bool flush) +{ + unsigned long ptr_len, ptr_off = 0; + skb_frag_t *next_frag, *end_frag; + struct skb_shared_info *sinfo; + void *src, *dst; + u8 *ptr_buf; + + if (likely(xdp->data_end - xdp->data >= off + len)) { + src = flush ? buf : xdp->data + off; + dst = flush ? xdp->data + off : buf; + memcpy(dst, src, len); + return; + } + + sinfo = xdp_get_shared_info_from_buff(xdp); + end_frag = &sinfo->frags[sinfo->nr_frags]; + next_frag = &sinfo->frags[0]; + + ptr_len = xdp->data_end - xdp->data; + ptr_buf = xdp->data; + + while (true) { + if (off < ptr_off + ptr_len) { + unsigned long copy_off = off - ptr_off; + unsigned long copy_len = min(len, ptr_len - copy_off); + + src = flush ? buf : ptr_buf + copy_off; + dst = flush ? ptr_buf + copy_off : buf; + memcpy(dst, src, copy_len); + + off += copy_len; + len -= copy_len; + buf += copy_len; + } + + if (!len || next_frag == end_frag) + break; + + ptr_off += ptr_len; + ptr_buf = skb_frag_address(next_frag); + ptr_len = skb_frag_size(next_frag); + next_frag++; + } +} + +static void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len) +{ + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + u32 size = xdp->data_end - xdp->data; + void *addr = xdp->data; + int i; + + if (unlikely(offset > 0xffff || len > 0xffff)) + return ERR_PTR(-EFAULT); + + if (offset + len > xdp_get_buff_len(xdp)) + return ERR_PTR(-EINVAL); + + if (offset < size) /* linear area */ + goto out; + + offset -= size; + for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */ + u32 frag_size = skb_frag_size(&sinfo->frags[i]); + + if (offset < frag_size) { + addr = skb_frag_address(&sinfo->frags[i]); + size = frag_size; + break; + } + offset -= frag_size; + } +out: + return offset + len <= size ? addr + offset : NULL; +} + +BPF_CALL_4(bpf_xdp_load_bytes, struct xdp_buff *, xdp, u32, offset, + void *, buf, u32, len) +{ + void *ptr; + + ptr = bpf_xdp_pointer(xdp, offset, len); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + if (!ptr) + bpf_xdp_copy_buf(xdp, offset, buf, len, false); + else + memcpy(buf, ptr, len); + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_load_bytes_proto = { + .func = bpf_xdp_load_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + +BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset, + void *, buf, u32, len) +{ + void *ptr; + + ptr = bpf_xdp_pointer(xdp, offset, len); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + if (!ptr) + bpf_xdp_copy_buf(xdp, offset, buf, len, true); + else + memcpy(ptr, buf, len); + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_store_bytes_proto = { + .func = bpf_xdp_store_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + +static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) +{ + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1]; + struct xdp_rxq_info *rxq = xdp->rxq; + unsigned int tailroom; + + if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz) + return -EOPNOTSUPP; + + tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag); + if (unlikely(offset > tailroom)) + return -EINVAL; + + memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset); + skb_frag_size_add(frag, offset); + sinfo->xdp_frags_size += offset; + + return 0; +} + +static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) +{ + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + int i, n_frags_free = 0, len_free = 0; + + if (unlikely(offset > (int)xdp_get_buff_len(xdp) - ETH_HLEN)) + return -EINVAL; + + for (i = sinfo->nr_frags - 1; i >= 0 && offset > 0; i--) { + skb_frag_t *frag = &sinfo->frags[i]; + int shrink = min_t(int, offset, skb_frag_size(frag)); + + len_free += shrink; + offset -= shrink; + + if (skb_frag_size(frag) == shrink) { + struct page *page = skb_frag_page(frag); + + __xdp_return(page_address(page), &xdp->rxq->mem, + false, NULL); + n_frags_free++; + } else { + skb_frag_size_sub(frag, shrink); + break; + } + } + sinfo->nr_frags -= n_frags_free; + sinfo->xdp_frags_size -= len_free; + + if (unlikely(!sinfo->nr_frags)) { + xdp_buff_clear_frags_flag(xdp); + xdp->data_end -= offset; + } + + return 0; +} + +BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) +{ + void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */ + void *data_end = xdp->data_end + offset; + + if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */ + if (offset < 0) + return bpf_xdp_frags_shrink_tail(xdp, -offset); + + return bpf_xdp_frags_increase_tail(xdp, offset); + } + + /* Notice that xdp_data_hard_end have reserved some tailroom */ + if (unlikely(data_end > data_hard_end)) + return -EINVAL; + + if (unlikely(data_end < xdp->data + ETH_HLEN)) + return -EINVAL; + + /* Clear memory area on grow, can contain uninit kernel memory */ + if (offset > 0) + memset(xdp->data_end, 0, offset); + + xdp->data_end = data_end; + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = { + .func = bpf_xdp_adjust_tail, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) +{ + void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); + void *meta = xdp->data_meta + offset; + unsigned long metalen = xdp->data - meta; + + if (xdp_data_meta_unsupported(xdp)) + return -ENOTSUPP; + if (unlikely(meta < xdp_frame_end || + meta > xdp->data)) + return -EINVAL; + if (unlikely(xdp_metalen_invalid(metalen))) + return -EACCES; + + xdp->data_meta = meta; + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { + .func = bpf_xdp_adjust_meta, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +/* XDP_REDIRECT works by a three-step process, implemented in the functions + * below: + * + * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target + * of the redirect and store it (along with some other metadata) in a per-CPU + * struct bpf_redirect_info. + * + * 2. When the program returns the XDP_REDIRECT return code, the driver will + * call xdp_do_redirect() which will use the information in struct + * bpf_redirect_info to actually enqueue the frame into a map type-specific + * bulk queue structure. + * + * 3. Before exiting its NAPI poll loop, the driver will call xdp_do_flush(), + * which will flush all the different bulk queues, thus completing the + * redirect. + * + * Pointers to the map entries will be kept around for this whole sequence of + * steps, protected by RCU. However, there is no top-level rcu_read_lock() in + * the core code; instead, the RCU protection relies on everything happening + * inside a single NAPI poll sequence, which means it's between a pair of calls + * to local_bh_disable()/local_bh_enable(). + * + * The map entries are marked as __rcu and the map code makes sure to + * dereference those pointers with rcu_dereference_check() in a way that works + * for both sections that to hold an rcu_read_lock() and sections that are + * called from NAPI without a separate rcu_read_lock(). The code below does not + * use RCU annotations, but relies on those in the map code. + */ +void xdp_do_flush(void) +{ + __dev_flush(); + __cpu_map_flush(); + __xsk_map_flush(); +} +EXPORT_SYMBOL_GPL(xdp_do_flush); + +void bpf_clear_redirect_map(struct bpf_map *map) +{ + struct bpf_redirect_info *ri; + int cpu; + + for_each_possible_cpu(cpu) { + ri = per_cpu_ptr(&bpf_redirect_info, cpu); + /* Avoid polluting remote cacheline due to writes if + * not needed. Once we pass this test, we need the + * cmpxchg() to make sure it hasn't been changed in + * the meantime by remote CPU. + */ + if (unlikely(READ_ONCE(ri->map) == map)) + cmpxchg(&ri->map, map, NULL); + } +} + +DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key); +EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key); + +u32 xdp_master_redirect(struct xdp_buff *xdp) +{ + struct net_device *master, *slave; + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev); + slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp); + if (slave && slave != xdp->rxq->dev) { + /* The target device is different from the receiving device, so + * redirect it to the new device. + * Using XDP_REDIRECT gets the correct behaviour from XDP enabled + * drivers to unmap the packet from their rx ring. + */ + ri->tgt_index = slave->ifindex; + ri->map_id = INT_MAX; + ri->map_type = BPF_MAP_TYPE_UNSPEC; + return XDP_REDIRECT; + } + return XDP_TX; +} +EXPORT_SYMBOL_GPL(xdp_master_redirect); + +static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri, + struct net_device *dev, + struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + enum bpf_map_type map_type = ri->map_type; + void *fwd = ri->tgt_value; + u32 map_id = ri->map_id; + int err; + + ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->map_type = BPF_MAP_TYPE_UNSPEC; + + err = __xsk_map_redirect(fwd, xdp); + if (unlikely(err)) + goto err; + + _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); + return 0; +err: + _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); + return err; +} + +static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, + struct net_device *dev, + struct xdp_frame *xdpf, + struct bpf_prog *xdp_prog) +{ + enum bpf_map_type map_type = ri->map_type; + void *fwd = ri->tgt_value; + u32 map_id = ri->map_id; + struct bpf_map *map; + int err; + + ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->map_type = BPF_MAP_TYPE_UNSPEC; + + if (unlikely(!xdpf)) { + err = -EOVERFLOW; + goto err; + } + + switch (map_type) { + case BPF_MAP_TYPE_DEVMAP: + fallthrough; + case BPF_MAP_TYPE_DEVMAP_HASH: + map = READ_ONCE(ri->map); + if (unlikely(map)) { + WRITE_ONCE(ri->map, NULL); + err = dev_map_enqueue_multi(xdpf, dev, map, + ri->flags & BPF_F_EXCLUDE_INGRESS); + } else { + err = dev_map_enqueue(fwd, xdpf, dev); + } + break; + case BPF_MAP_TYPE_CPUMAP: + err = cpu_map_enqueue(fwd, xdpf, dev); + break; + case BPF_MAP_TYPE_UNSPEC: + if (map_id == INT_MAX) { + fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); + if (unlikely(!fwd)) { + err = -EINVAL; + break; + } + err = dev_xdp_enqueue(fwd, xdpf, dev); + break; + } + fallthrough; + default: + err = -EBADRQC; + } + + if (unlikely(err)) + goto err; + + _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); + return 0; +err: + _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); + return err; +} + +int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + enum bpf_map_type map_type = ri->map_type; + + /* XDP_REDIRECT is not fully supported yet for xdp frags since + * not all XDP capable drivers can map non-linear xdp_frame in + * ndo_xdp_xmit. + */ + if (unlikely(xdp_buff_has_frags(xdp) && + map_type != BPF_MAP_TYPE_CPUMAP)) + return -EOPNOTSUPP; + + if (map_type == BPF_MAP_TYPE_XSKMAP) + return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); + + return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp), + xdp_prog); +} +EXPORT_SYMBOL_GPL(xdp_do_redirect); + +int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, + struct xdp_frame *xdpf, struct bpf_prog *xdp_prog) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + enum bpf_map_type map_type = ri->map_type; + + if (map_type == BPF_MAP_TYPE_XSKMAP) + return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); + + return __xdp_do_redirect_frame(ri, dev, xdpf, xdp_prog); +} +EXPORT_SYMBOL_GPL(xdp_do_redirect_frame); + +static int xdp_do_generic_redirect_map(struct net_device *dev, + struct sk_buff *skb, + struct xdp_buff *xdp, + struct bpf_prog *xdp_prog, + void *fwd, + enum bpf_map_type map_type, u32 map_id) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_map *map; + int err; + + switch (map_type) { + case BPF_MAP_TYPE_DEVMAP: + fallthrough; + case BPF_MAP_TYPE_DEVMAP_HASH: + map = READ_ONCE(ri->map); + if (unlikely(map)) { + WRITE_ONCE(ri->map, NULL); + err = dev_map_redirect_multi(dev, skb, xdp_prog, map, + ri->flags & BPF_F_EXCLUDE_INGRESS); + } else { + err = dev_map_generic_redirect(fwd, skb, xdp_prog); + } + if (unlikely(err)) + goto err; + break; + case BPF_MAP_TYPE_XSKMAP: + err = xsk_generic_rcv(fwd, xdp); + if (err) + goto err; + consume_skb(skb); + break; + case BPF_MAP_TYPE_CPUMAP: + err = cpu_map_generic_redirect(fwd, skb); + if (unlikely(err)) + goto err; + break; + default: + err = -EBADRQC; + goto err; + } + + _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); + return 0; +err: + _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); + return err; +} + +int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, + struct xdp_buff *xdp, struct bpf_prog *xdp_prog) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + enum bpf_map_type map_type = ri->map_type; + void *fwd = ri->tgt_value; + u32 map_id = ri->map_id; + int err; + + ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->map_type = BPF_MAP_TYPE_UNSPEC; + + if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { + fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); + if (unlikely(!fwd)) { + err = -EINVAL; + goto err; + } + + err = xdp_ok_fwd_dev(fwd, skb->len); + if (unlikely(err)) + goto err; + + skb->dev = fwd; + _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index); + generic_xdp_tx(skb, xdp_prog); + return 0; + } + + return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id); +err: + _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err); + return err; +} + +BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + if (unlikely(flags)) + return XDP_ABORTED; + + /* NB! Map type UNSPEC and map_id == INT_MAX (never generated + * by map_idr) is used for ifindex based XDP redirect. + */ + ri->tgt_index = ifindex; + ri->map_id = INT_MAX; + ri->map_type = BPF_MAP_TYPE_UNSPEC; + + return XDP_REDIRECT; +} + +static const struct bpf_func_proto bpf_xdp_redirect_proto = { + .func = bpf_xdp_redirect, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, + u64, flags) +{ + return map->ops->map_redirect(map, ifindex, flags); +} + +static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { + .func = bpf_xdp_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, + unsigned long off, unsigned long len) +{ + void *ptr = skb_header_pointer(skb, off, len, dst_buff); + + if (unlikely(!ptr)) + return len; + if (ptr != dst_buff) + memcpy(dst_buff, ptr, len); + + return 0; +} + +BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, + u64, flags, void *, meta, u64, meta_size) +{ + u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; + + if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) + return -EINVAL; + if (unlikely(!skb || skb_size > skb->len)) + return -EFAULT; + + return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, + bpf_skb_copy); +} + +static const struct bpf_func_proto bpf_skb_event_output_proto = { + .func = bpf_skb_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +BTF_ID_LIST_SINGLE(bpf_skb_output_btf_ids, struct, sk_buff) + +const struct bpf_func_proto bpf_skb_output_proto = { + .func = bpf_skb_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_skb_output_btf_ids[0], + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +static unsigned short bpf_tunnel_key_af(u64 flags) +{ + return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; +} + +BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to, + u32, size, u64, flags) +{ + const struct ip_tunnel_info *info = skb_tunnel_info(skb); + u8 compat[sizeof(struct bpf_tunnel_key)]; + void *to_orig = to; + int err; + + if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6 | + BPF_F_TUNINFO_FLAGS)))) { + err = -EINVAL; + goto err_clear; + } + if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) { + err = -EPROTO; + goto err_clear; + } + if (unlikely(size != sizeof(struct bpf_tunnel_key))) { + err = -EINVAL; + switch (size) { + case offsetof(struct bpf_tunnel_key, local_ipv6[0]): + case offsetof(struct bpf_tunnel_key, tunnel_label): + case offsetof(struct bpf_tunnel_key, tunnel_ext): + goto set_compat; + case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): + /* Fixup deprecated structure layouts here, so we have + * a common path later on. + */ + if (ip_tunnel_info_af(info) != AF_INET) + goto err_clear; +set_compat: + to = (struct bpf_tunnel_key *)compat; + break; + default: + goto err_clear; + } + } + + to->tunnel_id = be64_to_cpu(info->key.tun_id); + to->tunnel_tos = info->key.tos; + to->tunnel_ttl = info->key.ttl; + if (flags & BPF_F_TUNINFO_FLAGS) + to->tunnel_flags = info->key.tun_flags; + else + to->tunnel_ext = 0; + + if (flags & BPF_F_TUNINFO_IPV6) { + memcpy(to->remote_ipv6, &info->key.u.ipv6.src, + sizeof(to->remote_ipv6)); + memcpy(to->local_ipv6, &info->key.u.ipv6.dst, + sizeof(to->local_ipv6)); + to->tunnel_label = be32_to_cpu(info->key.label); + } else { + to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); + memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); + to->local_ipv4 = be32_to_cpu(info->key.u.ipv4.dst); + memset(&to->local_ipv6[1], 0, sizeof(__u32) * 3); + to->tunnel_label = 0; + } + + if (unlikely(size != sizeof(struct bpf_tunnel_key))) + memcpy(to_orig, to, size); + + return 0; +err_clear: + memset(to_orig, 0, size); + return err; +} + +static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { + .func = bpf_skb_get_tunnel_key, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size) +{ + const struct ip_tunnel_info *info = skb_tunnel_info(skb); + int err; + + if (unlikely(!info || + !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) { + err = -ENOENT; + goto err_clear; + } + if (unlikely(size < info->options_len)) { + err = -ENOMEM; + goto err_clear; + } + + ip_tunnel_info_opts_get(to, info); + if (size > info->options_len) + memset(to + info->options_len, 0, size - info->options_len); + + return info->options_len; +err_clear: + memset(to, 0, size); + return err; +} + +static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { + .func = bpf_skb_get_tunnel_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +static struct metadata_dst __percpu *md_dst; + +BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, + const struct bpf_tunnel_key *, from, u32, size, u64, flags) +{ + struct metadata_dst *md = this_cpu_ptr(md_dst); + u8 compat[sizeof(struct bpf_tunnel_key)]; + struct ip_tunnel_info *info; + + if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | + BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER))) + return -EINVAL; + if (unlikely(size != sizeof(struct bpf_tunnel_key))) { + switch (size) { + case offsetof(struct bpf_tunnel_key, local_ipv6[0]): + case offsetof(struct bpf_tunnel_key, tunnel_label): + case offsetof(struct bpf_tunnel_key, tunnel_ext): + case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): + /* Fixup deprecated structure layouts here, so we have + * a common path later on. + */ + memcpy(compat, from, size); + memset(compat + size, 0, sizeof(compat) - size); + from = (const struct bpf_tunnel_key *) compat; + break; + default: + return -EINVAL; + } + } + if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) || + from->tunnel_ext)) + return -EINVAL; + + skb_dst_drop(skb); + dst_hold((struct dst_entry *) md); + skb_dst_set(skb, (struct dst_entry *) md); + + info = &md->u.tun_info; + memset(info, 0, sizeof(*info)); + info->mode = IP_TUNNEL_INFO_TX; + + info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; + if (flags & BPF_F_DONT_FRAGMENT) + info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; + if (flags & BPF_F_ZERO_CSUM_TX) + info->key.tun_flags &= ~TUNNEL_CSUM; + if (flags & BPF_F_SEQ_NUMBER) + info->key.tun_flags |= TUNNEL_SEQ; + + info->key.tun_id = cpu_to_be64(from->tunnel_id); + info->key.tos = from->tunnel_tos; + info->key.ttl = from->tunnel_ttl; + + if (flags & BPF_F_TUNINFO_IPV6) { + info->mode |= IP_TUNNEL_INFO_IPV6; + memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, + sizeof(from->remote_ipv6)); + memcpy(&info->key.u.ipv6.src, from->local_ipv6, + sizeof(from->local_ipv6)); + info->key.label = cpu_to_be32(from->tunnel_label) & + IPV6_FLOWLABEL_MASK; + } else { + info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); + info->key.u.ipv4.src = cpu_to_be32(from->local_ipv4); + info->key.flow_flags = FLOWI_FLAG_ANYSRC; + } + + return 0; +} + +static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { + .func = bpf_skb_set_tunnel_key, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, + const u8 *, from, u32, size) +{ + struct ip_tunnel_info *info = skb_tunnel_info(skb); + const struct metadata_dst *md = this_cpu_ptr(md_dst); + + if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) + return -EINVAL; + if (unlikely(size > IP_TUNNEL_OPTS_MAX)) + return -ENOMEM; + + ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT); + + return 0; +} + +static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { + .func = bpf_skb_set_tunnel_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, +}; + +static const struct bpf_func_proto * +bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) +{ + if (!md_dst) { + struct metadata_dst __percpu *tmp; + + tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, + METADATA_IP_TUNNEL, + GFP_KERNEL); + if (!tmp) + return NULL; + if (cmpxchg(&md_dst, NULL, tmp)) + metadata_dst_free_percpu(tmp); + } + + switch (which) { + case BPF_FUNC_skb_set_tunnel_key: + return &bpf_skb_set_tunnel_key_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return &bpf_skb_set_tunnel_opt_proto; + default: + return NULL; + } +} + +BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map, + u32, idx) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct cgroup *cgrp; + struct sock *sk; + + sk = skb_to_full_sk(skb); + if (!sk || !sk_fullsock(sk)) + return -ENOENT; + if (unlikely(idx >= array->map.max_entries)) + return -E2BIG; + + cgrp = READ_ONCE(array->ptrs[idx]); + if (unlikely(!cgrp)) + return -EAGAIN; + + return sk_under_cgroup_hierarchy(sk, cgrp); +} + +static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { + .func = bpf_skb_under_cgroup, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +#ifdef CONFIG_SOCK_CGROUP_DATA +static inline u64 __bpf_sk_cgroup_id(struct sock *sk) +{ + struct cgroup *cgrp; + + sk = sk_to_full_sk(sk); + if (!sk || !sk_fullsock(sk)) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + return cgroup_id(cgrp); +} + +BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) +{ + return __bpf_sk_cgroup_id(skb->sk); +} + +static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { + .func = bpf_skb_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk, + int ancestor_level) +{ + struct cgroup *ancestor; + struct cgroup *cgrp; + + sk = sk_to_full_sk(sk); + if (!sk || !sk_fullsock(sk)) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + ancestor = cgroup_ancestor(cgrp, ancestor_level); + if (!ancestor) + return 0; + + return cgroup_id(ancestor); +} + +BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, + ancestor_level) +{ + return __bpf_sk_ancestor_cgroup_id(skb->sk, ancestor_level); +} + +static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { + .func = bpf_skb_ancestor_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk) +{ + return __bpf_sk_cgroup_id(sk); +} + +static const struct bpf_func_proto bpf_sk_cgroup_id_proto = { + .func = bpf_sk_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, +}; + +BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level) +{ + return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); +} + +static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = { + .func = bpf_sk_ancestor_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, +}; +#endif + +static unsigned long bpf_xdp_copy(void *dst, const void *ctx, + unsigned long off, unsigned long len) +{ + struct xdp_buff *xdp = (struct xdp_buff *)ctx; + + bpf_xdp_copy_buf(xdp, off, dst, len, false); + return 0; +} + +BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, + u64, flags, void *, meta, u64, meta_size) +{ + u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32; + + if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) + return -EINVAL; + + if (unlikely(!xdp || xdp_size > xdp_get_buff_len(xdp))) + return -EFAULT; + + return bpf_event_output(map, flags, meta, meta_size, xdp, + xdp_size, bpf_xdp_copy); +} + +static const struct bpf_func_proto bpf_xdp_event_output_proto = { + .func = bpf_xdp_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +BTF_ID_LIST_SINGLE(bpf_xdp_output_btf_ids, struct, xdp_buff) + +const struct bpf_func_proto bpf_xdp_output_proto = { + .func = bpf_xdp_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_xdp_output_btf_ids[0], + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) +{ + return skb->sk ? __sock_gen_cookie(skb->sk) : 0; +} + +static const struct bpf_func_proto bpf_get_socket_cookie_proto = { + .func = bpf_get_socket_cookie, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx) +{ + return __sock_gen_cookie(ctx->sk); +} + +static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = { + .func = bpf_get_socket_cookie_sock_addr, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx) +{ + return __sock_gen_cookie(ctx); +} + +static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = { + .func = bpf_get_socket_cookie_sock, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk) +{ + return sk ? sock_gen_cookie(sk) : 0; +} + +const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = { + .func = bpf_get_socket_ptr_cookie, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, +}; + +BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) +{ + return __sock_gen_cookie(ctx->sk); +} + +static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = { + .func = bpf_get_socket_cookie_sock_ops, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +static u64 __bpf_get_netns_cookie(struct sock *sk) +{ + const struct net *net = sk ? sock_net(sk) : &init_net; + + return net->net_cookie; +} + +BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx) +{ + return __bpf_get_netns_cookie(ctx); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = { + .func = bpf_get_netns_cookie_sock, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + +BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx) +{ + return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = { + .func = bpf_get_netns_cookie_sock_addr, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + +BPF_CALL_1(bpf_get_netns_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) +{ + return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = { + .func = bpf_get_netns_cookie_sock_ops, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + +BPF_CALL_1(bpf_get_netns_cookie_sk_msg, struct sk_msg *, ctx) +{ + return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_sk_msg_proto = { + .func = bpf_get_netns_cookie_sk_msg, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + +BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) +{ + struct sock *sk = sk_to_full_sk(skb->sk); + kuid_t kuid; + + if (!sk || !sk_fullsock(sk)) + return overflowuid; + kuid = sock_net_uid(sock_net(sk), sk); + return from_kuid_munged(sock_net(sk)->user_ns, kuid); +} + +static const struct bpf_func_proto bpf_get_socket_uid_proto = { + .func = bpf_get_socket_uid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +static int sol_socket_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) +{ + switch (optname) { + case SO_REUSEADDR: + case SO_SNDBUF: + case SO_RCVBUF: + case SO_KEEPALIVE: + case SO_PRIORITY: + case SO_REUSEPORT: + case SO_RCVLOWAT: + case SO_MARK: + case SO_MAX_PACING_RATE: + case SO_BINDTOIFINDEX: + case SO_TXREHASH: + if (*optlen != sizeof(int)) + return -EINVAL; + break; + case SO_BINDTODEVICE: + break; + default: + return -EINVAL; + } + + if (getopt) { + if (optname == SO_BINDTODEVICE) + return -EINVAL; + return sk_getsockopt(sk, SOL_SOCKET, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); + } + + return sk_setsockopt(sk, SOL_SOCKET, optname, + KERNEL_SOCKPTR(optval), *optlen); +} + +static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, + char *optval, int optlen) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned long timeout; + int val; + + if (optlen != sizeof(int)) + return -EINVAL; + + val = *(int *)optval; + + /* Only some options are supported */ + switch (optname) { + case TCP_BPF_IW: + if (val <= 0 || tp->data_segs_out > tp->syn_data) + return -EINVAL; + tcp_snd_cwnd_set(tp, val); + break; + case TCP_BPF_SNDCWND_CLAMP: + if (val <= 0) + return -EINVAL; + tp->snd_cwnd_clamp = val; + tp->snd_ssthresh = val; + break; + case TCP_BPF_DELACK_MAX: + timeout = usecs_to_jiffies(val); + if (timeout > TCP_DELACK_MAX || + timeout < TCP_TIMEOUT_MIN) + return -EINVAL; + inet_csk(sk)->icsk_delack_max = timeout; + break; + case TCP_BPF_RTO_MIN: + timeout = usecs_to_jiffies(val); + if (timeout > TCP_RTO_MIN || + timeout < TCP_TIMEOUT_MIN) + return -EINVAL; + inet_csk(sk)->icsk_rto_min = timeout; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval, + int *optlen, bool getopt) +{ + struct tcp_sock *tp; + int ret; + + if (*optlen < 2) + return -EINVAL; + + if (getopt) { + if (!inet_csk(sk)->icsk_ca_ops) + return -EINVAL; + /* BPF expects NULL-terminated tcp-cc string */ + optval[--(*optlen)] = '\0'; + return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); + } + + /* "cdg" is the only cc that alloc a ptr + * in inet_csk_ca area. The bpf-tcp-cc may + * overwrite this ptr after switching to cdg. + */ + if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen)) + return -ENOTSUPP; + + /* It stops this looping + * + * .init => bpf_setsockopt(tcp_cc) => .init => + * bpf_setsockopt(tcp_cc)" => .init => .... + * + * The second bpf_setsockopt(tcp_cc) is not allowed + * in order to break the loop when both .init + * are the same bpf prog. + * + * This applies even the second bpf_setsockopt(tcp_cc) + * does not cause a loop. This limits only the first + * '.init' can call bpf_setsockopt(TCP_CONGESTION) to + * pick a fallback cc (eg. peer does not support ECN) + * and the second '.init' cannot fallback to + * another. + */ + tp = tcp_sk(sk); + if (tp->bpf_chg_cc_inprogress) + return -EBUSY; + + tp->bpf_chg_cc_inprogress = 1; + ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + KERNEL_SOCKPTR(optval), *optlen); + tp->bpf_chg_cc_inprogress = 0; + return ret; +} + +static int sol_tcp_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) +{ + if (sk->sk_prot->setsockopt != tcp_setsockopt) + return -EINVAL; + + switch (optname) { + case TCP_NODELAY: + case TCP_MAXSEG: + case TCP_KEEPIDLE: + case TCP_KEEPINTVL: + case TCP_KEEPCNT: + case TCP_SYNCNT: + case TCP_WINDOW_CLAMP: + case TCP_THIN_LINEAR_TIMEOUTS: + case TCP_USER_TIMEOUT: + case TCP_NOTSENT_LOWAT: + case TCP_SAVE_SYN: + if (*optlen != sizeof(int)) + return -EINVAL; + break; + case TCP_CONGESTION: + return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt); + case TCP_SAVED_SYN: + if (*optlen < 1) + return -EINVAL; + break; + default: + if (getopt) + return -EINVAL; + return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen); + } + + if (getopt) { + if (optname == TCP_SAVED_SYN) { + struct tcp_sock *tp = tcp_sk(sk); + + if (!tp->saved_syn || + *optlen > tcp_saved_syn_len(tp->saved_syn)) + return -EINVAL; + memcpy(optval, tp->saved_syn->data, *optlen); + /* It cannot free tp->saved_syn here because it + * does not know if the user space still needs it. + */ + return 0; + } + + return do_tcp_getsockopt(sk, SOL_TCP, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); + } + + return do_tcp_setsockopt(sk, SOL_TCP, optname, + KERNEL_SOCKPTR(optval), *optlen); +} + +static int sol_ip_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) +{ + if (sk->sk_family != AF_INET) + return -EINVAL; + + switch (optname) { + case IP_TOS: + if (*optlen != sizeof(int)) + return -EINVAL; + break; + default: + return -EINVAL; + } + + if (getopt) + return do_ip_getsockopt(sk, SOL_IP, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); + + return do_ip_setsockopt(sk, SOL_IP, optname, + KERNEL_SOCKPTR(optval), *optlen); +} + +static int sol_ipv6_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) +{ + if (sk->sk_family != AF_INET6) + return -EINVAL; + + switch (optname) { + case IPV6_TCLASS: + case IPV6_AUTOFLOWLABEL: + if (*optlen != sizeof(int)) + return -EINVAL; + break; + default: + return -EINVAL; + } + + if (getopt) + return ipv6_bpf_stub->ipv6_getsockopt(sk, SOL_IPV6, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); + + return ipv6_bpf_stub->ipv6_setsockopt(sk, SOL_IPV6, optname, + KERNEL_SOCKPTR(optval), *optlen); +} + +static int __bpf_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + if (!sk_fullsock(sk)) + return -EINVAL; + + if (level == SOL_SOCKET) + return sol_socket_sockopt(sk, optname, optval, &optlen, false); + else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) + return sol_ip_sockopt(sk, optname, optval, &optlen, false); + else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) + return sol_ipv6_sockopt(sk, optname, optval, &optlen, false); + else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) + return sol_tcp_sockopt(sk, optname, optval, &optlen, false); + + return -EINVAL; +} + +static int _bpf_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + if (sk_fullsock(sk)) + sock_owned_by_me(sk); + return __bpf_setsockopt(sk, level, optname, optval, optlen); +} + +static int __bpf_getsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + int err, saved_optlen = optlen; + + if (!sk_fullsock(sk)) { + err = -EINVAL; + goto done; + } + + if (level == SOL_SOCKET) + err = sol_socket_sockopt(sk, optname, optval, &optlen, true); + else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) + err = sol_tcp_sockopt(sk, optname, optval, &optlen, true); + else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) + err = sol_ip_sockopt(sk, optname, optval, &optlen, true); + else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) + err = sol_ipv6_sockopt(sk, optname, optval, &optlen, true); + else + err = -EINVAL; + +done: + if (err) + optlen = 0; + if (optlen < saved_optlen) + memset(optval + optlen, 0, saved_optlen - optlen); + return err; +} + +static int _bpf_getsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + if (sk_fullsock(sk)) + sock_owned_by_me(sk); + return __bpf_getsockopt(sk, level, optname, optval, optlen); +} + +BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + return _bpf_setsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_sk_setsockopt_proto = { + .func = bpf_sk_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + return _bpf_getsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_sk_getsockopt_proto = { + .func = bpf_sk_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + return __bpf_setsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto = { + .func = bpf_unlocked_sk_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_unlocked_sk_getsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + return __bpf_getsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto = { + .func = bpf_unlocked_sk_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, + int, level, int, optname, char *, optval, int, optlen) +{ + return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { + .func = bpf_sock_addr_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx, + int, level, int, optname, char *, optval, int, optlen) +{ + return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { + .func = bpf_sock_addr_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, + int, level, int, optname, char *, optval, int, optlen) +{ + return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { + .func = bpf_sock_ops_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, +}; + +static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock, + int optname, const u8 **start) +{ + struct sk_buff *syn_skb = bpf_sock->syn_skb; + const u8 *hdr_start; + int ret; + + if (syn_skb) { + /* sk is a request_sock here */ + + if (optname == TCP_BPF_SYN) { + hdr_start = syn_skb->data; + ret = tcp_hdrlen(syn_skb); + } else if (optname == TCP_BPF_SYN_IP) { + hdr_start = skb_network_header(syn_skb); + ret = skb_network_header_len(syn_skb) + + tcp_hdrlen(syn_skb); + } else { + /* optname == TCP_BPF_SYN_MAC */ + hdr_start = skb_mac_header(syn_skb); + ret = skb_mac_header_len(syn_skb) + + skb_network_header_len(syn_skb) + + tcp_hdrlen(syn_skb); + } + } else { + struct sock *sk = bpf_sock->sk; + struct saved_syn *saved_syn; + + if (sk->sk_state == TCP_NEW_SYN_RECV) + /* synack retransmit. bpf_sock->syn_skb will + * not be available. It has to resort to + * saved_syn (if it is saved). + */ + saved_syn = inet_reqsk(sk)->saved_syn; + else + saved_syn = tcp_sk(sk)->saved_syn; + + if (!saved_syn) + return -ENOENT; + + if (optname == TCP_BPF_SYN) { + hdr_start = saved_syn->data + + saved_syn->mac_hdrlen + + saved_syn->network_hdrlen; + ret = saved_syn->tcp_hdrlen; + } else if (optname == TCP_BPF_SYN_IP) { + hdr_start = saved_syn->data + + saved_syn->mac_hdrlen; + ret = saved_syn->network_hdrlen + + saved_syn->tcp_hdrlen; + } else { + /* optname == TCP_BPF_SYN_MAC */ + + /* TCP_SAVE_SYN may not have saved the mac hdr */ + if (!saved_syn->mac_hdrlen) + return -ENOENT; + + hdr_start = saved_syn->data; + ret = saved_syn->mac_hdrlen + + saved_syn->network_hdrlen + + saved_syn->tcp_hdrlen; + } + } + + *start = hdr_start; + return ret; +} + +BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, + int, level, int, optname, char *, optval, int, optlen) +{ + if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP && + optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) { + int ret, copy_len = 0; + const u8 *start; + + ret = bpf_sock_ops_get_syn(bpf_sock, optname, &start); + if (ret > 0) { + copy_len = ret; + if (optlen < copy_len) { + copy_len = optlen; + ret = -ENOSPC; + } + + memcpy(optval, start, copy_len); + } + + /* Zero out unused buffer at the end */ + memset(optval + copy_len, 0, optlen - copy_len); + + return ret; + } + + return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = { + .func = bpf_sock_ops_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, + int, argval) +{ + struct sock *sk = bpf_sock->sk; + int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; + + if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk)) + return -EINVAL; + + tcp_sk(sk)->bpf_sock_ops_cb_flags = val; + + return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); +} + +static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { + .func = bpf_sock_ops_cb_flags_set, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; +EXPORT_SYMBOL_GPL(ipv6_bpf_stub); + +BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, + int, addr_len) +{ +#ifdef CONFIG_INET + struct sock *sk = ctx->sk; + u32 flags = BIND_FROM_BPF; + int err; + + err = -EINVAL; + if (addr_len < offsetofend(struct sockaddr, sa_family)) + return err; + if (addr->sa_family == AF_INET) { + if (addr_len < sizeof(struct sockaddr_in)) + return err; + if (((struct sockaddr_in *)addr)->sin_port == htons(0)) + flags |= BIND_FORCE_ADDRESS_NO_PORT; + return __inet_bind(sk, addr, addr_len, flags); +#if IS_ENABLED(CONFIG_IPV6) + } else if (addr->sa_family == AF_INET6) { + if (addr_len < SIN6_LEN_RFC2133) + return err; + if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0)) + flags |= BIND_FORCE_ADDRESS_NO_PORT; + /* ipv6_bpf_stub cannot be NULL, since it's called from + * bpf_cgroup_inet6_connect hook and ipv6 is already loaded + */ + return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags); +#endif /* CONFIG_IPV6 */ + } +#endif /* CONFIG_INET */ + + return -EAFNOSUPPORT; +} + +static const struct bpf_func_proto bpf_bind_proto = { + .func = bpf_bind, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, +}; + +#ifdef CONFIG_XFRM +BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index, + struct bpf_xfrm_state *, to, u32, size, u64, flags) +{ + const struct sec_path *sp = skb_sec_path(skb); + const struct xfrm_state *x; + + if (!sp || unlikely(index >= sp->len || flags)) + goto err_clear; + + x = sp->xvec[index]; + + if (unlikely(size != sizeof(struct bpf_xfrm_state))) + goto err_clear; + + to->reqid = x->props.reqid; + to->spi = x->id.spi; + to->family = x->props.family; + to->ext = 0; + + if (to->family == AF_INET6) { + memcpy(to->remote_ipv6, x->props.saddr.a6, + sizeof(to->remote_ipv6)); + } else { + to->remote_ipv4 = x->props.saddr.a4; + memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); + } + + return 0; +err_clear: + memset(to, 0, size); + return -EINVAL; +} + +static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { + .func = bpf_skb_get_xfrm_state, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; +#endif + +#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6) +static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, u32 mtu) +{ + params->h_vlan_TCI = 0; + params->h_vlan_proto = 0; + if (mtu) + params->mtu_result = mtu; /* union with tot_len */ + + return 0; +} +#endif + +#if IS_ENABLED(CONFIG_INET) +static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, + u32 flags, bool check_mtu) +{ + struct fib_nh_common *nhc; + struct in_device *in_dev; + struct neighbour *neigh; + struct net_device *dev; + struct fib_result res; + struct flowi4 fl4; + u32 mtu = 0; + int err; + + dev = dev_get_by_index_rcu(net, params->ifindex); + if (unlikely(!dev)) + return -ENODEV; + + /* verify forwarding is enabled on this interface */ + in_dev = __in_dev_get_rcu(dev); + if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) + return BPF_FIB_LKUP_RET_FWD_DISABLED; + + if (flags & BPF_FIB_LOOKUP_OUTPUT) { + fl4.flowi4_iif = 1; + fl4.flowi4_oif = params->ifindex; + } else { + fl4.flowi4_iif = params->ifindex; + fl4.flowi4_oif = 0; + } + fl4.flowi4_tos = params->tos & IPTOS_RT_MASK; + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.flowi4_flags = 0; + + fl4.flowi4_proto = params->l4_protocol; + fl4.daddr = params->ipv4_dst; + fl4.saddr = params->ipv4_src; + fl4.fl4_sport = params->sport; + fl4.fl4_dport = params->dport; + fl4.flowi4_multipath_hash = 0; + + if (flags & BPF_FIB_LOOKUP_DIRECT) { + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; + struct fib_table *tb; + + tb = fib_get_table(net, tbid); + if (unlikely(!tb)) + return BPF_FIB_LKUP_RET_NOT_FWDED; + + err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); + } else { + fl4.flowi4_mark = 0; + fl4.flowi4_secid = 0; + fl4.flowi4_tun_key.tun_id = 0; + fl4.flowi4_uid = sock_net_uid(net, NULL); + + err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); + } + + if (err) { + /* map fib lookup errors to RTN_ type */ + if (err == -EINVAL) + return BPF_FIB_LKUP_RET_BLACKHOLE; + if (err == -EHOSTUNREACH) + return BPF_FIB_LKUP_RET_UNREACHABLE; + if (err == -EACCES) + return BPF_FIB_LKUP_RET_PROHIBIT; + + return BPF_FIB_LKUP_RET_NOT_FWDED; + } + + if (res.type != RTN_UNICAST) + return BPF_FIB_LKUP_RET_NOT_FWDED; + + if (fib_info_num_path(res.fi) > 1) + fib_select_path(net, &res, &fl4, NULL); + + if (check_mtu) { + mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); + if (params->tot_len > mtu) { + params->mtu_result = mtu; /* union with tot_len */ + return BPF_FIB_LKUP_RET_FRAG_NEEDED; + } + } + + nhc = res.nhc; + + /* do not handle lwt encaps right now */ + if (nhc->nhc_lwtstate) + return BPF_FIB_LKUP_RET_UNSUPP_LWT; + + dev = nhc->nhc_dev; + + params->rt_metric = res.fi->fib_priority; + params->ifindex = dev->ifindex; + + /* xdp and cls_bpf programs are run in RCU-bh so + * rcu_read_lock_bh is not needed here + */ + if (likely(nhc->nhc_gw_family != AF_INET6)) { + if (nhc->nhc_gw_family) + params->ipv4_dst = nhc->nhc_gw.ipv4; + } else { + struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst; + + params->family = AF_INET6; + *dst = nhc->nhc_gw.ipv6; + } + + if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH) + goto set_fwd_params; + + if (likely(nhc->nhc_gw_family != AF_INET6)) + neigh = __ipv4_neigh_lookup_noref(dev, + (__force u32)params->ipv4_dst); + else + neigh = __ipv6_neigh_lookup_noref_stub(dev, params->ipv6_dst); + + if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID)) + return BPF_FIB_LKUP_RET_NO_NEIGH; + memcpy(params->dmac, neigh->ha, ETH_ALEN); + memcpy(params->smac, dev->dev_addr, ETH_ALEN); + +set_fwd_params: + return bpf_fib_set_fwd_params(params, mtu); +} +#endif + +#if IS_ENABLED(CONFIG_IPV6) +static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, + u32 flags, bool check_mtu) +{ + struct in6_addr *src = (struct in6_addr *) params->ipv6_src; + struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst; + struct fib6_result res = {}; + struct neighbour *neigh; + struct net_device *dev; + struct inet6_dev *idev; + struct flowi6 fl6; + int strict = 0; + int oif, err; + u32 mtu = 0; + + /* link local addresses are never forwarded */ + if (rt6_need_strict(dst) || rt6_need_strict(src)) + return BPF_FIB_LKUP_RET_NOT_FWDED; + + dev = dev_get_by_index_rcu(net, params->ifindex); + if (unlikely(!dev)) + return -ENODEV; + + idev = __in6_dev_get_safely(dev); + if (unlikely(!idev || !idev->cnf.forwarding)) + return BPF_FIB_LKUP_RET_FWD_DISABLED; + + if (flags & BPF_FIB_LOOKUP_OUTPUT) { + fl6.flowi6_iif = 1; + oif = fl6.flowi6_oif = params->ifindex; + } else { + oif = fl6.flowi6_iif = params->ifindex; + fl6.flowi6_oif = 0; + strict = RT6_LOOKUP_F_HAS_SADDR; + } + fl6.flowlabel = params->flowinfo; + fl6.flowi6_scope = 0; + fl6.flowi6_flags = 0; + fl6.mp_hash = 0; + + fl6.flowi6_proto = params->l4_protocol; + fl6.daddr = *dst; + fl6.saddr = *src; + fl6.fl6_sport = params->sport; + fl6.fl6_dport = params->dport; + + if (flags & BPF_FIB_LOOKUP_DIRECT) { + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; + struct fib6_table *tb; + + tb = ipv6_stub->fib6_get_table(net, tbid); + if (unlikely(!tb)) + return BPF_FIB_LKUP_RET_NOT_FWDED; + + err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res, + strict); + } else { + fl6.flowi6_mark = 0; + fl6.flowi6_secid = 0; + fl6.flowi6_tun_key.tun_id = 0; + fl6.flowi6_uid = sock_net_uid(net, NULL); + + err = ipv6_stub->fib6_lookup(net, oif, &fl6, &res, strict); + } + + if (unlikely(err || IS_ERR_OR_NULL(res.f6i) || + res.f6i == net->ipv6.fib6_null_entry)) + return BPF_FIB_LKUP_RET_NOT_FWDED; + + switch (res.fib6_type) { + /* only unicast is forwarded */ + case RTN_UNICAST: + break; + case RTN_BLACKHOLE: + return BPF_FIB_LKUP_RET_BLACKHOLE; + case RTN_UNREACHABLE: + return BPF_FIB_LKUP_RET_UNREACHABLE; + case RTN_PROHIBIT: + return BPF_FIB_LKUP_RET_PROHIBIT; + default: + return BPF_FIB_LKUP_RET_NOT_FWDED; + } + + ipv6_stub->fib6_select_path(net, &res, &fl6, fl6.flowi6_oif, + fl6.flowi6_oif != 0, NULL, strict); + + if (check_mtu) { + mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src); + if (params->tot_len > mtu) { + params->mtu_result = mtu; /* union with tot_len */ + return BPF_FIB_LKUP_RET_FRAG_NEEDED; + } + } + + if (res.nh->fib_nh_lws) + return BPF_FIB_LKUP_RET_UNSUPP_LWT; + + if (res.nh->fib_nh_gw_family) + *dst = res.nh->fib_nh_gw6; + + dev = res.nh->fib_nh_dev; + params->rt_metric = res.f6i->fib6_metric; + params->ifindex = dev->ifindex; + + if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH) + goto set_fwd_params; + + /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is + * not needed here. + */ + neigh = __ipv6_neigh_lookup_noref_stub(dev, dst); + if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID)) + return BPF_FIB_LKUP_RET_NO_NEIGH; + memcpy(params->dmac, neigh->ha, ETH_ALEN); + memcpy(params->smac, dev->dev_addr, ETH_ALEN); + +set_fwd_params: + return bpf_fib_set_fwd_params(params, mtu); +} +#endif + +#define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \ + BPF_FIB_LOOKUP_SKIP_NEIGH) + +BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, + struct bpf_fib_lookup *, params, int, plen, u32, flags) +{ + if (plen < sizeof(*params)) + return -EINVAL; + + if (flags & ~BPF_FIB_LOOKUP_MASK) + return -EINVAL; + + switch (params->family) { +#if IS_ENABLED(CONFIG_INET) + case AF_INET: + return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params, + flags, true); +#endif +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params, + flags, true); +#endif + } + return -EAFNOSUPPORT; +} + +static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = { + .func = bpf_xdp_fib_lookup, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, + struct bpf_fib_lookup *, params, int, plen, u32, flags) +{ + struct net *net = dev_net(skb->dev); + int rc = -EAFNOSUPPORT; + bool check_mtu = false; + + if (plen < sizeof(*params)) + return -EINVAL; + + if (flags & ~BPF_FIB_LOOKUP_MASK) + return -EINVAL; + + if (params->tot_len) + check_mtu = true; + + switch (params->family) { +#if IS_ENABLED(CONFIG_INET) + case AF_INET: + rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu); + break; +#endif +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu); + break; +#endif + } + + if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) { + struct net_device *dev; + + /* When tot_len isn't provided by user, check skb + * against MTU of FIB lookup resulting net_device + */ + dev = dev_get_by_index_rcu(net, params->ifindex); + if (!is_skb_forwardable(dev, skb)) + rc = BPF_FIB_LKUP_RET_FRAG_NEEDED; + + params->mtu_result = dev->mtu; /* union with tot_len */ + } + + return rc; +} + +static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { + .func = bpf_skb_fib_lookup, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +static struct net_device *__dev_via_ifindex(struct net_device *dev_curr, + u32 ifindex) +{ + struct net *netns = dev_net(dev_curr); + + /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */ + if (ifindex == 0) + return dev_curr; + + return dev_get_by_index_rcu(netns, ifindex); +} + +BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb, + u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) +{ + int ret = BPF_MTU_CHK_RET_FRAG_NEEDED; + struct net_device *dev = skb->dev; + int skb_len, dev_len; + int mtu; + + if (unlikely(flags & ~(BPF_MTU_CHK_SEGS))) + return -EINVAL; + + if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len))) + return -EINVAL; + + dev = __dev_via_ifindex(dev, ifindex); + if (unlikely(!dev)) + return -ENODEV; + + mtu = READ_ONCE(dev->mtu); + + dev_len = mtu + dev->hard_header_len; + + /* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */ + skb_len = *mtu_len ? *mtu_len + dev->hard_header_len : skb->len; + + skb_len += len_diff; /* minus result pass check */ + if (skb_len <= dev_len) { + ret = BPF_MTU_CHK_RET_SUCCESS; + goto out; + } + /* At this point, skb->len exceed MTU, but as it include length of all + * segments, it can still be below MTU. The SKB can possibly get + * re-segmented in transmit path (see validate_xmit_skb). Thus, user + * must choose if segs are to be MTU checked. + */ + if (skb_is_gso(skb)) { + ret = BPF_MTU_CHK_RET_SUCCESS; + + if (flags & BPF_MTU_CHK_SEGS && + !skb_gso_validate_network_len(skb, mtu)) + ret = BPF_MTU_CHK_RET_SEGS_TOOBIG; + } +out: + /* BPF verifier guarantees valid pointer */ + *mtu_len = mtu; + + return ret; +} + +BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp, + u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) +{ + struct net_device *dev = xdp->rxq->dev; + int xdp_len = xdp->data_end - xdp->data; + int ret = BPF_MTU_CHK_RET_SUCCESS; + int mtu, dev_len; + + /* XDP variant doesn't support multi-buffer segment check (yet) */ + if (unlikely(flags)) + return -EINVAL; + + dev = __dev_via_ifindex(dev, ifindex); + if (unlikely(!dev)) + return -ENODEV; + + mtu = READ_ONCE(dev->mtu); + + /* Add L2-header as dev MTU is L3 size */ + dev_len = mtu + dev->hard_header_len; + + /* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */ + if (*mtu_len) + xdp_len = *mtu_len + dev->hard_header_len; + + xdp_len += len_diff; /* minus result pass check */ + if (xdp_len > dev_len) + ret = BPF_MTU_CHK_RET_FRAG_NEEDED; + + /* BPF verifier guarantees valid pointer */ + *mtu_len = mtu; + + return ret; +} + +static const struct bpf_func_proto bpf_skb_check_mtu_proto = { + .func = bpf_skb_check_mtu, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_INT, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto bpf_xdp_check_mtu_proto = { + .func = bpf_xdp_check_mtu, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_INT, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) +static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) +{ + int err; + struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr; + + if (!seg6_validate_srh(srh, len, false)) + return -EINVAL; + + switch (type) { + case BPF_LWT_ENCAP_SEG6_INLINE: + if (skb->protocol != htons(ETH_P_IPV6)) + return -EBADMSG; + + err = seg6_do_srh_inline(skb, srh); + break; + case BPF_LWT_ENCAP_SEG6: + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6); + break; + default: + return -EINVAL; + } + + bpf_compute_data_pointers(skb); + if (err) + return err; + + skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + + return seg6_lookup_nexthop(skb, NULL, 0); +} +#endif /* CONFIG_IPV6_SEG6_BPF */ + +#if IS_ENABLED(CONFIG_LWTUNNEL_BPF) +static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, + bool ingress) +{ + return bpf_lwt_push_ip_encap(skb, hdr, len, ingress); +} +#endif + +BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, + u32, len) +{ + switch (type) { +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) + case BPF_LWT_ENCAP_SEG6: + case BPF_LWT_ENCAP_SEG6_INLINE: + return bpf_push_seg6_encap(skb, type, hdr, len); +#endif +#if IS_ENABLED(CONFIG_LWTUNNEL_BPF) + case BPF_LWT_ENCAP_IP: + return bpf_push_ip_encap(skb, hdr, len, true /* ingress */); +#endif + default: + return -EINVAL; + } +} + +BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type, + void *, hdr, u32, len) +{ + switch (type) { +#if IS_ENABLED(CONFIG_LWTUNNEL_BPF) + case BPF_LWT_ENCAP_IP: + return bpf_push_ip_encap(skb, hdr, len, false /* egress */); +#endif + default: + return -EINVAL; + } +} + +static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = { + .func = bpf_lwt_in_push_encap, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg4_type = ARG_CONST_SIZE +}; + +static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = { + .func = bpf_lwt_xmit_push_encap, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg4_type = ARG_CONST_SIZE +}; + +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) +BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, + const void *, from, u32, len) +{ + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + struct ipv6_sr_hdr *srh = srh_state->srh; + void *srh_tlvs, *srh_end, *ptr; + int srhoff = 0; + + if (srh == NULL) + return -EINVAL; + + srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4)); + srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen); + + ptr = skb->data + offset; + if (ptr >= srh_tlvs && ptr + len <= srh_end) + srh_state->valid = false; + else if (ptr < (void *)&srh->flags || + ptr + len > (void *)&srh->segments) + return -EFAULT; + + if (unlikely(bpf_try_make_writable(skb, offset + len))) + return -EFAULT; + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + + memcpy(skb->data + offset, from, len); + return 0; +} + +static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { + .func = bpf_lwt_seg6_store_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg4_type = ARG_CONST_SIZE +}; + +static void bpf_update_srh_state(struct sk_buff *skb) +{ + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + int srhoff = 0; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) { + srh_state->srh = NULL; + } else { + srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + srh_state->hdrlen = srh_state->srh->hdrlen << 3; + srh_state->valid = true; + } +} + +BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, + u32, action, void *, param, u32, param_len) +{ + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + int hdroff = 0; + int err; + + switch (action) { + case SEG6_LOCAL_ACTION_END_X: + if (!seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; + if (param_len != sizeof(struct in6_addr)) + return -EINVAL; + return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0); + case SEG6_LOCAL_ACTION_END_T: + if (!seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; + if (param_len != sizeof(int)) + return -EINVAL; + return seg6_lookup_nexthop(skb, NULL, *(int *)param); + case SEG6_LOCAL_ACTION_END_DT6: + if (!seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; + if (param_len != sizeof(int)) + return -EINVAL; + + if (ipv6_find_hdr(skb, &hdroff, IPPROTO_IPV6, NULL, NULL) < 0) + return -EBADMSG; + if (!pskb_pull(skb, hdroff)) + return -EBADMSG; + + skb_postpull_rcsum(skb, skb_network_header(skb), hdroff); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb->encapsulation = 0; + + bpf_compute_data_pointers(skb); + bpf_update_srh_state(skb); + return seg6_lookup_nexthop(skb, NULL, *(int *)param); + case SEG6_LOCAL_ACTION_END_B6: + if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; + err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE, + param, param_len); + if (!err) + bpf_update_srh_state(skb); + + return err; + case SEG6_LOCAL_ACTION_END_B6_ENCAP: + if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; + err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6, + param, param_len); + if (!err) + bpf_update_srh_state(skb); + + return err; + default: + return -EINVAL; + } +} + +static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { + .func = bpf_lwt_seg6_action, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg4_type = ARG_CONST_SIZE +}; + +BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, + s32, len) +{ + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + struct ipv6_sr_hdr *srh = srh_state->srh; + void *srh_end, *srh_tlvs, *ptr; + struct ipv6hdr *hdr; + int srhoff = 0; + int ret; + + if (unlikely(srh == NULL)) + return -EINVAL; + + srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) + + ((srh->first_segment + 1) << 4)); + srh_end = (void *)((unsigned char *)srh + sizeof(*srh) + + srh_state->hdrlen); + ptr = skb->data + offset; + + if (unlikely(ptr < srh_tlvs || ptr > srh_end)) + return -EFAULT; + if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end)) + return -EFAULT; + + if (len > 0) { + ret = skb_cow_head(skb, len); + if (unlikely(ret < 0)) + return ret; + + ret = bpf_skb_net_hdr_push(skb, offset, len); + } else { + ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len); + } + + bpf_compute_data_pointers(skb); + if (unlikely(ret < 0)) + return ret; + + hdr = (struct ipv6hdr *)skb->data; + hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + srh_state->hdrlen += len; + srh_state->valid = false; + return 0; +} + +static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { + .func = bpf_lwt_seg6_adjust_srh, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; +#endif /* CONFIG_IPV6_SEG6_BPF */ + +#ifdef CONFIG_INET +static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, + int dif, int sdif, u8 family, u8 proto) +{ + struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; + bool refcounted = false; + struct sock *sk = NULL; + + if (family == AF_INET) { + __be32 src4 = tuple->ipv4.saddr; + __be32 dst4 = tuple->ipv4.daddr; + + if (proto == IPPROTO_TCP) + sk = __inet_lookup(net, hinfo, NULL, 0, + src4, tuple->ipv4.sport, + dst4, tuple->ipv4.dport, + dif, sdif, &refcounted); + else + sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, + dst4, tuple->ipv4.dport, + dif, sdif, &udp_table, NULL); +#if IS_ENABLED(CONFIG_IPV6) + } else { + struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; + struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; + + if (proto == IPPROTO_TCP) + sk = __inet6_lookup(net, hinfo, NULL, 0, + src6, tuple->ipv6.sport, + dst6, ntohs(tuple->ipv6.dport), + dif, sdif, &refcounted); + else if (likely(ipv6_bpf_stub)) + sk = ipv6_bpf_stub->udp6_lib_lookup(net, + src6, tuple->ipv6.sport, + dst6, tuple->ipv6.dport, + dif, sdif, + &udp_table, NULL); +#endif + } + + if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) { + WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); + sk = NULL; + } + return sk; +} + +/* bpf_skc_lookup performs the core lookup for different types of sockets, + * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE. + */ +static struct sock * +__bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, + u64 flags, int sdif) +{ + struct sock *sk = NULL; + struct net *net; + u8 family; + + if (len == sizeof(tuple->ipv4)) + family = AF_INET; + else if (len == sizeof(tuple->ipv6)) + family = AF_INET6; + else + return NULL; + + if (unlikely(flags || !((s32)netns_id < 0 || netns_id <= S32_MAX))) + goto out; + + if (sdif < 0) { + if (family == AF_INET) + sdif = inet_sdif(skb); + else + sdif = inet6_sdif(skb); + } + + if ((s32)netns_id < 0) { + net = caller_net; + sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); + } else { + net = get_net_ns_by_id(caller_net, netns_id); + if (unlikely(!net)) + goto out; + sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); + put_net(net); + } + +out: + return sk; +} + +static struct sock * +__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, + u64 flags, int sdif) +{ + struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net, + ifindex, proto, netns_id, flags, + sdif); + + if (sk) { + struct sock *sk2 = sk_to_full_sk(sk); + + /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk + * sock refcnt is decremented to prevent a request_sock leak. + */ + if (!sk_fullsock(sk2)) + sk2 = NULL; + if (sk2 != sk) { + sock_gen_put(sk); + /* Ensure there is no need to bump sk2 refcnt */ + if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { + WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); + return NULL; + } + sk = sk2; + } + } + + return sk; +} + +static struct sock * +bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) +{ + struct net *caller_net; + int ifindex; + + if (skb->dev) { + caller_net = dev_net(skb->dev); + ifindex = skb->dev->ifindex; + } else { + caller_net = sock_net(skb->sk); + ifindex = 0; + } + + return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto, + netns_id, flags, -1); +} + +static struct sock * +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) +{ + struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id, + flags); + + if (sk) { + struct sock *sk2 = sk_to_full_sk(sk); + + /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk + * sock refcnt is decremented to prevent a request_sock leak. + */ + if (!sk_fullsock(sk2)) + sk2 = NULL; + if (sk2 != sk) { + sock_gen_put(sk); + /* Ensure there is no need to bump sk2 refcnt */ + if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { + WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); + return NULL; + } + sk = sk2; + } + } + + return sk; +} + +BPF_CALL_5(bpf_skc_lookup_tcp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP, + netns_id, flags); +} + +static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = { + .func = bpf_skc_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, + netns_id, flags); +} + +static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { + .func = bpf_sk_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, + netns_id, flags); +} + +static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { + .func = bpf_sk_lookup_udp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_tc_skc_lookup_tcp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + struct net_device *dev = skb->dev; + int ifindex = dev->ifindex, sdif = dev_sdif(dev); + struct net *caller_net = dev_net(dev); + + return (unsigned long)__bpf_skc_lookup(skb, tuple, len, caller_net, + ifindex, IPPROTO_TCP, netns_id, + flags, sdif); +} + +static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = { + .func = bpf_tc_skc_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_tc_sk_lookup_tcp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + struct net_device *dev = skb->dev; + int ifindex = dev->ifindex, sdif = dev_sdif(dev); + struct net *caller_net = dev_net(dev); + + return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, + ifindex, IPPROTO_TCP, netns_id, + flags, sdif); +} + +static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = { + .func = bpf_tc_sk_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_tc_sk_lookup_udp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + struct net_device *dev = skb->dev; + int ifindex = dev->ifindex, sdif = dev_sdif(dev); + struct net *caller_net = dev_net(dev); + + return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, + ifindex, IPPROTO_UDP, netns_id, + flags, sdif); +} + +static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = { + .func = bpf_tc_sk_lookup_udp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_1(bpf_sk_release, struct sock *, sk) +{ + if (sk && sk_is_refcounted(sk)) + sock_gen_put(sk); + return 0; +} + +static const struct bpf_func_proto bpf_sk_release_proto = { + .func = bpf_sk_release, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | OBJ_RELEASE, +}; + +BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net_device *dev = ctx->rxq->dev; + int ifindex = dev->ifindex, sdif = dev_sdif(dev); + struct net *caller_net = dev_net(dev); + + return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, + ifindex, IPPROTO_UDP, netns_id, + flags, sdif); +} + +static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { + .func = bpf_xdp_sk_lookup_udp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net_device *dev = ctx->rxq->dev; + int ifindex = dev->ifindex, sdif = dev_sdif(dev); + struct net *caller_net = dev_net(dev); + + return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net, + ifindex, IPPROTO_TCP, netns_id, + flags, sdif); +} + +static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { + .func = bpf_xdp_skc_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net_device *dev = ctx->rxq->dev; + int ifindex = dev->ifindex, sdif = dev_sdif(dev); + struct net *caller_net = dev_net(dev); + + return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, + ifindex, IPPROTO_TCP, netns_id, + flags, sdif); +} + +static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { + .func = bpf_xdp_sk_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, + sock_net(ctx->sk), 0, + IPPROTO_TCP, netns_id, flags, + -1); +} + +static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { + .func = bpf_sock_addr_skc_lookup_tcp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, + sock_net(ctx->sk), 0, IPPROTO_TCP, + netns_id, flags, -1); +} + +static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { + .func = bpf_sock_addr_sk_lookup_tcp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, + sock_net(ctx->sk), 0, IPPROTO_UDP, + netns_id, flags, -1); +} + +static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { + .func = bpf_sock_addr_sk_lookup_udp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, + icsk_retransmits)) + return false; + + if (off % size != 0) + return false; + + switch (off) { + case offsetof(struct bpf_tcp_sock, bytes_received): + case offsetof(struct bpf_tcp_sock, bytes_acked): + return size == sizeof(__u64); + default: + return size == sizeof(__u32); + } +} + +u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + +#define BPF_TCP_SOCK_GET_COMMON(FIELD) \ + do { \ + BUILD_BUG_ON(sizeof_field(struct tcp_sock, FIELD) > \ + sizeof_field(struct bpf_tcp_sock, FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\ + si->dst_reg, si->src_reg, \ + offsetof(struct tcp_sock, FIELD)); \ + } while (0) + +#define BPF_INET_SOCK_GET_COMMON(FIELD) \ + do { \ + BUILD_BUG_ON(sizeof_field(struct inet_connection_sock, \ + FIELD) > \ + sizeof_field(struct bpf_tcp_sock, FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct inet_connection_sock, \ + FIELD), \ + si->dst_reg, si->src_reg, \ + offsetof( \ + struct inet_connection_sock, \ + FIELD)); \ + } while (0) + + if (insn > insn_buf) + return insn - insn_buf; + + switch (si->off) { + case offsetof(struct bpf_tcp_sock, rtt_min): + BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) != + sizeof(struct minmax)); + BUILD_BUG_ON(sizeof(struct minmax) < + sizeof(struct minmax_sample)); + + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct tcp_sock, rtt_min) + + offsetof(struct minmax_sample, v)); + break; + case offsetof(struct bpf_tcp_sock, snd_cwnd): + BPF_TCP_SOCK_GET_COMMON(snd_cwnd); + break; + case offsetof(struct bpf_tcp_sock, srtt_us): + BPF_TCP_SOCK_GET_COMMON(srtt_us); + break; + case offsetof(struct bpf_tcp_sock, snd_ssthresh): + BPF_TCP_SOCK_GET_COMMON(snd_ssthresh); + break; + case offsetof(struct bpf_tcp_sock, rcv_nxt): + BPF_TCP_SOCK_GET_COMMON(rcv_nxt); + break; + case offsetof(struct bpf_tcp_sock, snd_nxt): + BPF_TCP_SOCK_GET_COMMON(snd_nxt); + break; + case offsetof(struct bpf_tcp_sock, snd_una): + BPF_TCP_SOCK_GET_COMMON(snd_una); + break; + case offsetof(struct bpf_tcp_sock, mss_cache): + BPF_TCP_SOCK_GET_COMMON(mss_cache); + break; + case offsetof(struct bpf_tcp_sock, ecn_flags): + BPF_TCP_SOCK_GET_COMMON(ecn_flags); + break; + case offsetof(struct bpf_tcp_sock, rate_delivered): + BPF_TCP_SOCK_GET_COMMON(rate_delivered); + break; + case offsetof(struct bpf_tcp_sock, rate_interval_us): + BPF_TCP_SOCK_GET_COMMON(rate_interval_us); + break; + case offsetof(struct bpf_tcp_sock, packets_out): + BPF_TCP_SOCK_GET_COMMON(packets_out); + break; + case offsetof(struct bpf_tcp_sock, retrans_out): + BPF_TCP_SOCK_GET_COMMON(retrans_out); + break; + case offsetof(struct bpf_tcp_sock, total_retrans): + BPF_TCP_SOCK_GET_COMMON(total_retrans); + break; + case offsetof(struct bpf_tcp_sock, segs_in): + BPF_TCP_SOCK_GET_COMMON(segs_in); + break; + case offsetof(struct bpf_tcp_sock, data_segs_in): + BPF_TCP_SOCK_GET_COMMON(data_segs_in); + break; + case offsetof(struct bpf_tcp_sock, segs_out): + BPF_TCP_SOCK_GET_COMMON(segs_out); + break; + case offsetof(struct bpf_tcp_sock, data_segs_out): + BPF_TCP_SOCK_GET_COMMON(data_segs_out); + break; + case offsetof(struct bpf_tcp_sock, lost_out): + BPF_TCP_SOCK_GET_COMMON(lost_out); + break; + case offsetof(struct bpf_tcp_sock, sacked_out): + BPF_TCP_SOCK_GET_COMMON(sacked_out); + break; + case offsetof(struct bpf_tcp_sock, bytes_received): + BPF_TCP_SOCK_GET_COMMON(bytes_received); + break; + case offsetof(struct bpf_tcp_sock, bytes_acked): + BPF_TCP_SOCK_GET_COMMON(bytes_acked); + break; + case offsetof(struct bpf_tcp_sock, dsack_dups): + BPF_TCP_SOCK_GET_COMMON(dsack_dups); + break; + case offsetof(struct bpf_tcp_sock, delivered): + BPF_TCP_SOCK_GET_COMMON(delivered); + break; + case offsetof(struct bpf_tcp_sock, delivered_ce): + BPF_TCP_SOCK_GET_COMMON(delivered_ce); + break; + case offsetof(struct bpf_tcp_sock, icsk_retransmits): + BPF_INET_SOCK_GET_COMMON(icsk_retransmits); + break; + } + + return insn - insn_buf; +} + +BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) +{ + if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_tcp_sock_proto = { + .func = bpf_tcp_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, +}; + +BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk) +{ + sk = sk_to_full_sk(sk); + + if (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE)) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_get_listener_sock_proto = { + .func = bpf_get_listener_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, +}; + +BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) +{ + unsigned int iphdr_len; + + switch (skb_protocol(skb, true)) { + case cpu_to_be16(ETH_P_IP): + iphdr_len = sizeof(struct iphdr); + break; + case cpu_to_be16(ETH_P_IPV6): + iphdr_len = sizeof(struct ipv6hdr); + break; + default: + return 0; + } + + if (skb_headlen(skb) < iphdr_len) + return 0; + + if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len)) + return 0; + + return INET_ECN_set_ce(skb); +} + +bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id)) + return false; + + if (off % size != 0) + return false; + + switch (off) { + default: + return size == sizeof(__u32); + } +} + +u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + +#define BPF_XDP_SOCK_GET(FIELD) \ + do { \ + BUILD_BUG_ON(sizeof_field(struct xdp_sock, FIELD) > \ + sizeof_field(struct bpf_xdp_sock, FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\ + si->dst_reg, si->src_reg, \ + offsetof(struct xdp_sock, FIELD)); \ + } while (0) + + switch (si->off) { + case offsetof(struct bpf_xdp_sock, queue_id): + BPF_XDP_SOCK_GET(queue_id); + break; + } + + return insn - insn_buf; +} + +static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { + .func = bpf_skb_ecn_set_ce, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len, + struct tcphdr *, th, u32, th_len) +{ +#ifdef CONFIG_SYN_COOKIES + u32 cookie; + int ret; + + if (unlikely(!sk || th_len < sizeof(*th))) + return -EINVAL; + + /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */ + if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) + return -EINVAL; + + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies)) + return -EINVAL; + + if (!th->ack || th->rst || th->syn) + return -ENOENT; + + if (unlikely(iph_len < sizeof(struct iphdr))) + return -EINVAL; + + if (tcp_synq_no_recent_overflow(sk)) + return -ENOENT; + + cookie = ntohl(th->ack_seq) - 1; + + /* Both struct iphdr and struct ipv6hdr have the version field at the + * same offset so we can cast to the shorter header (struct iphdr). + */ + switch (((struct iphdr *)iph)->version) { + case 4: + if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) + return -EINVAL; + + ret = __cookie_v4_check((struct iphdr *)iph, th, cookie); + break; + +#if IS_BUILTIN(CONFIG_IPV6) + case 6: + if (unlikely(iph_len < sizeof(struct ipv6hdr))) + return -EINVAL; + + if (sk->sk_family != AF_INET6) + return -EINVAL; + + ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie); + break; +#endif /* CONFIG_IPV6 */ + + default: + return -EPROTONOSUPPORT; + } + + if (ret > 0) + return 0; + + return -ENOENT; +#else + return -ENOTSUPP; +#endif +} + +static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { + .func = bpf_tcp_check_syncookie, + .gpl_only = true, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len, + struct tcphdr *, th, u32, th_len) +{ +#ifdef CONFIG_SYN_COOKIES + u32 cookie; + u16 mss; + + if (unlikely(!sk || th_len < sizeof(*th) || th_len != th->doff * 4)) + return -EINVAL; + + if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) + return -EINVAL; + + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies)) + return -ENOENT; + + if (!th->syn || th->ack || th->fin || th->rst) + return -EINVAL; + + if (unlikely(iph_len < sizeof(struct iphdr))) + return -EINVAL; + + /* Both struct iphdr and struct ipv6hdr have the version field at the + * same offset so we can cast to the shorter header (struct iphdr). + */ + switch (((struct iphdr *)iph)->version) { + case 4: + if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) + return -EINVAL; + + mss = tcp_v4_get_syncookie(sk, iph, th, &cookie); + break; + +#if IS_BUILTIN(CONFIG_IPV6) + case 6: + if (unlikely(iph_len < sizeof(struct ipv6hdr))) + return -EINVAL; + + if (sk->sk_family != AF_INET6) + return -EINVAL; + + mss = tcp_v6_get_syncookie(sk, iph, th, &cookie); + break; +#endif /* CONFIG_IPV6 */ + + default: + return -EPROTONOSUPPORT; + } + if (mss == 0) + return -ENOENT; + + return cookie | ((u64)mss << 32); +#else + return -EOPNOTSUPP; +#endif /* CONFIG_SYN_COOKIES */ +} + +static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { + .func = bpf_tcp_gen_syncookie, + .gpl_only = true, /* __cookie_v*_init_sequence() is GPL */ + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags) +{ + if (!sk || flags != 0) + return -EINVAL; + if (!skb_at_tc_ingress(skb)) + return -EOPNOTSUPP; + if (unlikely(dev_net(skb->dev) != sock_net(sk))) + return -ENETUNREACH; + if (unlikely(sk_fullsock(sk) && sk->sk_reuseport)) + return -ESOCKTNOSUPPORT; + if (sk_unhashed(sk)) + return -EOPNOTSUPP; + if (sk_is_refcounted(sk) && + unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) + return -ENOENT; + + skb_orphan(skb); + skb->sk = sk; + skb->destructor = sock_pfree; + + return 0; +} + +static const struct bpf_func_proto bpf_sk_assign_proto = { + .func = bpf_sk_assign, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg3_type = ARG_ANYTHING, +}; + +static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend, + u8 search_kind, const u8 *magic, + u8 magic_len, bool *eol) +{ + u8 kind, kind_len; + + *eol = false; + + while (op < opend) { + kind = op[0]; + + if (kind == TCPOPT_EOL) { + *eol = true; + return ERR_PTR(-ENOMSG); + } else if (kind == TCPOPT_NOP) { + op++; + continue; + } + + if (opend - op < 2 || opend - op < op[1] || op[1] < 2) + /* Something is wrong in the received header. + * Follow the TCP stack's tcp_parse_options() + * and just bail here. + */ + return ERR_PTR(-EFAULT); + + kind_len = op[1]; + if (search_kind == kind) { + if (!magic_len) + return op; + + if (magic_len > kind_len - 2) + return ERR_PTR(-ENOMSG); + + if (!memcmp(&op[2], magic, magic_len)) + return op; + } + + op += kind_len; + } + + return ERR_PTR(-ENOMSG); +} + +BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, + void *, search_res, u32, len, u64, flags) +{ + bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN; + const u8 *op, *opend, *magic, *search = search_res; + u8 search_kind, search_len, copy_len, magic_len; + int ret; + + /* 2 byte is the minimal option len except TCPOPT_NOP and + * TCPOPT_EOL which are useless for the bpf prog to learn + * and this helper disallow loading them also. + */ + if (len < 2 || flags & ~BPF_LOAD_HDR_OPT_TCP_SYN) + return -EINVAL; + + search_kind = search[0]; + search_len = search[1]; + + if (search_len > len || search_kind == TCPOPT_NOP || + search_kind == TCPOPT_EOL) + return -EINVAL; + + if (search_kind == TCPOPT_EXP || search_kind == 253) { + /* 16 or 32 bit magic. +2 for kind and kind length */ + if (search_len != 4 && search_len != 6) + return -EINVAL; + magic = &search[2]; + magic_len = search_len - 2; + } else { + if (search_len) + return -EINVAL; + magic = NULL; + magic_len = 0; + } + + if (load_syn) { + ret = bpf_sock_ops_get_syn(bpf_sock, TCP_BPF_SYN, &op); + if (ret < 0) + return ret; + + opend = op + ret; + op += sizeof(struct tcphdr); + } else { + if (!bpf_sock->skb || + bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB) + /* This bpf_sock->op cannot call this helper */ + return -EPERM; + + opend = bpf_sock->skb_data_end; + op = bpf_sock->skb->data + sizeof(struct tcphdr); + } + + op = bpf_search_tcp_opt(op, opend, search_kind, magic, magic_len, + &eol); + if (IS_ERR(op)) + return PTR_ERR(op); + + copy_len = op[1]; + ret = copy_len; + if (copy_len > len) { + ret = -ENOSPC; + copy_len = len; + } + + memcpy(search_res, op, copy_len); + return ret; +} + +static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = { + .func = bpf_sock_ops_load_hdr_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, + const void *, from, u32, len, u64, flags) +{ + u8 new_kind, new_kind_len, magic_len = 0, *opend; + const u8 *op, *new_op, *magic = NULL; + struct sk_buff *skb; + bool eol; + + if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB) + return -EPERM; + + if (len < 2 || flags) + return -EINVAL; + + new_op = from; + new_kind = new_op[0]; + new_kind_len = new_op[1]; + + if (new_kind_len > len || new_kind == TCPOPT_NOP || + new_kind == TCPOPT_EOL) + return -EINVAL; + + if (new_kind_len > bpf_sock->remaining_opt_len) + return -ENOSPC; + + /* 253 is another experimental kind */ + if (new_kind == TCPOPT_EXP || new_kind == 253) { + if (new_kind_len < 4) + return -EINVAL; + /* Match for the 2 byte magic also. + * RFC 6994: the magic could be 2 or 4 bytes. + * Hence, matching by 2 byte only is on the + * conservative side but it is the right + * thing to do for the 'search-for-duplication' + * purpose. + */ + magic = &new_op[2]; + magic_len = 2; + } + + /* Check for duplication */ + skb = bpf_sock->skb; + op = skb->data + sizeof(struct tcphdr); + opend = bpf_sock->skb_data_end; + + op = bpf_search_tcp_opt(op, opend, new_kind, magic, magic_len, + &eol); + if (!IS_ERR(op)) + return -EEXIST; + + if (PTR_ERR(op) != -ENOMSG) + return PTR_ERR(op); + + if (eol) + /* The option has been ended. Treat it as no more + * header option can be written. + */ + return -ENOSPC; + + /* No duplication found. Store the header option. */ + memcpy(opend, from, new_kind_len); + + bpf_sock->remaining_opt_len -= new_kind_len; + bpf_sock->skb_data_end += new_kind_len; + + return 0; +} + +static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = { + .func = bpf_sock_ops_store_hdr_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, + u32, len, u64, flags) +{ + if (bpf_sock->op != BPF_SOCK_OPS_HDR_OPT_LEN_CB) + return -EPERM; + + if (flags || len < 2) + return -EINVAL; + + if (len > bpf_sock->remaining_opt_len) + return -ENOSPC; + + bpf_sock->remaining_opt_len -= len; + + return 0; +} + +static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = { + .func = bpf_sock_ops_reserve_hdr_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb, + u64, tstamp, u32, tstamp_type) +{ + /* skb_clear_delivery_time() is done for inet protocol */ + if (skb->protocol != htons(ETH_P_IP) && + skb->protocol != htons(ETH_P_IPV6)) + return -EOPNOTSUPP; + + switch (tstamp_type) { + case BPF_SKB_TSTAMP_DELIVERY_MONO: + if (!tstamp) + return -EINVAL; + skb->tstamp = tstamp; + skb->mono_delivery_time = 1; + break; + case BPF_SKB_TSTAMP_UNSPEC: + if (tstamp) + return -EINVAL; + skb->tstamp = 0; + skb->mono_delivery_time = 0; + break; + default: + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto bpf_skb_set_tstamp_proto = { + .func = bpf_skb_set_tstamp, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +#ifdef CONFIG_SYN_COOKIES +BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv4, struct iphdr *, iph, + struct tcphdr *, th, u32, th_len) +{ + u32 cookie; + u16 mss; + + if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) + return -EINVAL; + + mss = tcp_parse_mss_option(th, 0) ?: TCP_MSS_DEFAULT; + cookie = __cookie_v4_init_sequence(iph, th, &mss); + + return cookie | ((u64)mss << 32); +} + +static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = { + .func = bpf_tcp_raw_gen_syncookie_ipv4, + .gpl_only = true, /* __cookie_v4_init_sequence() is GPL */ + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_size = sizeof(struct iphdr), + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6, struct ipv6hdr *, iph, + struct tcphdr *, th, u32, th_len) +{ +#if IS_BUILTIN(CONFIG_IPV6) + const u16 mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - + sizeof(struct ipv6hdr); + u32 cookie; + u16 mss; + + if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) + return -EINVAL; + + mss = tcp_parse_mss_option(th, 0) ?: mss_clamp; + cookie = __cookie_v6_init_sequence(iph, th, &mss); + + return cookie | ((u64)mss << 32); +#else + return -EPROTONOSUPPORT; +#endif +} + +static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = { + .func = bpf_tcp_raw_gen_syncookie_ipv6, + .gpl_only = true, /* __cookie_v6_init_sequence() is GPL */ + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_size = sizeof(struct ipv6hdr), + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph, + struct tcphdr *, th) +{ + u32 cookie = ntohl(th->ack_seq) - 1; + + if (__cookie_v4_check(iph, th, cookie) > 0) + return 0; + + return -EACCES; +} + +static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv4_proto = { + .func = bpf_tcp_raw_check_syncookie_ipv4, + .gpl_only = true, /* __cookie_v4_check is GPL */ + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_size = sizeof(struct iphdr), + .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg2_size = sizeof(struct tcphdr), +}; + +BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph, + struct tcphdr *, th) +{ +#if IS_BUILTIN(CONFIG_IPV6) + u32 cookie = ntohl(th->ack_seq) - 1; + + if (__cookie_v6_check(iph, th, cookie) > 0) + return 0; + + return -EACCES; +#else + return -EPROTONOSUPPORT; +#endif +} + +static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = { + .func = bpf_tcp_raw_check_syncookie_ipv6, + .gpl_only = true, /* __cookie_v6_check is GPL */ + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_size = sizeof(struct ipv6hdr), + .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg2_size = sizeof(struct tcphdr), +}; +#endif /* CONFIG_SYN_COOKIES */ + +#endif /* CONFIG_INET */ + +bool bpf_helper_changes_pkt_data(void *func) +{ + if (func == bpf_skb_vlan_push || + func == bpf_skb_vlan_pop || + func == bpf_skb_store_bytes || + func == bpf_skb_change_proto || + func == bpf_skb_change_head || + func == sk_skb_change_head || + func == bpf_skb_change_tail || + func == sk_skb_change_tail || + func == bpf_skb_adjust_room || + func == sk_skb_adjust_room || + func == bpf_skb_pull_data || + func == sk_skb_pull_data || + func == bpf_clone_redirect || + func == bpf_l3_csum_replace || + func == bpf_l4_csum_replace || + func == bpf_xdp_adjust_head || + func == bpf_xdp_adjust_meta || + func == bpf_msg_pull_data || + func == bpf_msg_push_data || + func == bpf_msg_pop_data || + func == bpf_xdp_adjust_tail || +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) + func == bpf_lwt_seg6_store_bytes || + func == bpf_lwt_seg6_adjust_srh || + func == bpf_lwt_seg6_action || +#endif +#ifdef CONFIG_INET + func == bpf_sock_ops_store_hdr_opt || +#endif + func == bpf_lwt_in_push_encap || + func == bpf_lwt_xmit_push_encap) + return true; + + return false; +} + +const struct bpf_func_proto bpf_event_output_data_proto __weak; +const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak; + +static const struct bpf_func_proto * +sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + switch (func_id) { + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_sock_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_sock_proto; + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_cg_sock_proto; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + switch (func_id) { + case BPF_FUNC_bind: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + return &bpf_bind_proto; + default: + return NULL; + } + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_sock_addr_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_sock_addr_proto; + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sock_addr_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sock_addr_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_sock_addr_skc_lookup_tcp_proto; +#endif /* CONFIG_INET */ + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; + case BPF_FUNC_setsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: + return &bpf_sock_addr_setsockopt_proto; + default: + return NULL; + } + case BPF_FUNC_getsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: + return &bpf_sock_addr_getsockopt_proto; + default: + return NULL; + } + default: + return bpf_sk_base_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_load_bytes_relative: + return &bpf_skb_load_bytes_relative_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_proto; + case BPF_FUNC_get_socket_uid: + return &bpf_get_socket_uid_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; + default: + return bpf_sk_base_func_proto(func_id); + } +} + +const struct bpf_func_proto bpf_sk_storage_get_proto __weak; +const struct bpf_func_proto bpf_sk_storage_delete_proto __weak; + +static const struct bpf_func_proto * +cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + switch (func_id) { + case BPF_FUNC_sk_fullsock: + return &bpf_sk_fullsock_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; +#ifdef CONFIG_SOCK_CGROUP_DATA + case BPF_FUNC_skb_cgroup_id: + return &bpf_skb_cgroup_id_proto; + case BPF_FUNC_skb_ancestor_cgroup_id: + return &bpf_skb_ancestor_cgroup_id_proto; + case BPF_FUNC_sk_cgroup_id: + return &bpf_sk_cgroup_id_proto; + case BPF_FUNC_sk_ancestor_cgroup_id: + return &bpf_sk_ancestor_cgroup_id_proto; +#endif +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_skc_lookup_tcp_proto; + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; + case BPF_FUNC_get_listener_sock: + return &bpf_get_listener_sock_proto; + case BPF_FUNC_skb_ecn_set_ce: + return &bpf_skb_ecn_set_ce_proto; +#endif + default: + return sk_filter_func_proto(func_id, prog); + } +} + +static const struct bpf_func_proto * +tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_skb_store_bytes: + return &bpf_skb_store_bytes_proto; + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_load_bytes_relative: + return &bpf_skb_load_bytes_relative_proto; + case BPF_FUNC_skb_pull_data: + return &bpf_skb_pull_data_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; + case BPF_FUNC_csum_update: + return &bpf_csum_update_proto; + case BPF_FUNC_csum_level: + return &bpf_csum_level_proto; + case BPF_FUNC_l3_csum_replace: + return &bpf_l3_csum_replace_proto; + case BPF_FUNC_l4_csum_replace: + return &bpf_l4_csum_replace_proto; + case BPF_FUNC_clone_redirect: + return &bpf_clone_redirect_proto; + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_proto; + case BPF_FUNC_skb_vlan_push: + return &bpf_skb_vlan_push_proto; + case BPF_FUNC_skb_vlan_pop: + return &bpf_skb_vlan_pop_proto; + case BPF_FUNC_skb_change_proto: + return &bpf_skb_change_proto_proto; + case BPF_FUNC_skb_change_type: + return &bpf_skb_change_type_proto; + case BPF_FUNC_skb_adjust_room: + return &bpf_skb_adjust_room_proto; + case BPF_FUNC_skb_change_tail: + return &bpf_skb_change_tail_proto; + case BPF_FUNC_skb_change_head: + return &bpf_skb_change_head_proto; + case BPF_FUNC_skb_get_tunnel_key: + return &bpf_skb_get_tunnel_key_proto; + case BPF_FUNC_skb_set_tunnel_key: + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_skb_get_tunnel_opt: + return &bpf_skb_get_tunnel_opt_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_redirect: + return &bpf_redirect_proto; + case BPF_FUNC_redirect_neigh: + return &bpf_redirect_neigh_proto; + case BPF_FUNC_redirect_peer: + return &bpf_redirect_peer_proto; + case BPF_FUNC_get_route_realm: + return &bpf_get_route_realm_proto; + case BPF_FUNC_get_hash_recalc: + return &bpf_get_hash_recalc_proto; + case BPF_FUNC_set_hash_invalid: + return &bpf_set_hash_invalid_proto; + case BPF_FUNC_set_hash: + return &bpf_set_hash_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_skb_under_cgroup: + return &bpf_skb_under_cgroup_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_proto; + case BPF_FUNC_get_socket_uid: + return &bpf_get_socket_uid_proto; + case BPF_FUNC_fib_lookup: + return &bpf_skb_fib_lookup_proto; + case BPF_FUNC_check_mtu: + return &bpf_skb_check_mtu_proto; + case BPF_FUNC_sk_fullsock: + return &bpf_sk_fullsock_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; +#ifdef CONFIG_XFRM + case BPF_FUNC_skb_get_xfrm_state: + return &bpf_skb_get_xfrm_state_proto; +#endif +#ifdef CONFIG_CGROUP_NET_CLASSID + case BPF_FUNC_skb_cgroup_classid: + return &bpf_skb_cgroup_classid_proto; +#endif +#ifdef CONFIG_SOCK_CGROUP_DATA + case BPF_FUNC_skb_cgroup_id: + return &bpf_skb_cgroup_id_proto; + case BPF_FUNC_skb_ancestor_cgroup_id: + return &bpf_skb_ancestor_cgroup_id_proto; +#endif +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_tc_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_tc_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; + case BPF_FUNC_get_listener_sock: + return &bpf_get_listener_sock_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_tc_skc_lookup_tcp_proto; + case BPF_FUNC_tcp_check_syncookie: + return &bpf_tcp_check_syncookie_proto; + case BPF_FUNC_skb_ecn_set_ce: + return &bpf_skb_ecn_set_ce_proto; + case BPF_FUNC_tcp_gen_syncookie: + return &bpf_tcp_gen_syncookie_proto; + case BPF_FUNC_sk_assign: + return &bpf_sk_assign_proto; + case BPF_FUNC_skb_set_tstamp: + return &bpf_skb_set_tstamp_proto; +#ifdef CONFIG_SYN_COOKIES + case BPF_FUNC_tcp_raw_gen_syncookie_ipv4: + return &bpf_tcp_raw_gen_syncookie_ipv4_proto; + case BPF_FUNC_tcp_raw_gen_syncookie_ipv6: + return &bpf_tcp_raw_gen_syncookie_ipv6_proto; + case BPF_FUNC_tcp_raw_check_syncookie_ipv4: + return &bpf_tcp_raw_check_syncookie_ipv4_proto; + case BPF_FUNC_tcp_raw_check_syncookie_ipv6: + return &bpf_tcp_raw_check_syncookie_ipv6_proto; +#endif +#endif + default: + return bpf_sk_base_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_xdp_event_output_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; + case BPF_FUNC_xdp_adjust_head: + return &bpf_xdp_adjust_head_proto; + case BPF_FUNC_xdp_adjust_meta: + return &bpf_xdp_adjust_meta_proto; + case BPF_FUNC_redirect: + return &bpf_xdp_redirect_proto; + case BPF_FUNC_redirect_map: + return &bpf_xdp_redirect_map_proto; + case BPF_FUNC_xdp_adjust_tail: + return &bpf_xdp_adjust_tail_proto; + case BPF_FUNC_xdp_get_buff_len: + return &bpf_xdp_get_buff_len_proto; + case BPF_FUNC_xdp_load_bytes: + return &bpf_xdp_load_bytes_proto; + case BPF_FUNC_xdp_store_bytes: + return &bpf_xdp_store_bytes_proto; + case BPF_FUNC_fib_lookup: + return &bpf_xdp_fib_lookup_proto; + case BPF_FUNC_check_mtu: + return &bpf_xdp_check_mtu_proto; +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_udp: + return &bpf_xdp_sk_lookup_udp_proto; + case BPF_FUNC_sk_lookup_tcp: + return &bpf_xdp_sk_lookup_tcp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_xdp_skc_lookup_tcp_proto; + case BPF_FUNC_tcp_check_syncookie: + return &bpf_tcp_check_syncookie_proto; + case BPF_FUNC_tcp_gen_syncookie: + return &bpf_tcp_gen_syncookie_proto; +#ifdef CONFIG_SYN_COOKIES + case BPF_FUNC_tcp_raw_gen_syncookie_ipv4: + return &bpf_tcp_raw_gen_syncookie_ipv4_proto; + case BPF_FUNC_tcp_raw_gen_syncookie_ipv6: + return &bpf_tcp_raw_gen_syncookie_ipv6_proto; + case BPF_FUNC_tcp_raw_check_syncookie_ipv4: + return &bpf_tcp_raw_check_syncookie_ipv4_proto; + case BPF_FUNC_tcp_raw_check_syncookie_ipv6: + return &bpf_tcp_raw_check_syncookie_ipv6_proto; +#endif +#endif + default: + return bpf_sk_base_func_proto(func_id); + } + +#if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES) + /* The nf_conn___init type is used in the NF_CONNTRACK kfuncs. The + * kfuncs are defined in two different modules, and we want to be able + * to use them interchangably with the same BTF type ID. Because modules + * can't de-duplicate BTF IDs between each other, we need the type to be + * referenced in the vmlinux BTF or the verifier will get confused about + * the different types. So we add this dummy type reference which will + * be included in vmlinux BTF, allowing both modules to refer to the + * same type ID. + */ + BTF_TYPE_EMIT(struct nf_conn___init); +#endif +} + +const struct bpf_func_proto bpf_sock_map_update_proto __weak; +const struct bpf_func_proto bpf_sock_hash_update_proto __weak; + +static const struct bpf_func_proto * +sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + switch (func_id) { + case BPF_FUNC_setsockopt: + return &bpf_sock_ops_setsockopt_proto; + case BPF_FUNC_getsockopt: + return &bpf_sock_ops_getsockopt_proto; + case BPF_FUNC_sock_ops_cb_flags_set: + return &bpf_sock_ops_cb_flags_set_proto; + case BPF_FUNC_sock_map_update: + return &bpf_sock_map_update_proto; + case BPF_FUNC_sock_hash_update: + return &bpf_sock_hash_update_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_sock_ops_proto; + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_sock_ops_proto; +#ifdef CONFIG_INET + case BPF_FUNC_load_hdr_opt: + return &bpf_sock_ops_load_hdr_opt_proto; + case BPF_FUNC_store_hdr_opt: + return &bpf_sock_ops_store_hdr_opt_proto; + case BPF_FUNC_reserve_hdr_opt: + return &bpf_sock_ops_reserve_hdr_opt_proto; + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; +#endif /* CONFIG_INET */ + default: + return bpf_sk_base_func_proto(func_id); + } +} + +const struct bpf_func_proto bpf_msg_redirect_map_proto __weak; +const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak; + +static const struct bpf_func_proto * +sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_msg_redirect_map: + return &bpf_msg_redirect_map_proto; + case BPF_FUNC_msg_redirect_hash: + return &bpf_msg_redirect_hash_proto; + case BPF_FUNC_msg_apply_bytes: + return &bpf_msg_apply_bytes_proto; + case BPF_FUNC_msg_cork_bytes: + return &bpf_msg_cork_bytes_proto; + case BPF_FUNC_msg_pull_data: + return &bpf_msg_pull_data_proto; + case BPF_FUNC_msg_push_data: + return &bpf_msg_push_data_proto; + case BPF_FUNC_msg_pop_data: + return &bpf_msg_pop_data_proto; + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_current_pid_tgid: + return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_sk_msg_proto; +#ifdef CONFIG_CGROUPS + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; + case BPF_FUNC_get_current_ancestor_cgroup_id: + return &bpf_get_current_ancestor_cgroup_id_proto; +#endif +#ifdef CONFIG_CGROUP_NET_CLASSID + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_curr_proto; +#endif + default: + return bpf_sk_base_func_proto(func_id); + } +} + +const struct bpf_func_proto bpf_sk_redirect_map_proto __weak; +const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak; + +static const struct bpf_func_proto * +sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_skb_store_bytes: + return &bpf_skb_store_bytes_proto; + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_pull_data: + return &sk_skb_pull_data_proto; + case BPF_FUNC_skb_change_tail: + return &sk_skb_change_tail_proto; + case BPF_FUNC_skb_change_head: + return &sk_skb_change_head_proto; + case BPF_FUNC_skb_adjust_room: + return &sk_skb_adjust_room_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_proto; + case BPF_FUNC_get_socket_uid: + return &bpf_get_socket_uid_proto; + case BPF_FUNC_sk_redirect_map: + return &bpf_sk_redirect_map_proto; + case BPF_FUNC_sk_redirect_hash: + return &bpf_sk_redirect_hash_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_skc_lookup_tcp_proto; +#endif + default: + return bpf_sk_base_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_flow_dissector_load_bytes_proto; + default: + return bpf_sk_base_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_pull_data: + return &bpf_skb_pull_data_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_proto; + case BPF_FUNC_get_route_realm: + return &bpf_get_route_realm_proto; + case BPF_FUNC_get_hash_recalc: + return &bpf_get_hash_recalc_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_skb_under_cgroup: + return &bpf_skb_under_cgroup_proto; + default: + return bpf_sk_base_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_lwt_push_encap: + return &bpf_lwt_in_push_encap_proto; + default: + return lwt_out_func_proto(func_id, prog); + } +} + +static const struct bpf_func_proto * +lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_skb_get_tunnel_key: + return &bpf_skb_get_tunnel_key_proto; + case BPF_FUNC_skb_set_tunnel_key: + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_skb_get_tunnel_opt: + return &bpf_skb_get_tunnel_opt_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_redirect: + return &bpf_redirect_proto; + case BPF_FUNC_clone_redirect: + return &bpf_clone_redirect_proto; + case BPF_FUNC_skb_change_tail: + return &bpf_skb_change_tail_proto; + case BPF_FUNC_skb_change_head: + return &bpf_skb_change_head_proto; + case BPF_FUNC_skb_store_bytes: + return &bpf_skb_store_bytes_proto; + case BPF_FUNC_csum_update: + return &bpf_csum_update_proto; + case BPF_FUNC_csum_level: + return &bpf_csum_level_proto; + case BPF_FUNC_l3_csum_replace: + return &bpf_l3_csum_replace_proto; + case BPF_FUNC_l4_csum_replace: + return &bpf_l4_csum_replace_proto; + case BPF_FUNC_set_hash_invalid: + return &bpf_set_hash_invalid_proto; + case BPF_FUNC_lwt_push_encap: + return &bpf_lwt_xmit_push_encap_proto; + default: + return lwt_out_func_proto(func_id, prog); + } +} + +static const struct bpf_func_proto * +lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) + case BPF_FUNC_lwt_seg6_store_bytes: + return &bpf_lwt_seg6_store_bytes_proto; + case BPF_FUNC_lwt_seg6_action: + return &bpf_lwt_seg6_action_proto; + case BPF_FUNC_lwt_seg6_adjust_srh: + return &bpf_lwt_seg6_adjust_srh_proto; +#endif + default: + return lwt_out_func_proto(func_id, prog); + } +} + +static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct __sk_buff)) + return false; + + /* The verifier guarantees that size > 0. */ + if (off % size != 0) + return false; + + switch (off) { + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + if (off + size > offsetofend(struct __sk_buff, cb[4])) + return false; + break; + case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]): + case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]): + case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4): + case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4): + case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, data_end): + if (size != size_default) + return false; + break; + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + return false; + case bpf_ctx_range(struct __sk_buff, hwtstamp): + if (type == BPF_WRITE || size != sizeof(__u64)) + return false; + break; + case bpf_ctx_range(struct __sk_buff, tstamp): + if (size != sizeof(__u64)) + return false; + break; + case offsetof(struct __sk_buff, sk): + if (type == BPF_WRITE || size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; + break; + case offsetof(struct __sk_buff, tstamp_type): + return false; + case offsetofend(struct __sk_buff, tstamp_type) ... offsetof(struct __sk_buff, hwtstamp) - 1: + /* Explicitly prohibit access to padding in __sk_buff. */ + return false; + default: + /* Only narrow read access allowed for now. */ + if (type == BPF_WRITE) { + if (size != size_default) + return false; + } else { + bpf_ctx_record_field_size(info, size_default); + if (!bpf_ctx_narrow_access_ok(off, size, size_default)) + return false; + } + } + + return true; +} + +static bool sk_filter_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + switch (off) { + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, data_end): + case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): + case bpf_ctx_range(struct __sk_buff, hwtstamp): + return false; + } + + if (type == BPF_WRITE) { + switch (off) { + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + break; + default: + return false; + } + } + + return bpf_skb_is_valid_access(off, size, type, prog, info); +} + +static bool cg_skb_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + switch (off) { + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, wire_len): + return false; + case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_end): + if (!bpf_capable()) + return false; + break; + } + + if (type == BPF_WRITE) { + switch (off) { + case bpf_ctx_range(struct __sk_buff, mark): + case bpf_ctx_range(struct __sk_buff, priority): + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + break; + case bpf_ctx_range(struct __sk_buff, tstamp): + if (!bpf_capable()) + return false; + break; + default: + return false; + } + } + + switch (off) { + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; + break; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + } + + return bpf_skb_is_valid_access(off, size, type, prog, info); +} + +static bool lwt_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + switch (off) { + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): + case bpf_ctx_range(struct __sk_buff, hwtstamp): + return false; + } + + if (type == BPF_WRITE) { + switch (off) { + case bpf_ctx_range(struct __sk_buff, mark): + case bpf_ctx_range(struct __sk_buff, priority): + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + break; + default: + return false; + } + } + + switch (off) { + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; + break; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + } + + return bpf_skb_is_valid_access(off, size, type, prog, info); +} + +/* Attach type specific accesses */ +static bool __sock_filter_check_attach_type(int off, + enum bpf_access_type access_type, + enum bpf_attach_type attach_type) +{ + switch (off) { + case offsetof(struct bpf_sock, bound_dev_if): + case offsetof(struct bpf_sock, mark): + case offsetof(struct bpf_sock, priority): + switch (attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: + goto full_access; + default: + return false; + } + case bpf_ctx_range(struct bpf_sock, src_ip4): + switch (attach_type) { + case BPF_CGROUP_INET4_POST_BIND: + goto read_only; + default: + return false; + } + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): + switch (attach_type) { + case BPF_CGROUP_INET6_POST_BIND: + goto read_only; + default: + return false; + } + case bpf_ctx_range(struct bpf_sock, src_port): + switch (attach_type) { + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: + goto read_only; + default: + return false; + } + } +read_only: + return access_type == BPF_READ; +full_access: + return true; +} + +bool bpf_sock_common_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + switch (off) { + case bpf_ctx_range_till(struct bpf_sock, type, priority): + return false; + default: + return bpf_sock_is_valid_access(off, size, type, info); + } +} + +bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + int field_size; + + if (off < 0 || off >= sizeof(struct bpf_sock)) + return false; + if (off % size != 0) + return false; + + switch (off) { + case offsetof(struct bpf_sock, state): + case offsetof(struct bpf_sock, family): + case offsetof(struct bpf_sock, type): + case offsetof(struct bpf_sock, protocol): + case offsetof(struct bpf_sock, src_port): + case offsetof(struct bpf_sock, rx_queue_mapping): + case bpf_ctx_range(struct bpf_sock, src_ip4): + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): + case bpf_ctx_range(struct bpf_sock, dst_ip4): + case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + case bpf_ctx_range(struct bpf_sock, dst_port): + field_size = size == size_default ? + size_default : sizeof_field(struct bpf_sock, dst_port); + bpf_ctx_record_field_size(info, field_size); + return bpf_ctx_narrow_access_ok(off, size, field_size); + case offsetofend(struct bpf_sock, dst_port) ... + offsetof(struct bpf_sock, dst_ip4) - 1: + return false; + } + + return size == size_default; +} + +static bool sock_filter_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (!bpf_sock_is_valid_access(off, size, type, info)) + return false; + return __sock_filter_check_attach_type(off, type, + prog->expected_attach_type); +} + +static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + /* Neither direct read nor direct write requires any preliminary + * action. + */ + return 0; +} + +static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog, int drop_verdict) +{ + struct bpf_insn *insn = insn_buf; + + if (!direct_write) + return 0; + + /* if (!skb->cloned) + * goto start; + * + * (Fast-path, otherwise approximation that we might be + * a clone, do the rest in helper.) + */ + *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK); + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7); + + /* ret = bpf_skb_pull_data(skb, 0); */ + *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); + *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2); + *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_skb_pull_data); + /* if (!ret) + * goto restore; + * return TC_ACT_SHOT; + */ + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2); + *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, drop_verdict); + *insn++ = BPF_EXIT_INSN(); + + /* restore: */ + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); + /* start: */ + *insn++ = prog->insnsi[0]; + + return insn - insn_buf; +} + +static int bpf_gen_ld_abs(const struct bpf_insn *orig, + struct bpf_insn *insn_buf) +{ + bool indirect = BPF_MODE(orig->code) == BPF_IND; + struct bpf_insn *insn = insn_buf; + + if (!indirect) { + *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm); + } else { + *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg); + if (orig->imm) + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm); + } + /* We're guaranteed here that CTX is in R6. */ + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX); + + switch (BPF_SIZE(orig->code)) { + case BPF_B: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache); + break; + case BPF_H: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache); + break; + case BPF_W: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache); + break; + } + + *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0); + *insn++ = BPF_EXIT_INSN(); + + return insn - insn_buf; +} + +static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT); +} + +static bool tc_cls_act_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) { + switch (off) { + case bpf_ctx_range(struct __sk_buff, mark): + case bpf_ctx_range(struct __sk_buff, tc_index): + case bpf_ctx_range(struct __sk_buff, priority): + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, queue_mapping): + break; + default: + return false; + } + } + + switch (off) { + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; + break; + case bpf_ctx_range(struct __sk_buff, data_meta): + info->reg_type = PTR_TO_PACKET_META; + break; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + case bpf_ctx_range_till(struct __sk_buff, family, local_port): + return false; + case offsetof(struct __sk_buff, tstamp_type): + /* The convert_ctx_access() on reading and writing + * __sk_buff->tstamp depends on whether the bpf prog + * has used __sk_buff->tstamp_type or not. + * Thus, we need to set prog->tstamp_type_access + * earlier during is_valid_access() here. + */ + ((struct bpf_prog *)prog)->tstamp_type_access = 1; + return size == sizeof(__u8); + } + + return bpf_skb_is_valid_access(off, size, type, prog, info); +} + +DEFINE_MUTEX(nf_conn_btf_access_lock); +EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock); + +int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, u32 *next_btf_id, + enum bpf_type_flag *flag); +EXPORT_SYMBOL_GPL(nfct_btf_struct_access); + +static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id, + enum bpf_type_flag *flag) +{ + int ret = -EACCES; + + if (atype == BPF_READ) + return btf_struct_access(log, btf, t, off, size, atype, next_btf_id, + flag); + + mutex_lock(&nf_conn_btf_access_lock); + if (nfct_btf_struct_access) + ret = nfct_btf_struct_access(log, btf, t, off, size, atype, next_btf_id, flag); + mutex_unlock(&nf_conn_btf_access_lock); + + return ret; +} + +static bool __is_valid_xdp_access(int off, int size) +{ + if (off < 0 || off >= sizeof(struct xdp_md)) + return false; + if (off % size != 0) + return false; + if (size != sizeof(__u32)) + return false; + + return true; +} + +static bool xdp_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (prog->expected_attach_type != BPF_XDP_DEVMAP) { + switch (off) { + case offsetof(struct xdp_md, egress_ifindex): + return false; + } + } + + if (type == BPF_WRITE) { + if (bpf_prog_is_dev_bound(prog->aux)) { + switch (off) { + case offsetof(struct xdp_md, rx_queue_index): + return __is_valid_xdp_access(off, size); + } + } + return false; + } + + switch (off) { + case offsetof(struct xdp_md, data): + info->reg_type = PTR_TO_PACKET; + break; + case offsetof(struct xdp_md, data_meta): + info->reg_type = PTR_TO_PACKET_META; + break; + case offsetof(struct xdp_md, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + } + + return __is_valid_xdp_access(off, size); +} + +void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act) +{ + const u32 act_max = XDP_REDIRECT; + + pr_warn_once("%s XDP return value %u on prog %s (id %d) dev %s, expect packet loss!\n", + act > act_max ? "Illegal" : "Driver unsupported", + act, prog->aux->name, prog->aux->id, dev ? dev->name : "N/A"); +} +EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); + +static int xdp_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id, + enum bpf_type_flag *flag) +{ + int ret = -EACCES; + + if (atype == BPF_READ) + return btf_struct_access(log, btf, t, off, size, atype, next_btf_id, + flag); + + mutex_lock(&nf_conn_btf_access_lock); + if (nfct_btf_struct_access) + ret = nfct_btf_struct_access(log, btf, t, off, size, atype, next_btf_id, flag); + mutex_unlock(&nf_conn_btf_access_lock); + + return ret; +} + +static bool sock_addr_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct bpf_sock_addr)) + return false; + if (off % size != 0) + return false; + + /* Disallow access to IPv6 fields from IPv4 contex and vise + * versa. + */ + switch (off) { + case bpf_ctx_range(struct bpf_sock_addr, user_ip4): + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP4_RECVMSG: + break; + default: + return false; + } + break; + case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UDP6_RECVMSG: + break; + default: + return false; + } + break; + case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): + switch (prog->expected_attach_type) { + case BPF_CGROUP_UDP4_SENDMSG: + break; + default: + return false; + } + break; + case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], + msg_src_ip6[3]): + switch (prog->expected_attach_type) { + case BPF_CGROUP_UDP6_SENDMSG: + break; + default: + return false; + } + break; + } + + switch (off) { + case bpf_ctx_range(struct bpf_sock_addr, user_ip4): + case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): + case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], + msg_src_ip6[3]): + case bpf_ctx_range(struct bpf_sock_addr, user_port): + if (type == BPF_READ) { + bpf_ctx_record_field_size(info, size_default); + + if (bpf_ctx_wide_access_ok(off, size, + struct bpf_sock_addr, + user_ip6)) + return true; + + if (bpf_ctx_wide_access_ok(off, size, + struct bpf_sock_addr, + msg_src_ip6)) + return true; + + if (!bpf_ctx_narrow_access_ok(off, size, size_default)) + return false; + } else { + if (bpf_ctx_wide_access_ok(off, size, + struct bpf_sock_addr, + user_ip6)) + return true; + + if (bpf_ctx_wide_access_ok(off, size, + struct bpf_sock_addr, + msg_src_ip6)) + return true; + + if (size != size_default) + return false; + } + break; + case offsetof(struct bpf_sock_addr, sk): + if (type != BPF_READ) + return false; + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET; + break; + default: + if (type == BPF_READ) { + if (size != size_default) + return false; + } else { + return false; + } + } + + return true; +} + +static bool sock_ops_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct bpf_sock_ops)) + return false; + + /* The verifier guarantees that size > 0. */ + if (off % size != 0) + return false; + + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct bpf_sock_ops, reply): + case offsetof(struct bpf_sock_ops, sk_txhash): + if (size != size_default) + return false; + break; + default: + return false; + } + } else { + switch (off) { + case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received, + bytes_acked): + if (size != sizeof(__u64)) + return false; + break; + case offsetof(struct bpf_sock_ops, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET_OR_NULL; + break; + case offsetof(struct bpf_sock_ops, skb_data): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET; + break; + case offsetof(struct bpf_sock_ops, skb_data_end): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET_END; + break; + case offsetof(struct bpf_sock_ops, skb_tcp_flags): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, + size_default); + default: + if (size != size_default) + return false; + break; + } + } + + return true; +} + +static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP); +} + +static bool sk_skb_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + switch (off) { + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, wire_len): + case bpf_ctx_range(struct __sk_buff, hwtstamp): + return false; + } + + if (type == BPF_WRITE) { + switch (off) { + case bpf_ctx_range(struct __sk_buff, tc_index): + case bpf_ctx_range(struct __sk_buff, priority): + break; + default: + return false; + } + } + + switch (off) { + case bpf_ctx_range(struct __sk_buff, mark): + return false; + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; + break; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + } + + return bpf_skb_is_valid_access(off, size, type, prog, info); +} + +static bool sk_msg_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) + return false; + + if (off % size != 0) + return false; + + switch (off) { + case offsetof(struct sk_msg_md, data): + info->reg_type = PTR_TO_PACKET; + if (size != sizeof(__u64)) + return false; + break; + case offsetof(struct sk_msg_md, data_end): + info->reg_type = PTR_TO_PACKET_END; + if (size != sizeof(__u64)) + return false; + break; + case offsetof(struct sk_msg_md, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET; + break; + case bpf_ctx_range(struct sk_msg_md, family): + case bpf_ctx_range(struct sk_msg_md, remote_ip4): + case bpf_ctx_range(struct sk_msg_md, local_ip4): + case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]): + case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]): + case bpf_ctx_range(struct sk_msg_md, remote_port): + case bpf_ctx_range(struct sk_msg_md, local_port): + case bpf_ctx_range(struct sk_msg_md, size): + if (size != sizeof(__u32)) + return false; + break; + default: + return false; + } + return true; +} + +static bool flow_dissector_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct __sk_buff)) + return false; + + if (type == BPF_WRITE) + return false; + + switch (off) { + case bpf_ctx_range(struct __sk_buff, data): + if (size != size_default) + return false; + info->reg_type = PTR_TO_PACKET; + return true; + case bpf_ctx_range(struct __sk_buff, data_end): + if (size != size_default) + return false; + info->reg_type = PTR_TO_PACKET_END; + return true; + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_FLOW_KEYS; + return true; + default: + return false; + } +} + +static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) + +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct __sk_buff, data): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data), + si->dst_reg, si->src_reg, + offsetof(struct bpf_flow_dissector, data)); + break; + + case offsetof(struct __sk_buff, data_end): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data_end), + si->dst_reg, si->src_reg, + offsetof(struct bpf_flow_dissector, data_end)); + break; + + case offsetof(struct __sk_buff, flow_keys): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, flow_keys), + si->dst_reg, si->src_reg, + offsetof(struct bpf_flow_dissector, flow_keys)); + break; + } + + return insn - insn_buf; +} + +static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si, + struct bpf_insn *insn) +{ + __u8 value_reg = si->dst_reg; + __u8 skb_reg = si->src_reg; + /* AX is needed because src_reg and dst_reg could be the same */ + __u8 tmp_reg = BPF_REG_AX; + + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, + PKT_VLAN_PRESENT_OFFSET); + *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, + SKB_MONO_DELIVERY_TIME_MASK, 2); + *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC); + *insn++ = BPF_JMP_A(1); + *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_MONO); + + return insn; +} + +static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si, + struct bpf_insn *insn) +{ + /* si->dst_reg = skb_shinfo(SKB); */ +#ifdef NET_SKBUFF_DATA_USES_OFFSET + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, end)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, head)); + *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); +#else + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, end)); +#endif + + return insn; +} + +static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, + const struct bpf_insn *si, + struct bpf_insn *insn) +{ + __u8 value_reg = si->dst_reg; + __u8 skb_reg = si->src_reg; + +#ifdef CONFIG_NET_CLS_ACT + /* If the tstamp_type is read, + * the bpf prog is aware the tstamp could have delivery time. + * Thus, read skb->tstamp as is if tstamp_type_access is true. + */ + if (!prog->tstamp_type_access) { + /* AX is needed because src_reg and dst_reg could be the same */ + __u8 tmp_reg = BPF_REG_AX; + + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, + TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK); + *insn++ = BPF_JMP32_IMM(BPF_JNE, tmp_reg, + TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK, 2); + /* skb->tc_at_ingress && skb->mono_delivery_time, + * read 0 as the (rcv) timestamp. + */ + *insn++ = BPF_MOV64_IMM(value_reg, 0); + *insn++ = BPF_JMP_A(1); + } +#endif + + *insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg, + offsetof(struct sk_buff, tstamp)); + return insn; +} + +static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, + const struct bpf_insn *si, + struct bpf_insn *insn) +{ + __u8 value_reg = si->src_reg; + __u8 skb_reg = si->dst_reg; + +#ifdef CONFIG_NET_CLS_ACT + /* If the tstamp_type is read, + * the bpf prog is aware the tstamp could have delivery time. + * Thus, write skb->tstamp as is if tstamp_type_access is true. + * Otherwise, writing at ingress will have to clear the + * mono_delivery_time bit also. + */ + if (!prog->tstamp_type_access) { + __u8 tmp_reg = BPF_REG_AX; + + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET); + /* Writing __sk_buff->tstamp as ingress, goto <clear> */ + *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); + /* goto <store> */ + *insn++ = BPF_JMP_A(2); + /* <clear>: mono_delivery_time */ + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK); + *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, PKT_VLAN_PRESENT_OFFSET); + } +#endif + + /* <store>: skb->tstamp = tstamp */ + *insn++ = BPF_STX_MEM(BPF_DW, skb_reg, value_reg, + offsetof(struct sk_buff, tstamp)); + return insn; +} + +static u32 bpf_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + int off; + + switch (si->off) { + case offsetof(struct __sk_buff, len): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, len, 4, + target_size)); + break; + + case offsetof(struct __sk_buff, protocol): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, protocol, 2, + target_size)); + break; + + case offsetof(struct __sk_buff, vlan_proto): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, vlan_proto, 2, + target_size)); + break; + + case offsetof(struct __sk_buff, priority): + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, priority, 4, + target_size)); + else + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, priority, 4, + target_size)); + break; + + case offsetof(struct __sk_buff, ingress_ifindex): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, skb_iif, 4, + target_size)); + break; + + case offsetof(struct __sk_buff, ifindex): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, dev)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + bpf_target_off(struct net_device, ifindex, 4, + target_size)); + break; + + case offsetof(struct __sk_buff, hash): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, hash, 4, + target_size)); + break; + + case offsetof(struct __sk_buff, mark): + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, mark, 4, + target_size)); + else + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, mark, 4, + target_size)); + break; + + case offsetof(struct __sk_buff, pkt_type): + *target_size = 1; + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, + PKT_TYPE_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX); +#ifdef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5); +#endif + break; + + case offsetof(struct __sk_buff, queue_mapping): + if (type == BPF_WRITE) { + *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1); + *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, + queue_mapping, + 2, target_size)); + } else { + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, + queue_mapping, + 2, target_size)); + } + break; + + case offsetof(struct __sk_buff, vlan_present): + *target_size = 1; + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, + PKT_VLAN_PRESENT_OFFSET); + if (PKT_VLAN_PRESENT_BIT) + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, PKT_VLAN_PRESENT_BIT); + if (PKT_VLAN_PRESENT_BIT < 7) + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1); + break; + + case offsetof(struct __sk_buff, vlan_tci): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, vlan_tci, 2, + target_size)); + break; + + case offsetof(struct __sk_buff, cb[0]) ... + offsetofend(struct __sk_buff, cb[4]) - 1: + BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, data) < 20); + BUILD_BUG_ON((offsetof(struct sk_buff, cb) + + offsetof(struct qdisc_skb_cb, data)) % + sizeof(__u64)); + + prog->cb_access = 1; + off = si->off; + off -= offsetof(struct __sk_buff, cb[0]); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct qdisc_skb_cb, data); + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg, + si->src_reg, off); + else + *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, + si->src_reg, off); + break; + + case offsetof(struct __sk_buff, tc_classid): + BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, tc_classid) != 2); + + off = si->off; + off -= offsetof(struct __sk_buff, tc_classid); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct qdisc_skb_cb, tc_classid); + *target_size = 2; + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, + si->src_reg, off); + else + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, + si->src_reg, off); + break; + + case offsetof(struct __sk_buff, data): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, data)); + break; + + case offsetof(struct __sk_buff, data_meta): + off = si->off; + off -= offsetof(struct __sk_buff, data_meta); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct bpf_skb_data_end, data_meta); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, + si->src_reg, off); + break; + + case offsetof(struct __sk_buff, data_end): + off = si->off; + off -= offsetof(struct __sk_buff, data_end); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct bpf_skb_data_end, data_end); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, + si->src_reg, off); + break; + + case offsetof(struct __sk_buff, tc_index): +#ifdef CONFIG_NET_SCHED + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, tc_index, 2, + target_size)); + else + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, tc_index, 2, + target_size)); +#else + *target_size = 2; + if (type == BPF_WRITE) + *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); + else + *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct __sk_buff, napi_id): +#if defined(CONFIG_NET_RX_BUSY_POLL) + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, napi_id, 4, + target_size)); + *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1); + *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); +#else + *target_size = 4; + *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); +#endif + break; + case offsetof(struct __sk_buff, family): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + bpf_target_off(struct sock_common, + skc_family, + 2, target_size)); + break; + case offsetof(struct __sk_buff, remote_ip4): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + bpf_target_off(struct sock_common, + skc_daddr, + 4, target_size)); + break; + case offsetof(struct __sk_buff, local_ip4): + BUILD_BUG_ON(sizeof_field(struct sock_common, + skc_rcv_saddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + bpf_target_off(struct sock_common, + skc_rcv_saddr, + 4, target_size)); + break; + case offsetof(struct __sk_buff, remote_ip6[0]) ... + offsetof(struct __sk_buff, remote_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(sizeof_field(struct sock_common, + skc_v6_daddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct __sk_buff, remote_ip6[0]); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_daddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + case offsetof(struct __sk_buff, local_ip6[0]) ... + offsetof(struct __sk_buff, local_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(sizeof_field(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct __sk_buff, local_ip6[0]); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct __sk_buff, remote_port): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + bpf_target_off(struct sock_common, + skc_dport, + 2, target_size)); +#ifndef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); +#endif + break; + + case offsetof(struct __sk_buff, local_port): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + bpf_target_off(struct sock_common, + skc_num, 2, target_size)); + break; + + case offsetof(struct __sk_buff, tstamp): + BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8); + + if (type == BPF_WRITE) + insn = bpf_convert_tstamp_write(prog, si, insn); + else + insn = bpf_convert_tstamp_read(prog, si, insn); + break; + + case offsetof(struct __sk_buff, tstamp_type): + insn = bpf_convert_tstamp_type_read(si, insn); + break; + + case offsetof(struct __sk_buff, gso_segs): + insn = bpf_convert_shinfo_access(si, insn); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs), + si->dst_reg, si->dst_reg, + bpf_target_off(struct skb_shared_info, + gso_segs, 2, + target_size)); + break; + case offsetof(struct __sk_buff, gso_size): + insn = bpf_convert_shinfo_access(si, insn); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size), + si->dst_reg, si->dst_reg, + bpf_target_off(struct skb_shared_info, + gso_size, 2, + target_size)); + break; + case offsetof(struct __sk_buff, wire_len): + BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4); + + off = si->off; + off -= offsetof(struct __sk_buff, wire_len); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct qdisc_skb_cb, pkt_len); + *target_size = 4; + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); + break; + + case offsetof(struct __sk_buff, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + break; + case offsetof(struct __sk_buff, hwtstamp): + BUILD_BUG_ON(sizeof_field(struct skb_shared_hwtstamps, hwtstamp) != 8); + BUILD_BUG_ON(offsetof(struct skb_shared_hwtstamps, hwtstamp) != 0); + + insn = bpf_convert_shinfo_access(si, insn); + *insn++ = BPF_LDX_MEM(BPF_DW, + si->dst_reg, si->dst_reg, + bpf_target_off(struct skb_shared_info, + hwtstamps, 8, + target_size)); + break; + } + + return insn - insn_buf; +} + +u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + int off; + + switch (si->off) { + case offsetof(struct bpf_sock, bound_dev_if): + BUILD_BUG_ON(sizeof_field(struct sock, sk_bound_dev_if) != 4); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct sock, sk_bound_dev_if)); + else + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct sock, sk_bound_dev_if)); + break; + + case offsetof(struct bpf_sock, mark): + BUILD_BUG_ON(sizeof_field(struct sock, sk_mark) != 4); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct sock, sk_mark)); + else + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct sock, sk_mark)); + break; + + case offsetof(struct bpf_sock, priority): + BUILD_BUG_ON(sizeof_field(struct sock, sk_priority) != 4); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct sock, sk_priority)); + else + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct sock, sk_priority)); + break; + + case offsetof(struct bpf_sock, family): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_family), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, + skc_family, + sizeof_field(struct sock_common, + skc_family), + target_size)); + break; + + case offsetof(struct bpf_sock, type): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock, sk_type), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock, sk_type, + sizeof_field(struct sock, sk_type), + target_size)); + break; + + case offsetof(struct bpf_sock, protocol): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock, sk_protocol), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock, sk_protocol, + sizeof_field(struct sock, sk_protocol), + target_size)); + break; + + case offsetof(struct bpf_sock, src_ip4): + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_rcv_saddr, + sizeof_field(struct sock_common, + skc_rcv_saddr), + target_size)); + break; + + case offsetof(struct bpf_sock, dst_ip4): + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_daddr, + sizeof_field(struct sock_common, + skc_daddr), + target_size)); + break; + + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + off = si->off; + off -= offsetof(struct bpf_sock, src_ip6[0]); + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off( + struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0], + sizeof_field(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]), + target_size) + off); +#else + (void)off; + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + off = si->off; + off -= offsetof(struct bpf_sock, dst_ip6[0]); + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, + skc_v6_daddr.s6_addr32[0], + sizeof_field(struct sock_common, + skc_v6_daddr.s6_addr32[0]), + target_size) + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); + *target_size = 4; +#endif + break; + + case offsetof(struct bpf_sock, src_port): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_num), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_num, + sizeof_field(struct sock_common, + skc_num), + target_size)); + break; + + case offsetof(struct bpf_sock, dst_port): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_dport), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_dport, + sizeof_field(struct sock_common, + skc_dport), + target_size)); + break; + + case offsetof(struct bpf_sock, state): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_state), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_state, + sizeof_field(struct sock_common, + skc_state), + target_size)); + break; + case offsetof(struct bpf_sock, rx_queue_mapping): +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock, sk_rx_queue_mapping, + sizeof_field(struct sock, + sk_rx_queue_mapping), + target_size)); + *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING, + 1); + *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); +#else + *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); + *target_size = 2; +#endif + break; + } + + return insn - insn_buf; +} + +static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct __sk_buff, ifindex): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + bpf_target_off(struct net_device, ifindex, 4, + target_size)); + break; + default: + return bpf_convert_ctx_access(type, si, insn_buf, prog, + target_size); + } + + return insn - insn_buf; +} + +static u32 xdp_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct xdp_md, data): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, data)); + break; + case offsetof(struct xdp_md, data_meta): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, data_meta)); + break; + case offsetof(struct xdp_md, data_end): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, data_end)); + break; + case offsetof(struct xdp_md, ingress_ifindex): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, rxq)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev), + si->dst_reg, si->dst_reg, + offsetof(struct xdp_rxq_info, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct net_device, ifindex)); + break; + case offsetof(struct xdp_md, rx_queue_index): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, rxq)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct xdp_rxq_info, + queue_index)); + break; + case offsetof(struct xdp_md, egress_ifindex): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, txq)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev), + si->dst_reg, si->dst_reg, + offsetof(struct xdp_txq_info, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct net_device, ifindex)); + break; + } + + return insn - insn_buf; +} + +/* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of + * context Structure, F is Field in context structure that contains a pointer + * to Nested Structure of type NS that has the field NF. + * + * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make + * sure that SIZE is not greater than actual size of S.F.NF. + * + * If offset OFF is provided, the load happens from that offset relative to + * offset of NF. + */ +#define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF) \ + do { \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg, \ + si->src_reg, offsetof(S, F)); \ + *insn++ = BPF_LDX_MEM( \ + SIZE, si->dst_reg, si->dst_reg, \ + bpf_target_off(NS, NF, sizeof_field(NS, NF), \ + target_size) \ + + OFF); \ + } while (0) + +#define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF) \ + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \ + BPF_FIELD_SIZEOF(NS, NF), 0) + +/* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to + * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation. + * + * In addition it uses Temporary Field TF (member of struct S) as the 3rd + * "register" since two registers available in convert_ctx_access are not + * enough: we can't override neither SRC, since it contains value to store, nor + * DST since it contains pointer to context that may be used by later + * instructions. But we need a temporary place to save pointer to nested + * structure whose field we want to store to. + */ +#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF) \ + do { \ + int tmp_reg = BPF_REG_9; \ + if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ + --tmp_reg; \ + if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ + --tmp_reg; \ + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg, \ + offsetof(S, TF)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \ + si->dst_reg, offsetof(S, F)); \ + *insn++ = BPF_STX_MEM(SIZE, tmp_reg, si->src_reg, \ + bpf_target_off(NS, NF, sizeof_field(NS, NF), \ + target_size) \ + + OFF); \ + *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \ + offsetof(S, TF)); \ + } while (0) + +#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \ + TF) \ + do { \ + if (type == BPF_WRITE) { \ + SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, \ + OFF, TF); \ + } else { \ + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \ + S, NS, F, NF, SIZE, OFF); \ + } \ + } while (0) + +#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF) \ + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( \ + S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF) + +static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port); + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sock_addr, user_family): + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sockaddr, uaddr, sa_family); + break; + + case offsetof(struct bpf_sock_addr, user_ip4): + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sockaddr_in, uaddr, + sin_addr, BPF_SIZE(si->code), 0, tmp_reg); + break; + + case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + off = si->off; + off -= offsetof(struct bpf_sock_addr, user_ip6[0]); + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, + sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off, + tmp_reg); + break; + + case offsetof(struct bpf_sock_addr, user_port): + /* To get port we need to know sa_family first and then treat + * sockaddr as either sockaddr_in or sockaddr_in6. + * Though we can simplify since port field has same offset and + * size in both structures. + * Here we check this invariant and use just one of the + * structures if it's true. + */ + BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) != + offsetof(struct sockaddr_in6, sin6_port)); + BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) != + sizeof_field(struct sockaddr_in6, sin6_port)); + /* Account for sin6_port being smaller than user_port. */ + port_size = min(port_size, BPF_LDST_BYTES(si)); + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, + sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg); + break; + + case offsetof(struct bpf_sock_addr, family): + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sock, sk, sk_family); + break; + + case offsetof(struct bpf_sock_addr, type): + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sock, sk, sk_type); + break; + + case offsetof(struct bpf_sock_addr, protocol): + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sock, sk, sk_protocol); + break; + + case offsetof(struct bpf_sock_addr, msg_src_ip4): + /* Treat t_ctx as struct in_addr for msg_src_ip4. */ + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct in_addr, t_ctx, + s_addr, BPF_SIZE(si->code), 0, tmp_reg); + break; + + case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], + msg_src_ip6[3]): + off = si->off; + off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]); + /* Treat t_ctx as struct in6_addr for msg_src_ip6. */ + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct in6_addr, t_ctx, + s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); + break; + case offsetof(struct bpf_sock_addr, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_addr_kern, sk)); + break; + } + + return insn - insn_buf; +} + +static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + int off; + +/* Helper macro for adding read access to tcp_sock or sock fields. */ +#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ + do { \ + int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2; \ + BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \ + sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == si->src_reg) { \ + *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + fullsock_reg = reg; \ + jmp += 2; \ + } \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + fullsock_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \ + if (si->dst_reg == si->src_reg) \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \ + OBJ_FIELD), \ + si->dst_reg, si->dst_reg, \ + offsetof(OBJ, OBJ_FIELD)); \ + if (si->dst_reg == si->src_reg) { \ + *insn++ = BPF_JMP_A(1); \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + } \ + } while (0) + +#define SOCK_OPS_GET_SK() \ + do { \ + int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 1; \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == si->src_reg) { \ + *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + fullsock_reg = reg; \ + jmp += 2; \ + } \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + fullsock_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \ + if (si->dst_reg == si->src_reg) \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + if (si->dst_reg == si->src_reg) { \ + *insn++ = BPF_JMP_A(1); \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + } \ + } while (0) + +#define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \ + SOCK_OPS_GET_FIELD(FIELD, FIELD, struct tcp_sock) + +/* Helper macro for adding write access to tcp_sock or sock fields. + * The macro is called with two registers, dst_reg which contains a pointer + * to ctx (context) and src_reg which contains the value that should be + * stored. However, we need an additional register since we cannot overwrite + * dst_reg because it may be used later in the program. + * Instead we "borrow" one of the other register. We first save its value + * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore + * it at the end of the macro. + */ +#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ + do { \ + int reg = BPF_REG_9; \ + BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \ + sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD), \ + reg, si->src_reg, \ + offsetof(OBJ, OBJ_FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + } while (0) + +#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \ + do { \ + if (TYPE == BPF_WRITE) \ + SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ + else \ + SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ + } while (0) + + if (insn > insn_buf) + return insn - insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sock_ops, op): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + op), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, op)); + break; + + case offsetof(struct bpf_sock_ops, replylong[0]) ... + offsetof(struct bpf_sock_ops, replylong[3]): + BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) != + sizeof_field(struct bpf_sock_ops_kern, reply)); + BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) != + sizeof_field(struct bpf_sock_ops_kern, replylong)); + off = si->off; + off -= offsetof(struct bpf_sock_ops, replylong[0]); + off += offsetof(struct bpf_sock_ops_kern, replylong[0]); + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, + off); + else + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + off); + break; + + case offsetof(struct bpf_sock_ops, family): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_family)); + break; + + case offsetof(struct bpf_sock_ops, remote_ip4): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_daddr)); + break; + + case offsetof(struct bpf_sock_ops, local_ip4): + BUILD_BUG_ON(sizeof_field(struct sock_common, + skc_rcv_saddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_rcv_saddr)); + break; + + case offsetof(struct bpf_sock_ops, remote_ip6[0]) ... + offsetof(struct bpf_sock_ops, remote_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(sizeof_field(struct sock_common, + skc_v6_daddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct bpf_sock_ops, remote_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_daddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct bpf_sock_ops, local_ip6[0]) ... + offsetof(struct bpf_sock_ops, local_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(sizeof_field(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct bpf_sock_ops, local_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct bpf_sock_ops, remote_port): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_dport)); +#ifndef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); +#endif + break; + + case offsetof(struct bpf_sock_ops, local_port): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_num)); + break; + + case offsetof(struct bpf_sock_ops, is_fullsock): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, + is_fullsock), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + is_fullsock)); + break; + + case offsetof(struct bpf_sock_ops, state): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_state) != 1); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_state)); + break; + + case offsetof(struct bpf_sock_ops, rtt_min): + BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) != + sizeof(struct minmax)); + BUILD_BUG_ON(sizeof(struct minmax) < + sizeof(struct minmax_sample)); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct tcp_sock, rtt_min) + + sizeof_field(struct minmax_sample, t)); + break; + + case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags): + SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, sk_txhash): + SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, + struct sock, type); + break; + case offsetof(struct bpf_sock_ops, snd_cwnd): + SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd); + break; + case offsetof(struct bpf_sock_ops, srtt_us): + SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us); + break; + case offsetof(struct bpf_sock_ops, snd_ssthresh): + SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh); + break; + case offsetof(struct bpf_sock_ops, rcv_nxt): + SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt); + break; + case offsetof(struct bpf_sock_ops, snd_nxt): + SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt); + break; + case offsetof(struct bpf_sock_ops, snd_una): + SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una); + break; + case offsetof(struct bpf_sock_ops, mss_cache): + SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache); + break; + case offsetof(struct bpf_sock_ops, ecn_flags): + SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags); + break; + case offsetof(struct bpf_sock_ops, rate_delivered): + SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered); + break; + case offsetof(struct bpf_sock_ops, rate_interval_us): + SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us); + break; + case offsetof(struct bpf_sock_ops, packets_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out); + break; + case offsetof(struct bpf_sock_ops, retrans_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out); + break; + case offsetof(struct bpf_sock_ops, total_retrans): + SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans); + break; + case offsetof(struct bpf_sock_ops, segs_in): + SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in); + break; + case offsetof(struct bpf_sock_ops, data_segs_in): + SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in); + break; + case offsetof(struct bpf_sock_ops, segs_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out); + break; + case offsetof(struct bpf_sock_ops, data_segs_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out); + break; + case offsetof(struct bpf_sock_ops, lost_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out); + break; + case offsetof(struct bpf_sock_ops, sacked_out): + SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out); + break; + case offsetof(struct bpf_sock_ops, bytes_received): + SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received); + break; + case offsetof(struct bpf_sock_ops, bytes_acked): + SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked); + break; + case offsetof(struct bpf_sock_ops, sk): + SOCK_OPS_GET_SK(); + break; + case offsetof(struct bpf_sock_ops, skb_data_end): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb_data_end), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb_data_end)); + break; + case offsetof(struct bpf_sock_ops, skb_data): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + si->dst_reg, si->dst_reg, + offsetof(struct sk_buff, data)); + break; + case offsetof(struct bpf_sock_ops, skb_len): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), + si->dst_reg, si->dst_reg, + offsetof(struct sk_buff, len)); + break; + case offsetof(struct bpf_sock_ops, skb_tcp_flags): + off = offsetof(struct sk_buff, cb); + off += offsetof(struct tcp_skb_cb, tcp_flags); + *target_size = sizeof_field(struct tcp_skb_cb, tcp_flags); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_skb_cb, + tcp_flags), + si->dst_reg, si->dst_reg, off); + break; + } + return insn - insn_buf; +} + +/* data_end = skb->data + skb_headlen() */ +static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si, + struct bpf_insn *insn) +{ + int reg; + int temp_reg_off = offsetof(struct sk_buff, cb) + + offsetof(struct sk_skb_cb, temp_reg); + + if (si->src_reg == si->dst_reg) { + /* We need an extra register, choose and save a register. */ + reg = BPF_REG_9; + if (si->src_reg == reg || si->dst_reg == reg) + reg--; + if (si->src_reg == reg || si->dst_reg == reg) + reg--; + *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, temp_reg_off); + } else { + reg = si->dst_reg; + } + + /* reg = skb->data */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + reg, si->src_reg, + offsetof(struct sk_buff, data)); + /* AX = skb->len */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, len)); + /* reg = skb->data + skb->len */ + *insn++ = BPF_ALU64_REG(BPF_ADD, reg, BPF_REG_AX); + /* AX = skb->data_len */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, data_len)); + + /* reg = skb->data + skb->len - skb->data_len */ + *insn++ = BPF_ALU64_REG(BPF_SUB, reg, BPF_REG_AX); + + if (si->src_reg == si->dst_reg) { + /* Restore the saved register */ + *insn++ = BPF_MOV64_REG(BPF_REG_AX, si->src_reg); + *insn++ = BPF_MOV64_REG(si->dst_reg, reg); + *insn++ = BPF_LDX_MEM(BPF_DW, reg, BPF_REG_AX, temp_reg_off); + } + + return insn; +} + +static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + int off; + + switch (si->off) { + case offsetof(struct __sk_buff, data_end): + insn = bpf_convert_data_end_access(si, insn); + break; + case offsetof(struct __sk_buff, cb[0]) ... + offsetofend(struct __sk_buff, cb[4]) - 1: + BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20); + BUILD_BUG_ON((offsetof(struct sk_buff, cb) + + offsetof(struct sk_skb_cb, data)) % + sizeof(__u64)); + + prog->cb_access = 1; + off = si->off; + off -= offsetof(struct __sk_buff, cb[0]); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct sk_skb_cb, data); + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg, + si->src_reg, off); + else + *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, + si->src_reg, off); + break; + + + default: + return bpf_convert_ctx_access(type, si, insn_buf, prog, + target_size); + } + + return insn - insn_buf; +} + +static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; +#if IS_ENABLED(CONFIG_IPV6) + int off; +#endif + + /* convert ctx uses the fact sg element is first in struct */ + BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0); + + switch (si->off) { + case offsetof(struct sk_msg_md, data): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, data)); + break; + case offsetof(struct sk_msg_md, data_end): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, data_end)); + break; + case offsetof(struct sk_msg_md, family): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_family)); + break; + + case offsetof(struct sk_msg_md, remote_ip4): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_daddr)); + break; + + case offsetof(struct sk_msg_md, local_ip4): + BUILD_BUG_ON(sizeof_field(struct sock_common, + skc_rcv_saddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_rcv_saddr)); + break; + + case offsetof(struct sk_msg_md, remote_ip6[0]) ... + offsetof(struct sk_msg_md, remote_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(sizeof_field(struct sock_common, + skc_v6_daddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct sk_msg_md, remote_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_daddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct sk_msg_md, local_ip6[0]) ... + offsetof(struct sk_msg_md, local_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(sizeof_field(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct sk_msg_md, local_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct sk_msg_md, remote_port): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_dport)); +#ifndef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); +#endif + break; + + case offsetof(struct sk_msg_md, local_port): + BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_num)); + break; + + case offsetof(struct sk_msg_md, size): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_sg, size)); + break; + + case offsetof(struct sk_msg_md, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, sk)); + break; + } + + return insn - insn_buf; +} + +const struct bpf_verifier_ops sk_filter_verifier_ops = { + .get_func_proto = sk_filter_func_proto, + .is_valid_access = sk_filter_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, + .gen_ld_abs = bpf_gen_ld_abs, +}; + +const struct bpf_prog_ops sk_filter_prog_ops = { + .test_run = bpf_prog_test_run_skb, +}; + +const struct bpf_verifier_ops tc_cls_act_verifier_ops = { + .get_func_proto = tc_cls_act_func_proto, + .is_valid_access = tc_cls_act_is_valid_access, + .convert_ctx_access = tc_cls_act_convert_ctx_access, + .gen_prologue = tc_cls_act_prologue, + .gen_ld_abs = bpf_gen_ld_abs, + .btf_struct_access = tc_cls_act_btf_struct_access, +}; + +const struct bpf_prog_ops tc_cls_act_prog_ops = { + .test_run = bpf_prog_test_run_skb, +}; + +const struct bpf_verifier_ops xdp_verifier_ops = { + .get_func_proto = xdp_func_proto, + .is_valid_access = xdp_is_valid_access, + .convert_ctx_access = xdp_convert_ctx_access, + .gen_prologue = bpf_noop_prologue, + .btf_struct_access = xdp_btf_struct_access, +}; + +const struct bpf_prog_ops xdp_prog_ops = { + .test_run = bpf_prog_test_run_xdp, +}; + +const struct bpf_verifier_ops cg_skb_verifier_ops = { + .get_func_proto = cg_skb_func_proto, + .is_valid_access = cg_skb_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops cg_skb_prog_ops = { + .test_run = bpf_prog_test_run_skb, +}; + +const struct bpf_verifier_ops lwt_in_verifier_ops = { + .get_func_proto = lwt_in_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops lwt_in_prog_ops = { + .test_run = bpf_prog_test_run_skb, +}; + +const struct bpf_verifier_ops lwt_out_verifier_ops = { + .get_func_proto = lwt_out_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops lwt_out_prog_ops = { + .test_run = bpf_prog_test_run_skb, +}; + +const struct bpf_verifier_ops lwt_xmit_verifier_ops = { + .get_func_proto = lwt_xmit_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, + .gen_prologue = tc_cls_act_prologue, +}; + +const struct bpf_prog_ops lwt_xmit_prog_ops = { + .test_run = bpf_prog_test_run_skb, +}; + +const struct bpf_verifier_ops lwt_seg6local_verifier_ops = { + .get_func_proto = lwt_seg6local_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops lwt_seg6local_prog_ops = { + .test_run = bpf_prog_test_run_skb, +}; + +const struct bpf_verifier_ops cg_sock_verifier_ops = { + .get_func_proto = sock_filter_func_proto, + .is_valid_access = sock_filter_is_valid_access, + .convert_ctx_access = bpf_sock_convert_ctx_access, +}; + +const struct bpf_prog_ops cg_sock_prog_ops = { +}; + +const struct bpf_verifier_ops cg_sock_addr_verifier_ops = { + .get_func_proto = sock_addr_func_proto, + .is_valid_access = sock_addr_is_valid_access, + .convert_ctx_access = sock_addr_convert_ctx_access, +}; + +const struct bpf_prog_ops cg_sock_addr_prog_ops = { +}; + +const struct bpf_verifier_ops sock_ops_verifier_ops = { + .get_func_proto = sock_ops_func_proto, + .is_valid_access = sock_ops_is_valid_access, + .convert_ctx_access = sock_ops_convert_ctx_access, +}; + +const struct bpf_prog_ops sock_ops_prog_ops = { +}; + +const struct bpf_verifier_ops sk_skb_verifier_ops = { + .get_func_proto = sk_skb_func_proto, + .is_valid_access = sk_skb_is_valid_access, + .convert_ctx_access = sk_skb_convert_ctx_access, + .gen_prologue = sk_skb_prologue, +}; + +const struct bpf_prog_ops sk_skb_prog_ops = { +}; + +const struct bpf_verifier_ops sk_msg_verifier_ops = { + .get_func_proto = sk_msg_func_proto, + .is_valid_access = sk_msg_is_valid_access, + .convert_ctx_access = sk_msg_convert_ctx_access, + .gen_prologue = bpf_noop_prologue, +}; + +const struct bpf_prog_ops sk_msg_prog_ops = { +}; + +const struct bpf_verifier_ops flow_dissector_verifier_ops = { + .get_func_proto = flow_dissector_func_proto, + .is_valid_access = flow_dissector_is_valid_access, + .convert_ctx_access = flow_dissector_convert_ctx_access, +}; + +const struct bpf_prog_ops flow_dissector_prog_ops = { + .test_run = bpf_prog_test_run_flow_dissector, +}; + +int sk_detach_filter(struct sock *sk) +{ + int ret = -ENOENT; + struct sk_filter *filter; + + if (sock_flag(sk, SOCK_FILTER_LOCKED)) + return -EPERM; + + filter = rcu_dereference_protected(sk->sk_filter, + lockdep_sock_is_held(sk)); + if (filter) { + RCU_INIT_POINTER(sk->sk_filter, NULL); + sk_filter_uncharge(sk, filter); + ret = 0; + } + + return ret; +} +EXPORT_SYMBOL_GPL(sk_detach_filter); + +int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len) +{ + struct sock_fprog_kern *fprog; + struct sk_filter *filter; + int ret = 0; + + sockopt_lock_sock(sk); + filter = rcu_dereference_protected(sk->sk_filter, + lockdep_sock_is_held(sk)); + if (!filter) + goto out; + + /* We're copying the filter that has been originally attached, + * so no conversion/decode needed anymore. eBPF programs that + * have no original program cannot be dumped through this. + */ + ret = -EACCES; + fprog = filter->prog->orig_prog; + if (!fprog) + goto out; + + ret = fprog->len; + if (!len) + /* User space only enquires number of filter blocks. */ + goto out; + + ret = -EINVAL; + if (len < fprog->len) + goto out; + + ret = -EFAULT; + if (copy_to_sockptr(optval, fprog->filter, bpf_classic_proglen(fprog))) + goto out; + + /* Instead of bytes, the API requests to return the number + * of filter blocks. + */ + ret = fprog->len; +out: + sockopt_release_sock(sk); + return ret; +} + +#ifdef CONFIG_INET +static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, + struct sock_reuseport *reuse, + struct sock *sk, struct sk_buff *skb, + struct sock *migrating_sk, + u32 hash) +{ + reuse_kern->skb = skb; + reuse_kern->sk = sk; + reuse_kern->selected_sk = NULL; + reuse_kern->migrating_sk = migrating_sk; + reuse_kern->data_end = skb->data + skb_headlen(skb); + reuse_kern->hash = hash; + reuse_kern->reuseport_id = reuse->reuseport_id; + reuse_kern->bind_inany = reuse->bind_inany; +} + +struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + struct sock *migrating_sk, + u32 hash) +{ + struct sk_reuseport_kern reuse_kern; + enum sk_action action; + + bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash); + action = bpf_prog_run(prog, &reuse_kern); + + if (action == SK_PASS) + return reuse_kern.selected_sk; + else + return ERR_PTR(-ECONNREFUSED); +} + +BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, + struct bpf_map *, map, void *, key, u32, flags) +{ + bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY; + struct sock_reuseport *reuse; + struct sock *selected_sk; + + selected_sk = map->ops->map_lookup_elem(map, key); + if (!selected_sk) + return -ENOENT; + + reuse = rcu_dereference(selected_sk->sk_reuseport_cb); + if (!reuse) { + /* Lookup in sock_map can return TCP ESTABLISHED sockets. */ + if (sk_is_refcounted(selected_sk)) + sock_put(selected_sk); + + /* reuseport_array has only sk with non NULL sk_reuseport_cb. + * The only (!reuse) case here is - the sk has already been + * unhashed (e.g. by close()), so treat it as -ENOENT. + * + * Other maps (e.g. sock_map) do not provide this guarantee and + * the sk may never be in the reuseport group to begin with. + */ + return is_sockarray ? -ENOENT : -EINVAL; + } + + if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) { + struct sock *sk = reuse_kern->sk; + + if (sk->sk_protocol != selected_sk->sk_protocol) + return -EPROTOTYPE; + else if (sk->sk_family != selected_sk->sk_family) + return -EAFNOSUPPORT; + + /* Catch all. Likely bound to a different sockaddr. */ + return -EBADFD; + } + + reuse_kern->selected_sk = selected_sk; + + return 0; +} + +static const struct bpf_func_proto sk_select_reuseport_proto = { + .func = sk_select_reuseport, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(sk_reuseport_load_bytes, + const struct sk_reuseport_kern *, reuse_kern, u32, offset, + void *, to, u32, len) +{ + return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len); +} + +static const struct bpf_func_proto sk_reuseport_load_bytes_proto = { + .func = sk_reuseport_load_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(sk_reuseport_load_bytes_relative, + const struct sk_reuseport_kern *, reuse_kern, u32, offset, + void *, to, u32, len, u32, start_header) +{ + return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to, + len, start_header); +} + +static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = { + .func = sk_reuseport_load_bytes_relative, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +sk_reuseport_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_sk_select_reuseport: + return &sk_select_reuseport_proto; + case BPF_FUNC_skb_load_bytes: + return &sk_reuseport_load_bytes_proto; + case BPF_FUNC_skb_load_bytes_relative: + return &sk_reuseport_load_bytes_relative_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_ptr_cookie_proto; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static bool +sk_reuseport_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const u32 size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct sk_reuseport_md) || + off % size || type != BPF_READ) + return false; + + switch (off) { + case offsetof(struct sk_reuseport_md, data): + info->reg_type = PTR_TO_PACKET; + return size == sizeof(__u64); + + case offsetof(struct sk_reuseport_md, data_end): + info->reg_type = PTR_TO_PACKET_END; + return size == sizeof(__u64); + + case offsetof(struct sk_reuseport_md, hash): + return size == size_default; + + case offsetof(struct sk_reuseport_md, sk): + info->reg_type = PTR_TO_SOCKET; + return size == sizeof(__u64); + + case offsetof(struct sk_reuseport_md, migrating_sk): + info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; + return size == sizeof(__u64); + + /* Fields that allow narrowing */ + case bpf_ctx_range(struct sk_reuseport_md, eth_protocol): + if (size < sizeof_field(struct sk_buff, protocol)) + return false; + fallthrough; + case bpf_ctx_range(struct sk_reuseport_md, ip_protocol): + case bpf_ctx_range(struct sk_reuseport_md, bind_inany): + case bpf_ctx_range(struct sk_reuseport_md, len): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + + default: + return false; + } +} + +#define SK_REUSEPORT_LOAD_FIELD(F) ({ \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \ + si->dst_reg, si->src_reg, \ + bpf_target_off(struct sk_reuseport_kern, F, \ + sizeof_field(struct sk_reuseport_kern, F), \ + target_size)); \ + }) + +#define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \ + SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ + struct sk_buff, \ + skb, \ + SKB_FIELD) + +#define SK_REUSEPORT_LOAD_SK_FIELD(SK_FIELD) \ + SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ + struct sock, \ + sk, \ + SK_FIELD) + +static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct sk_reuseport_md, data): + SK_REUSEPORT_LOAD_SKB_FIELD(data); + break; + + case offsetof(struct sk_reuseport_md, len): + SK_REUSEPORT_LOAD_SKB_FIELD(len); + break; + + case offsetof(struct sk_reuseport_md, eth_protocol): + SK_REUSEPORT_LOAD_SKB_FIELD(protocol); + break; + + case offsetof(struct sk_reuseport_md, ip_protocol): + SK_REUSEPORT_LOAD_SK_FIELD(sk_protocol); + break; + + case offsetof(struct sk_reuseport_md, data_end): + SK_REUSEPORT_LOAD_FIELD(data_end); + break; + + case offsetof(struct sk_reuseport_md, hash): + SK_REUSEPORT_LOAD_FIELD(hash); + break; + + case offsetof(struct sk_reuseport_md, bind_inany): + SK_REUSEPORT_LOAD_FIELD(bind_inany); + break; + + case offsetof(struct sk_reuseport_md, sk): + SK_REUSEPORT_LOAD_FIELD(sk); + break; + + case offsetof(struct sk_reuseport_md, migrating_sk): + SK_REUSEPORT_LOAD_FIELD(migrating_sk); + break; + } + + return insn - insn_buf; +} + +const struct bpf_verifier_ops sk_reuseport_verifier_ops = { + .get_func_proto = sk_reuseport_func_proto, + .is_valid_access = sk_reuseport_is_valid_access, + .convert_ctx_access = sk_reuseport_convert_ctx_access, +}; + +const struct bpf_prog_ops sk_reuseport_prog_ops = { +}; + +DEFINE_STATIC_KEY_FALSE(bpf_sk_lookup_enabled); +EXPORT_SYMBOL(bpf_sk_lookup_enabled); + +BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx, + struct sock *, sk, u64, flags) +{ + if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE | + BPF_SK_LOOKUP_F_NO_REUSEPORT))) + return -EINVAL; + if (unlikely(sk && sk_is_refcounted(sk))) + return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */ + if (unlikely(sk && sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN)) + return -ESOCKTNOSUPPORT; /* only accept TCP socket in LISTEN */ + if (unlikely(sk && sk_is_udp(sk) && sk->sk_state != TCP_CLOSE)) + return -ESOCKTNOSUPPORT; /* only accept UDP socket in CLOSE */ + + /* Check if socket is suitable for packet L3/L4 protocol */ + if (sk && sk->sk_protocol != ctx->protocol) + return -EPROTOTYPE; + if (sk && sk->sk_family != ctx->family && + (sk->sk_family == AF_INET || ipv6_only_sock(sk))) + return -EAFNOSUPPORT; + + if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE)) + return -EEXIST; + + /* Select socket as lookup result */ + ctx->selected_sk = sk; + ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT; + return 0; +} + +static const struct bpf_func_proto bpf_sk_lookup_assign_proto = { + .func = bpf_sk_lookup_assign, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_SOCKET_OR_NULL, + .arg3_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; + case BPF_FUNC_sk_assign: + return &bpf_sk_lookup_assign_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; + default: + return bpf_sk_base_func_proto(func_id); + } +} + +static bool sk_lookup_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(struct bpf_sk_lookup)) + return false; + if (off % size != 0) + return false; + if (type != BPF_READ) + return false; + + switch (off) { + case offsetof(struct bpf_sk_lookup, sk): + info->reg_type = PTR_TO_SOCKET_OR_NULL; + return size == sizeof(__u64); + + case bpf_ctx_range(struct bpf_sk_lookup, family): + case bpf_ctx_range(struct bpf_sk_lookup, protocol): + case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4): + case bpf_ctx_range(struct bpf_sk_lookup, local_ip4): + case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]): + case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): + case bpf_ctx_range(struct bpf_sk_lookup, local_port): + case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex): + bpf_ctx_record_field_size(info, sizeof(__u32)); + return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32)); + + case bpf_ctx_range(struct bpf_sk_lookup, remote_port): + /* Allow 4-byte access to 2-byte field for backward compatibility */ + if (size == sizeof(__u32)) + return true; + bpf_ctx_record_field_size(info, sizeof(__be16)); + return bpf_ctx_narrow_access_ok(off, size, sizeof(__be16)); + + case offsetofend(struct bpf_sk_lookup, remote_port) ... + offsetof(struct bpf_sk_lookup, local_ip4) - 1: + /* Allow access to zero padding for backward compatibility */ + bpf_ctx_record_field_size(info, sizeof(__u16)); + return bpf_ctx_narrow_access_ok(off, size, sizeof(__u16)); + + default: + return false; + } +} + +static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sk_lookup, sk): + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, selected_sk)); + break; + + case offsetof(struct bpf_sk_lookup, family): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + family, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, protocol): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + protocol, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, remote_ip4): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + v4.saddr, 4, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, local_ip4): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + v4.daddr, 4, target_size)); + break; + + case bpf_ctx_range_till(struct bpf_sk_lookup, + remote_ip6[0], remote_ip6[3]): { +#if IS_ENABLED(CONFIG_IPV6) + int off = si->off; + + off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]); + off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, v6.saddr)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + } + case bpf_ctx_range_till(struct bpf_sk_lookup, + local_ip6[0], local_ip6[3]): { +#if IS_ENABLED(CONFIG_IPV6) + int off = si->off; + + off -= offsetof(struct bpf_sk_lookup, local_ip6[0]); + off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, v6.daddr)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + } + case offsetof(struct bpf_sk_lookup, remote_port): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + sport, 2, target_size)); + break; + + case offsetofend(struct bpf_sk_lookup, remote_port): + *target_size = 2; + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); + break; + + case offsetof(struct bpf_sk_lookup, local_port): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + dport, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, ingress_ifindex): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + ingress_ifindex, 4, target_size)); + break; + } + + return insn - insn_buf; +} + +const struct bpf_prog_ops sk_lookup_prog_ops = { + .test_run = bpf_prog_test_run_sk_lookup, +}; + +const struct bpf_verifier_ops sk_lookup_verifier_ops = { + .get_func_proto = sk_lookup_func_proto, + .is_valid_access = sk_lookup_is_valid_access, + .convert_ctx_access = sk_lookup_convert_ctx_access, +}; + +#endif /* CONFIG_INET */ + +DEFINE_BPF_DISPATCHER(xdp) + +void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) +{ + bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog); +} + +BTF_ID_LIST_GLOBAL(btf_sock_ids, MAX_BTF_SOCK_TYPE) +#define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type) +BTF_SOCK_TYPE_xxx +#undef BTF_SOCK_TYPE + +BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk) +{ + /* tcp6_sock type is not generated in dwarf and hence btf, + * trigger an explicit type generation here. + */ + BTF_TYPE_EMIT(struct tcp6_sock); + if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && + sk->sk_family == AF_INET6) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { + .func = bpf_skc_to_tcp6_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6], +}; + +BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk) +{ + if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = { + .func = bpf_skc_to_tcp_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP], +}; + +BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk) +{ + /* BTF types for tcp_timewait_sock and inet_timewait_sock are not + * generated if CONFIG_INET=n. Trigger an explicit generation here. + */ + BTF_TYPE_EMIT(struct inet_timewait_sock); + BTF_TYPE_EMIT(struct tcp_timewait_sock); + +#ifdef CONFIG_INET + if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT) + return (unsigned long)sk; +#endif + +#if IS_BUILTIN(CONFIG_IPV6) + if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT) + return (unsigned long)sk; +#endif + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = { + .func = bpf_skc_to_tcp_timewait_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW], +}; + +BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk) +{ +#ifdef CONFIG_INET + if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_NEW_SYN_RECV) + return (unsigned long)sk; +#endif + +#if IS_BUILTIN(CONFIG_IPV6) + if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV) + return (unsigned long)sk; +#endif + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = { + .func = bpf_skc_to_tcp_request_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ], +}; + +BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk) +{ + /* udp6_sock type is not generated in dwarf and hence btf, + * trigger an explicit type generation here. + */ + BTF_TYPE_EMIT(struct udp6_sock); + if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP && + sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = { + .func = bpf_skc_to_udp6_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6], +}; + +BPF_CALL_1(bpf_skc_to_unix_sock, struct sock *, sk) +{ + /* unix_sock type is not generated in dwarf and hence btf, + * trigger an explicit type generation here. + */ + BTF_TYPE_EMIT(struct unix_sock); + if (sk && sk_fullsock(sk) && sk->sk_family == AF_UNIX) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_unix_sock_proto = { + .func = bpf_skc_to_unix_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UNIX], +}; + +BPF_CALL_1(bpf_skc_to_mptcp_sock, struct sock *, sk) +{ + BTF_TYPE_EMIT(struct mptcp_sock); + return (unsigned long)bpf_mptcp_sock_from_subflow(sk); +} + +const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto = { + .func = bpf_skc_to_mptcp_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_MPTCP], +}; + +BPF_CALL_1(bpf_sock_from_file, struct file *, file) +{ + return (unsigned long)sock_from_file(file); +} + +BTF_ID_LIST(bpf_sock_from_file_btf_ids) +BTF_ID(struct, socket) +BTF_ID(struct, file) + +const struct bpf_func_proto bpf_sock_from_file_proto = { + .func = bpf_sock_from_file, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .ret_btf_id = &bpf_sock_from_file_btf_ids[0], + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_sock_from_file_btf_ids[1], +}; + +static const struct bpf_func_proto * +bpf_sk_base_func_proto(enum bpf_func_id func_id) +{ + const struct bpf_func_proto *func; + + switch (func_id) { + case BPF_FUNC_skc_to_tcp6_sock: + func = &bpf_skc_to_tcp6_sock_proto; + break; + case BPF_FUNC_skc_to_tcp_sock: + func = &bpf_skc_to_tcp_sock_proto; + break; + case BPF_FUNC_skc_to_tcp_timewait_sock: + func = &bpf_skc_to_tcp_timewait_sock_proto; + break; + case BPF_FUNC_skc_to_tcp_request_sock: + func = &bpf_skc_to_tcp_request_sock_proto; + break; + case BPF_FUNC_skc_to_udp6_sock: + func = &bpf_skc_to_udp6_sock_proto; + break; + case BPF_FUNC_skc_to_unix_sock: + func = &bpf_skc_to_unix_sock_proto; + break; + case BPF_FUNC_skc_to_mptcp_sock: + func = &bpf_skc_to_mptcp_sock_proto; + break; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; + default: + return bpf_base_func_proto(func_id); + } + + if (!perfmon_capable()) + return NULL; + + return func; +} diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c new file mode 100644 index 000000000..0c85c8a9e --- /dev/null +++ b/net/core/flow_dissector.c @@ -0,0 +1,1960 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/export.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/if_vlan.h> +#include <linux/filter.h> +#include <net/dsa.h> +#include <net/dst_metadata.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/gre.h> +#include <net/pptp.h> +#include <net/tipc.h> +#include <linux/igmp.h> +#include <linux/icmp.h> +#include <linux/sctp.h> +#include <linux/dccp.h> +#include <linux/if_tunnel.h> +#include <linux/if_pppox.h> +#include <linux/ppp_defs.h> +#include <linux/stddef.h> +#include <linux/if_ether.h> +#include <linux/if_hsr.h> +#include <linux/mpls.h> +#include <linux/tcp.h> +#include <linux/ptp_classify.h> +#include <net/flow_dissector.h> +#include <scsi/fc/fc_fcoe.h> +#include <uapi/linux/batadv_packet.h> +#include <linux/bpf.h> +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_labels.h> +#endif +#include <linux/bpf-netns.h> + +static void dissector_set_key(struct flow_dissector *flow_dissector, + enum flow_dissector_key_id key_id) +{ + flow_dissector->used_keys |= (1 << key_id); +} + +void skb_flow_dissector_init(struct flow_dissector *flow_dissector, + const struct flow_dissector_key *key, + unsigned int key_count) +{ + unsigned int i; + + memset(flow_dissector, 0, sizeof(*flow_dissector)); + + for (i = 0; i < key_count; i++, key++) { + /* User should make sure that every key target offset is within + * boundaries of unsigned short. + */ + BUG_ON(key->offset > USHRT_MAX); + BUG_ON(dissector_uses_key(flow_dissector, + key->key_id)); + + dissector_set_key(flow_dissector, key->key_id); + flow_dissector->offset[key->key_id] = key->offset; + } + + /* Ensure that the dissector always includes control and basic key. + * That way we are able to avoid handling lack of these in fast path. + */ + BUG_ON(!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_CONTROL)); + BUG_ON(!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC)); +} +EXPORT_SYMBOL(skb_flow_dissector_init); + +#ifdef CONFIG_BPF_SYSCALL +int flow_dissector_bpf_prog_attach_check(struct net *net, + struct bpf_prog *prog) +{ + enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; + + if (net == &init_net) { + /* BPF flow dissector in the root namespace overrides + * any per-net-namespace one. When attaching to root, + * make sure we don't have any BPF program attached + * to the non-root namespaces. + */ + struct net *ns; + + for_each_net(ns) { + if (ns == &init_net) + continue; + if (rcu_access_pointer(ns->bpf.run_array[type])) + return -EEXIST; + } + } else { + /* Make sure root flow dissector is not attached + * when attaching to the non-root namespace. + */ + if (rcu_access_pointer(init_net.bpf.run_array[type])) + return -EEXIST; + } + + return 0; +} +#endif /* CONFIG_BPF_SYSCALL */ + +/** + * __skb_flow_get_ports - extract the upper layer ports and return them + * @skb: sk_buff to extract the ports from + * @thoff: transport header offset + * @ip_proto: protocol for which to get port offset + * @data: raw buffer pointer to the packet, if NULL use skb->data + * @hlen: packet header length, if @data is NULL use skb_headlen(skb) + * + * The function will try to retrieve the ports at offset thoff + poff where poff + * is the protocol port offset returned from proto_ports_offset + */ +__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, + const void *data, int hlen) +{ + int poff = proto_ports_offset(ip_proto); + + if (!data) { + data = skb->data; + hlen = skb_headlen(skb); + } + + if (poff >= 0) { + __be32 *ports, _ports; + + ports = __skb_header_pointer(skb, thoff + poff, + sizeof(_ports), data, hlen, &_ports); + if (ports) + return *ports; + } + + return 0; +} +EXPORT_SYMBOL(__skb_flow_get_ports); + +static bool icmp_has_id(u8 type) +{ + switch (type) { + case ICMP_ECHO: + case ICMP_ECHOREPLY: + case ICMP_TIMESTAMP: + case ICMP_TIMESTAMPREPLY: + case ICMPV6_ECHO_REQUEST: + case ICMPV6_ECHO_REPLY: + return true; + } + + return false; +} + +/** + * skb_flow_get_icmp_tci - extract ICMP(6) Type, Code and Identifier fields + * @skb: sk_buff to extract from + * @key_icmp: struct flow_dissector_key_icmp to fill + * @data: raw buffer pointer to the packet + * @thoff: offset to extract at + * @hlen: packet header length + */ +void skb_flow_get_icmp_tci(const struct sk_buff *skb, + struct flow_dissector_key_icmp *key_icmp, + const void *data, int thoff, int hlen) +{ + struct icmphdr *ih, _ih; + + ih = __skb_header_pointer(skb, thoff, sizeof(_ih), data, hlen, &_ih); + if (!ih) + return; + + key_icmp->type = ih->type; + key_icmp->code = ih->code; + + /* As we use 0 to signal that the Id field is not present, + * avoid confusion with packets without such field + */ + if (icmp_has_id(ih->type)) + key_icmp->id = ih->un.echo.id ? ntohs(ih->un.echo.id) : 1; + else + key_icmp->id = 0; +} +EXPORT_SYMBOL(skb_flow_get_icmp_tci); + +/* If FLOW_DISSECTOR_KEY_ICMP is set, dissect an ICMP packet + * using skb_flow_get_icmp_tci(). + */ +static void __skb_flow_dissect_icmp(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int thoff, int hlen) +{ + struct flow_dissector_key_icmp *key_icmp; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ICMP)) + return; + + key_icmp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ICMP, + target_container); + + skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen); +} + +static void __skb_flow_dissect_l2tpv3(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int nhoff, int hlen) +{ + struct flow_dissector_key_l2tpv3 *key_l2tpv3; + struct { + __be32 session_id; + } *hdr, _hdr; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_L2TPV3)) + return; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); + if (!hdr) + return; + + key_l2tpv3 = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_L2TPV3, + target_container); + + key_l2tpv3->session_id = hdr->session_id; +} + +void skb_flow_dissect_meta(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct flow_dissector_key_meta *meta; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_META)) + return; + + meta = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_META, + target_container); + meta->ingress_ifindex = skb->skb_iif; +} +EXPORT_SYMBOL(skb_flow_dissect_meta); + +static void +skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct flow_dissector_key_control *ctrl; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL)) + return; + + ctrl = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_CONTROL, + target_container); + ctrl->addr_type = type; +} + +void +skb_flow_dissect_ct(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, u16 *ctinfo_map, + size_t mapsize, bool post_ct, u16 zone) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + struct flow_dissector_key_ct *key; + enum ip_conntrack_info ctinfo; + struct nf_conn_labels *cl; + struct nf_conn *ct; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CT)) + return; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct && !post_ct) + return; + + key = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_CT, + target_container); + + if (!ct) { + key->ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | + TCA_FLOWER_KEY_CT_FLAGS_INVALID; + key->ct_zone = zone; + return; + } + + if (ctinfo < mapsize) + key->ct_state = ctinfo_map[ctinfo]; +#if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) + key->ct_zone = ct->zone.id; +#endif +#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) + key->ct_mark = READ_ONCE(ct->mark); +#endif + + cl = nf_ct_labels_find(ct); + if (cl) + memcpy(key->ct_labels, cl->bits, sizeof(key->ct_labels)); +#endif /* CONFIG_NF_CONNTRACK */ +} +EXPORT_SYMBOL(skb_flow_dissect_ct); + +void +skb_flow_dissect_tunnel_info(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct ip_tunnel_info *info; + struct ip_tunnel_key *key; + + /* A quick check to see if there might be something to do. */ + if (!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_KEYID) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_CONTROL) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_PORTS) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IP) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_OPTS)) + return; + + info = skb_tunnel_info(skb); + if (!info) + return; + + key = &info->key; + + switch (ip_tunnel_info_af(info)) { + case AF_INET: + skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV4_ADDRS, + flow_dissector, + target_container); + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) { + struct flow_dissector_key_ipv4_addrs *ipv4; + + ipv4 = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, + target_container); + ipv4->src = key->u.ipv4.src; + ipv4->dst = key->u.ipv4.dst; + } + break; + case AF_INET6: + skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV6_ADDRS, + flow_dissector, + target_container); + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) { + struct flow_dissector_key_ipv6_addrs *ipv6; + + ipv6 = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, + target_container); + ipv6->src = key->u.ipv6.src; + ipv6->dst = key->u.ipv6.dst; + } + break; + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) { + struct flow_dissector_key_keyid *keyid; + + keyid = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_KEYID, + target_container); + keyid->keyid = tunnel_id_to_key32(key->tun_id); + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) { + struct flow_dissector_key_ports *tp; + + tp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_PORTS, + target_container); + tp->src = key->tp_src; + tp->dst = key->tp_dst; + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP)) { + struct flow_dissector_key_ip *ip; + + ip = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IP, + target_container); + ip->tos = key->tos; + ip->ttl = key->ttl; + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) { + struct flow_dissector_key_enc_opts *enc_opt; + + enc_opt = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_OPTS, + target_container); + + if (info->options_len) { + enc_opt->len = info->options_len; + ip_tunnel_info_opts_get(enc_opt->data, info); + enc_opt->dst_opt_type = info->key.tun_flags & + TUNNEL_OPTIONS_PRESENT; + } + } +} +EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); + +void skb_flow_dissect_hash(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct flow_dissector_key_hash *key; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_HASH)) + return; + + key = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_HASH, + target_container); + + key->hash = skb_get_hash_raw(skb); +} +EXPORT_SYMBOL(skb_flow_dissect_hash); + +static enum flow_dissect_ret +__skb_flow_dissect_mpls(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, int nhoff, + int hlen, int lse_index, bool *entropy_label) +{ + struct mpls_label *hdr, _hdr; + u32 entry, label, bos; + + if (!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY) && + !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) + return FLOW_DISSECT_RET_OUT_GOOD; + + if (lse_index >= FLOW_DIS_MPLS_MAX) + return FLOW_DISSECT_RET_OUT_GOOD; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, + hlen, &_hdr); + if (!hdr) + return FLOW_DISSECT_RET_OUT_BAD; + + entry = ntohl(hdr->entry); + label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT; + bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT; + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) { + struct flow_dissector_key_mpls *key_mpls; + struct flow_dissector_mpls_lse *lse; + + key_mpls = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS, + target_container); + lse = &key_mpls->ls[lse_index]; + + lse->mpls_ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; + lse->mpls_bos = bos; + lse->mpls_tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT; + lse->mpls_label = label; + dissector_set_mpls_lse(key_mpls, lse_index); + } + + if (*entropy_label && + dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) { + struct flow_dissector_key_keyid *key_keyid; + + key_keyid = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY, + target_container); + key_keyid->keyid = cpu_to_be32(label); + } + + *entropy_label = label == MPLS_LABEL_ENTROPY; + + return bos ? FLOW_DISSECT_RET_OUT_GOOD : FLOW_DISSECT_RET_PROTO_AGAIN; +} + +static enum flow_dissect_ret +__skb_flow_dissect_arp(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int nhoff, int hlen) +{ + struct flow_dissector_key_arp *key_arp; + struct { + unsigned char ar_sha[ETH_ALEN]; + unsigned char ar_sip[4]; + unsigned char ar_tha[ETH_ALEN]; + unsigned char ar_tip[4]; + } *arp_eth, _arp_eth; + const struct arphdr *arp; + struct arphdr _arp; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ARP)) + return FLOW_DISSECT_RET_OUT_GOOD; + + arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data, + hlen, &_arp); + if (!arp) + return FLOW_DISSECT_RET_OUT_BAD; + + if (arp->ar_hrd != htons(ARPHRD_ETHER) || + arp->ar_pro != htons(ETH_P_IP) || + arp->ar_hln != ETH_ALEN || + arp->ar_pln != 4 || + (arp->ar_op != htons(ARPOP_REPLY) && + arp->ar_op != htons(ARPOP_REQUEST))) + return FLOW_DISSECT_RET_OUT_BAD; + + arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp), + sizeof(_arp_eth), data, + hlen, &_arp_eth); + if (!arp_eth) + return FLOW_DISSECT_RET_OUT_BAD; + + key_arp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ARP, + target_container); + + memcpy(&key_arp->sip, arp_eth->ar_sip, sizeof(key_arp->sip)); + memcpy(&key_arp->tip, arp_eth->ar_tip, sizeof(key_arp->tip)); + + /* Only store the lower byte of the opcode; + * this covers ARPOP_REPLY and ARPOP_REQUEST. + */ + key_arp->op = ntohs(arp->ar_op) & 0xff; + + ether_addr_copy(key_arp->sha, arp_eth->ar_sha); + ether_addr_copy(key_arp->tha, arp_eth->ar_tha); + + return FLOW_DISSECT_RET_OUT_GOOD; +} + +static enum flow_dissect_ret +__skb_flow_dissect_gre(const struct sk_buff *skb, + struct flow_dissector_key_control *key_control, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + __be16 *p_proto, int *p_nhoff, int *p_hlen, + unsigned int flags) +{ + struct flow_dissector_key_keyid *key_keyid; + struct gre_base_hdr *hdr, _hdr; + int offset = 0; + u16 gre_ver; + + hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), + data, *p_hlen, &_hdr); + if (!hdr) + return FLOW_DISSECT_RET_OUT_BAD; + + /* Only look inside GRE without routing */ + if (hdr->flags & GRE_ROUTING) + return FLOW_DISSECT_RET_OUT_GOOD; + + /* Only look inside GRE for version 0 and 1 */ + gre_ver = ntohs(hdr->flags & GRE_VERSION); + if (gre_ver > 1) + return FLOW_DISSECT_RET_OUT_GOOD; + + *p_proto = hdr->protocol; + if (gre_ver) { + /* Version1 must be PPTP, and check the flags */ + if (!(*p_proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY))) + return FLOW_DISSECT_RET_OUT_GOOD; + } + + offset += sizeof(struct gre_base_hdr); + + if (hdr->flags & GRE_CSUM) + offset += sizeof_field(struct gre_full_hdr, csum) + + sizeof_field(struct gre_full_hdr, reserved1); + + if (hdr->flags & GRE_KEY) { + const __be32 *keyid; + __be32 _keyid; + + keyid = __skb_header_pointer(skb, *p_nhoff + offset, + sizeof(_keyid), + data, *p_hlen, &_keyid); + if (!keyid) + return FLOW_DISSECT_RET_OUT_BAD; + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_GRE_KEYID)) { + key_keyid = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_GRE_KEYID, + target_container); + if (gre_ver == 0) + key_keyid->keyid = *keyid; + else + key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK; + } + offset += sizeof_field(struct gre_full_hdr, key); + } + + if (hdr->flags & GRE_SEQ) + offset += sizeof_field(struct pptp_gre_header, seq); + + if (gre_ver == 0) { + if (*p_proto == htons(ETH_P_TEB)) { + const struct ethhdr *eth; + struct ethhdr _eth; + + eth = __skb_header_pointer(skb, *p_nhoff + offset, + sizeof(_eth), + data, *p_hlen, &_eth); + if (!eth) + return FLOW_DISSECT_RET_OUT_BAD; + *p_proto = eth->h_proto; + offset += sizeof(*eth); + + /* Cap headers that we access via pointers at the + * end of the Ethernet header as our maximum alignment + * at that point is only 2 bytes. + */ + if (NET_IP_ALIGN) + *p_hlen = *p_nhoff + offset; + } + } else { /* version 1, must be PPTP */ + u8 _ppp_hdr[PPP_HDRLEN]; + u8 *ppp_hdr; + + if (hdr->flags & GRE_ACK) + offset += sizeof_field(struct pptp_gre_header, ack); + + ppp_hdr = __skb_header_pointer(skb, *p_nhoff + offset, + sizeof(_ppp_hdr), + data, *p_hlen, _ppp_hdr); + if (!ppp_hdr) + return FLOW_DISSECT_RET_OUT_BAD; + + switch (PPP_PROTOCOL(ppp_hdr)) { + case PPP_IP: + *p_proto = htons(ETH_P_IP); + break; + case PPP_IPV6: + *p_proto = htons(ETH_P_IPV6); + break; + default: + /* Could probably catch some more like MPLS */ + break; + } + + offset += PPP_HDRLEN; + } + + *p_nhoff += offset; + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) + return FLOW_DISSECT_RET_OUT_GOOD; + + return FLOW_DISSECT_RET_PROTO_AGAIN; +} + +/** + * __skb_flow_dissect_batadv() - dissect batman-adv header + * @skb: sk_buff to with the batman-adv header + * @key_control: flow dissectors control key + * @data: raw buffer pointer to the packet, if NULL use skb->data + * @p_proto: pointer used to update the protocol to process next + * @p_nhoff: pointer used to update inner network header offset + * @hlen: packet header length + * @flags: any combination of FLOW_DISSECTOR_F_* + * + * ETH_P_BATMAN packets are tried to be dissected. Only + * &struct batadv_unicast packets are actually processed because they contain an + * inner ethernet header and are usually followed by actual network header. This + * allows the flow dissector to continue processing the packet. + * + * Return: FLOW_DISSECT_RET_PROTO_AGAIN when &struct batadv_unicast was found, + * FLOW_DISSECT_RET_OUT_GOOD when dissector should stop after encapsulation, + * otherwise FLOW_DISSECT_RET_OUT_BAD + */ +static enum flow_dissect_ret +__skb_flow_dissect_batadv(const struct sk_buff *skb, + struct flow_dissector_key_control *key_control, + const void *data, __be16 *p_proto, int *p_nhoff, + int hlen, unsigned int flags) +{ + struct { + struct batadv_unicast_packet batadv_unicast; + struct ethhdr eth; + } *hdr, _hdr; + + hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), data, hlen, + &_hdr); + if (!hdr) + return FLOW_DISSECT_RET_OUT_BAD; + + if (hdr->batadv_unicast.version != BATADV_COMPAT_VERSION) + return FLOW_DISSECT_RET_OUT_BAD; + + if (hdr->batadv_unicast.packet_type != BATADV_UNICAST) + return FLOW_DISSECT_RET_OUT_BAD; + + *p_proto = hdr->eth.h_proto; + *p_nhoff += sizeof(*hdr); + + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) + return FLOW_DISSECT_RET_OUT_GOOD; + + return FLOW_DISSECT_RET_PROTO_AGAIN; +} + +static void +__skb_flow_dissect_tcp(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int thoff, int hlen) +{ + struct flow_dissector_key_tcp *key_tcp; + struct tcphdr *th, _th; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_TCP)) + return; + + th = __skb_header_pointer(skb, thoff, sizeof(_th), data, hlen, &_th); + if (!th) + return; + + if (unlikely(__tcp_hdrlen(th) < sizeof(_th))) + return; + + key_tcp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_TCP, + target_container); + key_tcp->flags = (*(__be16 *) &tcp_flag_word(th) & htons(0x0FFF)); +} + +static void +__skb_flow_dissect_ports(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int nhoff, u8 ip_proto, int hlen) +{ + enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX; + struct flow_dissector_key_ports *key_ports; + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) + dissector_ports = FLOW_DISSECTOR_KEY_PORTS; + else if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE)) + dissector_ports = FLOW_DISSECTOR_KEY_PORTS_RANGE; + + if (dissector_ports == FLOW_DISSECTOR_KEY_MAX) + return; + + key_ports = skb_flow_dissector_target(flow_dissector, + dissector_ports, + target_container); + key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, + data, hlen); +} + +static void +__skb_flow_dissect_ipv4(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + const struct iphdr *iph) +{ + struct flow_dissector_key_ip *key_ip; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP)) + return; + + key_ip = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IP, + target_container); + key_ip->tos = iph->tos; + key_ip->ttl = iph->ttl; +} + +static void +__skb_flow_dissect_ipv6(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + const struct ipv6hdr *iph) +{ + struct flow_dissector_key_ip *key_ip; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP)) + return; + + key_ip = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IP, + target_container); + key_ip->tos = ipv6_get_dsfield(iph); + key_ip->ttl = iph->hop_limit; +} + +/* Maximum number of protocol headers that can be parsed in + * __skb_flow_dissect + */ +#define MAX_FLOW_DISSECT_HDRS 15 + +static bool skb_flow_dissect_allowed(int *num_hdrs) +{ + ++*num_hdrs; + + return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS); +} + +static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct flow_dissector_key_ports *key_ports = NULL; + struct flow_dissector_key_control *key_control; + struct flow_dissector_key_basic *key_basic; + struct flow_dissector_key_addrs *key_addrs; + struct flow_dissector_key_tags *key_tags; + + key_control = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_CONTROL, + target_container); + key_control->thoff = flow_keys->thoff; + if (flow_keys->is_frag) + key_control->flags |= FLOW_DIS_IS_FRAGMENT; + if (flow_keys->is_first_frag) + key_control->flags |= FLOW_DIS_FIRST_FRAG; + if (flow_keys->is_encap) + key_control->flags |= FLOW_DIS_ENCAPSULATION; + + key_basic = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC, + target_container); + key_basic->n_proto = flow_keys->n_proto; + key_basic->ip_proto = flow_keys->ip_proto; + + if (flow_keys->addr_proto == ETH_P_IP && + dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, + target_container); + key_addrs->v4addrs.src = flow_keys->ipv4_src; + key_addrs->v4addrs.dst = flow_keys->ipv4_dst; + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + } else if (flow_keys->addr_proto == ETH_P_IPV6 && + dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, + target_container); + memcpy(&key_addrs->v6addrs.src, &flow_keys->ipv6_src, + sizeof(key_addrs->v6addrs.src)); + memcpy(&key_addrs->v6addrs.dst, &flow_keys->ipv6_dst, + sizeof(key_addrs->v6addrs.dst)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS, + target_container); + else if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE)) + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE, + target_container); + + if (key_ports) { + key_ports->src = flow_keys->sport; + key_ports->dst = flow_keys->dport; + } + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL)) { + key_tags = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL, + target_container); + key_tags->flow_label = ntohl(flow_keys->flow_label); + } +} + +u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, + __be16 proto, int nhoff, int hlen, unsigned int flags) +{ + struct bpf_flow_keys *flow_keys = ctx->flow_keys; + u32 result; + + /* Pass parameters to the BPF program */ + memset(flow_keys, 0, sizeof(*flow_keys)); + flow_keys->n_proto = proto; + flow_keys->nhoff = nhoff; + flow_keys->thoff = flow_keys->nhoff; + + BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG != + (int)FLOW_DISSECTOR_F_PARSE_1ST_FRAG); + BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL != + (int)FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP != + (int)FLOW_DISSECTOR_F_STOP_AT_ENCAP); + flow_keys->flags = flags; + + result = bpf_prog_run_pin_on_cpu(prog, ctx); + + flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen); + flow_keys->thoff = clamp_t(u16, flow_keys->thoff, + flow_keys->nhoff, hlen); + + return result; +} + +static bool is_pppoe_ses_hdr_valid(const struct pppoe_hdr *hdr) +{ + return hdr->ver == 1 && hdr->type == 1 && hdr->code == 0; +} + +/** + * __skb_flow_dissect - extract the flow_keys struct and return it + * @net: associated network namespace, derived from @skb if NULL + * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified + * @flow_dissector: list of keys to dissect + * @target_container: target structure to put dissected values into + * @data: raw buffer pointer to the packet, if NULL use skb->data + * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol + * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb) + * @hlen: packet header length, if @data is NULL use skb_headlen(skb) + * @flags: flags that control the dissection process, e.g. + * FLOW_DISSECTOR_F_STOP_AT_ENCAP. + * + * The function will try to retrieve individual keys into target specified + * by flow_dissector from either the skbuff or a raw buffer specified by the + * rest parameters. + * + * Caller must take care of zeroing target container memory. + */ +bool __skb_flow_dissect(const struct net *net, + const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + __be16 proto, int nhoff, int hlen, unsigned int flags) +{ + struct flow_dissector_key_control *key_control; + struct flow_dissector_key_basic *key_basic; + struct flow_dissector_key_addrs *key_addrs; + struct flow_dissector_key_tags *key_tags; + struct flow_dissector_key_vlan *key_vlan; + enum flow_dissect_ret fdret; + enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; + bool mpls_el = false; + int mpls_lse = 0; + int num_hdrs = 0; + u8 ip_proto = 0; + bool ret; + + if (!data) { + data = skb->data; + proto = skb_vlan_tag_present(skb) ? + skb->vlan_proto : skb->protocol; + nhoff = skb_network_offset(skb); + hlen = skb_headlen(skb); +#if IS_ENABLED(CONFIG_NET_DSA) + if (unlikely(skb->dev && netdev_uses_dsa(skb->dev) && + proto == htons(ETH_P_XDSA))) { + const struct dsa_device_ops *ops; + int offset = 0; + + ops = skb->dev->dsa_ptr->tag_ops; + /* Only DSA header taggers break flow dissection */ + if (ops->needed_headroom) { + if (ops->flow_dissect) + ops->flow_dissect(skb, &proto, &offset); + else + dsa_tag_generic_flow_dissect(skb, + &proto, + &offset); + hlen -= offset; + nhoff += offset; + } + } +#endif + } + + /* It is ensured by skb_flow_dissector_init() that control key will + * be always present. + */ + key_control = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_CONTROL, + target_container); + + /* It is ensured by skb_flow_dissector_init() that basic key will + * be always present. + */ + key_basic = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC, + target_container); + + if (skb) { + if (!net) { + if (skb->dev) + net = dev_net(skb->dev); + else if (skb->sk) + net = sock_net(skb->sk); + } + } + + WARN_ON_ONCE(!net); + if (net) { + enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; + struct bpf_prog_array *run_array; + + rcu_read_lock(); + run_array = rcu_dereference(init_net.bpf.run_array[type]); + if (!run_array) + run_array = rcu_dereference(net->bpf.run_array[type]); + + if (run_array) { + struct bpf_flow_keys flow_keys; + struct bpf_flow_dissector ctx = { + .flow_keys = &flow_keys, + .data = data, + .data_end = data + hlen, + }; + __be16 n_proto = proto; + struct bpf_prog *prog; + u32 result; + + if (skb) { + ctx.skb = skb; + /* we can't use 'proto' in the skb case + * because it might be set to skb->vlan_proto + * which has been pulled from the data + */ + n_proto = skb->protocol; + } + + prog = READ_ONCE(run_array->items[0].prog); + result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, + hlen, flags); + if (result == BPF_FLOW_DISSECTOR_CONTINUE) + goto dissect_continue; + __skb_flow_bpf_to_target(&flow_keys, flow_dissector, + target_container); + rcu_read_unlock(); + return result == BPF_OK; + } +dissect_continue: + rcu_read_unlock(); + } + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ETH_ADDRS)) { + struct ethhdr *eth = eth_hdr(skb); + struct flow_dissector_key_eth_addrs *key_eth_addrs; + + key_eth_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ETH_ADDRS, + target_container); + memcpy(key_eth_addrs, eth, sizeof(*key_eth_addrs)); + } + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_NUM_OF_VLANS)) { + struct flow_dissector_key_num_of_vlans *key_num_of_vlans; + + key_num_of_vlans = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_NUM_OF_VLANS, + target_container); + key_num_of_vlans->num_of_vlans = 0; + } + +proto_again: + fdret = FLOW_DISSECT_RET_CONTINUE; + + switch (proto) { + case htons(ETH_P_IP): { + const struct iphdr *iph; + struct iphdr _iph; + + iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); + if (!iph || iph->ihl < 5) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + nhoff += iph->ihl * 4; + + ip_proto = iph->protocol; + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, + target_container); + + memcpy(&key_addrs->v4addrs.src, &iph->saddr, + sizeof(key_addrs->v4addrs.src)); + memcpy(&key_addrs->v4addrs.dst, &iph->daddr, + sizeof(key_addrs->v4addrs.dst)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + } + + __skb_flow_dissect_ipv4(skb, flow_dissector, + target_container, data, iph); + + if (ip_is_fragment(iph)) { + key_control->flags |= FLOW_DIS_IS_FRAGMENT; + + if (iph->frag_off & htons(IP_OFFSET)) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } else { + key_control->flags |= FLOW_DIS_FIRST_FRAG; + if (!(flags & + FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + } + } + + break; + } + case htons(ETH_P_IPV6): { + const struct ipv6hdr *iph; + struct ipv6hdr _iph; + + iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); + if (!iph) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + ip_proto = iph->nexthdr; + nhoff += sizeof(struct ipv6hdr); + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, + target_container); + + memcpy(&key_addrs->v6addrs.src, &iph->saddr, + sizeof(key_addrs->v6addrs.src)); + memcpy(&key_addrs->v6addrs.dst, &iph->daddr, + sizeof(key_addrs->v6addrs.dst)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + } + + if ((dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL) || + (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) && + ip6_flowlabel(iph)) { + __be32 flow_label = ip6_flowlabel(iph); + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL)) { + key_tags = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL, + target_container); + key_tags->flow_label = ntohl(flow_label); + } + if (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + } + + __skb_flow_dissect_ipv6(skb, flow_dissector, + target_container, data, iph); + + break; + } + case htons(ETH_P_8021AD): + case htons(ETH_P_8021Q): { + const struct vlan_hdr *vlan = NULL; + struct vlan_hdr _vlan; + __be16 saved_vlan_tpid = proto; + + if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX && + skb && skb_vlan_tag_present(skb)) { + proto = skb->protocol; + } else { + vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), + data, hlen, &_vlan); + if (!vlan) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + proto = vlan->h_vlan_encapsulated_proto; + nhoff += sizeof(*vlan); + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS) && + !(key_control->flags & FLOW_DIS_ENCAPSULATION)) { + struct flow_dissector_key_num_of_vlans *key_nvs; + + key_nvs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_NUM_OF_VLANS, + target_container); + key_nvs->num_of_vlans++; + } + + if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX) { + dissector_vlan = FLOW_DISSECTOR_KEY_VLAN; + } else if (dissector_vlan == FLOW_DISSECTOR_KEY_VLAN) { + dissector_vlan = FLOW_DISSECTOR_KEY_CVLAN; + } else { + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; + } + + if (dissector_uses_key(flow_dissector, dissector_vlan)) { + key_vlan = skb_flow_dissector_target(flow_dissector, + dissector_vlan, + target_container); + + if (!vlan) { + key_vlan->vlan_id = skb_vlan_tag_get_id(skb); + key_vlan->vlan_priority = skb_vlan_tag_get_prio(skb); + } else { + key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) & + VLAN_VID_MASK; + key_vlan->vlan_priority = + (ntohs(vlan->h_vlan_TCI) & + VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; + } + key_vlan->vlan_tpid = saved_vlan_tpid; + key_vlan->vlan_eth_type = proto; + } + + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; + } + case htons(ETH_P_PPP_SES): { + struct { + struct pppoe_hdr hdr; + __be16 proto; + } *hdr, _hdr; + u16 ppp_proto; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); + if (!hdr) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + if (!is_pppoe_ses_hdr_valid(&hdr->hdr)) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + /* least significant bit of the most significant octet + * indicates if protocol field was compressed + */ + ppp_proto = ntohs(hdr->proto); + if (ppp_proto & 0x0100) { + ppp_proto = ppp_proto >> 8; + nhoff += PPPOE_SES_HLEN - 1; + } else { + nhoff += PPPOE_SES_HLEN; + } + + if (ppp_proto == PPP_IP) { + proto = htons(ETH_P_IP); + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + } else if (ppp_proto == PPP_IPV6) { + proto = htons(ETH_P_IPV6); + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + } else if (ppp_proto == PPP_MPLS_UC) { + proto = htons(ETH_P_MPLS_UC); + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + } else if (ppp_proto == PPP_MPLS_MC) { + proto = htons(ETH_P_MPLS_MC); + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + } else if (ppp_proto_is_valid(ppp_proto)) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + } else { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PPPOE)) { + struct flow_dissector_key_pppoe *key_pppoe; + + key_pppoe = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PPPOE, + target_container); + key_pppoe->session_id = hdr->hdr.sid; + key_pppoe->ppp_proto = htons(ppp_proto); + key_pppoe->type = htons(ETH_P_PPP_SES); + } + break; + } + case htons(ETH_P_TIPC): { + struct tipc_basic_hdr *hdr, _hdr; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), + data, hlen, &_hdr); + if (!hdr) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_TIPC)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_TIPC, + target_container); + key_addrs->tipckey.key = tipc_hdr_rps_key(hdr); + key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC; + } + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + + case htons(ETH_P_MPLS_UC): + case htons(ETH_P_MPLS_MC): + fdret = __skb_flow_dissect_mpls(skb, flow_dissector, + target_container, data, + nhoff, hlen, mpls_lse, + &mpls_el); + nhoff += sizeof(struct mpls_label); + mpls_lse++; + break; + case htons(ETH_P_FCOE): + if ((hlen - nhoff) < FCOE_HEADER_LEN) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + nhoff += FCOE_HEADER_LEN; + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + + case htons(ETH_P_ARP): + case htons(ETH_P_RARP): + fdret = __skb_flow_dissect_arp(skb, flow_dissector, + target_container, data, + nhoff, hlen); + break; + + case htons(ETH_P_BATMAN): + fdret = __skb_flow_dissect_batadv(skb, key_control, data, + &proto, &nhoff, hlen, flags); + break; + + case htons(ETH_P_1588): { + struct ptp_header *hdr, _hdr; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, + hlen, &_hdr); + if (!hdr) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + nhoff += sizeof(struct ptp_header); + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + + case htons(ETH_P_PRP): + case htons(ETH_P_HSR): { + struct hsr_tag *hdr, _hdr; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, + &_hdr); + if (!hdr) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + proto = hdr->encap_proto; + nhoff += HSR_HLEN; + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; + } + + default: + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + /* Process result of proto processing */ + switch (fdret) { + case FLOW_DISSECT_RET_OUT_GOOD: + goto out_good; + case FLOW_DISSECT_RET_PROTO_AGAIN: + if (skb_flow_dissect_allowed(&num_hdrs)) + goto proto_again; + goto out_good; + case FLOW_DISSECT_RET_CONTINUE: + case FLOW_DISSECT_RET_IPPROTO_AGAIN: + break; + case FLOW_DISSECT_RET_OUT_BAD: + default: + goto out_bad; + } + +ip_proto_again: + fdret = FLOW_DISSECT_RET_CONTINUE; + + switch (ip_proto) { + case IPPROTO_GRE: + if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + + fdret = __skb_flow_dissect_gre(skb, key_control, flow_dissector, + target_container, data, + &proto, &nhoff, &hlen, flags); + break; + + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_DEST: { + u8 _opthdr[2], *opthdr; + + if (proto != htons(ETH_P_IPV6)) + break; + + opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr), + data, hlen, &_opthdr); + if (!opthdr) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + ip_proto = opthdr[0]; + nhoff += (opthdr[1] + 1) << 3; + + fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN; + break; + } + case NEXTHDR_FRAGMENT: { + struct frag_hdr _fh, *fh; + + if (proto != htons(ETH_P_IPV6)) + break; + + fh = __skb_header_pointer(skb, nhoff, sizeof(_fh), + data, hlen, &_fh); + + if (!fh) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + key_control->flags |= FLOW_DIS_IS_FRAGMENT; + + nhoff += sizeof(_fh); + ip_proto = fh->nexthdr; + + if (!(fh->frag_off & htons(IP6_OFFSET))) { + key_control->flags |= FLOW_DIS_FIRST_FRAG; + if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) { + fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN; + break; + } + } + + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + case IPPROTO_IPIP: + if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + + proto = htons(ETH_P_IP); + + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; + + case IPPROTO_IPV6: + if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + + proto = htons(ETH_P_IPV6); + + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; + + + case IPPROTO_MPLS: + proto = htons(ETH_P_MPLS_UC); + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; + + case IPPROTO_TCP: + __skb_flow_dissect_tcp(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; + + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + __skb_flow_dissect_icmp(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; + case IPPROTO_L2TP: + __skb_flow_dissect_l2tpv3(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; + + default: + break; + } + + if (!(key_control->flags & FLOW_DIS_IS_FRAGMENT)) + __skb_flow_dissect_ports(skb, flow_dissector, target_container, + data, nhoff, ip_proto, hlen); + + /* Process result of IP proto processing */ + switch (fdret) { + case FLOW_DISSECT_RET_PROTO_AGAIN: + if (skb_flow_dissect_allowed(&num_hdrs)) + goto proto_again; + break; + case FLOW_DISSECT_RET_IPPROTO_AGAIN: + if (skb_flow_dissect_allowed(&num_hdrs)) + goto ip_proto_again; + break; + case FLOW_DISSECT_RET_OUT_GOOD: + case FLOW_DISSECT_RET_CONTINUE: + break; + case FLOW_DISSECT_RET_OUT_BAD: + default: + goto out_bad; + } + +out_good: + ret = true; + +out: + key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen); + key_basic->n_proto = proto; + key_basic->ip_proto = ip_proto; + + return ret; + +out_bad: + ret = false; + goto out; +} +EXPORT_SYMBOL(__skb_flow_dissect); + +static siphash_aligned_key_t hashrnd; +static __always_inline void __flow_hash_secret_init(void) +{ + net_get_random_once(&hashrnd, sizeof(hashrnd)); +} + +static const void *flow_keys_hash_start(const struct flow_keys *flow) +{ + BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % SIPHASH_ALIGNMENT); + return &flow->FLOW_KEYS_HASH_START_FIELD; +} + +static inline size_t flow_keys_hash_length(const struct flow_keys *flow) +{ + size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs); + + BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32)); + + switch (flow->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + diff -= sizeof(flow->addrs.v4addrs); + break; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + diff -= sizeof(flow->addrs.v6addrs); + break; + case FLOW_DISSECTOR_KEY_TIPC: + diff -= sizeof(flow->addrs.tipckey); + break; + } + return sizeof(*flow) - diff; +} + +__be32 flow_get_u32_src(const struct flow_keys *flow) +{ + switch (flow->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + return flow->addrs.v4addrs.src; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + return (__force __be32)ipv6_addr_hash( + &flow->addrs.v6addrs.src); + case FLOW_DISSECTOR_KEY_TIPC: + return flow->addrs.tipckey.key; + default: + return 0; + } +} +EXPORT_SYMBOL(flow_get_u32_src); + +__be32 flow_get_u32_dst(const struct flow_keys *flow) +{ + switch (flow->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + return flow->addrs.v4addrs.dst; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + return (__force __be32)ipv6_addr_hash( + &flow->addrs.v6addrs.dst); + default: + return 0; + } +} +EXPORT_SYMBOL(flow_get_u32_dst); + +/* Sort the source and destination IP and the ports, + * to have consistent hash within the two directions + */ +static inline void __flow_hash_consistentify(struct flow_keys *keys) +{ + int addr_diff, i; + + switch (keys->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + if ((__force u32)keys->addrs.v4addrs.dst < + (__force u32)keys->addrs.v4addrs.src) + swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst); + + if ((__force u16)keys->ports.dst < + (__force u16)keys->ports.src) { + swap(keys->ports.src, keys->ports.dst); + } + break; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + addr_diff = memcmp(&keys->addrs.v6addrs.dst, + &keys->addrs.v6addrs.src, + sizeof(keys->addrs.v6addrs.dst)); + if (addr_diff < 0) { + for (i = 0; i < 4; i++) + swap(keys->addrs.v6addrs.src.s6_addr32[i], + keys->addrs.v6addrs.dst.s6_addr32[i]); + } + if ((__force u16)keys->ports.dst < + (__force u16)keys->ports.src) { + swap(keys->ports.src, keys->ports.dst); + } + break; + } +} + +static inline u32 __flow_hash_from_keys(struct flow_keys *keys, + const siphash_key_t *keyval) +{ + u32 hash; + + __flow_hash_consistentify(keys); + + hash = siphash(flow_keys_hash_start(keys), + flow_keys_hash_length(keys), keyval); + if (!hash) + hash = 1; + + return hash; +} + +u32 flow_hash_from_keys(struct flow_keys *keys) +{ + __flow_hash_secret_init(); + return __flow_hash_from_keys(keys, &hashrnd); +} +EXPORT_SYMBOL(flow_hash_from_keys); + +static inline u32 ___skb_get_hash(const struct sk_buff *skb, + struct flow_keys *keys, + const siphash_key_t *keyval) +{ + skb_flow_dissect_flow_keys(skb, keys, + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + + return __flow_hash_from_keys(keys, keyval); +} + +struct _flow_keys_digest_data { + __be16 n_proto; + u8 ip_proto; + u8 padding; + __be32 ports; + __be32 src; + __be32 dst; +}; + +void make_flow_keys_digest(struct flow_keys_digest *digest, + const struct flow_keys *flow) +{ + struct _flow_keys_digest_data *data = + (struct _flow_keys_digest_data *)digest; + + BUILD_BUG_ON(sizeof(*data) > sizeof(*digest)); + + memset(digest, 0, sizeof(*digest)); + + data->n_proto = flow->basic.n_proto; + data->ip_proto = flow->basic.ip_proto; + data->ports = flow->ports.ports; + data->src = flow->addrs.v4addrs.src; + data->dst = flow->addrs.v4addrs.dst; +} +EXPORT_SYMBOL(make_flow_keys_digest); + +static struct flow_dissector flow_keys_dissector_symmetric __read_mostly; + +u32 __skb_get_hash_symmetric(const struct sk_buff *skb) +{ + struct flow_keys keys; + + __flow_hash_secret_init(); + + memset(&keys, 0, sizeof(keys)); + __skb_flow_dissect(NULL, skb, &flow_keys_dissector_symmetric, + &keys, NULL, 0, 0, 0, 0); + + return __flow_hash_from_keys(&keys, &hashrnd); +} +EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric); + +/** + * __skb_get_hash: calculate a flow hash + * @skb: sk_buff to calculate flow hash from + * + * This function calculates a flow hash based on src/dst addresses + * and src/dst port numbers. Sets hash in skb to non-zero hash value + * on success, zero indicates no valid hash. Also, sets l4_hash in skb + * if hash is a canonical 4-tuple hash over transport ports. + */ +void __skb_get_hash(struct sk_buff *skb) +{ + struct flow_keys keys; + u32 hash; + + __flow_hash_secret_init(); + + hash = ___skb_get_hash(skb, &keys, &hashrnd); + + __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); +} +EXPORT_SYMBOL(__skb_get_hash); + +__u32 skb_get_hash_perturb(const struct sk_buff *skb, + const siphash_key_t *perturb) +{ + struct flow_keys keys; + + return ___skb_get_hash(skb, &keys, perturb); +} +EXPORT_SYMBOL(skb_get_hash_perturb); + +u32 __skb_get_poff(const struct sk_buff *skb, const void *data, + const struct flow_keys_basic *keys, int hlen) +{ + u32 poff = keys->control.thoff; + + /* skip L4 headers for fragments after the first */ + if ((keys->control.flags & FLOW_DIS_IS_FRAGMENT) && + !(keys->control.flags & FLOW_DIS_FIRST_FRAG)) + return poff; + + switch (keys->basic.ip_proto) { + case IPPROTO_TCP: { + /* access doff as u8 to avoid unaligned access */ + const u8 *doff; + u8 _doff; + + doff = __skb_header_pointer(skb, poff + 12, sizeof(_doff), + data, hlen, &_doff); + if (!doff) + return poff; + + poff += max_t(u32, sizeof(struct tcphdr), (*doff & 0xF0) >> 2); + break; + } + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + poff += sizeof(struct udphdr); + break; + /* For the rest, we do not really care about header + * extensions at this point for now. + */ + case IPPROTO_ICMP: + poff += sizeof(struct icmphdr); + break; + case IPPROTO_ICMPV6: + poff += sizeof(struct icmp6hdr); + break; + case IPPROTO_IGMP: + poff += sizeof(struct igmphdr); + break; + case IPPROTO_DCCP: + poff += sizeof(struct dccp_hdr); + break; + case IPPROTO_SCTP: + poff += sizeof(struct sctphdr); + break; + } + + return poff; +} + +/** + * skb_get_poff - get the offset to the payload + * @skb: sk_buff to get the payload offset from + * + * The function will get the offset to the payload as far as it could + * be dissected. The main user is currently BPF, so that we can dynamically + * truncate packets without needing to push actual payload to the user + * space and can analyze headers only, instead. + */ +u32 skb_get_poff(const struct sk_buff *skb) +{ + struct flow_keys_basic keys; + + if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, + NULL, 0, 0, 0, 0)) + return 0; + + return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); +} + +__u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys) +{ + memset(keys, 0, sizeof(*keys)); + + memcpy(&keys->addrs.v6addrs.src, &fl6->saddr, + sizeof(keys->addrs.v6addrs.src)); + memcpy(&keys->addrs.v6addrs.dst, &fl6->daddr, + sizeof(keys->addrs.v6addrs.dst)); + keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + keys->ports.src = fl6->fl6_sport; + keys->ports.dst = fl6->fl6_dport; + keys->keyid.keyid = fl6->fl6_gre_key; + keys->tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); + keys->basic.ip_proto = fl6->flowi6_proto; + + return flow_hash_from_keys(keys); +} +EXPORT_SYMBOL(__get_hash_from_flowi6); + +static const struct flow_dissector_key flow_keys_dissector_keys[] = { + { + .key_id = FLOW_DISSECTOR_KEY_CONTROL, + .offset = offsetof(struct flow_keys, control), + }, + { + .key_id = FLOW_DISSECTOR_KEY_BASIC, + .offset = offsetof(struct flow_keys, basic), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, + .offset = offsetof(struct flow_keys, addrs.v4addrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, + .offset = offsetof(struct flow_keys, addrs.v6addrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_TIPC, + .offset = offsetof(struct flow_keys, addrs.tipckey), + }, + { + .key_id = FLOW_DISSECTOR_KEY_PORTS, + .offset = offsetof(struct flow_keys, ports), + }, + { + .key_id = FLOW_DISSECTOR_KEY_VLAN, + .offset = offsetof(struct flow_keys, vlan), + }, + { + .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL, + .offset = offsetof(struct flow_keys, tags), + }, + { + .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID, + .offset = offsetof(struct flow_keys, keyid), + }, +}; + +static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = { + { + .key_id = FLOW_DISSECTOR_KEY_CONTROL, + .offset = offsetof(struct flow_keys, control), + }, + { + .key_id = FLOW_DISSECTOR_KEY_BASIC, + .offset = offsetof(struct flow_keys, basic), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, + .offset = offsetof(struct flow_keys, addrs.v4addrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, + .offset = offsetof(struct flow_keys, addrs.v6addrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_PORTS, + .offset = offsetof(struct flow_keys, ports), + }, +}; + +static const struct flow_dissector_key flow_keys_basic_dissector_keys[] = { + { + .key_id = FLOW_DISSECTOR_KEY_CONTROL, + .offset = offsetof(struct flow_keys, control), + }, + { + .key_id = FLOW_DISSECTOR_KEY_BASIC, + .offset = offsetof(struct flow_keys, basic), + }, +}; + +struct flow_dissector flow_keys_dissector __read_mostly; +EXPORT_SYMBOL(flow_keys_dissector); + +struct flow_dissector flow_keys_basic_dissector __read_mostly; +EXPORT_SYMBOL(flow_keys_basic_dissector); + +static int __init init_default_flow_dissectors(void) +{ + skb_flow_dissector_init(&flow_keys_dissector, + flow_keys_dissector_keys, + ARRAY_SIZE(flow_keys_dissector_keys)); + skb_flow_dissector_init(&flow_keys_dissector_symmetric, + flow_keys_dissector_symmetric_keys, + ARRAY_SIZE(flow_keys_dissector_symmetric_keys)); + skb_flow_dissector_init(&flow_keys_basic_dissector, + flow_keys_basic_dissector_keys, + ARRAY_SIZE(flow_keys_basic_dissector_keys)); + return 0; +} +core_initcall(init_default_flow_dissectors); diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c new file mode 100644 index 000000000..abe423fd5 --- /dev/null +++ b/net/core/flow_offload.c @@ -0,0 +1,624 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/kernel.h> +#include <linux/slab.h> +#include <net/act_api.h> +#include <net/flow_offload.h> +#include <linux/rtnetlink.h> +#include <linux/mutex.h> +#include <linux/rhashtable.h> + +struct flow_rule *flow_rule_alloc(unsigned int num_actions) +{ + struct flow_rule *rule; + int i; + + rule = kzalloc(struct_size(rule, action.entries, num_actions), + GFP_KERNEL); + if (!rule) + return NULL; + + rule->action.num_entries = num_actions; + /* Pre-fill each action hw_stats with DONT_CARE. + * Caller can override this if it wants stats for a given action. + */ + for (i = 0; i < num_actions; i++) + rule->action.entries[i].hw_stats = FLOW_ACTION_HW_STATS_DONT_CARE; + + return rule; +} +EXPORT_SYMBOL(flow_rule_alloc); + +struct flow_offload_action *offload_action_alloc(unsigned int num_actions) +{ + struct flow_offload_action *fl_action; + int i; + + fl_action = kzalloc(struct_size(fl_action, action.entries, num_actions), + GFP_KERNEL); + if (!fl_action) + return NULL; + + fl_action->action.num_entries = num_actions; + /* Pre-fill each action hw_stats with DONT_CARE. + * Caller can override this if it wants stats for a given action. + */ + for (i = 0; i < num_actions; i++) + fl_action->action.entries[i].hw_stats = FLOW_ACTION_HW_STATS_DONT_CARE; + + return fl_action; +} + +#define FLOW_DISSECTOR_MATCH(__rule, __type, __out) \ + const struct flow_match *__m = &(__rule)->match; \ + struct flow_dissector *__d = (__m)->dissector; \ + \ + (__out)->key = skb_flow_dissector_target(__d, __type, (__m)->key); \ + (__out)->mask = skb_flow_dissector_target(__d, __type, (__m)->mask); \ + +void flow_rule_match_meta(const struct flow_rule *rule, + struct flow_match_meta *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_META, out); +} +EXPORT_SYMBOL(flow_rule_match_meta); + +void flow_rule_match_basic(const struct flow_rule *rule, + struct flow_match_basic *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_BASIC, out); +} +EXPORT_SYMBOL(flow_rule_match_basic); + +void flow_rule_match_control(const struct flow_rule *rule, + struct flow_match_control *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CONTROL, out); +} +EXPORT_SYMBOL(flow_rule_match_control); + +void flow_rule_match_eth_addrs(const struct flow_rule *rule, + struct flow_match_eth_addrs *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ETH_ADDRS, out); +} +EXPORT_SYMBOL(flow_rule_match_eth_addrs); + +void flow_rule_match_vlan(const struct flow_rule *rule, + struct flow_match_vlan *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_VLAN, out); +} +EXPORT_SYMBOL(flow_rule_match_vlan); + +void flow_rule_match_cvlan(const struct flow_rule *rule, + struct flow_match_vlan *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CVLAN, out); +} +EXPORT_SYMBOL(flow_rule_match_cvlan); + +void flow_rule_match_ipv4_addrs(const struct flow_rule *rule, + struct flow_match_ipv4_addrs *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPV4_ADDRS, out); +} +EXPORT_SYMBOL(flow_rule_match_ipv4_addrs); + +void flow_rule_match_ipv6_addrs(const struct flow_rule *rule, + struct flow_match_ipv6_addrs *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPV6_ADDRS, out); +} +EXPORT_SYMBOL(flow_rule_match_ipv6_addrs); + +void flow_rule_match_ip(const struct flow_rule *rule, + struct flow_match_ip *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IP, out); +} +EXPORT_SYMBOL(flow_rule_match_ip); + +void flow_rule_match_ports(const struct flow_rule *rule, + struct flow_match_ports *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PORTS, out); +} +EXPORT_SYMBOL(flow_rule_match_ports); + +void flow_rule_match_ports_range(const struct flow_rule *rule, + struct flow_match_ports_range *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PORTS_RANGE, out); +} +EXPORT_SYMBOL(flow_rule_match_ports_range); + +void flow_rule_match_tcp(const struct flow_rule *rule, + struct flow_match_tcp *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_TCP, out); +} +EXPORT_SYMBOL(flow_rule_match_tcp); + +void flow_rule_match_icmp(const struct flow_rule *rule, + struct flow_match_icmp *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ICMP, out); +} +EXPORT_SYMBOL(flow_rule_match_icmp); + +void flow_rule_match_mpls(const struct flow_rule *rule, + struct flow_match_mpls *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_MPLS, out); +} +EXPORT_SYMBOL(flow_rule_match_mpls); + +void flow_rule_match_enc_control(const struct flow_rule *rule, + struct flow_match_control *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_CONTROL, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_control); + +void flow_rule_match_enc_ipv4_addrs(const struct flow_rule *rule, + struct flow_match_ipv4_addrs *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_ipv4_addrs); + +void flow_rule_match_enc_ipv6_addrs(const struct flow_rule *rule, + struct flow_match_ipv6_addrs *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_ipv6_addrs); + +void flow_rule_match_enc_ip(const struct flow_rule *rule, + struct flow_match_ip *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IP, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_ip); + +void flow_rule_match_enc_ports(const struct flow_rule *rule, + struct flow_match_ports *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_PORTS, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_ports); + +void flow_rule_match_enc_keyid(const struct flow_rule *rule, + struct flow_match_enc_keyid *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_KEYID, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_keyid); + +void flow_rule_match_enc_opts(const struct flow_rule *rule, + struct flow_match_enc_opts *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_OPTS, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_opts); + +struct flow_action_cookie *flow_action_cookie_create(void *data, + unsigned int len, + gfp_t gfp) +{ + struct flow_action_cookie *cookie; + + cookie = kmalloc(sizeof(*cookie) + len, gfp); + if (!cookie) + return NULL; + cookie->cookie_len = len; + memcpy(cookie->cookie, data, len); + return cookie; +} +EXPORT_SYMBOL(flow_action_cookie_create); + +void flow_action_cookie_destroy(struct flow_action_cookie *cookie) +{ + kfree(cookie); +} +EXPORT_SYMBOL(flow_action_cookie_destroy); + +void flow_rule_match_ct(const struct flow_rule *rule, + struct flow_match_ct *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CT, out); +} +EXPORT_SYMBOL(flow_rule_match_ct); + +void flow_rule_match_pppoe(const struct flow_rule *rule, + struct flow_match_pppoe *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PPPOE, out); +} +EXPORT_SYMBOL(flow_rule_match_pppoe); + +void flow_rule_match_l2tpv3(const struct flow_rule *rule, + struct flow_match_l2tpv3 *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_L2TPV3, out); +} +EXPORT_SYMBOL(flow_rule_match_l2tpv3); + +struct flow_block_cb *flow_block_cb_alloc(flow_setup_cb_t *cb, + void *cb_ident, void *cb_priv, + void (*release)(void *cb_priv)) +{ + struct flow_block_cb *block_cb; + + block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL); + if (!block_cb) + return ERR_PTR(-ENOMEM); + + block_cb->cb = cb; + block_cb->cb_ident = cb_ident; + block_cb->cb_priv = cb_priv; + block_cb->release = release; + + return block_cb; +} +EXPORT_SYMBOL(flow_block_cb_alloc); + +void flow_block_cb_free(struct flow_block_cb *block_cb) +{ + if (block_cb->release) + block_cb->release(block_cb->cb_priv); + + kfree(block_cb); +} +EXPORT_SYMBOL(flow_block_cb_free); + +struct flow_block_cb *flow_block_cb_lookup(struct flow_block *block, + flow_setup_cb_t *cb, void *cb_ident) +{ + struct flow_block_cb *block_cb; + + list_for_each_entry(block_cb, &block->cb_list, list) { + if (block_cb->cb == cb && + block_cb->cb_ident == cb_ident) + return block_cb; + } + + return NULL; +} +EXPORT_SYMBOL(flow_block_cb_lookup); + +void *flow_block_cb_priv(struct flow_block_cb *block_cb) +{ + return block_cb->cb_priv; +} +EXPORT_SYMBOL(flow_block_cb_priv); + +void flow_block_cb_incref(struct flow_block_cb *block_cb) +{ + block_cb->refcnt++; +} +EXPORT_SYMBOL(flow_block_cb_incref); + +unsigned int flow_block_cb_decref(struct flow_block_cb *block_cb) +{ + return --block_cb->refcnt; +} +EXPORT_SYMBOL(flow_block_cb_decref); + +bool flow_block_cb_is_busy(flow_setup_cb_t *cb, void *cb_ident, + struct list_head *driver_block_list) +{ + struct flow_block_cb *block_cb; + + list_for_each_entry(block_cb, driver_block_list, driver_list) { + if (block_cb->cb == cb && + block_cb->cb_ident == cb_ident) + return true; + } + + return false; +} +EXPORT_SYMBOL(flow_block_cb_is_busy); + +int flow_block_cb_setup_simple(struct flow_block_offload *f, + struct list_head *driver_block_list, + flow_setup_cb_t *cb, + void *cb_ident, void *cb_priv, + bool ingress_only) +{ + struct flow_block_cb *block_cb; + + if (ingress_only && + f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) + return -EOPNOTSUPP; + + f->driver_block_list = driver_block_list; + + switch (f->command) { + case FLOW_BLOCK_BIND: + if (flow_block_cb_is_busy(cb, cb_ident, driver_block_list)) + return -EBUSY; + + block_cb = flow_block_cb_alloc(cb, cb_ident, cb_priv, NULL); + if (IS_ERR(block_cb)) + return PTR_ERR(block_cb); + + flow_block_cb_add(block_cb, f); + list_add_tail(&block_cb->driver_list, driver_block_list); + return 0; + case FLOW_BLOCK_UNBIND: + block_cb = flow_block_cb_lookup(f->block, cb, cb_ident); + if (!block_cb) + return -ENOENT; + + flow_block_cb_remove(block_cb, f); + list_del(&block_cb->driver_list); + return 0; + default: + return -EOPNOTSUPP; + } +} +EXPORT_SYMBOL(flow_block_cb_setup_simple); + +static DEFINE_MUTEX(flow_indr_block_lock); +static LIST_HEAD(flow_block_indr_list); +static LIST_HEAD(flow_block_indr_dev_list); +static LIST_HEAD(flow_indir_dev_list); + +struct flow_indr_dev { + struct list_head list; + flow_indr_block_bind_cb_t *cb; + void *cb_priv; + refcount_t refcnt; +}; + +static struct flow_indr_dev *flow_indr_dev_alloc(flow_indr_block_bind_cb_t *cb, + void *cb_priv) +{ + struct flow_indr_dev *indr_dev; + + indr_dev = kmalloc(sizeof(*indr_dev), GFP_KERNEL); + if (!indr_dev) + return NULL; + + indr_dev->cb = cb; + indr_dev->cb_priv = cb_priv; + refcount_set(&indr_dev->refcnt, 1); + + return indr_dev; +} + +struct flow_indir_dev_info { + void *data; + struct net_device *dev; + struct Qdisc *sch; + enum tc_setup_type type; + void (*cleanup)(struct flow_block_cb *block_cb); + struct list_head list; + enum flow_block_command command; + enum flow_block_binder_type binder_type; + struct list_head *cb_list; +}; + +static void existing_qdiscs_register(flow_indr_block_bind_cb_t *cb, void *cb_priv) +{ + struct flow_block_offload bo; + struct flow_indir_dev_info *cur; + + list_for_each_entry(cur, &flow_indir_dev_list, list) { + memset(&bo, 0, sizeof(bo)); + bo.command = cur->command; + bo.binder_type = cur->binder_type; + INIT_LIST_HEAD(&bo.cb_list); + cb(cur->dev, cur->sch, cb_priv, cur->type, &bo, cur->data, cur->cleanup); + list_splice(&bo.cb_list, cur->cb_list); + } +} + +int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv) +{ + struct flow_indr_dev *indr_dev; + + mutex_lock(&flow_indr_block_lock); + list_for_each_entry(indr_dev, &flow_block_indr_dev_list, list) { + if (indr_dev->cb == cb && + indr_dev->cb_priv == cb_priv) { + refcount_inc(&indr_dev->refcnt); + mutex_unlock(&flow_indr_block_lock); + return 0; + } + } + + indr_dev = flow_indr_dev_alloc(cb, cb_priv); + if (!indr_dev) { + mutex_unlock(&flow_indr_block_lock); + return -ENOMEM; + } + + list_add(&indr_dev->list, &flow_block_indr_dev_list); + existing_qdiscs_register(cb, cb_priv); + mutex_unlock(&flow_indr_block_lock); + + tcf_action_reoffload_cb(cb, cb_priv, true); + + return 0; +} +EXPORT_SYMBOL(flow_indr_dev_register); + +static void __flow_block_indr_cleanup(void (*release)(void *cb_priv), + void *cb_priv, + struct list_head *cleanup_list) +{ + struct flow_block_cb *this, *next; + + list_for_each_entry_safe(this, next, &flow_block_indr_list, indr.list) { + if (this->release == release && + this->indr.cb_priv == cb_priv) + list_move(&this->indr.list, cleanup_list); + } +} + +static void flow_block_indr_notify(struct list_head *cleanup_list) +{ + struct flow_block_cb *this, *next; + + list_for_each_entry_safe(this, next, cleanup_list, indr.list) { + list_del(&this->indr.list); + this->indr.cleanup(this); + } +} + +void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv, + void (*release)(void *cb_priv)) +{ + struct flow_indr_dev *this, *next, *indr_dev = NULL; + LIST_HEAD(cleanup_list); + + mutex_lock(&flow_indr_block_lock); + list_for_each_entry_safe(this, next, &flow_block_indr_dev_list, list) { + if (this->cb == cb && + this->cb_priv == cb_priv && + refcount_dec_and_test(&this->refcnt)) { + indr_dev = this; + list_del(&indr_dev->list); + break; + } + } + + if (!indr_dev) { + mutex_unlock(&flow_indr_block_lock); + return; + } + + __flow_block_indr_cleanup(release, cb_priv, &cleanup_list); + mutex_unlock(&flow_indr_block_lock); + + tcf_action_reoffload_cb(cb, cb_priv, false); + flow_block_indr_notify(&cleanup_list); + kfree(indr_dev); +} +EXPORT_SYMBOL(flow_indr_dev_unregister); + +static void flow_block_indr_init(struct flow_block_cb *flow_block, + struct flow_block_offload *bo, + struct net_device *dev, struct Qdisc *sch, void *data, + void *cb_priv, + void (*cleanup)(struct flow_block_cb *block_cb)) +{ + flow_block->indr.binder_type = bo->binder_type; + flow_block->indr.data = data; + flow_block->indr.cb_priv = cb_priv; + flow_block->indr.dev = dev; + flow_block->indr.sch = sch; + flow_block->indr.cleanup = cleanup; +} + +struct flow_block_cb *flow_indr_block_cb_alloc(flow_setup_cb_t *cb, + void *cb_ident, void *cb_priv, + void (*release)(void *cb_priv), + struct flow_block_offload *bo, + struct net_device *dev, + struct Qdisc *sch, void *data, + void *indr_cb_priv, + void (*cleanup)(struct flow_block_cb *block_cb)) +{ + struct flow_block_cb *block_cb; + + block_cb = flow_block_cb_alloc(cb, cb_ident, cb_priv, release); + if (IS_ERR(block_cb)) + goto out; + + flow_block_indr_init(block_cb, bo, dev, sch, data, indr_cb_priv, cleanup); + list_add(&block_cb->indr.list, &flow_block_indr_list); + +out: + return block_cb; +} +EXPORT_SYMBOL(flow_indr_block_cb_alloc); + +static struct flow_indir_dev_info *find_indir_dev(void *data) +{ + struct flow_indir_dev_info *cur; + + list_for_each_entry(cur, &flow_indir_dev_list, list) { + if (cur->data == data) + return cur; + } + return NULL; +} + +static int indir_dev_add(void *data, struct net_device *dev, struct Qdisc *sch, + enum tc_setup_type type, void (*cleanup)(struct flow_block_cb *block_cb), + struct flow_block_offload *bo) +{ + struct flow_indir_dev_info *info; + + info = find_indir_dev(data); + if (info) + return -EEXIST; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + info->data = data; + info->dev = dev; + info->sch = sch; + info->type = type; + info->cleanup = cleanup; + info->command = bo->command; + info->binder_type = bo->binder_type; + info->cb_list = bo->cb_list_head; + + list_add(&info->list, &flow_indir_dev_list); + return 0; +} + +static int indir_dev_remove(void *data) +{ + struct flow_indir_dev_info *info; + + info = find_indir_dev(data); + if (!info) + return -ENOENT; + + list_del(&info->list); + + kfree(info); + return 0; +} + +int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, + enum tc_setup_type type, void *data, + struct flow_block_offload *bo, + void (*cleanup)(struct flow_block_cb *block_cb)) +{ + struct flow_indr_dev *this; + u32 count = 0; + int err; + + mutex_lock(&flow_indr_block_lock); + if (bo) { + if (bo->command == FLOW_BLOCK_BIND) + indir_dev_add(data, dev, sch, type, cleanup, bo); + else if (bo->command == FLOW_BLOCK_UNBIND) + indir_dev_remove(data); + } + + list_for_each_entry(this, &flow_block_indr_dev_list, list) { + err = this->cb(dev, sch, this->cb_priv, type, bo, data, cleanup); + if (!err) + count++; + } + + mutex_unlock(&flow_indr_block_lock); + + return (bo && list_empty(&bo->cb_list)) ? -EOPNOTSUPP : count; +} +EXPORT_SYMBOL(flow_indr_dev_setup_offload); + +bool flow_indr_dev_exists(void) +{ + return !list_empty(&flow_block_indr_dev_list); +} +EXPORT_SYMBOL(flow_indr_dev_exists); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c new file mode 100644 index 000000000..4fcbdd71c --- /dev/null +++ b/net/core/gen_estimator.c @@ -0,0 +1,278 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * net/sched/gen_estimator.c Simple rate estimator. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * Eric Dumazet <edumazet@google.com> + * + * Changes: + * Jamal Hadi Salim - moved it to net/core and reshulfed + * names to make it usable in general net subsystem. + */ + +#include <linux/uaccess.h> +#include <linux/bitops.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/seqlock.h> +#include <net/sock.h> +#include <net/gen_stats.h> + +/* This code is NOT intended to be used for statistics collection, + * its purpose is to provide a base for statistical multiplexing + * for controlled load service. + * If you need only statistics, run a user level daemon which + * periodically reads byte counters. + */ + +struct net_rate_estimator { + struct gnet_stats_basic_sync *bstats; + spinlock_t *stats_lock; + bool running; + struct gnet_stats_basic_sync __percpu *cpu_bstats; + u8 ewma_log; + u8 intvl_log; /* period : (250ms << intvl_log) */ + + seqcount_t seq; + u64 last_packets; + u64 last_bytes; + + u64 avpps; + u64 avbps; + + unsigned long next_jiffies; + struct timer_list timer; + struct rcu_head rcu; +}; + +static void est_fetch_counters(struct net_rate_estimator *e, + struct gnet_stats_basic_sync *b) +{ + gnet_stats_basic_sync_init(b); + if (e->stats_lock) + spin_lock(e->stats_lock); + + gnet_stats_add_basic(b, e->cpu_bstats, e->bstats, e->running); + + if (e->stats_lock) + spin_unlock(e->stats_lock); + +} + +static void est_timer(struct timer_list *t) +{ + struct net_rate_estimator *est = from_timer(est, t, timer); + struct gnet_stats_basic_sync b; + u64 b_bytes, b_packets; + u64 rate, brate; + + est_fetch_counters(est, &b); + b_bytes = u64_stats_read(&b.bytes); + b_packets = u64_stats_read(&b.packets); + + brate = (b_bytes - est->last_bytes) << (10 - est->intvl_log); + brate = (brate >> est->ewma_log) - (est->avbps >> est->ewma_log); + + rate = (b_packets - est->last_packets) << (10 - est->intvl_log); + rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log); + + write_seqcount_begin(&est->seq); + est->avbps += brate; + est->avpps += rate; + write_seqcount_end(&est->seq); + + est->last_bytes = b_bytes; + est->last_packets = b_packets; + + est->next_jiffies += ((HZ/4) << est->intvl_log); + + if (unlikely(time_after_eq(jiffies, est->next_jiffies))) { + /* Ouch... timer was delayed. */ + est->next_jiffies = jiffies + 1; + } + mod_timer(&est->timer, est->next_jiffies); +} + +/** + * gen_new_estimator - create a new rate estimator + * @bstats: basic statistics + * @cpu_bstats: bstats per cpu + * @rate_est: rate estimator statistics + * @lock: lock for statistics and control path + * @running: true if @bstats represents a running qdisc, thus @bstats' + * internal values might change during basic reads. Only used + * if @bstats_cpu is NULL + * @opt: rate estimator configuration TLV + * + * Creates a new rate estimator with &bstats as source and &rate_est + * as destination. A new timer with the interval specified in the + * configuration TLV is created. Upon each interval, the latest statistics + * will be read from &bstats and the estimated rate will be stored in + * &rate_est with the statistics lock grabbed during this period. + * + * Returns 0 on success or a negative error code. + * + */ +int gen_new_estimator(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu_bstats, + struct net_rate_estimator __rcu **rate_est, + spinlock_t *lock, + bool running, + struct nlattr *opt) +{ + struct gnet_estimator *parm = nla_data(opt); + struct net_rate_estimator *old, *est; + struct gnet_stats_basic_sync b; + int intvl_log; + + if (nla_len(opt) < sizeof(*parm)) + return -EINVAL; + + /* allowed timer periods are : + * -2 : 250ms, -1 : 500ms, 0 : 1 sec + * 1 : 2 sec, 2 : 4 sec, 3 : 8 sec + */ + if (parm->interval < -2 || parm->interval > 3) + return -EINVAL; + + if (parm->ewma_log == 0 || parm->ewma_log >= 31) + return -EINVAL; + + est = kzalloc(sizeof(*est), GFP_KERNEL); + if (!est) + return -ENOBUFS; + + seqcount_init(&est->seq); + intvl_log = parm->interval + 2; + est->bstats = bstats; + est->stats_lock = lock; + est->running = running; + est->ewma_log = parm->ewma_log; + est->intvl_log = intvl_log; + est->cpu_bstats = cpu_bstats; + + if (lock) + local_bh_disable(); + est_fetch_counters(est, &b); + if (lock) + local_bh_enable(); + est->last_bytes = u64_stats_read(&b.bytes); + est->last_packets = u64_stats_read(&b.packets); + + if (lock) + spin_lock_bh(lock); + old = rcu_dereference_protected(*rate_est, 1); + if (old) { + del_timer_sync(&old->timer); + est->avbps = old->avbps; + est->avpps = old->avpps; + } + + est->next_jiffies = jiffies + ((HZ/4) << intvl_log); + timer_setup(&est->timer, est_timer, 0); + mod_timer(&est->timer, est->next_jiffies); + + rcu_assign_pointer(*rate_est, est); + if (lock) + spin_unlock_bh(lock); + if (old) + kfree_rcu(old, rcu); + return 0; +} +EXPORT_SYMBOL(gen_new_estimator); + +/** + * gen_kill_estimator - remove a rate estimator + * @rate_est: rate estimator + * + * Removes the rate estimator. + * + */ +void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est) +{ + struct net_rate_estimator *est; + + est = xchg((__force struct net_rate_estimator **)rate_est, NULL); + if (est) { + del_timer_sync(&est->timer); + kfree_rcu(est, rcu); + } +} +EXPORT_SYMBOL(gen_kill_estimator); + +/** + * gen_replace_estimator - replace rate estimator configuration + * @bstats: basic statistics + * @cpu_bstats: bstats per cpu + * @rate_est: rate estimator statistics + * @lock: lock for statistics and control path + * @running: true if @bstats represents a running qdisc, thus @bstats' + * internal values might change during basic reads. Only used + * if @cpu_bstats is NULL + * @opt: rate estimator configuration TLV + * + * Replaces the configuration of a rate estimator by calling + * gen_kill_estimator() and gen_new_estimator(). + * + * Returns 0 on success or a negative error code. + */ +int gen_replace_estimator(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu_bstats, + struct net_rate_estimator __rcu **rate_est, + spinlock_t *lock, + bool running, struct nlattr *opt) +{ + return gen_new_estimator(bstats, cpu_bstats, rate_est, + lock, running, opt); +} +EXPORT_SYMBOL(gen_replace_estimator); + +/** + * gen_estimator_active - test if estimator is currently in use + * @rate_est: rate estimator + * + * Returns true if estimator is active, and false if not. + */ +bool gen_estimator_active(struct net_rate_estimator __rcu **rate_est) +{ + return !!rcu_access_pointer(*rate_est); +} +EXPORT_SYMBOL(gen_estimator_active); + +bool gen_estimator_read(struct net_rate_estimator __rcu **rate_est, + struct gnet_stats_rate_est64 *sample) +{ + struct net_rate_estimator *est; + unsigned seq; + + rcu_read_lock(); + est = rcu_dereference(*rate_est); + if (!est) { + rcu_read_unlock(); + return false; + } + + do { + seq = read_seqcount_begin(&est->seq); + sample->bps = est->avbps >> 8; + sample->pps = est->avpps >> 8; + } while (read_seqcount_retry(&est->seq, seq)); + + rcu_read_unlock(); + return true; +} +EXPORT_SYMBOL(gen_estimator_read); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c new file mode 100644 index 000000000..c8d137ef5 --- /dev/null +++ b/net/core/gen_stats.c @@ -0,0 +1,485 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * net/core/gen_stats.c + * + * Authors: Thomas Graf <tgraf@suug.ch> + * Jamal Hadi Salim + * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * See Documentation/networking/gen_stats.rst + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/socket.h> +#include <linux/rtnetlink.h> +#include <linux/gen_stats.h> +#include <net/netlink.h> +#include <net/gen_stats.h> +#include <net/sch_generic.h> + +static inline int +gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr) +{ + if (nla_put_64bit(d->skb, type, size, buf, padattr)) + goto nla_put_failure; + return 0; + +nla_put_failure: + if (d->lock) + spin_unlock_bh(d->lock); + kfree(d->xstats); + d->xstats = NULL; + d->xstats_len = 0; + return -1; +} + +/** + * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode + * @skb: socket buffer to put statistics TLVs into + * @type: TLV type for top level statistic TLV + * @tc_stats_type: TLV type for backward compatibility struct tc_stats TLV + * @xstats_type: TLV type for backward compatibility xstats TLV + * @lock: statistics lock + * @d: dumping handle + * @padattr: padding attribute + * + * Initializes the dumping handle, grabs the statistic lock and appends + * an empty TLV header to the socket buffer for use a container for all + * other statistic TLVS. + * + * The dumping handle is marked to be in backward compatibility mode telling + * all gnet_stats_copy_XXX() functions to fill a local copy of struct tc_stats. + * + * Returns 0 on success or -1 if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type, + int xstats_type, spinlock_t *lock, + struct gnet_dump *d, int padattr) + __acquires(lock) +{ + memset(d, 0, sizeof(*d)); + + if (type) + d->tail = (struct nlattr *)skb_tail_pointer(skb); + d->skb = skb; + d->compat_tc_stats = tc_stats_type; + d->compat_xstats = xstats_type; + d->padattr = padattr; + if (lock) { + d->lock = lock; + spin_lock_bh(lock); + } + if (d->tail) { + int ret = gnet_stats_copy(d, type, NULL, 0, padattr); + + /* The initial attribute added in gnet_stats_copy() may be + * preceded by a padding attribute, in which case d->tail will + * end up pointing at the padding instead of the real attribute. + * Fix this so gnet_stats_finish_copy() adjusts the length of + * the right attribute. + */ + if (ret == 0 && d->tail->nla_type == padattr) + d->tail = (struct nlattr *)((char *)d->tail + + NLA_ALIGN(d->tail->nla_len)); + return ret; + } + + return 0; +} +EXPORT_SYMBOL(gnet_stats_start_copy_compat); + +/** + * gnet_stats_start_copy - start dumping procedure in compatibility mode + * @skb: socket buffer to put statistics TLVs into + * @type: TLV type for top level statistic TLV + * @lock: statistics lock + * @d: dumping handle + * @padattr: padding attribute + * + * Initializes the dumping handle, grabs the statistic lock and appends + * an empty TLV header to the socket buffer for use a container for all + * other statistic TLVS. + * + * Returns 0 on success or -1 if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, + struct gnet_dump *d, int padattr) +{ + return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d, padattr); +} +EXPORT_SYMBOL(gnet_stats_start_copy); + +/* Must not be inlined, due to u64_stats seqcount_t lockdep key */ +void gnet_stats_basic_sync_init(struct gnet_stats_basic_sync *b) +{ + u64_stats_set(&b->bytes, 0); + u64_stats_set(&b->packets, 0); + u64_stats_init(&b->syncp); +} +EXPORT_SYMBOL(gnet_stats_basic_sync_init); + +static void gnet_stats_add_basic_cpu(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu) +{ + u64 t_bytes = 0, t_packets = 0; + int i; + + for_each_possible_cpu(i) { + struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i); + unsigned int start; + u64 bytes, packets; + + do { + start = u64_stats_fetch_begin_irq(&bcpu->syncp); + bytes = u64_stats_read(&bcpu->bytes); + packets = u64_stats_read(&bcpu->packets); + } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start)); + + t_bytes += bytes; + t_packets += packets; + } + _bstats_update(bstats, t_bytes, t_packets); +} + +void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, bool running) +{ + unsigned int start; + u64 bytes = 0; + u64 packets = 0; + + WARN_ON_ONCE((cpu || running) && in_hardirq()); + + if (cpu) { + gnet_stats_add_basic_cpu(bstats, cpu); + return; + } + do { + if (running) + start = u64_stats_fetch_begin_irq(&b->syncp); + bytes = u64_stats_read(&b->bytes); + packets = u64_stats_read(&b->packets); + } while (running && u64_stats_fetch_retry_irq(&b->syncp, start)); + + _bstats_update(bstats, bytes, packets); +} +EXPORT_SYMBOL(gnet_stats_add_basic); + +static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, bool running) +{ + unsigned int start; + + if (cpu) { + u64 t_bytes = 0, t_packets = 0; + int i; + + for_each_possible_cpu(i) { + struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i); + unsigned int start; + u64 bytes, packets; + + do { + start = u64_stats_fetch_begin_irq(&bcpu->syncp); + bytes = u64_stats_read(&bcpu->bytes); + packets = u64_stats_read(&bcpu->packets); + } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start)); + + t_bytes += bytes; + t_packets += packets; + } + *ret_bytes = t_bytes; + *ret_packets = t_packets; + return; + } + do { + if (running) + start = u64_stats_fetch_begin_irq(&b->syncp); + *ret_bytes = u64_stats_read(&b->bytes); + *ret_packets = u64_stats_read(&b->packets); + } while (running && u64_stats_fetch_retry_irq(&b->syncp, start)); +} + +static int +___gnet_stats_copy_basic(struct gnet_dump *d, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, + int type, bool running) +{ + u64 bstats_bytes, bstats_packets; + + gnet_stats_read_basic(&bstats_bytes, &bstats_packets, cpu, b, running); + + if (d->compat_tc_stats && type == TCA_STATS_BASIC) { + d->tc_stats.bytes = bstats_bytes; + d->tc_stats.packets = bstats_packets; + } + + if (d->tail) { + struct gnet_stats_basic sb; + int res; + + memset(&sb, 0, sizeof(sb)); + sb.bytes = bstats_bytes; + sb.packets = bstats_packets; + res = gnet_stats_copy(d, type, &sb, sizeof(sb), TCA_STATS_PAD); + if (res < 0 || sb.packets == bstats_packets) + return res; + /* emit 64bit stats only if needed */ + return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats_packets, + sizeof(bstats_packets), TCA_STATS_PAD); + } + return 0; +} + +/** + * gnet_stats_copy_basic - copy basic statistics into statistic TLV + * @d: dumping handle + * @cpu: copy statistic per cpu + * @b: basic statistics + * @running: true if @b represents a running qdisc, thus @b's + * internal values might change during basic reads. + * Only used if @cpu is NULL + * + * Context: task; must not be run from IRQ or BH contexts + * + * Appends the basic statistics to the top level TLV created by + * gnet_stats_start_copy(). + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_basic(struct gnet_dump *d, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, + bool running) +{ + return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC, running); +} +EXPORT_SYMBOL(gnet_stats_copy_basic); + +/** + * gnet_stats_copy_basic_hw - copy basic hw statistics into statistic TLV + * @d: dumping handle + * @cpu: copy statistic per cpu + * @b: basic statistics + * @running: true if @b represents a running qdisc, thus @b's + * internal values might change during basic reads. + * Only used if @cpu is NULL + * + * Context: task; must not be run from IRQ or BH contexts + * + * Appends the basic statistics to the top level TLV created by + * gnet_stats_start_copy(). + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_basic_hw(struct gnet_dump *d, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, + bool running) +{ + return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC_HW, running); +} +EXPORT_SYMBOL(gnet_stats_copy_basic_hw); + +/** + * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV + * @d: dumping handle + * @rate_est: rate estimator + * + * Appends the rate estimator statistics to the top level TLV created by + * gnet_stats_start_copy(). + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_rate_est(struct gnet_dump *d, + struct net_rate_estimator __rcu **rate_est) +{ + struct gnet_stats_rate_est64 sample; + struct gnet_stats_rate_est est; + int res; + + if (!gen_estimator_read(rate_est, &sample)) + return 0; + est.bps = min_t(u64, UINT_MAX, sample.bps); + /* we have some time before reaching 2^32 packets per second */ + est.pps = sample.pps; + + if (d->compat_tc_stats) { + d->tc_stats.bps = est.bps; + d->tc_stats.pps = est.pps; + } + + if (d->tail) { + res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est), + TCA_STATS_PAD); + if (res < 0 || est.bps == sample.bps) + return res; + /* emit 64bit stats only if needed */ + return gnet_stats_copy(d, TCA_STATS_RATE_EST64, &sample, + sizeof(sample), TCA_STATS_PAD); + } + + return 0; +} +EXPORT_SYMBOL(gnet_stats_copy_rate_est); + +static void gnet_stats_add_queue_cpu(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *q) +{ + int i; + + for_each_possible_cpu(i) { + const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i); + + qstats->qlen += qcpu->qlen; + qstats->backlog += qcpu->backlog; + qstats->drops += qcpu->drops; + qstats->requeues += qcpu->requeues; + qstats->overlimits += qcpu->overlimits; + } +} + +void gnet_stats_add_queue(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *cpu, + const struct gnet_stats_queue *q) +{ + if (cpu) { + gnet_stats_add_queue_cpu(qstats, cpu); + } else { + qstats->qlen += q->qlen; + qstats->backlog += q->backlog; + qstats->drops += q->drops; + qstats->requeues += q->requeues; + qstats->overlimits += q->overlimits; + } +} +EXPORT_SYMBOL(gnet_stats_add_queue); + +/** + * gnet_stats_copy_queue - copy queue statistics into statistics TLV + * @d: dumping handle + * @cpu_q: per cpu queue statistics + * @q: queue statistics + * @qlen: queue length statistics + * + * Appends the queue statistics to the top level TLV created by + * gnet_stats_start_copy(). Using per cpu queue statistics if + * they are available. + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_queue(struct gnet_dump *d, + struct gnet_stats_queue __percpu *cpu_q, + struct gnet_stats_queue *q, __u32 qlen) +{ + struct gnet_stats_queue qstats = {0}; + + gnet_stats_add_queue(&qstats, cpu_q, q); + qstats.qlen = qlen; + + if (d->compat_tc_stats) { + d->tc_stats.drops = qstats.drops; + d->tc_stats.qlen = qstats.qlen; + d->tc_stats.backlog = qstats.backlog; + d->tc_stats.overlimits = qstats.overlimits; + } + + if (d->tail) + return gnet_stats_copy(d, TCA_STATS_QUEUE, + &qstats, sizeof(qstats), + TCA_STATS_PAD); + + return 0; +} +EXPORT_SYMBOL(gnet_stats_copy_queue); + +/** + * gnet_stats_copy_app - copy application specific statistics into statistics TLV + * @d: dumping handle + * @st: application specific statistics data + * @len: length of data + * + * Appends the application specific statistics to the top level TLV created by + * gnet_stats_start_copy() and remembers the data for XSTATS if the dumping + * handle is in backward compatibility mode. + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_app(struct gnet_dump *d, void *st, int len) +{ + if (d->compat_xstats) { + d->xstats = kmemdup(st, len, GFP_ATOMIC); + if (!d->xstats) + goto err_out; + d->xstats_len = len; + } + + if (d->tail) + return gnet_stats_copy(d, TCA_STATS_APP, st, len, + TCA_STATS_PAD); + + return 0; + +err_out: + if (d->lock) + spin_unlock_bh(d->lock); + d->xstats_len = 0; + return -1; +} +EXPORT_SYMBOL(gnet_stats_copy_app); + +/** + * gnet_stats_finish_copy - finish dumping procedure + * @d: dumping handle + * + * Corrects the length of the top level TLV to include all TLVs added + * by gnet_stats_copy_XXX() calls. Adds the backward compatibility TLVs + * if gnet_stats_start_copy_compat() was used and releases the statistics + * lock. + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_finish_copy(struct gnet_dump *d) +{ + if (d->tail) + d->tail->nla_len = skb_tail_pointer(d->skb) - (u8 *)d->tail; + + if (d->compat_tc_stats) + if (gnet_stats_copy(d, d->compat_tc_stats, &d->tc_stats, + sizeof(d->tc_stats), d->padattr) < 0) + return -1; + + if (d->compat_xstats && d->xstats) { + if (gnet_stats_copy(d, d->compat_xstats, d->xstats, + d->xstats_len, d->padattr) < 0) + return -1; + } + + if (d->lock) + spin_unlock_bh(d->lock); + kfree(d->xstats); + d->xstats = NULL; + d->xstats_len = 0; + return 0; +} +EXPORT_SYMBOL(gnet_stats_finish_copy); diff --git a/net/core/gro.c b/net/core/gro.c new file mode 100644 index 000000000..352f966cb --- /dev/null +++ b/net/core/gro.c @@ -0,0 +1,815 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <net/gro.h> +#include <net/dst_metadata.h> +#include <net/busy_poll.h> +#include <trace/events/net.h> + +#define MAX_GRO_SKBS 8 + +/* This should be increased if a protocol with a bigger head is added. */ +#define GRO_MAX_HEAD (MAX_HEADER + 128) + +static DEFINE_SPINLOCK(offload_lock); +static struct list_head offload_base __read_mostly = LIST_HEAD_INIT(offload_base); +/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */ +int gro_normal_batch __read_mostly = 8; + +/** + * dev_add_offload - register offload handlers + * @po: protocol offload declaration + * + * Add protocol offload handlers to the networking stack. The passed + * &proto_offload is linked into kernel lists and may not be freed until + * it has been removed from the kernel lists. + * + * This call does not sleep therefore it can not + * guarantee all CPU's that are in middle of receiving packets + * will see the new offload handlers (until the next received packet). + */ +void dev_add_offload(struct packet_offload *po) +{ + struct packet_offload *elem; + + spin_lock(&offload_lock); + list_for_each_entry(elem, &offload_base, list) { + if (po->priority < elem->priority) + break; + } + list_add_rcu(&po->list, elem->list.prev); + spin_unlock(&offload_lock); +} +EXPORT_SYMBOL(dev_add_offload); + +/** + * __dev_remove_offload - remove offload handler + * @po: packet offload declaration + * + * Remove a protocol offload handler that was previously added to the + * kernel offload handlers by dev_add_offload(). The passed &offload_type + * is removed from the kernel lists and can be freed or reused once this + * function returns. + * + * The packet type might still be in use by receivers + * and must not be freed until after all the CPU's have gone + * through a quiescent state. + */ +static void __dev_remove_offload(struct packet_offload *po) +{ + struct list_head *head = &offload_base; + struct packet_offload *po1; + + spin_lock(&offload_lock); + + list_for_each_entry(po1, head, list) { + if (po == po1) { + list_del_rcu(&po->list); + goto out; + } + } + + pr_warn("dev_remove_offload: %p not found\n", po); +out: + spin_unlock(&offload_lock); +} + +/** + * dev_remove_offload - remove packet offload handler + * @po: packet offload declaration + * + * Remove a packet offload handler that was previously added to the kernel + * offload handlers by dev_add_offload(). The passed &offload_type is + * removed from the kernel lists and can be freed or reused once this + * function returns. + * + * This call sleeps to guarantee that no CPU is looking at the packet + * type after return. + */ +void dev_remove_offload(struct packet_offload *po) +{ + __dev_remove_offload(po); + + synchronize_net(); +} +EXPORT_SYMBOL(dev_remove_offload); + +/** + * skb_eth_gso_segment - segmentation handler for ethernet protocols. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * @type: Ethernet Protocol ID + */ +struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, + netdev_features_t features, __be16 type) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_offload *ptype; + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &offload_base, list) { + if (ptype->type == type && ptype->callbacks.gso_segment) { + segs = ptype->callbacks.gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + return segs; +} +EXPORT_SYMBOL(skb_eth_gso_segment); + +/** + * skb_mac_gso_segment - mac layer segmentation handler. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + */ +struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_offload *ptype; + int vlan_depth = skb->mac_len; + __be16 type = skb_network_protocol(skb, &vlan_depth); + + if (unlikely(!type)) + return ERR_PTR(-EINVAL); + + __skb_pull(skb, vlan_depth); + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &offload_base, list) { + if (ptype->type == type && ptype->callbacks.gso_segment) { + segs = ptype->callbacks.gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + __skb_push(skb, skb->data - skb_mac_header(skb)); + + return segs; +} +EXPORT_SYMBOL(skb_mac_gso_segment); + +int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) +{ + struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); + unsigned int offset = skb_gro_offset(skb); + unsigned int headlen = skb_headlen(skb); + unsigned int len = skb_gro_len(skb); + unsigned int delta_truesize; + unsigned int gro_max_size; + unsigned int new_truesize; + struct sk_buff *lp; + int segs; + + /* Do not splice page pool based packets w/ non-page pool + * packets. This can result in reference count issues as page + * pool pages will not decrement the reference count and will + * instead be immediately returned to the pool or have frag + * count decremented. + */ + if (p->pp_recycle != skb->pp_recycle) + return -ETOOMANYREFS; + + /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ + gro_max_size = READ_ONCE(p->dev->gro_max_size); + + if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush)) + return -E2BIG; + + if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { + if (p->protocol != htons(ETH_P_IPV6) || + skb_headroom(p) < sizeof(struct hop_jumbo_hdr) || + ipv6_hdr(p)->nexthdr != IPPROTO_TCP || + p->encapsulation) + return -E2BIG; + } + + segs = NAPI_GRO_CB(skb)->count; + lp = NAPI_GRO_CB(p)->last; + pinfo = skb_shinfo(lp); + + if (headlen <= offset) { + skb_frag_t *frag; + skb_frag_t *frag2; + int i = skbinfo->nr_frags; + int nr_frags = pinfo->nr_frags + i; + + if (nr_frags > MAX_SKB_FRAGS) + goto merge; + + offset -= headlen; + pinfo->nr_frags = nr_frags; + skbinfo->nr_frags = 0; + + frag = pinfo->frags + nr_frags; + frag2 = skbinfo->frags + i; + do { + *--frag = *--frag2; + } while (--i); + + skb_frag_off_add(frag, offset); + skb_frag_size_sub(frag, offset); + + /* all fragments truesize : remove (head size + sk_buff) */ + new_truesize = SKB_TRUESIZE(skb_end_offset(skb)); + delta_truesize = skb->truesize - new_truesize; + + skb->truesize = new_truesize; + skb->len -= skb->data_len; + skb->data_len = 0; + + NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; + goto done; + } else if (skb->head_frag) { + int nr_frags = pinfo->nr_frags; + skb_frag_t *frag = pinfo->frags + nr_frags; + struct page *page = virt_to_head_page(skb->head); + unsigned int first_size = headlen - offset; + unsigned int first_offset; + + if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) + goto merge; + + first_offset = skb->data - + (unsigned char *)page_address(page) + + offset; + + pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; + + __skb_frag_set_page(frag, page); + skb_frag_off_set(frag, first_offset); + skb_frag_size_set(frag, first_size); + + memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); + /* We dont need to clear skbinfo->nr_frags here */ + + new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff)); + delta_truesize = skb->truesize - new_truesize; + skb->truesize = new_truesize; + NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; + goto done; + } + +merge: + /* sk owenrship - if any - completely transferred to the aggregated packet */ + skb->destructor = NULL; + delta_truesize = skb->truesize; + if (offset > headlen) { + unsigned int eat = offset - headlen; + + skb_frag_off_add(&skbinfo->frags[0], eat); + skb_frag_size_sub(&skbinfo->frags[0], eat); + skb->data_len -= eat; + skb->len -= eat; + offset = headlen; + } + + __skb_pull(skb, offset); + + if (NAPI_GRO_CB(p)->last == p) + skb_shinfo(p)->frag_list = skb; + else + NAPI_GRO_CB(p)->last->next = skb; + NAPI_GRO_CB(p)->last = skb; + __skb_header_release(skb); + lp = p; + +done: + NAPI_GRO_CB(p)->count += segs; + p->data_len += len; + p->truesize += delta_truesize; + p->len += len; + if (lp != p) { + lp->data_len += len; + lp->truesize += delta_truesize; + lp->len += len; + } + NAPI_GRO_CB(skb)->same_flow = 1; + return 0; +} + + +static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) +{ + struct packet_offload *ptype; + __be16 type = skb->protocol; + struct list_head *head = &offload_base; + int err = -ENOENT; + + BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); + + if (NAPI_GRO_CB(skb)->count == 1) { + skb_shinfo(skb)->gso_size = 0; + goto out; + } + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, head, list) { + if (ptype->type != type || !ptype->callbacks.gro_complete) + continue; + + err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete, + ipv6_gro_complete, inet_gro_complete, + skb, 0); + break; + } + rcu_read_unlock(); + + if (err) { + WARN_ON(&ptype->list == head); + kfree_skb(skb); + return; + } + +out: + gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count); +} + +static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, + bool flush_old) +{ + struct list_head *head = &napi->gro_hash[index].list; + struct sk_buff *skb, *p; + + list_for_each_entry_safe_reverse(skb, p, head, list) { + if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) + return; + skb_list_del_init(skb); + napi_gro_complete(napi, skb); + napi->gro_hash[index].count--; + } + + if (!napi->gro_hash[index].count) + __clear_bit(index, &napi->gro_bitmask); +} + +/* napi->gro_hash[].list contains packets ordered by age. + * youngest packets at the head of it. + * Complete skbs in reverse order to reduce latencies. + */ +void napi_gro_flush(struct napi_struct *napi, bool flush_old) +{ + unsigned long bitmask = napi->gro_bitmask; + unsigned int i, base = ~0U; + + while ((i = ffs(bitmask)) != 0) { + bitmask >>= i; + base += i; + __napi_gro_flush_chain(napi, base, flush_old); + } +} +EXPORT_SYMBOL(napi_gro_flush); + +static void gro_list_prepare(const struct list_head *head, + const struct sk_buff *skb) +{ + unsigned int maclen = skb->dev->hard_header_len; + u32 hash = skb_get_hash_raw(skb); + struct sk_buff *p; + + list_for_each_entry(p, head, list) { + unsigned long diffs; + + NAPI_GRO_CB(p)->flush = 0; + + if (hash != skb_get_hash_raw(p)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + + diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; + diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb); + if (skb_vlan_tag_present(p)) + diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb); + diffs |= skb_metadata_differs(p, skb); + if (maclen == ETH_HLEN) + diffs |= compare_ether_header(skb_mac_header(p), + skb_mac_header(skb)); + else if (!diffs) + diffs = memcmp(skb_mac_header(p), + skb_mac_header(skb), + maclen); + + /* in most common scenarions 'slow_gro' is 0 + * otherwise we are already on some slower paths + * either skip all the infrequent tests altogether or + * avoid trying too hard to skip each of them individually + */ + if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) { +#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + struct tc_skb_ext *skb_ext; + struct tc_skb_ext *p_ext; +#endif + + diffs |= p->sk != skb->sk; + diffs |= skb_metadata_dst_cmp(p, skb); + diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); + +#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + skb_ext = skb_ext_find(skb, TC_SKB_EXT); + p_ext = skb_ext_find(p, TC_SKB_EXT); + + diffs |= (!!p_ext) ^ (!!skb_ext); + if (!diffs && unlikely(skb_ext)) + diffs |= p_ext->chain ^ skb_ext->chain; +#endif + } + + NAPI_GRO_CB(p)->same_flow = !diffs; + } +} + +static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) +{ + const struct skb_shared_info *pinfo = skb_shinfo(skb); + const skb_frag_t *frag0 = &pinfo->frags[0]; + + NAPI_GRO_CB(skb)->data_offset = 0; + NAPI_GRO_CB(skb)->frag0 = NULL; + NAPI_GRO_CB(skb)->frag0_len = 0; + + if (!skb_headlen(skb) && pinfo->nr_frags && + !PageHighMem(skb_frag_page(frag0)) && + (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) { + NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); + NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, + skb_frag_size(frag0), + skb->end - skb->tail); + } +} + +static void gro_pull_from_frag0(struct sk_buff *skb, int grow) +{ + struct skb_shared_info *pinfo = skb_shinfo(skb); + + BUG_ON(skb->end - skb->tail < grow); + + memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); + + skb->data_len -= grow; + skb->tail += grow; + + skb_frag_off_add(&pinfo->frags[0], grow); + skb_frag_size_sub(&pinfo->frags[0], grow); + + if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { + skb_frag_unref(skb, 0); + memmove(pinfo->frags, pinfo->frags + 1, + --pinfo->nr_frags * sizeof(pinfo->frags[0])); + } +} + +static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) +{ + struct sk_buff *oldest; + + oldest = list_last_entry(head, struct sk_buff, list); + + /* We are called with head length >= MAX_GRO_SKBS, so this is + * impossible. + */ + if (WARN_ON_ONCE(!oldest)) + return; + + /* Do not adjust napi->gro_hash[].count, caller is adding a new + * SKB to the chain. + */ + skb_list_del_init(oldest); + napi_gro_complete(napi, oldest); +} + +static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +{ + u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); + struct gro_list *gro_list = &napi->gro_hash[bucket]; + struct list_head *head = &offload_base; + struct packet_offload *ptype; + __be16 type = skb->protocol; + struct sk_buff *pp = NULL; + enum gro_result ret; + int same_flow; + int grow; + + if (netif_elide_gro(skb->dev)) + goto normal; + + gro_list_prepare(&gro_list->list, skb); + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, head, list) { + if (ptype->type == type && ptype->callbacks.gro_receive) + goto found_ptype; + } + rcu_read_unlock(); + goto normal; + +found_ptype: + skb_set_network_header(skb, skb_gro_offset(skb)); + skb_reset_mac_len(skb); + BUILD_BUG_ON(sizeof_field(struct napi_gro_cb, zeroed) != sizeof(u32)); + BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed), + sizeof(u32))); /* Avoid slow unaligned acc */ + *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; + NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); + NAPI_GRO_CB(skb)->is_atomic = 1; + NAPI_GRO_CB(skb)->count = 1; + if (unlikely(skb_is_gso(skb))) { + NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs; + /* Only support TCP and non DODGY users. */ + if (!skb_is_gso_tcp(skb) || + (skb_shinfo(skb)->gso_type & SKB_GSO_DODGY)) + NAPI_GRO_CB(skb)->flush = 1; + } + + /* Setup for GRO checksum validation */ + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + NAPI_GRO_CB(skb)->csum = skb->csum; + NAPI_GRO_CB(skb)->csum_valid = 1; + break; + case CHECKSUM_UNNECESSARY: + NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; + break; + } + + pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, + ipv6_gro_receive, inet_gro_receive, + &gro_list->list, skb); + + rcu_read_unlock(); + + if (PTR_ERR(pp) == -EINPROGRESS) { + ret = GRO_CONSUMED; + goto ok; + } + + same_flow = NAPI_GRO_CB(skb)->same_flow; + ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; + + if (pp) { + skb_list_del_init(pp); + napi_gro_complete(napi, pp); + gro_list->count--; + } + + if (same_flow) + goto ok; + + if (NAPI_GRO_CB(skb)->flush) + goto normal; + + if (unlikely(gro_list->count >= MAX_GRO_SKBS)) + gro_flush_oldest(napi, &gro_list->list); + else + gro_list->count++; + + NAPI_GRO_CB(skb)->age = jiffies; + NAPI_GRO_CB(skb)->last = skb; + if (!skb_is_gso(skb)) + skb_shinfo(skb)->gso_size = skb_gro_len(skb); + list_add(&skb->list, &gro_list->list); + ret = GRO_HELD; + +pull: + grow = skb_gro_offset(skb) - skb_headlen(skb); + if (grow > 0) + gro_pull_from_frag0(skb, grow); +ok: + if (gro_list->count) { + if (!test_bit(bucket, &napi->gro_bitmask)) + __set_bit(bucket, &napi->gro_bitmask); + } else if (test_bit(bucket, &napi->gro_bitmask)) { + __clear_bit(bucket, &napi->gro_bitmask); + } + + return ret; + +normal: + ret = GRO_NORMAL; + goto pull; +} + +struct packet_offload *gro_find_receive_by_type(__be16 type) +{ + struct list_head *offload_head = &offload_base; + struct packet_offload *ptype; + + list_for_each_entry_rcu(ptype, offload_head, list) { + if (ptype->type != type || !ptype->callbacks.gro_receive) + continue; + return ptype; + } + return NULL; +} +EXPORT_SYMBOL(gro_find_receive_by_type); + +struct packet_offload *gro_find_complete_by_type(__be16 type) +{ + struct list_head *offload_head = &offload_base; + struct packet_offload *ptype; + + list_for_each_entry_rcu(ptype, offload_head, list) { + if (ptype->type != type || !ptype->callbacks.gro_complete) + continue; + return ptype; + } + return NULL; +} +EXPORT_SYMBOL(gro_find_complete_by_type); + +static gro_result_t napi_skb_finish(struct napi_struct *napi, + struct sk_buff *skb, + gro_result_t ret) +{ + switch (ret) { + case GRO_NORMAL: + gro_normal_one(napi, skb, 1); + break; + + case GRO_MERGED_FREE: + if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) + napi_skb_free_stolen_head(skb); + else if (skb->fclone != SKB_FCLONE_UNAVAILABLE) + __kfree_skb(skb); + else + __kfree_skb_defer(skb); + break; + + case GRO_HELD: + case GRO_MERGED: + case GRO_CONSUMED: + break; + } + + return ret; +} + +gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +{ + gro_result_t ret; + + skb_mark_napi_id(skb, napi); + trace_napi_gro_receive_entry(skb); + + skb_gro_reset_offset(skb, 0); + + ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); + trace_napi_gro_receive_exit(ret); + + return ret; +} +EXPORT_SYMBOL(napi_gro_receive); + +static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) +{ + if (unlikely(skb->pfmemalloc)) { + consume_skb(skb); + return; + } + __skb_pull(skb, skb_headlen(skb)); + /* restore the reserve we had after netdev_alloc_skb_ip_align() */ + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); + __vlan_hwaccel_clear_tag(skb); + skb->dev = napi->dev; + skb->skb_iif = 0; + + /* eth_type_trans() assumes pkt_type is PACKET_HOST */ + skb->pkt_type = PACKET_HOST; + + skb->encapsulation = 0; + skb_shinfo(skb)->gso_type = 0; + skb_shinfo(skb)->gso_size = 0; + if (unlikely(skb->slow_gro)) { + skb_orphan(skb); + skb_ext_reset(skb); + nf_reset_ct(skb); + skb->slow_gro = 0; + } + + napi->skb = skb; +} + +struct sk_buff *napi_get_frags(struct napi_struct *napi) +{ + struct sk_buff *skb = napi->skb; + + if (!skb) { + skb = napi_alloc_skb(napi, GRO_MAX_HEAD); + if (skb) { + napi->skb = skb; + skb_mark_napi_id(skb, napi); + } + } + return skb; +} +EXPORT_SYMBOL(napi_get_frags); + +static gro_result_t napi_frags_finish(struct napi_struct *napi, + struct sk_buff *skb, + gro_result_t ret) +{ + switch (ret) { + case GRO_NORMAL: + case GRO_HELD: + __skb_push(skb, ETH_HLEN); + skb->protocol = eth_type_trans(skb, skb->dev); + if (ret == GRO_NORMAL) + gro_normal_one(napi, skb, 1); + break; + + case GRO_MERGED_FREE: + if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) + napi_skb_free_stolen_head(skb); + else + napi_reuse_skb(napi, skb); + break; + + case GRO_MERGED: + case GRO_CONSUMED: + break; + } + + return ret; +} + +/* Upper GRO stack assumes network header starts at gro_offset=0 + * Drivers could call both napi_gro_frags() and napi_gro_receive() + * We copy ethernet header into skb->data to have a common layout. + */ +static struct sk_buff *napi_frags_skb(struct napi_struct *napi) +{ + struct sk_buff *skb = napi->skb; + const struct ethhdr *eth; + unsigned int hlen = sizeof(*eth); + + napi->skb = NULL; + + skb_reset_mac_header(skb); + skb_gro_reset_offset(skb, hlen); + + if (unlikely(skb_gro_header_hard(skb, hlen))) { + eth = skb_gro_header_slow(skb, hlen, 0); + if (unlikely(!eth)) { + net_warn_ratelimited("%s: dropping impossible skb from %s\n", + __func__, napi->dev->name); + napi_reuse_skb(napi, skb); + return NULL; + } + } else { + eth = (const struct ethhdr *)skb->data; + gro_pull_from_frag0(skb, hlen); + NAPI_GRO_CB(skb)->frag0 += hlen; + NAPI_GRO_CB(skb)->frag0_len -= hlen; + } + __skb_pull(skb, hlen); + + /* + * This works because the only protocols we care about don't require + * special handling. + * We'll fix it up properly in napi_frags_finish() + */ + skb->protocol = eth->h_proto; + + return skb; +} + +gro_result_t napi_gro_frags(struct napi_struct *napi) +{ + gro_result_t ret; + struct sk_buff *skb = napi_frags_skb(napi); + + trace_napi_gro_frags_entry(skb); + + ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); + trace_napi_gro_frags_exit(ret); + + return ret; +} +EXPORT_SYMBOL(napi_gro_frags); + +/* Compute the checksum from gro_offset and return the folded value + * after adding in any pseudo checksum. + */ +__sum16 __skb_gro_checksum_complete(struct sk_buff *skb) +{ + __wsum wsum; + __sum16 sum; + + wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); + + /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ + sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); + /* See comments in __skb_checksum_complete(). */ + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev, skb); + } + + NAPI_GRO_CB(skb)->csum = wsum; + NAPI_GRO_CB(skb)->csum_valid = 1; + + return sum; +} +EXPORT_SYMBOL(__skb_gro_checksum_complete); diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c new file mode 100644 index 000000000..ed5ec5de4 --- /dev/null +++ b/net/core/gro_cells.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/netdevice.h> +#include <net/gro_cells.h> + +struct gro_cell { + struct sk_buff_head napi_skbs; + struct napi_struct napi; +}; + +int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct gro_cell *cell; + int res; + + rcu_read_lock(); + if (unlikely(!(dev->flags & IFF_UP))) + goto drop; + + if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(dev)) { + res = netif_rx(skb); + goto unlock; + } + + cell = this_cpu_ptr(gcells->cells); + + if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(netdev_max_backlog)) { +drop: + dev_core_stats_rx_dropped_inc(dev); + kfree_skb(skb); + res = NET_RX_DROP; + goto unlock; + } + + __skb_queue_tail(&cell->napi_skbs, skb); + if (skb_queue_len(&cell->napi_skbs) == 1) + napi_schedule(&cell->napi); + + res = NET_RX_SUCCESS; + +unlock: + rcu_read_unlock(); + return res; +} +EXPORT_SYMBOL(gro_cells_receive); + +/* called under BH context */ +static int gro_cell_poll(struct napi_struct *napi, int budget) +{ + struct gro_cell *cell = container_of(napi, struct gro_cell, napi); + struct sk_buff *skb; + int work_done = 0; + + while (work_done < budget) { + skb = __skb_dequeue(&cell->napi_skbs); + if (!skb) + break; + napi_gro_receive(napi, skb); + work_done++; + } + + if (work_done < budget) + napi_complete_done(napi, work_done); + return work_done; +} + +int gro_cells_init(struct gro_cells *gcells, struct net_device *dev) +{ + int i; + + gcells->cells = alloc_percpu(struct gro_cell); + if (!gcells->cells) + return -ENOMEM; + + for_each_possible_cpu(i) { + struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); + + __skb_queue_head_init(&cell->napi_skbs); + + set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state); + + netif_napi_add(dev, &cell->napi, gro_cell_poll); + napi_enable(&cell->napi); + } + return 0; +} +EXPORT_SYMBOL(gro_cells_init); + +struct percpu_free_defer { + struct rcu_head rcu; + void __percpu *ptr; +}; + +static void percpu_free_defer_callback(struct rcu_head *head) +{ + struct percpu_free_defer *defer; + + defer = container_of(head, struct percpu_free_defer, rcu); + free_percpu(defer->ptr); + kfree(defer); +} + +void gro_cells_destroy(struct gro_cells *gcells) +{ + struct percpu_free_defer *defer; + int i; + + if (!gcells->cells) + return; + for_each_possible_cpu(i) { + struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); + + napi_disable(&cell->napi); + __netif_napi_del(&cell->napi); + __skb_queue_purge(&cell->napi_skbs); + } + /* We need to observe an rcu grace period before freeing ->cells, + * because netpoll could access dev->napi_list under rcu protection. + * Try hard using call_rcu() instead of synchronize_rcu(), + * because we might be called from cleanup_net(), and we + * definitely do not want to block this critical task. + */ + defer = kmalloc(sizeof(*defer), GFP_KERNEL | __GFP_NOWARN); + if (likely(defer)) { + defer->ptr = gcells->cells; + call_rcu(&defer->rcu, percpu_free_defer_callback); + } else { + /* We do not hold RTNL at this point, synchronize_net() + * would not be able to expedite this sync. + */ + synchronize_rcu_expedited(); + free_percpu(gcells->cells); + } + gcells->cells = NULL; +} +EXPORT_SYMBOL(gro_cells_destroy); diff --git a/net/core/hwbm.c b/net/core/hwbm.c new file mode 100644 index 000000000..ac1a66df9 --- /dev/null +++ b/net/core/hwbm.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Support for hardware buffer manager. + * + * Copyright (C) 2016 Marvell + * + * Gregory CLEMENT <gregory.clement@free-electrons.com> + */ +#include <linux/kernel.h> +#include <linux/printk.h> +#include <linux/skbuff.h> +#include <net/hwbm.h> + +void hwbm_buf_free(struct hwbm_pool *bm_pool, void *buf) +{ + if (likely(bm_pool->frag_size <= PAGE_SIZE)) + skb_free_frag(buf); + else + kfree(buf); +} +EXPORT_SYMBOL_GPL(hwbm_buf_free); + +/* Refill processing for HW buffer management */ +int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp) +{ + int frag_size = bm_pool->frag_size; + void *buf; + + if (likely(frag_size <= PAGE_SIZE)) + buf = netdev_alloc_frag(frag_size); + else + buf = kmalloc(frag_size, gfp); + + if (!buf) + return -ENOMEM; + + if (bm_pool->construct) + if (bm_pool->construct(bm_pool, buf)) { + hwbm_buf_free(bm_pool, buf); + return -ENOMEM; + } + + return 0; +} +EXPORT_SYMBOL_GPL(hwbm_pool_refill); + +int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num) +{ + int err, i; + + mutex_lock(&bm_pool->buf_lock); + if (bm_pool->buf_num == bm_pool->size) { + pr_warn("pool already filled\n"); + mutex_unlock(&bm_pool->buf_lock); + return bm_pool->buf_num; + } + + if (buf_num + bm_pool->buf_num > bm_pool->size) { + pr_warn("cannot allocate %d buffers for pool\n", + buf_num); + mutex_unlock(&bm_pool->buf_lock); + return 0; + } + + if ((buf_num + bm_pool->buf_num) < bm_pool->buf_num) { + pr_warn("Adding %d buffers to the %d current buffers will overflow\n", + buf_num, bm_pool->buf_num); + mutex_unlock(&bm_pool->buf_lock); + return 0; + } + + for (i = 0; i < buf_num; i++) { + err = hwbm_pool_refill(bm_pool, GFP_KERNEL); + if (err < 0) + break; + } + + /* Update BM driver with number of buffers added to pool */ + bm_pool->buf_num += i; + + pr_debug("hwpm pool: %d of %d buffers added\n", i, buf_num); + mutex_unlock(&bm_pool->buf_lock); + + return i; +} +EXPORT_SYMBOL_GPL(hwbm_pool_add); diff --git a/net/core/link_watch.c b/net/core/link_watch.c new file mode 100644 index 000000000..aa6cb1f90 --- /dev/null +++ b/net/core/link_watch.c @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Linux network device link state notification + * + * Author: + * Stefan Rompf <sux@loplof.de> + */ + +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/if.h> +#include <net/sock.h> +#include <net/pkt_sched.h> +#include <linux/rtnetlink.h> +#include <linux/jiffies.h> +#include <linux/spinlock.h> +#include <linux/workqueue.h> +#include <linux/bitops.h> +#include <linux/types.h> + +#include "dev.h" + +enum lw_bits { + LW_URGENT = 0, +}; + +static unsigned long linkwatch_flags; +static unsigned long linkwatch_nextevent; + +static void linkwatch_event(struct work_struct *dummy); +static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event); + +static LIST_HEAD(lweventlist); +static DEFINE_SPINLOCK(lweventlist_lock); + +static unsigned char default_operstate(const struct net_device *dev) +{ + if (netif_testing(dev)) + return IF_OPER_TESTING; + + if (!netif_carrier_ok(dev)) + return (dev->ifindex != dev_get_iflink(dev) ? + IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN); + + if (netif_dormant(dev)) + return IF_OPER_DORMANT; + + return IF_OPER_UP; +} + + +static void rfc2863_policy(struct net_device *dev) +{ + unsigned char operstate = default_operstate(dev); + + if (operstate == dev->operstate) + return; + + write_lock(&dev_base_lock); + + switch(dev->link_mode) { + case IF_LINK_MODE_TESTING: + if (operstate == IF_OPER_UP) + operstate = IF_OPER_TESTING; + break; + + case IF_LINK_MODE_DORMANT: + if (operstate == IF_OPER_UP) + operstate = IF_OPER_DORMANT; + break; + case IF_LINK_MODE_DEFAULT: + default: + break; + } + + dev->operstate = operstate; + + write_unlock(&dev_base_lock); +} + + +void linkwatch_init_dev(struct net_device *dev) +{ + /* Handle pre-registration link state changes */ + if (!netif_carrier_ok(dev) || netif_dormant(dev) || + netif_testing(dev)) + rfc2863_policy(dev); +} + + +static bool linkwatch_urgent_event(struct net_device *dev) +{ + if (!netif_running(dev)) + return false; + + if (dev->ifindex != dev_get_iflink(dev)) + return true; + + if (netif_is_lag_port(dev) || netif_is_lag_master(dev)) + return true; + + return netif_carrier_ok(dev) && qdisc_tx_changing(dev); +} + + +static void linkwatch_add_event(struct net_device *dev) +{ + unsigned long flags; + + spin_lock_irqsave(&lweventlist_lock, flags); + if (list_empty(&dev->link_watch_list)) { + list_add_tail(&dev->link_watch_list, &lweventlist); + netdev_hold(dev, &dev->linkwatch_dev_tracker, GFP_ATOMIC); + } + spin_unlock_irqrestore(&lweventlist_lock, flags); +} + + +static void linkwatch_schedule_work(int urgent) +{ + unsigned long delay = linkwatch_nextevent - jiffies; + + if (test_bit(LW_URGENT, &linkwatch_flags)) + return; + + /* Minimise down-time: drop delay for up event. */ + if (urgent) { + if (test_and_set_bit(LW_URGENT, &linkwatch_flags)) + return; + delay = 0; + } + + /* If we wrap around we'll delay it by at most HZ. */ + if (delay > HZ) + delay = 0; + + /* + * If urgent, schedule immediate execution; otherwise, don't + * override the existing timer. + */ + if (test_bit(LW_URGENT, &linkwatch_flags)) + mod_delayed_work(system_wq, &linkwatch_work, 0); + else + schedule_delayed_work(&linkwatch_work, delay); +} + + +static void linkwatch_do_dev(struct net_device *dev) +{ + /* + * Make sure the above read is complete since it can be + * rewritten as soon as we clear the bit below. + */ + smp_mb__before_atomic(); + + /* We are about to handle this device, + * so new events can be accepted + */ + clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); + + rfc2863_policy(dev); + if (dev->flags & IFF_UP) { + if (netif_carrier_ok(dev)) + dev_activate(dev); + else + dev_deactivate(dev); + + netdev_state_change(dev); + } + /* Note: our callers are responsible for calling netdev_tracker_free(). + * This is the reason we use __dev_put() instead of dev_put(). + */ + __dev_put(dev); +} + +static void __linkwatch_run_queue(int urgent_only) +{ +#define MAX_DO_DEV_PER_LOOP 100 + + int do_dev = MAX_DO_DEV_PER_LOOP; + struct net_device *dev; + LIST_HEAD(wrk); + + /* Give urgent case more budget */ + if (urgent_only) + do_dev += MAX_DO_DEV_PER_LOOP; + + /* + * Limit the number of linkwatch events to one + * per second so that a runaway driver does not + * cause a storm of messages on the netlink + * socket. This limit does not apply to up events + * while the device qdisc is down. + */ + if (!urgent_only) + linkwatch_nextevent = jiffies + HZ; + /* Limit wrap-around effect on delay. */ + else if (time_after(linkwatch_nextevent, jiffies + HZ)) + linkwatch_nextevent = jiffies; + + clear_bit(LW_URGENT, &linkwatch_flags); + + spin_lock_irq(&lweventlist_lock); + list_splice_init(&lweventlist, &wrk); + + while (!list_empty(&wrk) && do_dev > 0) { + + dev = list_first_entry(&wrk, struct net_device, link_watch_list); + list_del_init(&dev->link_watch_list); + + if (!netif_device_present(dev) || + (urgent_only && !linkwatch_urgent_event(dev))) { + list_add_tail(&dev->link_watch_list, &lweventlist); + continue; + } + /* We must free netdev tracker under + * the spinlock protection. + */ + netdev_tracker_free(dev, &dev->linkwatch_dev_tracker); + spin_unlock_irq(&lweventlist_lock); + linkwatch_do_dev(dev); + do_dev--; + spin_lock_irq(&lweventlist_lock); + } + + /* Add the remaining work back to lweventlist */ + list_splice_init(&wrk, &lweventlist); + + if (!list_empty(&lweventlist)) + linkwatch_schedule_work(0); + spin_unlock_irq(&lweventlist_lock); +} + +void linkwatch_forget_dev(struct net_device *dev) +{ + unsigned long flags; + int clean = 0; + + spin_lock_irqsave(&lweventlist_lock, flags); + if (!list_empty(&dev->link_watch_list)) { + list_del_init(&dev->link_watch_list); + clean = 1; + /* We must release netdev tracker under + * the spinlock protection. + */ + netdev_tracker_free(dev, &dev->linkwatch_dev_tracker); + } + spin_unlock_irqrestore(&lweventlist_lock, flags); + if (clean) + linkwatch_do_dev(dev); +} + + +/* Must be called with the rtnl semaphore held */ +void linkwatch_run_queue(void) +{ + __linkwatch_run_queue(0); +} + + +static void linkwatch_event(struct work_struct *dummy) +{ + rtnl_lock(); + __linkwatch_run_queue(time_after(linkwatch_nextevent, jiffies)); + rtnl_unlock(); +} + + +void linkwatch_fire_event(struct net_device *dev) +{ + bool urgent = linkwatch_urgent_event(dev); + + if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { + linkwatch_add_event(dev); + } else if (!urgent) + return; + + linkwatch_schedule_work(urgent); +} +EXPORT_SYMBOL(linkwatch_fire_event); diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c new file mode 100644 index 000000000..4a0797f0a --- /dev/null +++ b/net/core/lwt_bpf.c @@ -0,0 +1,657 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> + */ + +#include <linux/filter.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/types.h> +#include <linux/bpf.h> +#include <net/lwtunnel.h> +#include <net/gre.h> +#include <net/ip6_route.h> +#include <net/ipv6_stubs.h> + +struct bpf_lwt_prog { + struct bpf_prog *prog; + char *name; +}; + +struct bpf_lwt { + struct bpf_lwt_prog in; + struct bpf_lwt_prog out; + struct bpf_lwt_prog xmit; + int family; +}; + +#define MAX_PROG_NAME 256 + +static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) +{ + return (struct bpf_lwt *)lwt->data; +} + +#define NO_REDIRECT false +#define CAN_REDIRECT true + +static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, + struct dst_entry *dst, bool can_redirect) +{ + int ret; + + /* Migration disable and BH disable are needed to protect per-cpu + * redirect_info between BPF prog and skb_do_redirect(). + */ + migrate_disable(); + local_bh_disable(); + bpf_compute_data_pointers(skb); + ret = bpf_prog_run_save_cb(lwt->prog, skb); + + switch (ret) { + case BPF_OK: + case BPF_LWT_REROUTE: + break; + + case BPF_REDIRECT: + if (unlikely(!can_redirect)) { + pr_warn_once("Illegal redirect return code in prog %s\n", + lwt->name ? : "<unknown>"); + ret = BPF_OK; + } else { + skb_reset_mac_header(skb); + skb_do_redirect(skb); + ret = BPF_REDIRECT; + } + break; + + case BPF_DROP: + kfree_skb(skb); + ret = -EPERM; + break; + + default: + pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret); + kfree_skb(skb); + ret = -EINVAL; + break; + } + + local_bh_enable(); + migrate_enable(); + + return ret; +} + +static int bpf_lwt_input_reroute(struct sk_buff *skb) +{ + int err = -EINVAL; + + if (skb->protocol == htons(ETH_P_IP)) { + struct net_device *dev = skb_dst(skb)->dev; + struct iphdr *iph = ip_hdr(skb); + + dev_hold(dev); + skb_dst_drop(skb); + err = ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, dev); + dev_put(dev); + } else if (skb->protocol == htons(ETH_P_IPV6)) { + skb_dst_drop(skb); + err = ipv6_stub->ipv6_route_input(skb); + } else { + err = -EAFNOSUPPORT; + } + + if (err) + goto err; + return dst_input(skb); + +err: + kfree_skb(skb); + return err; +} + +static int bpf_input(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct bpf_lwt *bpf; + int ret; + + bpf = bpf_lwt_lwtunnel(dst->lwtstate); + if (bpf->in.prog) { + ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); + if (ret < 0) + return ret; + if (ret == BPF_LWT_REROUTE) + return bpf_lwt_input_reroute(skb); + } + + if (unlikely(!dst->lwtstate->orig_input)) { + kfree_skb(skb); + return -EINVAL; + } + + return dst->lwtstate->orig_input(skb); +} + +static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct bpf_lwt *bpf; + int ret; + + bpf = bpf_lwt_lwtunnel(dst->lwtstate); + if (bpf->out.prog) { + ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT); + if (ret < 0) + return ret; + } + + if (unlikely(!dst->lwtstate->orig_output)) { + pr_warn_once("orig_output not set on dst for prog %s\n", + bpf->out.name); + kfree_skb(skb); + return -EINVAL; + } + + return dst->lwtstate->orig_output(net, sk, skb); +} + +static int xmit_check_hhlen(struct sk_buff *skb, int hh_len) +{ + if (skb_headroom(skb) < hh_len) { + int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); + + if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC)) + return -ENOMEM; + } + + return 0; +} + +static int bpf_lwt_xmit_reroute(struct sk_buff *skb) +{ + struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev); + int oif = l3mdev ? l3mdev->ifindex : 0; + struct dst_entry *dst = NULL; + int err = -EAFNOSUPPORT; + struct sock *sk; + struct net *net; + bool ipv4; + + if (skb->protocol == htons(ETH_P_IP)) + ipv4 = true; + else if (skb->protocol == htons(ETH_P_IPV6)) + ipv4 = false; + else + goto err; + + sk = sk_to_full_sk(skb->sk); + if (sk) { + if (sk->sk_bound_dev_if) + oif = sk->sk_bound_dev_if; + net = sock_net(sk); + } else { + net = dev_net(skb_dst(skb)->dev); + } + + if (ipv4) { + struct iphdr *iph = ip_hdr(skb); + struct flowi4 fl4 = {}; + struct rtable *rt; + + fl4.flowi4_oif = oif; + fl4.flowi4_mark = skb->mark; + fl4.flowi4_uid = sock_net_uid(net, sk); + fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; + fl4.flowi4_proto = iph->protocol; + fl4.daddr = iph->daddr; + fl4.saddr = iph->saddr; + + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto err; + } + dst = &rt->dst; + } else { + struct ipv6hdr *iph6 = ipv6_hdr(skb); + struct flowi6 fl6 = {}; + + fl6.flowi6_oif = oif; + fl6.flowi6_mark = skb->mark; + fl6.flowi6_uid = sock_net_uid(net, sk); + fl6.flowlabel = ip6_flowinfo(iph6); + fl6.flowi6_proto = iph6->nexthdr; + fl6.daddr = iph6->daddr; + fl6.saddr = iph6->saddr; + + dst = ipv6_stub->ipv6_dst_lookup_flow(net, skb->sk, &fl6, NULL); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + goto err; + } + } + if (unlikely(dst->error)) { + err = dst->error; + dst_release(dst); + goto err; + } + + /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it + * was done for the previous dst, so we are doing it here again, in + * case the new dst needs much more space. The call below is a noop + * if there is enough header space in skb. + */ + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + goto err; + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + + err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb); + if (unlikely(err)) + return net_xmit_errno(err); + + /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */ + return LWTUNNEL_XMIT_DONE; + +err: + kfree_skb(skb); + return err; +} + +static int bpf_xmit(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct bpf_lwt *bpf; + + bpf = bpf_lwt_lwtunnel(dst->lwtstate); + if (bpf->xmit.prog) { + int hh_len = dst->dev->hard_header_len; + __be16 proto = skb->protocol; + int ret; + + ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); + switch (ret) { + case BPF_OK: + /* If the header changed, e.g. via bpf_lwt_push_encap, + * BPF_LWT_REROUTE below should have been used if the + * protocol was also changed. + */ + if (skb->protocol != proto) { + kfree_skb(skb); + return -EINVAL; + } + /* If the header was expanded, headroom might be too + * small for L2 header to come, expand as needed. + */ + ret = xmit_check_hhlen(skb, hh_len); + if (unlikely(ret)) + return ret; + + return LWTUNNEL_XMIT_CONTINUE; + case BPF_REDIRECT: + return LWTUNNEL_XMIT_DONE; + case BPF_LWT_REROUTE: + return bpf_lwt_xmit_reroute(skb); + default: + return ret; + } + } + + return LWTUNNEL_XMIT_CONTINUE; +} + +static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog) +{ + if (prog->prog) + bpf_prog_put(prog->prog); + + kfree(prog->name); +} + +static void bpf_destroy_state(struct lwtunnel_state *lwt) +{ + struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); + + bpf_lwt_prog_destroy(&bpf->in); + bpf_lwt_prog_destroy(&bpf->out); + bpf_lwt_prog_destroy(&bpf->xmit); +} + +static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { + [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, + [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, + .len = MAX_PROG_NAME }, +}; + +static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, + enum bpf_prog_type type) +{ + struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; + struct bpf_prog *p; + int ret; + u32 fd; + + ret = nla_parse_nested_deprecated(tb, LWT_BPF_PROG_MAX, attr, + bpf_prog_policy, NULL); + if (ret < 0) + return ret; + + if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) + return -EINVAL; + + prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_ATOMIC); + if (!prog->name) + return -ENOMEM; + + fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); + p = bpf_prog_get_type(fd, type); + if (IS_ERR(p)) + return PTR_ERR(p); + + prog->prog = p; + + return 0; +} + +static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { + [LWT_BPF_IN] = { .type = NLA_NESTED, }, + [LWT_BPF_OUT] = { .type = NLA_NESTED, }, + [LWT_BPF_XMIT] = { .type = NLA_NESTED, }, + [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, +}; + +static int bpf_build_state(struct net *net, struct nlattr *nla, + unsigned int family, const void *cfg, + struct lwtunnel_state **ts, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[LWT_BPF_MAX + 1]; + struct lwtunnel_state *newts; + struct bpf_lwt *bpf; + int ret; + + if (family != AF_INET && family != AF_INET6) + return -EAFNOSUPPORT; + + ret = nla_parse_nested_deprecated(tb, LWT_BPF_MAX, nla, bpf_nl_policy, + extack); + if (ret < 0) + return ret; + + if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT]) + return -EINVAL; + + newts = lwtunnel_state_alloc(sizeof(*bpf)); + if (!newts) + return -ENOMEM; + + newts->type = LWTUNNEL_ENCAP_BPF; + bpf = bpf_lwt_lwtunnel(newts); + + if (tb[LWT_BPF_IN]) { + newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; + ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in, + BPF_PROG_TYPE_LWT_IN); + if (ret < 0) + goto errout; + } + + if (tb[LWT_BPF_OUT]) { + newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; + ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out, + BPF_PROG_TYPE_LWT_OUT); + if (ret < 0) + goto errout; + } + + if (tb[LWT_BPF_XMIT]) { + newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; + ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit, + BPF_PROG_TYPE_LWT_XMIT); + if (ret < 0) + goto errout; + } + + if (tb[LWT_BPF_XMIT_HEADROOM]) { + u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]); + + if (headroom > LWT_BPF_MAX_HEADROOM) { + ret = -ERANGE; + goto errout; + } + + newts->headroom = headroom; + } + + bpf->family = family; + *ts = newts; + + return 0; + +errout: + bpf_destroy_state(newts); + kfree(newts); + return ret; +} + +static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr, + struct bpf_lwt_prog *prog) +{ + struct nlattr *nest; + + if (!prog->prog) + return 0; + + nest = nla_nest_start_noflag(skb, attr); + if (!nest) + return -EMSGSIZE; + + if (prog->name && + nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name)) + return -EMSGSIZE; + + return nla_nest_end(skb, nest); +} + +static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) +{ + struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); + + if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 || + bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 || + bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0) + return -EMSGSIZE; + + return 0; +} + +static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + int nest_len = nla_total_size(sizeof(struct nlattr)) + + nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */ + 0; + + return nest_len + /* LWT_BPF_IN */ + nest_len + /* LWT_BPF_OUT */ + nest_len + /* LWT_BPF_XMIT */ + 0; +} + +static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) +{ + /* FIXME: + * The LWT state is currently rebuilt for delete requests which + * results in a new bpf_prog instance. Comparing names for now. + */ + if (!a->name && !b->name) + return 0; + + if (!a->name || !b->name) + return 1; + + return strcmp(a->name, b->name); +} + +static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a); + struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b); + + return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) || + bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) || + bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit); +} + +static const struct lwtunnel_encap_ops bpf_encap_ops = { + .build_state = bpf_build_state, + .destroy_state = bpf_destroy_state, + .input = bpf_input, + .output = bpf_output, + .xmit = bpf_xmit, + .fill_encap = bpf_fill_encap_info, + .get_encap_size = bpf_encap_nlsize, + .cmp_encap = bpf_encap_cmp, + .owner = THIS_MODULE, +}; + +static int handle_gso_type(struct sk_buff *skb, unsigned int gso_type, + int encap_len) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + + gso_type |= SKB_GSO_DODGY; + shinfo->gso_type |= gso_type; + skb_decrease_gso_size(shinfo, encap_len); + shinfo->gso_segs = 0; + return 0; +} + +static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len) +{ + int next_hdr_offset; + void *next_hdr; + __u8 protocol; + + /* SCTP and UDP_L4 gso need more nuanced handling than what + * handle_gso_type() does above: skb_decrease_gso_size() is not enough. + * So at the moment only TCP GSO packets are let through. + */ + if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) + return -ENOTSUPP; + + if (ipv4) { + protocol = ip_hdr(skb)->protocol; + next_hdr_offset = sizeof(struct iphdr); + next_hdr = skb_network_header(skb) + next_hdr_offset; + } else { + protocol = ipv6_hdr(skb)->nexthdr; + next_hdr_offset = sizeof(struct ipv6hdr); + next_hdr = skb_network_header(skb) + next_hdr_offset; + } + + switch (protocol) { + case IPPROTO_GRE: + next_hdr_offset += sizeof(struct gre_base_hdr); + if (next_hdr_offset > encap_len) + return -EINVAL; + + if (((struct gre_base_hdr *)next_hdr)->flags & GRE_CSUM) + return handle_gso_type(skb, SKB_GSO_GRE_CSUM, + encap_len); + return handle_gso_type(skb, SKB_GSO_GRE, encap_len); + + case IPPROTO_UDP: + next_hdr_offset += sizeof(struct udphdr); + if (next_hdr_offset > encap_len) + return -EINVAL; + + if (((struct udphdr *)next_hdr)->check) + return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL_CSUM, + encap_len); + return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL, encap_len); + + case IPPROTO_IP: + case IPPROTO_IPV6: + if (ipv4) + return handle_gso_type(skb, SKB_GSO_IPXIP4, encap_len); + else + return handle_gso_type(skb, SKB_GSO_IPXIP6, encap_len); + + default: + return -EPROTONOSUPPORT; + } +} + +int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) +{ + struct iphdr *iph; + bool ipv4; + int err; + + if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM)) + return -EINVAL; + + /* validate protocol and length */ + iph = (struct iphdr *)hdr; + if (iph->version == 4) { + ipv4 = true; + if (unlikely(len < iph->ihl * 4)) + return -EINVAL; + } else if (iph->version == 6) { + ipv4 = false; + if (unlikely(len < sizeof(struct ipv6hdr))) + return -EINVAL; + } else { + return -EINVAL; + } + + if (ingress) + err = skb_cow_head(skb, len + skb->mac_len); + else + err = skb_cow_head(skb, + len + LL_RESERVED_SPACE(skb_dst(skb)->dev)); + if (unlikely(err)) + return err; + + /* push the encap headers and fix pointers */ + skb_reset_inner_headers(skb); + skb_reset_inner_mac_header(skb); /* mac header is not yet set */ + skb_set_inner_protocol(skb, skb->protocol); + skb->encapsulation = 1; + skb_push(skb, len); + if (ingress) + skb_postpush_rcsum(skb, iph, len); + skb_reset_network_header(skb); + memcpy(skb_network_header(skb), hdr, len); + bpf_compute_data_pointers(skb); + skb_clear_hash(skb); + + if (ipv4) { + skb->protocol = htons(ETH_P_IP); + iph = ip_hdr(skb); + + if (!iph->check) + iph->check = ip_fast_csum((unsigned char *)iph, + iph->ihl); + } else { + skb->protocol = htons(ETH_P_IPV6); + } + + if (skb_is_gso(skb)) + return handle_gso_encap(skb, ipv4, len); + + return 0; +} + +static int __init bpf_lwt_init(void) +{ + return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); +} + +subsys_initcall(bpf_lwt_init) diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c new file mode 100644 index 000000000..711cd3b43 --- /dev/null +++ b/net/core/lwtunnel.c @@ -0,0 +1,427 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * lwtunnel Infrastructure for light weight tunnels like mpls + * + * Authors: Roopa Prabhu, <roopa@cumulusnetworks.com> + */ + +#include <linux/capability.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/lwtunnel.h> +#include <linux/in.h> +#include <linux/init.h> +#include <linux/err.h> + +#include <net/lwtunnel.h> +#include <net/rtnetlink.h> +#include <net/ip6_fib.h> +#include <net/rtnh.h> + +DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled); +EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled); + +#ifdef CONFIG_MODULES + +static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) +{ + /* Only lwt encaps implemented without using an interface for + * the encap need to return a string here. + */ + switch (encap_type) { + case LWTUNNEL_ENCAP_MPLS: + return "MPLS"; + case LWTUNNEL_ENCAP_ILA: + return "ILA"; + case LWTUNNEL_ENCAP_SEG6: + return "SEG6"; + case LWTUNNEL_ENCAP_BPF: + return "BPF"; + case LWTUNNEL_ENCAP_SEG6_LOCAL: + return "SEG6LOCAL"; + case LWTUNNEL_ENCAP_RPL: + return "RPL"; + case LWTUNNEL_ENCAP_IOAM6: + return "IOAM6"; + case LWTUNNEL_ENCAP_XFRM: + /* module autoload not supported for encap type */ + return NULL; + case LWTUNNEL_ENCAP_IP6: + case LWTUNNEL_ENCAP_IP: + case LWTUNNEL_ENCAP_NONE: + case __LWTUNNEL_ENCAP_MAX: + /* should not have got here */ + WARN_ON(1); + break; + } + return NULL; +} + +#endif /* CONFIG_MODULES */ + +struct lwtunnel_state *lwtunnel_state_alloc(int encap_len) +{ + struct lwtunnel_state *lws; + + lws = kzalloc(sizeof(*lws) + encap_len, GFP_ATOMIC); + + return lws; +} +EXPORT_SYMBOL_GPL(lwtunnel_state_alloc); + +static const struct lwtunnel_encap_ops __rcu * + lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly; + +int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops, + unsigned int num) +{ + if (num > LWTUNNEL_ENCAP_MAX) + return -ERANGE; + + return !cmpxchg((const struct lwtunnel_encap_ops **) + &lwtun_encaps[num], + NULL, ops) ? 0 : -1; +} +EXPORT_SYMBOL_GPL(lwtunnel_encap_add_ops); + +int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops, + unsigned int encap_type) +{ + int ret; + + if (encap_type == LWTUNNEL_ENCAP_NONE || + encap_type > LWTUNNEL_ENCAP_MAX) + return -ERANGE; + + ret = (cmpxchg((const struct lwtunnel_encap_ops **) + &lwtun_encaps[encap_type], + ops, NULL) == ops) ? 0 : -1; + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL_GPL(lwtunnel_encap_del_ops); + +int lwtunnel_build_state(struct net *net, u16 encap_type, + struct nlattr *encap, unsigned int family, + const void *cfg, struct lwtunnel_state **lws, + struct netlink_ext_ack *extack) +{ + const struct lwtunnel_encap_ops *ops; + bool found = false; + int ret = -EINVAL; + + if (encap_type == LWTUNNEL_ENCAP_NONE || + encap_type > LWTUNNEL_ENCAP_MAX) { + NL_SET_ERR_MSG_ATTR(extack, encap, + "Unknown LWT encapsulation type"); + return ret; + } + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[encap_type]); + if (likely(ops && ops->build_state && try_module_get(ops->owner))) + found = true; + rcu_read_unlock(); + + if (found) { + ret = ops->build_state(net, encap, family, cfg, lws, extack); + if (ret) + module_put(ops->owner); + } else { + /* don't rely on -EOPNOTSUPP to detect match as build_state + * handlers could return it + */ + NL_SET_ERR_MSG_ATTR(extack, encap, + "LWT encapsulation type not supported"); + } + + return ret; +} +EXPORT_SYMBOL_GPL(lwtunnel_build_state); + +int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack) +{ + const struct lwtunnel_encap_ops *ops; + int ret = -EINVAL; + + if (encap_type == LWTUNNEL_ENCAP_NONE || + encap_type > LWTUNNEL_ENCAP_MAX) { + NL_SET_ERR_MSG(extack, "Unknown lwt encapsulation type"); + return ret; + } + + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[encap_type]); + rcu_read_unlock(); +#ifdef CONFIG_MODULES + if (!ops) { + const char *encap_type_str = lwtunnel_encap_str(encap_type); + + if (encap_type_str) { + __rtnl_unlock(); + request_module("rtnl-lwt-%s", encap_type_str); + rtnl_lock(); + + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[encap_type]); + rcu_read_unlock(); + } + } +#endif + ret = ops ? 0 : -EOPNOTSUPP; + if (ret < 0) + NL_SET_ERR_MSG(extack, "lwt encapsulation type not supported"); + + return ret; +} +EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type); + +int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining, + struct netlink_ext_ack *extack) +{ + struct rtnexthop *rtnh = (struct rtnexthop *)attr; + struct nlattr *nla_entype; + struct nlattr *attrs; + u16 encap_type; + int attrlen; + + while (rtnh_ok(rtnh, remaining)) { + attrlen = rtnh_attrlen(rtnh); + if (attrlen > 0) { + attrs = rtnh_attrs(rtnh); + nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); + + if (nla_entype) { + if (nla_len(nla_entype) < sizeof(u16)) { + NL_SET_ERR_MSG(extack, "Invalid RTA_ENCAP_TYPE"); + return -EINVAL; + } + encap_type = nla_get_u16(nla_entype); + + if (lwtunnel_valid_encap_type(encap_type, + extack) != 0) + return -EOPNOTSUPP; + } + } + rtnh = rtnh_next(rtnh, &remaining); + } + + return 0; +} +EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type_attr); + +void lwtstate_free(struct lwtunnel_state *lws) +{ + const struct lwtunnel_encap_ops *ops = lwtun_encaps[lws->type]; + + if (ops->destroy_state) { + ops->destroy_state(lws); + kfree_rcu(lws, rcu); + } else { + kfree(lws); + } + module_put(ops->owner); +} +EXPORT_SYMBOL_GPL(lwtstate_free); + +int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate, + int encap_attr, int encap_type_attr) +{ + const struct lwtunnel_encap_ops *ops; + struct nlattr *nest; + int ret; + + if (!lwtstate) + return 0; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + nest = nla_nest_start_noflag(skb, encap_attr); + if (!nest) + return -EMSGSIZE; + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->fill_encap)) + ret = ops->fill_encap(skb, lwtstate); + rcu_read_unlock(); + + if (ret) + goto nla_put_failure; + nla_nest_end(skb, nest); + ret = nla_put_u16(skb, encap_type_attr, lwtstate->type); + if (ret) + goto nla_put_failure; + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + + return (ret == -EOPNOTSUPP ? 0 : ret); +} +EXPORT_SYMBOL_GPL(lwtunnel_fill_encap); + +int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate) +{ + const struct lwtunnel_encap_ops *ops; + int ret = 0; + + if (!lwtstate) + return 0; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->get_encap_size)) + ret = nla_total_size(ops->get_encap_size(lwtstate)); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL_GPL(lwtunnel_get_encap_size); + +int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + const struct lwtunnel_encap_ops *ops; + int ret = 0; + + if (!a && !b) + return 0; + + if (!a || !b) + return 1; + + if (a->type != b->type) + return 1; + + if (a->type == LWTUNNEL_ENCAP_NONE || + a->type > LWTUNNEL_ENCAP_MAX) + return 0; + + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[a->type]); + if (likely(ops && ops->cmp_encap)) + ret = ops->cmp_encap(a, b); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL_GPL(lwtunnel_cmp_encap); + +int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + const struct lwtunnel_encap_ops *ops; + struct lwtunnel_state *lwtstate; + int ret = -EINVAL; + + if (!dst) + goto drop; + lwtstate = dst->lwtstate; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->output)) + ret = ops->output(net, sk, skb); + rcu_read_unlock(); + + if (ret == -EOPNOTSUPP) + goto drop; + + return ret; + +drop: + kfree_skb(skb); + + return ret; +} +EXPORT_SYMBOL_GPL(lwtunnel_output); + +int lwtunnel_xmit(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + const struct lwtunnel_encap_ops *ops; + struct lwtunnel_state *lwtstate; + int ret = -EINVAL; + + if (!dst) + goto drop; + + lwtstate = dst->lwtstate; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->xmit)) + ret = ops->xmit(skb); + rcu_read_unlock(); + + if (ret == -EOPNOTSUPP) + goto drop; + + return ret; + +drop: + kfree_skb(skb); + + return ret; +} +EXPORT_SYMBOL_GPL(lwtunnel_xmit); + +int lwtunnel_input(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + const struct lwtunnel_encap_ops *ops; + struct lwtunnel_state *lwtstate; + int ret = -EINVAL; + + if (!dst) + goto drop; + lwtstate = dst->lwtstate; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->input)) + ret = ops->input(skb); + rcu_read_unlock(); + + if (ret == -EOPNOTSUPP) + goto drop; + + return ret; + +drop: + kfree_skb(skb); + + return ret; +} +EXPORT_SYMBOL_GPL(lwtunnel_input); diff --git a/net/core/neighbour.c b/net/core/neighbour.c new file mode 100644 index 000000000..c842f150c --- /dev/null +++ b/net/core/neighbour.c @@ -0,0 +1,3887 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Generic address resolution entity + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * + * Fixes: + * Vitaly E. Lavrov releasing NULL neighbor in neigh_add. + * Harald Welte Add neighbour cache statistics like rtstat + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/slab.h> +#include <linux/kmemleak.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/socket.h> +#include <linux/netdevice.h> +#include <linux/proc_fs.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif +#include <linux/times.h> +#include <net/net_namespace.h> +#include <net/neighbour.h> +#include <net/arp.h> +#include <net/dst.h> +#include <net/sock.h> +#include <net/netevent.h> +#include <net/netlink.h> +#include <linux/rtnetlink.h> +#include <linux/random.h> +#include <linux/string.h> +#include <linux/log2.h> +#include <linux/inetdevice.h> +#include <net/addrconf.h> + +#include <trace/events/neigh.h> + +#define NEIGH_DEBUG 1 +#define neigh_dbg(level, fmt, ...) \ +do { \ + if (level <= NEIGH_DEBUG) \ + pr_debug(fmt, ##__VA_ARGS__); \ +} while (0) + +#define PNEIGH_HASHMASK 0xF + +static void neigh_timer_handler(struct timer_list *t); +static void __neigh_notify(struct neighbour *n, int type, int flags, + u32 pid); +static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid); +static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, + struct net_device *dev); + +#ifdef CONFIG_PROC_FS +static const struct seq_operations neigh_stat_seq_ops; +#endif + +/* + Neighbour hash table buckets are protected with rwlock tbl->lock. + + - All the scans/updates to hash buckets MUST be made under this lock. + - NOTHING clever should be made under this lock: no callbacks + to protocol backends, no attempts to send something to network. + It will result in deadlocks, if backend/driver wants to use neighbour + cache. + - If the entry requires some non-trivial actions, increase + its reference count and release table lock. + + Neighbour entries are protected: + - with reference count. + - with rwlock neigh->lock + + Reference count prevents destruction. + + neigh->lock mainly serializes ll address data and its validity state. + However, the same lock is used to protect another entry fields: + - timer + - resolution queue + + Again, nothing clever shall be made under neigh->lock, + the most complicated procedure, which we allow is dev->hard_header. + It is supposed, that dev->hard_header is simplistic and does + not make callbacks to neighbour tables. + */ + +static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb) +{ + kfree_skb(skb); + return -ENETDOWN; +} + +static void neigh_cleanup_and_release(struct neighbour *neigh) +{ + trace_neigh_cleanup_and_release(neigh, 0); + __neigh_notify(neigh, RTM_DELNEIGH, 0, 0); + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); + neigh_release(neigh); +} + +/* + * It is random distribution in the interval (1/2)*base...(3/2)*base. + * It corresponds to default IPv6 settings and is not overridable, + * because it is really reasonable choice. + */ + +unsigned long neigh_rand_reach_time(unsigned long base) +{ + return base ? prandom_u32_max(base) + (base >> 1) : 0; +} +EXPORT_SYMBOL(neigh_rand_reach_time); + +static void neigh_mark_dead(struct neighbour *n) +{ + n->dead = 1; + if (!list_empty(&n->gc_list)) { + list_del_init(&n->gc_list); + atomic_dec(&n->tbl->gc_entries); + } + if (!list_empty(&n->managed_list)) + list_del_init(&n->managed_list); +} + +static void neigh_update_gc_list(struct neighbour *n) +{ + bool on_gc_list, exempt_from_gc; + + write_lock_bh(&n->tbl->lock); + write_lock(&n->lock); + if (n->dead) + goto out; + + /* remove from the gc list if new state is permanent or if neighbor + * is externally learned; otherwise entry should be on the gc list + */ + exempt_from_gc = n->nud_state & NUD_PERMANENT || + n->flags & NTF_EXT_LEARNED; + on_gc_list = !list_empty(&n->gc_list); + + if (exempt_from_gc && on_gc_list) { + list_del_init(&n->gc_list); + atomic_dec(&n->tbl->gc_entries); + } else if (!exempt_from_gc && !on_gc_list) { + /* add entries to the tail; cleaning removes from the front */ + list_add_tail(&n->gc_list, &n->tbl->gc_list); + atomic_inc(&n->tbl->gc_entries); + } +out: + write_unlock(&n->lock); + write_unlock_bh(&n->tbl->lock); +} + +static void neigh_update_managed_list(struct neighbour *n) +{ + bool on_managed_list, add_to_managed; + + write_lock_bh(&n->tbl->lock); + write_lock(&n->lock); + if (n->dead) + goto out; + + add_to_managed = n->flags & NTF_MANAGED; + on_managed_list = !list_empty(&n->managed_list); + + if (!add_to_managed && on_managed_list) + list_del_init(&n->managed_list); + else if (add_to_managed && !on_managed_list) + list_add_tail(&n->managed_list, &n->tbl->managed_list); +out: + write_unlock(&n->lock); + write_unlock_bh(&n->tbl->lock); +} + +static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, + bool *gc_update, bool *managed_update) +{ + u32 ndm_flags, old_flags = neigh->flags; + + if (!(flags & NEIGH_UPDATE_F_ADMIN)) + return; + + ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; + ndm_flags |= (flags & NEIGH_UPDATE_F_MANAGED) ? NTF_MANAGED : 0; + + if ((old_flags ^ ndm_flags) & NTF_EXT_LEARNED) { + if (ndm_flags & NTF_EXT_LEARNED) + neigh->flags |= NTF_EXT_LEARNED; + else + neigh->flags &= ~NTF_EXT_LEARNED; + *notify = 1; + *gc_update = true; + } + if ((old_flags ^ ndm_flags) & NTF_MANAGED) { + if (ndm_flags & NTF_MANAGED) + neigh->flags |= NTF_MANAGED; + else + neigh->flags &= ~NTF_MANAGED; + *notify = 1; + *managed_update = true; + } +} + +static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np, + struct neigh_table *tbl) +{ + bool retval = false; + + write_lock(&n->lock); + if (refcount_read(&n->refcnt) == 1) { + struct neighbour *neigh; + + neigh = rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock)); + rcu_assign_pointer(*np, neigh); + neigh_mark_dead(n); + retval = true; + } + write_unlock(&n->lock); + if (retval) + neigh_cleanup_and_release(n); + return retval; +} + +bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) +{ + struct neigh_hash_table *nht; + void *pkey = ndel->primary_key; + u32 hash_val; + struct neighbour *n; + struct neighbour __rcu **np; + + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + hash_val = tbl->hash(pkey, ndel->dev, nht->hash_rnd); + hash_val = hash_val >> (32 - nht->hash_shift); + + np = &nht->hash_buckets[hash_val]; + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock)))) { + if (n == ndel) + return neigh_del(n, np, tbl); + np = &n->next; + } + return false; +} + +static int neigh_forced_gc(struct neigh_table *tbl) +{ + int max_clean = atomic_read(&tbl->gc_entries) - + READ_ONCE(tbl->gc_thresh2); + u64 tmax = ktime_get_ns() + NSEC_PER_MSEC; + unsigned long tref = jiffies - 5 * HZ; + struct neighbour *n, *tmp; + int shrunk = 0; + int loop = 0; + + NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); + + write_lock_bh(&tbl->lock); + + list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) { + if (refcount_read(&n->refcnt) == 1) { + bool remove = false; + + write_lock(&n->lock); + if ((n->nud_state == NUD_FAILED) || + (n->nud_state == NUD_NOARP) || + (tbl->is_multicast && + tbl->is_multicast(n->primary_key)) || + !time_in_range(n->updated, tref, jiffies)) + remove = true; + write_unlock(&n->lock); + + if (remove && neigh_remove_one(n, tbl)) + shrunk++; + if (shrunk >= max_clean) + break; + if (++loop == 16) { + if (ktime_get_ns() > tmax) + goto unlock; + loop = 0; + } + } + } + + WRITE_ONCE(tbl->last_flush, jiffies); +unlock: + write_unlock_bh(&tbl->lock); + + return shrunk; +} + +static void neigh_add_timer(struct neighbour *n, unsigned long when) +{ + /* Use safe distance from the jiffies - LONG_MAX point while timer + * is running in DELAY/PROBE state but still show to user space + * large times in the past. + */ + unsigned long mint = jiffies - (LONG_MAX - 86400 * HZ); + + neigh_hold(n); + if (!time_in_range(n->confirmed, mint, jiffies)) + n->confirmed = mint; + if (time_before(n->used, n->confirmed)) + n->used = n->confirmed; + if (unlikely(mod_timer(&n->timer, when))) { + printk("NEIGH: BUG, double timer add, state is %x\n", + n->nud_state); + dump_stack(); + } +} + +static int neigh_del_timer(struct neighbour *n) +{ + if ((n->nud_state & NUD_IN_TIMER) && + del_timer(&n->timer)) { + neigh_release(n); + return 1; + } + return 0; +} + +static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev, + int family) +{ + switch (family) { + case AF_INET: + return __in_dev_arp_parms_get_rcu(dev); + case AF_INET6: + return __in6_dev_nd_parms_get_rcu(dev); + } + return NULL; +} + +static void neigh_parms_qlen_dec(struct net_device *dev, int family) +{ + struct neigh_parms *p; + + rcu_read_lock(); + p = neigh_get_dev_parms_rcu(dev, family); + if (p) + p->qlen--; + rcu_read_unlock(); +} + +static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net, + int family) +{ + struct sk_buff_head tmp; + unsigned long flags; + struct sk_buff *skb; + + skb_queue_head_init(&tmp); + spin_lock_irqsave(&list->lock, flags); + skb = skb_peek(list); + while (skb != NULL) { + struct sk_buff *skb_next = skb_peek_next(skb, list); + struct net_device *dev = skb->dev; + + if (net == NULL || net_eq(dev_net(dev), net)) { + neigh_parms_qlen_dec(dev, family); + __skb_unlink(skb, list); + __skb_queue_tail(&tmp, skb); + } + skb = skb_next; + } + spin_unlock_irqrestore(&list->lock, flags); + + while ((skb = __skb_dequeue(&tmp))) { + dev_put(skb->dev); + kfree_skb(skb); + } +} + +static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm) +{ + int i; + struct neigh_hash_table *nht; + + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + + for (i = 0; i < (1 << nht->hash_shift); i++) { + struct neighbour *n; + struct neighbour __rcu **np = &nht->hash_buckets[i]; + + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { + if (dev && n->dev != dev) { + np = &n->next; + continue; + } + if (skip_perm && n->nud_state & NUD_PERMANENT) { + np = &n->next; + continue; + } + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); + write_lock(&n->lock); + neigh_del_timer(n); + neigh_mark_dead(n); + if (refcount_read(&n->refcnt) != 1) { + /* The most unpleasant situation. + We must destroy neighbour entry, + but someone still uses it. + + The destroy will be delayed until + the last user releases us, but + we must kill timers etc. and move + it to safe state. + */ + __skb_queue_purge(&n->arp_queue); + n->arp_queue_len_bytes = 0; + WRITE_ONCE(n->output, neigh_blackhole); + if (n->nud_state & NUD_VALID) + n->nud_state = NUD_NOARP; + else + n->nud_state = NUD_NONE; + neigh_dbg(2, "neigh %p is stray\n", n); + } + write_unlock(&n->lock); + neigh_cleanup_and_release(n); + } + } +} + +void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev) +{ + write_lock_bh(&tbl->lock); + neigh_flush_dev(tbl, dev, false); + write_unlock_bh(&tbl->lock); +} +EXPORT_SYMBOL(neigh_changeaddr); + +static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm) +{ + write_lock_bh(&tbl->lock); + neigh_flush_dev(tbl, dev, skip_perm); + pneigh_ifdown_and_unlock(tbl, dev); + pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL, + tbl->family); + if (skb_queue_empty_lockless(&tbl->proxy_queue)) + del_timer_sync(&tbl->proxy_timer); + return 0; +} + +int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev) +{ + __neigh_ifdown(tbl, dev, true); + return 0; +} +EXPORT_SYMBOL(neigh_carrier_down); + +int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) +{ + __neigh_ifdown(tbl, dev, false); + return 0; +} +EXPORT_SYMBOL(neigh_ifdown); + +static struct neighbour *neigh_alloc(struct neigh_table *tbl, + struct net_device *dev, + u32 flags, bool exempt_from_gc) +{ + struct neighbour *n = NULL; + unsigned long now = jiffies; + int entries, gc_thresh3; + + if (exempt_from_gc) + goto do_alloc; + + entries = atomic_inc_return(&tbl->gc_entries) - 1; + gc_thresh3 = READ_ONCE(tbl->gc_thresh3); + if (entries >= gc_thresh3 || + (entries >= READ_ONCE(tbl->gc_thresh2) && + time_after(now, READ_ONCE(tbl->last_flush) + 5 * HZ))) { + if (!neigh_forced_gc(tbl) && entries >= gc_thresh3) { + net_info_ratelimited("%s: neighbor table overflow!\n", + tbl->id); + NEIGH_CACHE_STAT_INC(tbl, table_fulls); + goto out_entries; + } + } + +do_alloc: + n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC); + if (!n) + goto out_entries; + + __skb_queue_head_init(&n->arp_queue); + rwlock_init(&n->lock); + seqlock_init(&n->ha_lock); + n->updated = n->used = now; + n->nud_state = NUD_NONE; + n->output = neigh_blackhole; + n->flags = flags; + seqlock_init(&n->hh.hh_lock); + n->parms = neigh_parms_clone(&tbl->parms); + timer_setup(&n->timer, neigh_timer_handler, 0); + + NEIGH_CACHE_STAT_INC(tbl, allocs); + n->tbl = tbl; + refcount_set(&n->refcnt, 1); + n->dead = 1; + INIT_LIST_HEAD(&n->gc_list); + INIT_LIST_HEAD(&n->managed_list); + + atomic_inc(&tbl->entries); +out: + return n; + +out_entries: + if (!exempt_from_gc) + atomic_dec(&tbl->gc_entries); + goto out; +} + +static void neigh_get_hash_rnd(u32 *x) +{ + *x = get_random_u32() | 1; +} + +static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) +{ + size_t size = (1 << shift) * sizeof(struct neighbour *); + struct neigh_hash_table *ret; + struct neighbour __rcu **buckets; + int i; + + ret = kmalloc(sizeof(*ret), GFP_ATOMIC); + if (!ret) + return NULL; + if (size <= PAGE_SIZE) { + buckets = kzalloc(size, GFP_ATOMIC); + } else { + buckets = (struct neighbour __rcu **) + __get_free_pages(GFP_ATOMIC | __GFP_ZERO, + get_order(size)); + kmemleak_alloc(buckets, size, 1, GFP_ATOMIC); + } + if (!buckets) { + kfree(ret); + return NULL; + } + ret->hash_buckets = buckets; + ret->hash_shift = shift; + for (i = 0; i < NEIGH_NUM_HASH_RND; i++) + neigh_get_hash_rnd(&ret->hash_rnd[i]); + return ret; +} + +static void neigh_hash_free_rcu(struct rcu_head *head) +{ + struct neigh_hash_table *nht = container_of(head, + struct neigh_hash_table, + rcu); + size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *); + struct neighbour __rcu **buckets = nht->hash_buckets; + + if (size <= PAGE_SIZE) { + kfree(buckets); + } else { + kmemleak_free(buckets); + free_pages((unsigned long)buckets, get_order(size)); + } + kfree(nht); +} + +static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, + unsigned long new_shift) +{ + unsigned int i, hash; + struct neigh_hash_table *new_nht, *old_nht; + + NEIGH_CACHE_STAT_INC(tbl, hash_grows); + + old_nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + new_nht = neigh_hash_alloc(new_shift); + if (!new_nht) + return old_nht; + + for (i = 0; i < (1 << old_nht->hash_shift); i++) { + struct neighbour *n, *next; + + for (n = rcu_dereference_protected(old_nht->hash_buckets[i], + lockdep_is_held(&tbl->lock)); + n != NULL; + n = next) { + hash = tbl->hash(n->primary_key, n->dev, + new_nht->hash_rnd); + + hash >>= (32 - new_nht->hash_shift); + next = rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock)); + + rcu_assign_pointer(n->next, + rcu_dereference_protected( + new_nht->hash_buckets[hash], + lockdep_is_held(&tbl->lock))); + rcu_assign_pointer(new_nht->hash_buckets[hash], n); + } + } + + rcu_assign_pointer(tbl->nht, new_nht); + call_rcu(&old_nht->rcu, neigh_hash_free_rcu); + return new_nht; +} + +struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, + struct net_device *dev) +{ + struct neighbour *n; + + NEIGH_CACHE_STAT_INC(tbl, lookups); + + rcu_read_lock(); + n = __neigh_lookup_noref(tbl, pkey, dev); + if (n) { + if (!refcount_inc_not_zero(&n->refcnt)) + n = NULL; + NEIGH_CACHE_STAT_INC(tbl, hits); + } + + rcu_read_unlock(); + return n; +} +EXPORT_SYMBOL(neigh_lookup); + +static struct neighbour * +___neigh_create(struct neigh_table *tbl, const void *pkey, + struct net_device *dev, u32 flags, + bool exempt_from_gc, bool want_ref) +{ + u32 hash_val, key_len = tbl->key_len; + struct neighbour *n1, *rc, *n; + struct neigh_hash_table *nht; + int error; + + n = neigh_alloc(tbl, dev, flags, exempt_from_gc); + trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc); + if (!n) { + rc = ERR_PTR(-ENOBUFS); + goto out; + } + + memcpy(n->primary_key, pkey, key_len); + n->dev = dev; + netdev_hold(dev, &n->dev_tracker, GFP_ATOMIC); + + /* Protocol specific setup. */ + if (tbl->constructor && (error = tbl->constructor(n)) < 0) { + rc = ERR_PTR(error); + goto out_neigh_release; + } + + if (dev->netdev_ops->ndo_neigh_construct) { + error = dev->netdev_ops->ndo_neigh_construct(dev, n); + if (error < 0) { + rc = ERR_PTR(error); + goto out_neigh_release; + } + } + + /* Device specific setup. */ + if (n->parms->neigh_setup && + (error = n->parms->neigh_setup(n)) < 0) { + rc = ERR_PTR(error); + goto out_neigh_release; + } + + n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1); + + write_lock_bh(&tbl->lock); + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + + if (atomic_read(&tbl->entries) > (1 << nht->hash_shift)) + nht = neigh_hash_grow(tbl, nht->hash_shift + 1); + + hash_val = tbl->hash(n->primary_key, dev, nht->hash_rnd) >> (32 - nht->hash_shift); + + if (n->parms->dead) { + rc = ERR_PTR(-EINVAL); + goto out_tbl_unlock; + } + + for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val], + lockdep_is_held(&tbl->lock)); + n1 != NULL; + n1 = rcu_dereference_protected(n1->next, + lockdep_is_held(&tbl->lock))) { + if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) { + if (want_ref) + neigh_hold(n1); + rc = n1; + goto out_tbl_unlock; + } + } + + n->dead = 0; + if (!exempt_from_gc) + list_add_tail(&n->gc_list, &n->tbl->gc_list); + if (n->flags & NTF_MANAGED) + list_add_tail(&n->managed_list, &n->tbl->managed_list); + if (want_ref) + neigh_hold(n); + rcu_assign_pointer(n->next, + rcu_dereference_protected(nht->hash_buckets[hash_val], + lockdep_is_held(&tbl->lock))); + rcu_assign_pointer(nht->hash_buckets[hash_val], n); + write_unlock_bh(&tbl->lock); + neigh_dbg(2, "neigh %p is created\n", n); + rc = n; +out: + return rc; +out_tbl_unlock: + write_unlock_bh(&tbl->lock); +out_neigh_release: + if (!exempt_from_gc) + atomic_dec(&tbl->gc_entries); + neigh_release(n); + goto out; +} + +struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, + struct net_device *dev, bool want_ref) +{ + return ___neigh_create(tbl, pkey, dev, 0, false, want_ref); +} +EXPORT_SYMBOL(__neigh_create); + +static u32 pneigh_hash(const void *pkey, unsigned int key_len) +{ + u32 hash_val = *(u32 *)(pkey + key_len - 4); + hash_val ^= (hash_val >> 16); + hash_val ^= hash_val >> 8; + hash_val ^= hash_val >> 4; + hash_val &= PNEIGH_HASHMASK; + return hash_val; +} + +static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, + struct net *net, + const void *pkey, + unsigned int key_len, + struct net_device *dev) +{ + while (n) { + if (!memcmp(n->key, pkey, key_len) && + net_eq(pneigh_net(n), net) && + (n->dev == dev || !n->dev)) + return n; + n = n->next; + } + return NULL; +} + +struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, + struct net *net, const void *pkey, struct net_device *dev) +{ + unsigned int key_len = tbl->key_len; + u32 hash_val = pneigh_hash(pkey, key_len); + + return __pneigh_lookup_1(tbl->phash_buckets[hash_val], + net, pkey, key_len, dev); +} +EXPORT_SYMBOL_GPL(__pneigh_lookup); + +struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, + struct net *net, const void *pkey, + struct net_device *dev, int creat) +{ + struct pneigh_entry *n; + unsigned int key_len = tbl->key_len; + u32 hash_val = pneigh_hash(pkey, key_len); + + read_lock_bh(&tbl->lock); + n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], + net, pkey, key_len, dev); + read_unlock_bh(&tbl->lock); + + if (n || !creat) + goto out; + + ASSERT_RTNL(); + + n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL); + if (!n) + goto out; + + write_pnet(&n->net, net); + memcpy(n->key, pkey, key_len); + n->dev = dev; + netdev_hold(dev, &n->dev_tracker, GFP_KERNEL); + + if (tbl->pconstructor && tbl->pconstructor(n)) { + netdev_put(dev, &n->dev_tracker); + kfree(n); + n = NULL; + goto out; + } + + write_lock_bh(&tbl->lock); + n->next = tbl->phash_buckets[hash_val]; + tbl->phash_buckets[hash_val] = n; + write_unlock_bh(&tbl->lock); +out: + return n; +} +EXPORT_SYMBOL(pneigh_lookup); + + +int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, + struct net_device *dev) +{ + struct pneigh_entry *n, **np; + unsigned int key_len = tbl->key_len; + u32 hash_val = pneigh_hash(pkey, key_len); + + write_lock_bh(&tbl->lock); + for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL; + np = &n->next) { + if (!memcmp(n->key, pkey, key_len) && n->dev == dev && + net_eq(pneigh_net(n), net)) { + *np = n->next; + write_unlock_bh(&tbl->lock); + if (tbl->pdestructor) + tbl->pdestructor(n); + netdev_put(n->dev, &n->dev_tracker); + kfree(n); + return 0; + } + } + write_unlock_bh(&tbl->lock); + return -ENOENT; +} + +static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, + struct net_device *dev) +{ + struct pneigh_entry *n, **np, *freelist = NULL; + u32 h; + + for (h = 0; h <= PNEIGH_HASHMASK; h++) { + np = &tbl->phash_buckets[h]; + while ((n = *np) != NULL) { + if (!dev || n->dev == dev) { + *np = n->next; + n->next = freelist; + freelist = n; + continue; + } + np = &n->next; + } + } + write_unlock_bh(&tbl->lock); + while ((n = freelist)) { + freelist = n->next; + n->next = NULL; + if (tbl->pdestructor) + tbl->pdestructor(n); + netdev_put(n->dev, &n->dev_tracker); + kfree(n); + } + return -ENOENT; +} + +static void neigh_parms_destroy(struct neigh_parms *parms); + +static inline void neigh_parms_put(struct neigh_parms *parms) +{ + if (refcount_dec_and_test(&parms->refcnt)) + neigh_parms_destroy(parms); +} + +/* + * neighbour must already be out of the table; + * + */ +void neigh_destroy(struct neighbour *neigh) +{ + struct net_device *dev = neigh->dev; + + NEIGH_CACHE_STAT_INC(neigh->tbl, destroys); + + if (!neigh->dead) { + pr_warn("Destroying alive neighbour %p\n", neigh); + dump_stack(); + return; + } + + if (neigh_del_timer(neigh)) + pr_warn("Impossible event\n"); + + write_lock_bh(&neigh->lock); + __skb_queue_purge(&neigh->arp_queue); + write_unlock_bh(&neigh->lock); + neigh->arp_queue_len_bytes = 0; + + if (dev->netdev_ops->ndo_neigh_destroy) + dev->netdev_ops->ndo_neigh_destroy(dev, neigh); + + netdev_put(dev, &neigh->dev_tracker); + neigh_parms_put(neigh->parms); + + neigh_dbg(2, "neigh %p is destroyed\n", neigh); + + atomic_dec(&neigh->tbl->entries); + kfree_rcu(neigh, rcu); +} +EXPORT_SYMBOL(neigh_destroy); + +/* Neighbour state is suspicious; + disable fast path. + + Called with write_locked neigh. + */ +static void neigh_suspect(struct neighbour *neigh) +{ + neigh_dbg(2, "neigh %p is suspected\n", neigh); + + WRITE_ONCE(neigh->output, neigh->ops->output); +} + +/* Neighbour state is OK; + enable fast path. + + Called with write_locked neigh. + */ +static void neigh_connect(struct neighbour *neigh) +{ + neigh_dbg(2, "neigh %p is connected\n", neigh); + + WRITE_ONCE(neigh->output, neigh->ops->connected_output); +} + +static void neigh_periodic_work(struct work_struct *work) +{ + struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); + struct neighbour *n; + struct neighbour __rcu **np; + unsigned int i; + struct neigh_hash_table *nht; + + NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); + + write_lock_bh(&tbl->lock); + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + + /* + * periodically recompute ReachableTime from random function + */ + + if (time_after(jiffies, tbl->last_rand + 300 * HZ)) { + struct neigh_parms *p; + + WRITE_ONCE(tbl->last_rand, jiffies); + list_for_each_entry(p, &tbl->parms_list, list) + p->reachable_time = + neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); + } + + if (atomic_read(&tbl->entries) < READ_ONCE(tbl->gc_thresh1)) + goto out; + + for (i = 0 ; i < (1 << nht->hash_shift); i++) { + np = &nht->hash_buckets[i]; + + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { + unsigned int state; + + write_lock(&n->lock); + + state = n->nud_state; + if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) || + (n->flags & NTF_EXT_LEARNED)) { + write_unlock(&n->lock); + goto next_elt; + } + + if (time_before(n->used, n->confirmed) && + time_is_before_eq_jiffies(n->confirmed)) + n->used = n->confirmed; + + if (refcount_read(&n->refcnt) == 1 && + (state == NUD_FAILED || + !time_in_range_open(jiffies, n->used, + n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); + neigh_mark_dead(n); + write_unlock(&n->lock); + neigh_cleanup_and_release(n); + continue; + } + write_unlock(&n->lock); + +next_elt: + np = &n->next; + } + /* + * It's fine to release lock here, even if hash table + * grows while we are preempted. + */ + write_unlock_bh(&tbl->lock); + cond_resched(); + write_lock_bh(&tbl->lock); + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + } +out: + /* Cycle through all hash buckets every BASE_REACHABLE_TIME/2 ticks. + * ARP entry timeouts range from 1/2 BASE_REACHABLE_TIME to 3/2 + * BASE_REACHABLE_TIME. + */ + queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, + NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1); + write_unlock_bh(&tbl->lock); +} + +static __inline__ int neigh_max_probes(struct neighbour *n) +{ + struct neigh_parms *p = n->parms; + return NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES) + + (n->nud_state & NUD_PROBE ? NEIGH_VAR(p, MCAST_REPROBES) : + NEIGH_VAR(p, MCAST_PROBES)); +} + +static void neigh_invalidate(struct neighbour *neigh) + __releases(neigh->lock) + __acquires(neigh->lock) +{ + struct sk_buff *skb; + + NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed); + neigh_dbg(2, "neigh %p is failed\n", neigh); + neigh->updated = jiffies; + + /* It is very thin place. report_unreachable is very complicated + routine. Particularly, it can hit the same neighbour entry! + + So that, we try to be accurate and avoid dead loop. --ANK + */ + while (neigh->nud_state == NUD_FAILED && + (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { + write_unlock(&neigh->lock); + neigh->ops->error_report(neigh, skb); + write_lock(&neigh->lock); + } + __skb_queue_purge(&neigh->arp_queue); + neigh->arp_queue_len_bytes = 0; +} + +static void neigh_probe(struct neighbour *neigh) + __releases(neigh->lock) +{ + struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue); + /* keep skb alive even if arp_queue overflows */ + if (skb) + skb = skb_clone(skb, GFP_ATOMIC); + write_unlock(&neigh->lock); + if (neigh->ops->solicit) + neigh->ops->solicit(neigh, skb); + atomic_inc(&neigh->probes); + consume_skb(skb); +} + +/* Called when a timer expires for a neighbour entry. */ + +static void neigh_timer_handler(struct timer_list *t) +{ + unsigned long now, next; + struct neighbour *neigh = from_timer(neigh, t, timer); + unsigned int state; + int notify = 0; + + write_lock(&neigh->lock); + + state = neigh->nud_state; + now = jiffies; + next = now + HZ; + + if (!(state & NUD_IN_TIMER)) + goto out; + + if (state & NUD_REACHABLE) { + if (time_before_eq(now, + neigh->confirmed + neigh->parms->reachable_time)) { + neigh_dbg(2, "neigh %p is still alive\n", neigh); + next = neigh->confirmed + neigh->parms->reachable_time; + } else if (time_before_eq(now, + neigh->used + + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { + neigh_dbg(2, "neigh %p is delayed\n", neigh); + WRITE_ONCE(neigh->nud_state, NUD_DELAY); + neigh->updated = jiffies; + neigh_suspect(neigh); + next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME); + } else { + neigh_dbg(2, "neigh %p is suspected\n", neigh); + WRITE_ONCE(neigh->nud_state, NUD_STALE); + neigh->updated = jiffies; + neigh_suspect(neigh); + notify = 1; + } + } else if (state & NUD_DELAY) { + if (time_before_eq(now, + neigh->confirmed + + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { + neigh_dbg(2, "neigh %p is now reachable\n", neigh); + WRITE_ONCE(neigh->nud_state, NUD_REACHABLE); + neigh->updated = jiffies; + neigh_connect(neigh); + notify = 1; + next = neigh->confirmed + neigh->parms->reachable_time; + } else { + neigh_dbg(2, "neigh %p is probed\n", neigh); + WRITE_ONCE(neigh->nud_state, NUD_PROBE); + neigh->updated = jiffies; + atomic_set(&neigh->probes, 0); + notify = 1; + next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), + HZ/100); + } + } else { + /* NUD_PROBE|NUD_INCOMPLETE */ + next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100); + } + + if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && + atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) { + WRITE_ONCE(neigh->nud_state, NUD_FAILED); + notify = 1; + neigh_invalidate(neigh); + goto out; + } + + if (neigh->nud_state & NUD_IN_TIMER) { + if (time_before(next, jiffies + HZ/100)) + next = jiffies + HZ/100; + if (!mod_timer(&neigh->timer, next)) + neigh_hold(neigh); + } + if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) { + neigh_probe(neigh); + } else { +out: + write_unlock(&neigh->lock); + } + + if (notify) + neigh_update_notify(neigh, 0); + + trace_neigh_timer_handler(neigh, 0); + + neigh_release(neigh); +} + +int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, + const bool immediate_ok) +{ + int rc; + bool immediate_probe = false; + + write_lock_bh(&neigh->lock); + + rc = 0; + if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)) + goto out_unlock_bh; + if (neigh->dead) + goto out_dead; + + if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { + if (NEIGH_VAR(neigh->parms, MCAST_PROBES) + + NEIGH_VAR(neigh->parms, APP_PROBES)) { + unsigned long next, now = jiffies; + + atomic_set(&neigh->probes, + NEIGH_VAR(neigh->parms, UCAST_PROBES)); + neigh_del_timer(neigh); + WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE); + neigh->updated = now; + if (!immediate_ok) { + next = now + 1; + } else { + immediate_probe = true; + next = now + max(NEIGH_VAR(neigh->parms, + RETRANS_TIME), + HZ / 100); + } + neigh_add_timer(neigh, next); + } else { + WRITE_ONCE(neigh->nud_state, NUD_FAILED); + neigh->updated = jiffies; + write_unlock_bh(&neigh->lock); + + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED); + return 1; + } + } else if (neigh->nud_state & NUD_STALE) { + neigh_dbg(2, "neigh %p is delayed\n", neigh); + neigh_del_timer(neigh); + WRITE_ONCE(neigh->nud_state, NUD_DELAY); + neigh->updated = jiffies; + neigh_add_timer(neigh, jiffies + + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME)); + } + + if (neigh->nud_state == NUD_INCOMPLETE) { + if (skb) { + while (neigh->arp_queue_len_bytes + skb->truesize > + NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) { + struct sk_buff *buff; + + buff = __skb_dequeue(&neigh->arp_queue); + if (!buff) + break; + neigh->arp_queue_len_bytes -= buff->truesize; + kfree_skb_reason(buff, SKB_DROP_REASON_NEIGH_QUEUEFULL); + NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards); + } + skb_dst_force(skb); + __skb_queue_tail(&neigh->arp_queue, skb); + neigh->arp_queue_len_bytes += skb->truesize; + } + rc = 1; + } +out_unlock_bh: + if (immediate_probe) + neigh_probe(neigh); + else + write_unlock(&neigh->lock); + local_bh_enable(); + trace_neigh_event_send_done(neigh, rc); + return rc; + +out_dead: + if (neigh->nud_state & NUD_STALE) + goto out_unlock_bh; + write_unlock_bh(&neigh->lock); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_DEAD); + trace_neigh_event_send_dead(neigh, 1); + return 1; +} +EXPORT_SYMBOL(__neigh_event_send); + +static void neigh_update_hhs(struct neighbour *neigh) +{ + struct hh_cache *hh; + void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *) + = NULL; + + if (neigh->dev->header_ops) + update = neigh->dev->header_ops->cache_update; + + if (update) { + hh = &neigh->hh; + if (READ_ONCE(hh->hh_len)) { + write_seqlock_bh(&hh->hh_lock); + update(hh, neigh->dev, neigh->ha); + write_sequnlock_bh(&hh->hh_lock); + } + } +} + +/* Generic update routine. + -- lladdr is new lladdr or NULL, if it is not supplied. + -- new is new state. + -- flags + NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr, + if it is different. + NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected" + lladdr instead of overriding it + if it is different. + NEIGH_UPDATE_F_ADMIN means that the change is administrative. + NEIGH_UPDATE_F_USE means that the entry is user triggered. + NEIGH_UPDATE_F_MANAGED means that the entry will be auto-refreshed. + NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing + NTF_ROUTER flag. + NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as + a router. + + Caller MUST hold reference count on the entry. + */ +static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, + u8 new, u32 flags, u32 nlmsg_pid, + struct netlink_ext_ack *extack) +{ + bool gc_update = false, managed_update = false; + int update_isrouter = 0; + struct net_device *dev; + int err, notify = 0; + u8 old; + + trace_neigh_update(neigh, lladdr, new, flags, nlmsg_pid); + + write_lock_bh(&neigh->lock); + + dev = neigh->dev; + old = neigh->nud_state; + err = -EPERM; + + if (neigh->dead) { + NL_SET_ERR_MSG(extack, "Neighbor entry is now dead"); + new = old; + goto out; + } + if (!(flags & NEIGH_UPDATE_F_ADMIN) && + (old & (NUD_NOARP | NUD_PERMANENT))) + goto out; + + neigh_update_flags(neigh, flags, ¬ify, &gc_update, &managed_update); + if (flags & (NEIGH_UPDATE_F_USE | NEIGH_UPDATE_F_MANAGED)) { + new = old & ~NUD_PERMANENT; + WRITE_ONCE(neigh->nud_state, new); + err = 0; + goto out; + } + + if (!(new & NUD_VALID)) { + neigh_del_timer(neigh); + if (old & NUD_CONNECTED) + neigh_suspect(neigh); + WRITE_ONCE(neigh->nud_state, new); + err = 0; + notify = old & NUD_VALID; + if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && + (new & NUD_FAILED)) { + neigh_invalidate(neigh); + notify = 1; + } + goto out; + } + + /* Compare new lladdr with cached one */ + if (!dev->addr_len) { + /* First case: device needs no address. */ + lladdr = neigh->ha; + } else if (lladdr) { + /* The second case: if something is already cached + and a new address is proposed: + - compare new & old + - if they are different, check override flag + */ + if ((old & NUD_VALID) && + !memcmp(lladdr, neigh->ha, dev->addr_len)) + lladdr = neigh->ha; + } else { + /* No address is supplied; if we know something, + use it, otherwise discard the request. + */ + err = -EINVAL; + if (!(old & NUD_VALID)) { + NL_SET_ERR_MSG(extack, "No link layer address given"); + goto out; + } + lladdr = neigh->ha; + } + + /* Update confirmed timestamp for neighbour entry after we + * received ARP packet even if it doesn't change IP to MAC binding. + */ + if (new & NUD_CONNECTED) + neigh->confirmed = jiffies; + + /* If entry was valid and address is not changed, + do not change entry state, if new one is STALE. + */ + err = 0; + update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER; + if (old & NUD_VALID) { + if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) { + update_isrouter = 0; + if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) && + (old & NUD_CONNECTED)) { + lladdr = neigh->ha; + new = NUD_STALE; + } else + goto out; + } else { + if (lladdr == neigh->ha && new == NUD_STALE && + !(flags & NEIGH_UPDATE_F_ADMIN)) + new = old; + } + } + + /* Update timestamp only once we know we will make a change to the + * neighbour entry. Otherwise we risk to move the locktime window with + * noop updates and ignore relevant ARP updates. + */ + if (new != old || lladdr != neigh->ha) + neigh->updated = jiffies; + + if (new != old) { + neigh_del_timer(neigh); + if (new & NUD_PROBE) + atomic_set(&neigh->probes, 0); + if (new & NUD_IN_TIMER) + neigh_add_timer(neigh, (jiffies + + ((new & NUD_REACHABLE) ? + neigh->parms->reachable_time : + 0))); + WRITE_ONCE(neigh->nud_state, new); + notify = 1; + } + + if (lladdr != neigh->ha) { + write_seqlock(&neigh->ha_lock); + memcpy(&neigh->ha, lladdr, dev->addr_len); + write_sequnlock(&neigh->ha_lock); + neigh_update_hhs(neigh); + if (!(new & NUD_CONNECTED)) + neigh->confirmed = jiffies - + (NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1); + notify = 1; + } + if (new == old) + goto out; + if (new & NUD_CONNECTED) + neigh_connect(neigh); + else + neigh_suspect(neigh); + if (!(old & NUD_VALID)) { + struct sk_buff *skb; + + /* Again: avoid dead loop if something went wrong */ + + while (neigh->nud_state & NUD_VALID && + (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { + struct dst_entry *dst = skb_dst(skb); + struct neighbour *n2, *n1 = neigh; + write_unlock_bh(&neigh->lock); + + rcu_read_lock(); + + /* Why not just use 'neigh' as-is? The problem is that + * things such as shaper, eql, and sch_teql can end up + * using alternative, different, neigh objects to output + * the packet in the output path. So what we need to do + * here is re-lookup the top-level neigh in the path so + * we can reinject the packet there. + */ + n2 = NULL; + if (dst && dst->obsolete != DST_OBSOLETE_DEAD) { + n2 = dst_neigh_lookup_skb(dst, skb); + if (n2) + n1 = n2; + } + READ_ONCE(n1->output)(n1, skb); + if (n2) + neigh_release(n2); + rcu_read_unlock(); + + write_lock_bh(&neigh->lock); + } + __skb_queue_purge(&neigh->arp_queue); + neigh->arp_queue_len_bytes = 0; + } +out: + if (update_isrouter) + neigh_update_is_router(neigh, flags, ¬ify); + write_unlock_bh(&neigh->lock); + if (((new ^ old) & NUD_PERMANENT) || gc_update) + neigh_update_gc_list(neigh); + if (managed_update) + neigh_update_managed_list(neigh); + if (notify) + neigh_update_notify(neigh, nlmsg_pid); + trace_neigh_update_done(neigh, err); + return err; +} + +int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, + u32 flags, u32 nlmsg_pid) +{ + return __neigh_update(neigh, lladdr, new, flags, nlmsg_pid, NULL); +} +EXPORT_SYMBOL(neigh_update); + +/* Update the neigh to listen temporarily for probe responses, even if it is + * in a NUD_FAILED state. The caller has to hold neigh->lock for writing. + */ +void __neigh_set_probe_once(struct neighbour *neigh) +{ + if (neigh->dead) + return; + neigh->updated = jiffies; + if (!(neigh->nud_state & NUD_FAILED)) + return; + WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE); + atomic_set(&neigh->probes, neigh_max_probes(neigh)); + neigh_add_timer(neigh, + jiffies + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), + HZ/100)); +} +EXPORT_SYMBOL(__neigh_set_probe_once); + +struct neighbour *neigh_event_ns(struct neigh_table *tbl, + u8 *lladdr, void *saddr, + struct net_device *dev) +{ + struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev, + lladdr || !dev->addr_len); + if (neigh) + neigh_update(neigh, lladdr, NUD_STALE, + NEIGH_UPDATE_F_OVERRIDE, 0); + return neigh; +} +EXPORT_SYMBOL(neigh_event_ns); + +/* called with read_lock_bh(&n->lock); */ +static void neigh_hh_init(struct neighbour *n) +{ + struct net_device *dev = n->dev; + __be16 prot = n->tbl->protocol; + struct hh_cache *hh = &n->hh; + + write_lock_bh(&n->lock); + + /* Only one thread can come in here and initialize the + * hh_cache entry. + */ + if (!hh->hh_len) + dev->header_ops->cache(n, hh, prot); + + write_unlock_bh(&n->lock); +} + +/* Slow and careful. */ + +int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) +{ + int rc = 0; + + if (!neigh_event_send(neigh, skb)) { + int err; + struct net_device *dev = neigh->dev; + unsigned int seq; + + if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len)) + neigh_hh_init(neigh); + + do { + __skb_pull(skb, skb_network_offset(skb)); + seq = read_seqbegin(&neigh->ha_lock); + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); + } while (read_seqretry(&neigh->ha_lock, seq)); + + if (err >= 0) + rc = dev_queue_xmit(skb); + else + goto out_kfree_skb; + } +out: + return rc; +out_kfree_skb: + rc = -EINVAL; + kfree_skb(skb); + goto out; +} +EXPORT_SYMBOL(neigh_resolve_output); + +/* As fast as possible without hh cache */ + +int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb) +{ + struct net_device *dev = neigh->dev; + unsigned int seq; + int err; + + do { + __skb_pull(skb, skb_network_offset(skb)); + seq = read_seqbegin(&neigh->ha_lock); + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); + } while (read_seqretry(&neigh->ha_lock, seq)); + + if (err >= 0) + err = dev_queue_xmit(skb); + else { + err = -EINVAL; + kfree_skb(skb); + } + return err; +} +EXPORT_SYMBOL(neigh_connected_output); + +int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb) +{ + return dev_queue_xmit(skb); +} +EXPORT_SYMBOL(neigh_direct_output); + +static void neigh_managed_work(struct work_struct *work) +{ + struct neigh_table *tbl = container_of(work, struct neigh_table, + managed_work.work); + struct neighbour *neigh; + + write_lock_bh(&tbl->lock); + list_for_each_entry(neigh, &tbl->managed_list, managed_list) + neigh_event_send_probe(neigh, NULL, false); + queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, + NEIGH_VAR(&tbl->parms, INTERVAL_PROBE_TIME_MS)); + write_unlock_bh(&tbl->lock); +} + +static void neigh_proxy_process(struct timer_list *t) +{ + struct neigh_table *tbl = from_timer(tbl, t, proxy_timer); + long sched_next = 0; + unsigned long now = jiffies; + struct sk_buff *skb, *n; + + spin_lock(&tbl->proxy_queue.lock); + + skb_queue_walk_safe(&tbl->proxy_queue, skb, n) { + long tdif = NEIGH_CB(skb)->sched_next - now; + + if (tdif <= 0) { + struct net_device *dev = skb->dev; + + neigh_parms_qlen_dec(dev, tbl->family); + __skb_unlink(skb, &tbl->proxy_queue); + + if (tbl->proxy_redo && netif_running(dev)) { + rcu_read_lock(); + tbl->proxy_redo(skb); + rcu_read_unlock(); + } else { + kfree_skb(skb); + } + + dev_put(dev); + } else if (!sched_next || tdif < sched_next) + sched_next = tdif; + } + del_timer(&tbl->proxy_timer); + if (sched_next) + mod_timer(&tbl->proxy_timer, jiffies + sched_next); + spin_unlock(&tbl->proxy_queue.lock); +} + +void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, + struct sk_buff *skb) +{ + unsigned long sched_next = jiffies + + prandom_u32_max(NEIGH_VAR(p, PROXY_DELAY)); + + if (p->qlen > NEIGH_VAR(p, PROXY_QLEN)) { + kfree_skb(skb); + return; + } + + NEIGH_CB(skb)->sched_next = sched_next; + NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED; + + spin_lock(&tbl->proxy_queue.lock); + if (del_timer(&tbl->proxy_timer)) { + if (time_before(tbl->proxy_timer.expires, sched_next)) + sched_next = tbl->proxy_timer.expires; + } + skb_dst_drop(skb); + dev_hold(skb->dev); + __skb_queue_tail(&tbl->proxy_queue, skb); + p->qlen++; + mod_timer(&tbl->proxy_timer, sched_next); + spin_unlock(&tbl->proxy_queue.lock); +} +EXPORT_SYMBOL(pneigh_enqueue); + +static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl, + struct net *net, int ifindex) +{ + struct neigh_parms *p; + + list_for_each_entry(p, &tbl->parms_list, list) { + if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) || + (!p->dev && !ifindex && net_eq(net, &init_net))) + return p; + } + + return NULL; +} + +struct neigh_parms *neigh_parms_alloc(struct net_device *dev, + struct neigh_table *tbl) +{ + struct neigh_parms *p; + struct net *net = dev_net(dev); + const struct net_device_ops *ops = dev->netdev_ops; + + p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL); + if (p) { + p->tbl = tbl; + refcount_set(&p->refcnt, 1); + p->reachable_time = + neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); + p->qlen = 0; + netdev_hold(dev, &p->dev_tracker, GFP_KERNEL); + p->dev = dev; + write_pnet(&p->net, net); + p->sysctl_table = NULL; + + if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) { + netdev_put(dev, &p->dev_tracker); + kfree(p); + return NULL; + } + + write_lock_bh(&tbl->lock); + list_add(&p->list, &tbl->parms.list); + write_unlock_bh(&tbl->lock); + + neigh_parms_data_state_cleanall(p); + } + return p; +} +EXPORT_SYMBOL(neigh_parms_alloc); + +static void neigh_rcu_free_parms(struct rcu_head *head) +{ + struct neigh_parms *parms = + container_of(head, struct neigh_parms, rcu_head); + + neigh_parms_put(parms); +} + +void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) +{ + if (!parms || parms == &tbl->parms) + return; + write_lock_bh(&tbl->lock); + list_del(&parms->list); + parms->dead = 1; + write_unlock_bh(&tbl->lock); + netdev_put(parms->dev, &parms->dev_tracker); + call_rcu(&parms->rcu_head, neigh_rcu_free_parms); +} +EXPORT_SYMBOL(neigh_parms_release); + +static void neigh_parms_destroy(struct neigh_parms *parms) +{ + kfree(parms); +} + +static struct lock_class_key neigh_table_proxy_queue_class; + +static struct neigh_table *neigh_tables[NEIGH_NR_TABLES] __read_mostly; + +void neigh_table_init(int index, struct neigh_table *tbl) +{ + unsigned long now = jiffies; + unsigned long phsize; + + INIT_LIST_HEAD(&tbl->parms_list); + INIT_LIST_HEAD(&tbl->gc_list); + INIT_LIST_HEAD(&tbl->managed_list); + + list_add(&tbl->parms.list, &tbl->parms_list); + write_pnet(&tbl->parms.net, &init_net); + refcount_set(&tbl->parms.refcnt, 1); + tbl->parms.reachable_time = + neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME)); + tbl->parms.qlen = 0; + + tbl->stats = alloc_percpu(struct neigh_statistics); + if (!tbl->stats) + panic("cannot create neighbour cache statistics"); + +#ifdef CONFIG_PROC_FS + if (!proc_create_seq_data(tbl->id, 0, init_net.proc_net_stat, + &neigh_stat_seq_ops, tbl)) + panic("cannot create neighbour proc dir entry"); +#endif + + RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(3)); + + phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *); + tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL); + + if (!tbl->nht || !tbl->phash_buckets) + panic("cannot allocate neighbour cache hashes"); + + if (!tbl->entry_size) + tbl->entry_size = ALIGN(offsetof(struct neighbour, primary_key) + + tbl->key_len, NEIGH_PRIV_ALIGN); + else + WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); + + rwlock_init(&tbl->lock); + + INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); + queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, + tbl->parms.reachable_time); + INIT_DEFERRABLE_WORK(&tbl->managed_work, neigh_managed_work); + queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, 0); + + timer_setup(&tbl->proxy_timer, neigh_proxy_process, 0); + skb_queue_head_init_class(&tbl->proxy_queue, + &neigh_table_proxy_queue_class); + + tbl->last_flush = now; + tbl->last_rand = now + tbl->parms.reachable_time * 20; + + neigh_tables[index] = tbl; +} +EXPORT_SYMBOL(neigh_table_init); + +int neigh_table_clear(int index, struct neigh_table *tbl) +{ + neigh_tables[index] = NULL; + /* It is not clean... Fix it to unload IPv6 module safely */ + cancel_delayed_work_sync(&tbl->managed_work); + cancel_delayed_work_sync(&tbl->gc_work); + del_timer_sync(&tbl->proxy_timer); + pneigh_queue_purge(&tbl->proxy_queue, NULL, tbl->family); + neigh_ifdown(tbl, NULL); + if (atomic_read(&tbl->entries)) + pr_crit("neighbour leakage\n"); + + call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu, + neigh_hash_free_rcu); + tbl->nht = NULL; + + kfree(tbl->phash_buckets); + tbl->phash_buckets = NULL; + + remove_proc_entry(tbl->id, init_net.proc_net_stat); + + free_percpu(tbl->stats); + tbl->stats = NULL; + + return 0; +} +EXPORT_SYMBOL(neigh_table_clear); + +static struct neigh_table *neigh_find_table(int family) +{ + struct neigh_table *tbl = NULL; + + switch (family) { + case AF_INET: + tbl = neigh_tables[NEIGH_ARP_TABLE]; + break; + case AF_INET6: + tbl = neigh_tables[NEIGH_ND_TABLE]; + break; + } + + return tbl; +} + +const struct nla_policy nda_policy[NDA_MAX+1] = { + [NDA_UNSPEC] = { .strict_start_type = NDA_NH_ID }, + [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) }, + [NDA_PROBES] = { .type = NLA_U32 }, + [NDA_VLAN] = { .type = NLA_U16 }, + [NDA_PORT] = { .type = NLA_U16 }, + [NDA_VNI] = { .type = NLA_U32 }, + [NDA_IFINDEX] = { .type = NLA_U32 }, + [NDA_MASTER] = { .type = NLA_U32 }, + [NDA_PROTOCOL] = { .type = NLA_U8 }, + [NDA_NH_ID] = { .type = NLA_U32 }, + [NDA_FLAGS_EXT] = NLA_POLICY_MASK(NLA_U32, NTF_EXT_MASK), + [NDA_FDB_EXT_ATTRS] = { .type = NLA_NESTED }, +}; + +static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct ndmsg *ndm; + struct nlattr *dst_attr; + struct neigh_table *tbl; + struct neighbour *neigh; + struct net_device *dev = NULL; + int err = -EINVAL; + + ASSERT_RTNL(); + if (nlmsg_len(nlh) < sizeof(*ndm)) + goto out; + + dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST); + if (!dst_attr) { + NL_SET_ERR_MSG(extack, "Network address not specified"); + goto out; + } + + ndm = nlmsg_data(nlh); + if (ndm->ndm_ifindex) { + dev = __dev_get_by_index(net, ndm->ndm_ifindex); + if (dev == NULL) { + err = -ENODEV; + goto out; + } + } + + tbl = neigh_find_table(ndm->ndm_family); + if (tbl == NULL) + return -EAFNOSUPPORT; + + if (nla_len(dst_attr) < (int)tbl->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address"); + goto out; + } + + if (ndm->ndm_flags & NTF_PROXY) { + err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); + goto out; + } + + if (dev == NULL) + goto out; + + neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); + if (neigh == NULL) { + err = -ENOENT; + goto out; + } + + err = __neigh_update(neigh, NULL, NUD_FAILED, + NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, + NETLINK_CB(skb).portid, extack); + write_lock_bh(&tbl->lock); + neigh_release(neigh); + neigh_remove_one(neigh, tbl); + write_unlock_bh(&tbl->lock); + +out: + return err; +} + +static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE | + NEIGH_UPDATE_F_OVERRIDE_ISROUTER; + struct net *net = sock_net(skb->sk); + struct ndmsg *ndm; + struct nlattr *tb[NDA_MAX+1]; + struct neigh_table *tbl; + struct net_device *dev = NULL; + struct neighbour *neigh; + void *dst, *lladdr; + u8 protocol = 0; + u32 ndm_flags; + int err; + + ASSERT_RTNL(); + err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, + nda_policy, extack); + if (err < 0) + goto out; + + err = -EINVAL; + if (!tb[NDA_DST]) { + NL_SET_ERR_MSG(extack, "Network address not specified"); + goto out; + } + + ndm = nlmsg_data(nlh); + ndm_flags = ndm->ndm_flags; + if (tb[NDA_FLAGS_EXT]) { + u32 ext = nla_get_u32(tb[NDA_FLAGS_EXT]); + + BUILD_BUG_ON(sizeof(neigh->flags) * BITS_PER_BYTE < + (sizeof(ndm->ndm_flags) * BITS_PER_BYTE + + hweight32(NTF_EXT_MASK))); + ndm_flags |= (ext << NTF_EXT_SHIFT); + } + if (ndm->ndm_ifindex) { + dev = __dev_get_by_index(net, ndm->ndm_ifindex); + if (dev == NULL) { + err = -ENODEV; + goto out; + } + + if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) { + NL_SET_ERR_MSG(extack, "Invalid link address"); + goto out; + } + } + + tbl = neigh_find_table(ndm->ndm_family); + if (tbl == NULL) + return -EAFNOSUPPORT; + + if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address"); + goto out; + } + + dst = nla_data(tb[NDA_DST]); + lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; + + if (tb[NDA_PROTOCOL]) + protocol = nla_get_u8(tb[NDA_PROTOCOL]); + if (ndm_flags & NTF_PROXY) { + struct pneigh_entry *pn; + + if (ndm_flags & NTF_MANAGED) { + NL_SET_ERR_MSG(extack, "Invalid NTF_* flag combination"); + goto out; + } + + err = -ENOBUFS; + pn = pneigh_lookup(tbl, net, dst, dev, 1); + if (pn) { + pn->flags = ndm_flags; + if (protocol) + pn->protocol = protocol; + err = 0; + } + goto out; + } + + if (!dev) { + NL_SET_ERR_MSG(extack, "Device not specified"); + goto out; + } + + if (tbl->allow_add && !tbl->allow_add(dev, extack)) { + err = -EINVAL; + goto out; + } + + neigh = neigh_lookup(tbl, dst, dev); + if (neigh == NULL) { + bool ndm_permanent = ndm->ndm_state & NUD_PERMANENT; + bool exempt_from_gc = ndm_permanent || + ndm_flags & NTF_EXT_LEARNED; + + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + err = -ENOENT; + goto out; + } + if (ndm_permanent && (ndm_flags & NTF_MANAGED)) { + NL_SET_ERR_MSG(extack, "Invalid NTF_* flag for permanent entry"); + err = -EINVAL; + goto out; + } + + neigh = ___neigh_create(tbl, dst, dev, + ndm_flags & + (NTF_EXT_LEARNED | NTF_MANAGED), + exempt_from_gc, true); + if (IS_ERR(neigh)) { + err = PTR_ERR(neigh); + goto out; + } + } else { + if (nlh->nlmsg_flags & NLM_F_EXCL) { + err = -EEXIST; + neigh_release(neigh); + goto out; + } + + if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) + flags &= ~(NEIGH_UPDATE_F_OVERRIDE | + NEIGH_UPDATE_F_OVERRIDE_ISROUTER); + } + + if (protocol) + neigh->protocol = protocol; + if (ndm_flags & NTF_EXT_LEARNED) + flags |= NEIGH_UPDATE_F_EXT_LEARNED; + if (ndm_flags & NTF_ROUTER) + flags |= NEIGH_UPDATE_F_ISROUTER; + if (ndm_flags & NTF_MANAGED) + flags |= NEIGH_UPDATE_F_MANAGED; + if (ndm_flags & NTF_USE) + flags |= NEIGH_UPDATE_F_USE; + + err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, + NETLINK_CB(skb).portid, extack); + if (!err && ndm_flags & (NTF_USE | NTF_MANAGED)) { + neigh_event_send(neigh, NULL); + err = 0; + } + neigh_release(neigh); +out: + return err; +} + +static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) +{ + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, NDTA_PARMS); + if (nest == NULL) + return -ENOBUFS; + + if ((parms->dev && + nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) || + nla_put_u32(skb, NDTPA_REFCNT, refcount_read(&parms->refcnt)) || + nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, + NEIGH_VAR(parms, QUEUE_LEN_BYTES)) || + /* approximative value for deprecated QUEUE_LEN (in packets) */ + nla_put_u32(skb, NDTPA_QUEUE_LEN, + NEIGH_VAR(parms, QUEUE_LEN_BYTES) / SKB_TRUESIZE(ETH_FRAME_LEN)) || + nla_put_u32(skb, NDTPA_PROXY_QLEN, NEIGH_VAR(parms, PROXY_QLEN)) || + nla_put_u32(skb, NDTPA_APP_PROBES, NEIGH_VAR(parms, APP_PROBES)) || + nla_put_u32(skb, NDTPA_UCAST_PROBES, + NEIGH_VAR(parms, UCAST_PROBES)) || + nla_put_u32(skb, NDTPA_MCAST_PROBES, + NEIGH_VAR(parms, MCAST_PROBES)) || + nla_put_u32(skb, NDTPA_MCAST_REPROBES, + NEIGH_VAR(parms, MCAST_REPROBES)) || + nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time, + NDTPA_PAD) || + nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME, + NEIGH_VAR(parms, BASE_REACHABLE_TIME), NDTPA_PAD) || + nla_put_msecs(skb, NDTPA_GC_STALETIME, + NEIGH_VAR(parms, GC_STALETIME), NDTPA_PAD) || + nla_put_msecs(skb, NDTPA_DELAY_PROBE_TIME, + NEIGH_VAR(parms, DELAY_PROBE_TIME), NDTPA_PAD) || + nla_put_msecs(skb, NDTPA_RETRANS_TIME, + NEIGH_VAR(parms, RETRANS_TIME), NDTPA_PAD) || + nla_put_msecs(skb, NDTPA_ANYCAST_DELAY, + NEIGH_VAR(parms, ANYCAST_DELAY), NDTPA_PAD) || + nla_put_msecs(skb, NDTPA_PROXY_DELAY, + NEIGH_VAR(parms, PROXY_DELAY), NDTPA_PAD) || + nla_put_msecs(skb, NDTPA_LOCKTIME, + NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD) || + nla_put_msecs(skb, NDTPA_INTERVAL_PROBE_TIME_MS, + NEIGH_VAR(parms, INTERVAL_PROBE_TIME_MS), NDTPA_PAD)) + goto nla_put_failure; + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, + u32 pid, u32 seq, int type, int flags) +{ + struct nlmsghdr *nlh; + struct ndtmsg *ndtmsg; + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags); + if (nlh == NULL) + return -EMSGSIZE; + + ndtmsg = nlmsg_data(nlh); + + read_lock_bh(&tbl->lock); + ndtmsg->ndtm_family = tbl->family; + ndtmsg->ndtm_pad1 = 0; + ndtmsg->ndtm_pad2 = 0; + + if (nla_put_string(skb, NDTA_NAME, tbl->id) || + nla_put_msecs(skb, NDTA_GC_INTERVAL, READ_ONCE(tbl->gc_interval), + NDTA_PAD) || + nla_put_u32(skb, NDTA_THRESH1, READ_ONCE(tbl->gc_thresh1)) || + nla_put_u32(skb, NDTA_THRESH2, READ_ONCE(tbl->gc_thresh2)) || + nla_put_u32(skb, NDTA_THRESH3, READ_ONCE(tbl->gc_thresh3))) + goto nla_put_failure; + { + unsigned long now = jiffies; + long flush_delta = now - READ_ONCE(tbl->last_flush); + long rand_delta = now - READ_ONCE(tbl->last_rand); + struct neigh_hash_table *nht; + struct ndt_config ndc = { + .ndtc_key_len = tbl->key_len, + .ndtc_entry_size = tbl->entry_size, + .ndtc_entries = atomic_read(&tbl->entries), + .ndtc_last_flush = jiffies_to_msecs(flush_delta), + .ndtc_last_rand = jiffies_to_msecs(rand_delta), + .ndtc_proxy_qlen = READ_ONCE(tbl->proxy_queue.qlen), + }; + + rcu_read_lock(); + nht = rcu_dereference(tbl->nht); + ndc.ndtc_hash_rnd = nht->hash_rnd[0]; + ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1); + rcu_read_unlock(); + + if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc)) + goto nla_put_failure; + } + + { + int cpu; + struct ndt_stats ndst; + + memset(&ndst, 0, sizeof(ndst)); + + for_each_possible_cpu(cpu) { + struct neigh_statistics *st; + + st = per_cpu_ptr(tbl->stats, cpu); + ndst.ndts_allocs += READ_ONCE(st->allocs); + ndst.ndts_destroys += READ_ONCE(st->destroys); + ndst.ndts_hash_grows += READ_ONCE(st->hash_grows); + ndst.ndts_res_failed += READ_ONCE(st->res_failed); + ndst.ndts_lookups += READ_ONCE(st->lookups); + ndst.ndts_hits += READ_ONCE(st->hits); + ndst.ndts_rcv_probes_mcast += READ_ONCE(st->rcv_probes_mcast); + ndst.ndts_rcv_probes_ucast += READ_ONCE(st->rcv_probes_ucast); + ndst.ndts_periodic_gc_runs += READ_ONCE(st->periodic_gc_runs); + ndst.ndts_forced_gc_runs += READ_ONCE(st->forced_gc_runs); + ndst.ndts_table_fulls += READ_ONCE(st->table_fulls); + } + + if (nla_put_64bit(skb, NDTA_STATS, sizeof(ndst), &ndst, + NDTA_PAD)) + goto nla_put_failure; + } + + BUG_ON(tbl->parms.dev); + if (neightbl_fill_parms(skb, &tbl->parms) < 0) + goto nla_put_failure; + + read_unlock_bh(&tbl->lock); + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure: + read_unlock_bh(&tbl->lock); + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int neightbl_fill_param_info(struct sk_buff *skb, + struct neigh_table *tbl, + struct neigh_parms *parms, + u32 pid, u32 seq, int type, + unsigned int flags) +{ + struct ndtmsg *ndtmsg; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags); + if (nlh == NULL) + return -EMSGSIZE; + + ndtmsg = nlmsg_data(nlh); + + read_lock_bh(&tbl->lock); + ndtmsg->ndtm_family = tbl->family; + ndtmsg->ndtm_pad1 = 0; + ndtmsg->ndtm_pad2 = 0; + + if (nla_put_string(skb, NDTA_NAME, tbl->id) < 0 || + neightbl_fill_parms(skb, parms) < 0) + goto errout; + + read_unlock_bh(&tbl->lock); + nlmsg_end(skb, nlh); + return 0; +errout: + read_unlock_bh(&tbl->lock); + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = { + [NDTA_NAME] = { .type = NLA_STRING }, + [NDTA_THRESH1] = { .type = NLA_U32 }, + [NDTA_THRESH2] = { .type = NLA_U32 }, + [NDTA_THRESH3] = { .type = NLA_U32 }, + [NDTA_GC_INTERVAL] = { .type = NLA_U64 }, + [NDTA_PARMS] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { + [NDTPA_IFINDEX] = { .type = NLA_U32 }, + [NDTPA_QUEUE_LEN] = { .type = NLA_U32 }, + [NDTPA_PROXY_QLEN] = { .type = NLA_U32 }, + [NDTPA_APP_PROBES] = { .type = NLA_U32 }, + [NDTPA_UCAST_PROBES] = { .type = NLA_U32 }, + [NDTPA_MCAST_PROBES] = { .type = NLA_U32 }, + [NDTPA_MCAST_REPROBES] = { .type = NLA_U32 }, + [NDTPA_BASE_REACHABLE_TIME] = { .type = NLA_U64 }, + [NDTPA_GC_STALETIME] = { .type = NLA_U64 }, + [NDTPA_DELAY_PROBE_TIME] = { .type = NLA_U64 }, + [NDTPA_RETRANS_TIME] = { .type = NLA_U64 }, + [NDTPA_ANYCAST_DELAY] = { .type = NLA_U64 }, + [NDTPA_PROXY_DELAY] = { .type = NLA_U64 }, + [NDTPA_LOCKTIME] = { .type = NLA_U64 }, + [NDTPA_INTERVAL_PROBE_TIME_MS] = { .type = NLA_U64, .min = 1 }, +}; + +static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct neigh_table *tbl; + struct ndtmsg *ndtmsg; + struct nlattr *tb[NDTA_MAX+1]; + bool found = false; + int err, tidx; + + err = nlmsg_parse_deprecated(nlh, sizeof(*ndtmsg), tb, NDTA_MAX, + nl_neightbl_policy, extack); + if (err < 0) + goto errout; + + if (tb[NDTA_NAME] == NULL) { + err = -EINVAL; + goto errout; + } + + ndtmsg = nlmsg_data(nlh); + + for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { + tbl = neigh_tables[tidx]; + if (!tbl) + continue; + if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) + continue; + if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) { + found = true; + break; + } + } + + if (!found) + return -ENOENT; + + /* + * We acquire tbl->lock to be nice to the periodic timers and + * make sure they always see a consistent set of values. + */ + write_lock_bh(&tbl->lock); + + if (tb[NDTA_PARMS]) { + struct nlattr *tbp[NDTPA_MAX+1]; + struct neigh_parms *p; + int i, ifindex = 0; + + err = nla_parse_nested_deprecated(tbp, NDTPA_MAX, + tb[NDTA_PARMS], + nl_ntbl_parm_policy, extack); + if (err < 0) + goto errout_tbl_lock; + + if (tbp[NDTPA_IFINDEX]) + ifindex = nla_get_u32(tbp[NDTPA_IFINDEX]); + + p = lookup_neigh_parms(tbl, net, ifindex); + if (p == NULL) { + err = -ENOENT; + goto errout_tbl_lock; + } + + for (i = 1; i <= NDTPA_MAX; i++) { + if (tbp[i] == NULL) + continue; + + switch (i) { + case NDTPA_QUEUE_LEN: + NEIGH_VAR_SET(p, QUEUE_LEN_BYTES, + nla_get_u32(tbp[i]) * + SKB_TRUESIZE(ETH_FRAME_LEN)); + break; + case NDTPA_QUEUE_LENBYTES: + NEIGH_VAR_SET(p, QUEUE_LEN_BYTES, + nla_get_u32(tbp[i])); + break; + case NDTPA_PROXY_QLEN: + NEIGH_VAR_SET(p, PROXY_QLEN, + nla_get_u32(tbp[i])); + break; + case NDTPA_APP_PROBES: + NEIGH_VAR_SET(p, APP_PROBES, + nla_get_u32(tbp[i])); + break; + case NDTPA_UCAST_PROBES: + NEIGH_VAR_SET(p, UCAST_PROBES, + nla_get_u32(tbp[i])); + break; + case NDTPA_MCAST_PROBES: + NEIGH_VAR_SET(p, MCAST_PROBES, + nla_get_u32(tbp[i])); + break; + case NDTPA_MCAST_REPROBES: + NEIGH_VAR_SET(p, MCAST_REPROBES, + nla_get_u32(tbp[i])); + break; + case NDTPA_BASE_REACHABLE_TIME: + NEIGH_VAR_SET(p, BASE_REACHABLE_TIME, + nla_get_msecs(tbp[i])); + /* update reachable_time as well, otherwise, the change will + * only be effective after the next time neigh_periodic_work + * decides to recompute it (can be multiple minutes) + */ + p->reachable_time = + neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); + break; + case NDTPA_GC_STALETIME: + NEIGH_VAR_SET(p, GC_STALETIME, + nla_get_msecs(tbp[i])); + break; + case NDTPA_DELAY_PROBE_TIME: + NEIGH_VAR_SET(p, DELAY_PROBE_TIME, + nla_get_msecs(tbp[i])); + call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); + break; + case NDTPA_INTERVAL_PROBE_TIME_MS: + NEIGH_VAR_SET(p, INTERVAL_PROBE_TIME_MS, + nla_get_msecs(tbp[i])); + break; + case NDTPA_RETRANS_TIME: + NEIGH_VAR_SET(p, RETRANS_TIME, + nla_get_msecs(tbp[i])); + break; + case NDTPA_ANYCAST_DELAY: + NEIGH_VAR_SET(p, ANYCAST_DELAY, + nla_get_msecs(tbp[i])); + break; + case NDTPA_PROXY_DELAY: + NEIGH_VAR_SET(p, PROXY_DELAY, + nla_get_msecs(tbp[i])); + break; + case NDTPA_LOCKTIME: + NEIGH_VAR_SET(p, LOCKTIME, + nla_get_msecs(tbp[i])); + break; + } + } + } + + err = -ENOENT; + if ((tb[NDTA_THRESH1] || tb[NDTA_THRESH2] || + tb[NDTA_THRESH3] || tb[NDTA_GC_INTERVAL]) && + !net_eq(net, &init_net)) + goto errout_tbl_lock; + + if (tb[NDTA_THRESH1]) + WRITE_ONCE(tbl->gc_thresh1, nla_get_u32(tb[NDTA_THRESH1])); + + if (tb[NDTA_THRESH2]) + WRITE_ONCE(tbl->gc_thresh2, nla_get_u32(tb[NDTA_THRESH2])); + + if (tb[NDTA_THRESH3]) + WRITE_ONCE(tbl->gc_thresh3, nla_get_u32(tb[NDTA_THRESH3])); + + if (tb[NDTA_GC_INTERVAL]) + WRITE_ONCE(tbl->gc_interval, nla_get_msecs(tb[NDTA_GC_INTERVAL])); + + err = 0; + +errout_tbl_lock: + write_unlock_bh(&tbl->lock); +errout: + return err; +} + +static int neightbl_valid_dump_info(const struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct ndtmsg *ndtm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndtm))) { + NL_SET_ERR_MSG(extack, "Invalid header for neighbor table dump request"); + return -EINVAL; + } + + ndtm = nlmsg_data(nlh); + if (ndtm->ndtm_pad1 || ndtm->ndtm_pad2) { + NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor table dump request"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*ndtm))) { + NL_SET_ERR_MSG(extack, "Invalid data after header in neighbor table dump request"); + return -EINVAL; + } + + return 0; +} + +static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct nlmsghdr *nlh = cb->nlh; + struct net *net = sock_net(skb->sk); + int family, tidx, nidx = 0; + int tbl_skip = cb->args[0]; + int neigh_skip = cb->args[1]; + struct neigh_table *tbl; + + if (cb->strict_check) { + int err = neightbl_valid_dump_info(nlh, cb->extack); + + if (err < 0) + return err; + } + + family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; + + for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { + struct neigh_parms *p; + + tbl = neigh_tables[tidx]; + if (!tbl) + continue; + + if (tidx < tbl_skip || (family && tbl->family != family)) + continue; + + if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid, + nlh->nlmsg_seq, RTM_NEWNEIGHTBL, + NLM_F_MULTI) < 0) + break; + + nidx = 0; + p = list_next_entry(&tbl->parms, list); + list_for_each_entry_from(p, &tbl->parms_list, list) { + if (!net_eq(neigh_parms_net(p), net)) + continue; + + if (nidx < neigh_skip) + goto next; + + if (neightbl_fill_param_info(skb, tbl, p, + NETLINK_CB(cb->skb).portid, + nlh->nlmsg_seq, + RTM_NEWNEIGHTBL, + NLM_F_MULTI) < 0) + goto out; + next: + nidx++; + } + + neigh_skip = 0; + } +out: + cb->args[0] = tidx; + cb->args[1] = nidx; + + return skb->len; +} + +static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, + u32 pid, u32 seq, int type, unsigned int flags) +{ + u32 neigh_flags, neigh_flags_ext; + unsigned long now = jiffies; + struct nda_cacheinfo ci; + struct nlmsghdr *nlh; + struct ndmsg *ndm; + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); + if (nlh == NULL) + return -EMSGSIZE; + + neigh_flags_ext = neigh->flags >> NTF_EXT_SHIFT; + neigh_flags = neigh->flags & NTF_OLD_MASK; + + ndm = nlmsg_data(nlh); + ndm->ndm_family = neigh->ops->family; + ndm->ndm_pad1 = 0; + ndm->ndm_pad2 = 0; + ndm->ndm_flags = neigh_flags; + ndm->ndm_type = neigh->type; + ndm->ndm_ifindex = neigh->dev->ifindex; + + if (nla_put(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key)) + goto nla_put_failure; + + read_lock_bh(&neigh->lock); + ndm->ndm_state = neigh->nud_state; + if (neigh->nud_state & NUD_VALID) { + char haddr[MAX_ADDR_LEN]; + + neigh_ha_snapshot(haddr, neigh, neigh->dev); + if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) { + read_unlock_bh(&neigh->lock); + goto nla_put_failure; + } + } + + ci.ndm_used = jiffies_to_clock_t(now - neigh->used); + ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed); + ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated); + ci.ndm_refcnt = refcount_read(&neigh->refcnt) - 1; + read_unlock_bh(&neigh->lock); + + if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) || + nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) + goto nla_put_failure; + + if (neigh->protocol && nla_put_u8(skb, NDA_PROTOCOL, neigh->protocol)) + goto nla_put_failure; + if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, + u32 pid, u32 seq, int type, unsigned int flags, + struct neigh_table *tbl) +{ + u32 neigh_flags, neigh_flags_ext; + struct nlmsghdr *nlh; + struct ndmsg *ndm; + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); + if (nlh == NULL) + return -EMSGSIZE; + + neigh_flags_ext = pn->flags >> NTF_EXT_SHIFT; + neigh_flags = pn->flags & NTF_OLD_MASK; + + ndm = nlmsg_data(nlh); + ndm->ndm_family = tbl->family; + ndm->ndm_pad1 = 0; + ndm->ndm_pad2 = 0; + ndm->ndm_flags = neigh_flags | NTF_PROXY; + ndm->ndm_type = RTN_UNICAST; + ndm->ndm_ifindex = pn->dev ? pn->dev->ifindex : 0; + ndm->ndm_state = NUD_NONE; + + if (nla_put(skb, NDA_DST, tbl->key_len, pn->key)) + goto nla_put_failure; + + if (pn->protocol && nla_put_u8(skb, NDA_PROTOCOL, pn->protocol)) + goto nla_put_failure; + if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid) +{ + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); + __neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid); +} + +static bool neigh_master_filtered(struct net_device *dev, int master_idx) +{ + struct net_device *master; + + if (!master_idx) + return false; + + master = dev ? netdev_master_upper_dev_get(dev) : NULL; + + /* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another + * invalid value for ifindex to denote "no master". + */ + if (master_idx == -1) + return !!master; + + if (!master || master->ifindex != master_idx) + return true; + + return false; +} + +static bool neigh_ifindex_filtered(struct net_device *dev, int filter_idx) +{ + if (filter_idx && (!dev || dev->ifindex != filter_idx)) + return true; + + return false; +} + +struct neigh_dump_filter { + int master_idx; + int dev_idx; +}; + +static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, + struct netlink_callback *cb, + struct neigh_dump_filter *filter) +{ + struct net *net = sock_net(skb->sk); + struct neighbour *n; + int rc, h, s_h = cb->args[1]; + int idx, s_idx = idx = cb->args[2]; + struct neigh_hash_table *nht; + unsigned int flags = NLM_F_MULTI; + + if (filter->dev_idx || filter->master_idx) + flags |= NLM_F_DUMP_FILTERED; + + rcu_read_lock(); + nht = rcu_dereference(tbl->nht); + + for (h = s_h; h < (1 << nht->hash_shift); h++) { + if (h > s_h) + s_idx = 0; + for (n = rcu_dereference(nht->hash_buckets[h]), idx = 0; + n != NULL; + n = rcu_dereference(n->next)) { + if (idx < s_idx || !net_eq(dev_net(n->dev), net)) + goto next; + if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || + neigh_master_filtered(n->dev, filter->master_idx)) + goto next; + if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, + flags) < 0) { + rc = -1; + goto out; + } +next: + idx++; + } + } + rc = skb->len; +out: + rcu_read_unlock(); + cb->args[1] = h; + cb->args[2] = idx; + return rc; +} + +static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, + struct netlink_callback *cb, + struct neigh_dump_filter *filter) +{ + struct pneigh_entry *n; + struct net *net = sock_net(skb->sk); + int rc, h, s_h = cb->args[3]; + int idx, s_idx = idx = cb->args[4]; + unsigned int flags = NLM_F_MULTI; + + if (filter->dev_idx || filter->master_idx) + flags |= NLM_F_DUMP_FILTERED; + + read_lock_bh(&tbl->lock); + + for (h = s_h; h <= PNEIGH_HASHMASK; h++) { + if (h > s_h) + s_idx = 0; + for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { + if (idx < s_idx || pneigh_net(n) != net) + goto next; + if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || + neigh_master_filtered(n->dev, filter->master_idx)) + goto next; + if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, flags, tbl) < 0) { + read_unlock_bh(&tbl->lock); + rc = -1; + goto out; + } + next: + idx++; + } + } + + read_unlock_bh(&tbl->lock); + rc = skb->len; +out: + cb->args[3] = h; + cb->args[4] = idx; + return rc; + +} + +static int neigh_valid_dump_req(const struct nlmsghdr *nlh, + bool strict_check, + struct neigh_dump_filter *filter, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[NDA_MAX + 1]; + int err, i; + + if (strict_check) { + struct ndmsg *ndm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + NL_SET_ERR_MSG(extack, "Invalid header for neighbor dump request"); + return -EINVAL; + } + + ndm = nlmsg_data(nlh); + if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex || + ndm->ndm_state || ndm->ndm_type) { + NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request"); + return -EINVAL; + } + + if (ndm->ndm_flags & ~NTF_PROXY) { + NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor dump request"); + return -EINVAL; + } + + err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), + tb, NDA_MAX, nda_policy, + extack); + } else { + err = nlmsg_parse_deprecated(nlh, sizeof(struct ndmsg), tb, + NDA_MAX, nda_policy, extack); + } + if (err < 0) + return err; + + for (i = 0; i <= NDA_MAX; ++i) { + if (!tb[i]) + continue; + + /* all new attributes should require strict_check */ + switch (i) { + case NDA_IFINDEX: + filter->dev_idx = nla_get_u32(tb[i]); + break; + case NDA_MASTER: + filter->master_idx = nla_get_u32(tb[i]); + break; + default: + if (strict_check) { + NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor dump request"); + return -EINVAL; + } + } + } + + return 0; +} + +static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct nlmsghdr *nlh = cb->nlh; + struct neigh_dump_filter filter = {}; + struct neigh_table *tbl; + int t, family, s_t; + int proxy = 0; + int err; + + family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; + + /* check for full ndmsg structure presence, family member is + * the same for both structures + */ + if (nlmsg_len(nlh) >= sizeof(struct ndmsg) && + ((struct ndmsg *)nlmsg_data(nlh))->ndm_flags == NTF_PROXY) + proxy = 1; + + err = neigh_valid_dump_req(nlh, cb->strict_check, &filter, cb->extack); + if (err < 0 && cb->strict_check) + return err; + + s_t = cb->args[0]; + + for (t = 0; t < NEIGH_NR_TABLES; t++) { + tbl = neigh_tables[t]; + + if (!tbl) + continue; + if (t < s_t || (family && tbl->family != family)) + continue; + if (t > s_t) + memset(&cb->args[1], 0, sizeof(cb->args) - + sizeof(cb->args[0])); + if (proxy) + err = pneigh_dump_table(tbl, skb, cb, &filter); + else + err = neigh_dump_table(tbl, skb, cb, &filter); + if (err < 0) + break; + } + + cb->args[0] = t; + return skb->len; +} + +static int neigh_valid_get_req(const struct nlmsghdr *nlh, + struct neigh_table **tbl, + void **dst, int *dev_idx, u8 *ndm_flags, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[NDA_MAX + 1]; + struct ndmsg *ndm; + int err, i; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request"); + return -EINVAL; + } + + ndm = nlmsg_data(nlh); + if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || + ndm->ndm_type) { + NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request"); + return -EINVAL; + } + + if (ndm->ndm_flags & ~NTF_PROXY) { + NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor get request"); + return -EINVAL; + } + + err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, + NDA_MAX, nda_policy, extack); + if (err < 0) + return err; + + *ndm_flags = ndm->ndm_flags; + *dev_idx = ndm->ndm_ifindex; + *tbl = neigh_find_table(ndm->ndm_family); + if (*tbl == NULL) { + NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); + return -EAFNOSUPPORT; + } + + for (i = 0; i <= NDA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case NDA_DST: + if (nla_len(tb[i]) != (int)(*tbl)->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); + return -EINVAL; + } + *dst = nla_data(tb[i]); + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor get request"); + return -EINVAL; + } + } + + return 0; +} + +static inline size_t neigh_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ndmsg)) + + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ + + nla_total_size(sizeof(struct nda_cacheinfo)) + + nla_total_size(4) /* NDA_PROBES */ + + nla_total_size(4) /* NDA_FLAGS_EXT */ + + nla_total_size(1); /* NDA_PROTOCOL */ +} + +static int neigh_get_reply(struct net *net, struct neighbour *neigh, + u32 pid, u32 seq) +{ + struct sk_buff *skb; + int err = 0; + + skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0); + if (err) { + kfree_skb(skb); + goto errout; + } + + err = rtnl_unicast(skb, net, pid); +errout: + return err; +} + +static inline size_t pneigh_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ndmsg)) + + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + + nla_total_size(4) /* NDA_FLAGS_EXT */ + + nla_total_size(1); /* NDA_PROTOCOL */ +} + +static int pneigh_get_reply(struct net *net, struct pneigh_entry *neigh, + u32 pid, u32 seq, struct neigh_table *tbl) +{ + struct sk_buff *skb; + int err = 0; + + skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + err = pneigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0, tbl); + if (err) { + kfree_skb(skb); + goto errout; + } + + err = rtnl_unicast(skb, net, pid); +errout: + return err; +} + +static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(in_skb->sk); + struct net_device *dev = NULL; + struct neigh_table *tbl = NULL; + struct neighbour *neigh; + void *dst = NULL; + u8 ndm_flags = 0; + int dev_idx = 0; + int err; + + err = neigh_valid_get_req(nlh, &tbl, &dst, &dev_idx, &ndm_flags, + extack); + if (err < 0) + return err; + + if (dev_idx) { + dev = __dev_get_by_index(net, dev_idx); + if (!dev) { + NL_SET_ERR_MSG(extack, "Unknown device ifindex"); + return -ENODEV; + } + } + + if (!dst) { + NL_SET_ERR_MSG(extack, "Network address not specified"); + return -EINVAL; + } + + if (ndm_flags & NTF_PROXY) { + struct pneigh_entry *pn; + + pn = pneigh_lookup(tbl, net, dst, dev, 0); + if (!pn) { + NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found"); + return -ENOENT; + } + return pneigh_get_reply(net, pn, NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, tbl); + } + + if (!dev) { + NL_SET_ERR_MSG(extack, "No device specified"); + return -EINVAL; + } + + neigh = neigh_lookup(tbl, dst, dev); + if (!neigh) { + NL_SET_ERR_MSG(extack, "Neighbour entry not found"); + return -ENOENT; + } + + err = neigh_get_reply(net, neigh, NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq); + + neigh_release(neigh); + + return err; +} + +void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie) +{ + int chain; + struct neigh_hash_table *nht; + + rcu_read_lock(); + nht = rcu_dereference(tbl->nht); + + read_lock_bh(&tbl->lock); /* avoid resizes */ + for (chain = 0; chain < (1 << nht->hash_shift); chain++) { + struct neighbour *n; + + for (n = rcu_dereference(nht->hash_buckets[chain]); + n != NULL; + n = rcu_dereference(n->next)) + cb(n, cookie); + } + read_unlock_bh(&tbl->lock); + rcu_read_unlock(); +} +EXPORT_SYMBOL(neigh_for_each); + +/* The tbl->lock must be held as a writer and BH disabled. */ +void __neigh_for_each_release(struct neigh_table *tbl, + int (*cb)(struct neighbour *)) +{ + int chain; + struct neigh_hash_table *nht; + + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + for (chain = 0; chain < (1 << nht->hash_shift); chain++) { + struct neighbour *n; + struct neighbour __rcu **np; + + np = &nht->hash_buckets[chain]; + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { + int release; + + write_lock(&n->lock); + release = cb(n); + if (release) { + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); + neigh_mark_dead(n); + } else + np = &n->next; + write_unlock(&n->lock); + if (release) + neigh_cleanup_and_release(n); + } + } +} +EXPORT_SYMBOL(__neigh_for_each_release); + +int neigh_xmit(int index, struct net_device *dev, + const void *addr, struct sk_buff *skb) +{ + int err = -EAFNOSUPPORT; + if (likely(index < NEIGH_NR_TABLES)) { + struct neigh_table *tbl; + struct neighbour *neigh; + + tbl = neigh_tables[index]; + if (!tbl) + goto out; + rcu_read_lock(); + if (index == NEIGH_ARP_TABLE) { + u32 key = *((u32 *)addr); + + neigh = __ipv4_neigh_lookup_noref(dev, key); + } else { + neigh = __neigh_lookup_noref(tbl, addr, dev); + } + if (!neigh) + neigh = __neigh_create(tbl, addr, dev, false); + err = PTR_ERR(neigh); + if (IS_ERR(neigh)) { + rcu_read_unlock(); + goto out_kfree_skb; + } + err = READ_ONCE(neigh->output)(neigh, skb); + rcu_read_unlock(); + } + else if (index == NEIGH_LINK_TABLE) { + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + addr, NULL, skb->len); + if (err < 0) + goto out_kfree_skb; + err = dev_queue_xmit(skb); + } +out: + return err; +out_kfree_skb: + kfree_skb(skb); + goto out; +} +EXPORT_SYMBOL(neigh_xmit); + +#ifdef CONFIG_PROC_FS + +static struct neighbour *neigh_get_first(struct seq_file *seq) +{ + struct neigh_seq_state *state = seq->private; + struct net *net = seq_file_net(seq); + struct neigh_hash_table *nht = state->nht; + struct neighbour *n = NULL; + int bucket; + + state->flags &= ~NEIGH_SEQ_IS_PNEIGH; + for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) { + n = rcu_dereference(nht->hash_buckets[bucket]); + + while (n) { + if (!net_eq(dev_net(n->dev), net)) + goto next; + if (state->neigh_sub_iter) { + loff_t fakep = 0; + void *v; + + v = state->neigh_sub_iter(state, n, &fakep); + if (!v) + goto next; + } + if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) + break; + if (READ_ONCE(n->nud_state) & ~NUD_NOARP) + break; +next: + n = rcu_dereference(n->next); + } + + if (n) + break; + } + state->bucket = bucket; + + return n; +} + +static struct neighbour *neigh_get_next(struct seq_file *seq, + struct neighbour *n, + loff_t *pos) +{ + struct neigh_seq_state *state = seq->private; + struct net *net = seq_file_net(seq); + struct neigh_hash_table *nht = state->nht; + + if (state->neigh_sub_iter) { + void *v = state->neigh_sub_iter(state, n, pos); + if (v) + return n; + } + n = rcu_dereference(n->next); + + while (1) { + while (n) { + if (!net_eq(dev_net(n->dev), net)) + goto next; + if (state->neigh_sub_iter) { + void *v = state->neigh_sub_iter(state, n, pos); + if (v) + return n; + goto next; + } + if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) + break; + + if (READ_ONCE(n->nud_state) & ~NUD_NOARP) + break; +next: + n = rcu_dereference(n->next); + } + + if (n) + break; + + if (++state->bucket >= (1 << nht->hash_shift)) + break; + + n = rcu_dereference(nht->hash_buckets[state->bucket]); + } + + if (n && pos) + --(*pos); + return n; +} + +static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos) +{ + struct neighbour *n = neigh_get_first(seq); + + if (n) { + --(*pos); + while (*pos) { + n = neigh_get_next(seq, n, pos); + if (!n) + break; + } + } + return *pos ? NULL : n; +} + +static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) +{ + struct neigh_seq_state *state = seq->private; + struct net *net = seq_file_net(seq); + struct neigh_table *tbl = state->tbl; + struct pneigh_entry *pn = NULL; + int bucket; + + state->flags |= NEIGH_SEQ_IS_PNEIGH; + for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { + pn = tbl->phash_buckets[bucket]; + while (pn && !net_eq(pneigh_net(pn), net)) + pn = pn->next; + if (pn) + break; + } + state->bucket = bucket; + + return pn; +} + +static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, + struct pneigh_entry *pn, + loff_t *pos) +{ + struct neigh_seq_state *state = seq->private; + struct net *net = seq_file_net(seq); + struct neigh_table *tbl = state->tbl; + + do { + pn = pn->next; + } while (pn && !net_eq(pneigh_net(pn), net)); + + while (!pn) { + if (++state->bucket > PNEIGH_HASHMASK) + break; + pn = tbl->phash_buckets[state->bucket]; + while (pn && !net_eq(pneigh_net(pn), net)) + pn = pn->next; + if (pn) + break; + } + + if (pn && pos) + --(*pos); + + return pn; +} + +static struct pneigh_entry *pneigh_get_idx(struct seq_file *seq, loff_t *pos) +{ + struct pneigh_entry *pn = pneigh_get_first(seq); + + if (pn) { + --(*pos); + while (*pos) { + pn = pneigh_get_next(seq, pn, pos); + if (!pn) + break; + } + } + return *pos ? NULL : pn; +} + +static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos) +{ + struct neigh_seq_state *state = seq->private; + void *rc; + loff_t idxpos = *pos; + + rc = neigh_get_idx(seq, &idxpos); + if (!rc && !(state->flags & NEIGH_SEQ_NEIGH_ONLY)) + rc = pneigh_get_idx(seq, &idxpos); + + return rc; +} + +void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) + __acquires(tbl->lock) + __acquires(rcu) +{ + struct neigh_seq_state *state = seq->private; + + state->tbl = tbl; + state->bucket = 0; + state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH); + + rcu_read_lock(); + state->nht = rcu_dereference(tbl->nht); + read_lock_bh(&tbl->lock); + + return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN; +} +EXPORT_SYMBOL(neigh_seq_start); + +void *neigh_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct neigh_seq_state *state; + void *rc; + + if (v == SEQ_START_TOKEN) { + rc = neigh_get_first(seq); + goto out; + } + + state = seq->private; + if (!(state->flags & NEIGH_SEQ_IS_PNEIGH)) { + rc = neigh_get_next(seq, v, NULL); + if (rc) + goto out; + if (!(state->flags & NEIGH_SEQ_NEIGH_ONLY)) + rc = pneigh_get_first(seq); + } else { + BUG_ON(state->flags & NEIGH_SEQ_NEIGH_ONLY); + rc = pneigh_get_next(seq, v, NULL); + } +out: + ++(*pos); + return rc; +} +EXPORT_SYMBOL(neigh_seq_next); + +void neigh_seq_stop(struct seq_file *seq, void *v) + __releases(tbl->lock) + __releases(rcu) +{ + struct neigh_seq_state *state = seq->private; + struct neigh_table *tbl = state->tbl; + + read_unlock_bh(&tbl->lock); + rcu_read_unlock(); +} +EXPORT_SYMBOL(neigh_seq_stop); + +/* statistics via seq_file */ + +static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct neigh_table *tbl = pde_data(file_inode(seq->file)); + int cpu; + + if (*pos == 0) + return SEQ_START_TOKEN; + + for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return per_cpu_ptr(tbl->stats, cpu); + } + return NULL; +} + +static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct neigh_table *tbl = pde_data(file_inode(seq->file)); + int cpu; + + for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return per_cpu_ptr(tbl->stats, cpu); + } + (*pos)++; + return NULL; +} + +static void neigh_stat_seq_stop(struct seq_file *seq, void *v) +{ + +} + +static int neigh_stat_seq_show(struct seq_file *seq, void *v) +{ + struct neigh_table *tbl = pde_data(file_inode(seq->file)); + struct neigh_statistics *st = v; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n"); + return 0; + } + + seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx " + "%08lx %08lx %08lx " + "%08lx %08lx %08lx\n", + atomic_read(&tbl->entries), + + st->allocs, + st->destroys, + st->hash_grows, + + st->lookups, + st->hits, + + st->res_failed, + + st->rcv_probes_mcast, + st->rcv_probes_ucast, + + st->periodic_gc_runs, + st->forced_gc_runs, + st->unres_discards, + st->table_fulls + ); + + return 0; +} + +static const struct seq_operations neigh_stat_seq_ops = { + .start = neigh_stat_seq_start, + .next = neigh_stat_seq_next, + .stop = neigh_stat_seq_stop, + .show = neigh_stat_seq_show, +}; +#endif /* CONFIG_PROC_FS */ + +static void __neigh_notify(struct neighbour *n, int type, int flags, + u32 pid) +{ + struct net *net = dev_net(n->dev); + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC); + if (skb == NULL) + goto errout; + + err = neigh_fill_info(skb, n, pid, 0, type, flags); + if (err < 0) { + /* -EMSGSIZE implies BUG in neigh_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); +} + +void neigh_app_ns(struct neighbour *n) +{ + __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0); +} +EXPORT_SYMBOL(neigh_app_ns); + +#ifdef CONFIG_SYSCTL +static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN); + +static int proc_unres_qlen(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int size, ret; + struct ctl_table tmp = *ctl; + + tmp.extra1 = SYSCTL_ZERO; + tmp.extra2 = &unres_qlen_max; + tmp.data = &size; + + size = *(int *)ctl->data / SKB_TRUESIZE(ETH_FRAME_LEN); + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + + if (write && !ret) + *(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN); + return ret; +} + +static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p, + int index) +{ + struct net_device *dev; + int family = neigh_parms_family(p); + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + struct neigh_parms *dst_p = + neigh_get_dev_parms_rcu(dev, family); + + if (dst_p && !test_bit(index, dst_p->data_state)) + dst_p->data[index] = p->data[index]; + } + rcu_read_unlock(); +} + +static void neigh_proc_update(struct ctl_table *ctl, int write) +{ + struct net_device *dev = ctl->extra1; + struct neigh_parms *p = ctl->extra2; + struct net *net = neigh_parms_net(p); + int index = (int *) ctl->data - p->data; + + if (!write) + return; + + set_bit(index, p->data_state); + if (index == NEIGH_VAR_DELAY_PROBE_TIME) + call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); + if (!dev) /* NULL dev means this is default value */ + neigh_copy_dflt_parms(net, p, index); +} + +static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, + loff_t *ppos) +{ + struct ctl_table tmp = *ctl; + int ret; + + tmp.extra1 = SYSCTL_ZERO; + tmp.extra2 = SYSCTL_INT_MAX; + + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + neigh_proc_update(ctl, write); + return ret; +} + +static int neigh_proc_dointvec_ms_jiffies_positive(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table tmp = *ctl; + int ret; + + int min = msecs_to_jiffies(1); + + tmp.extra1 = &min; + tmp.extra2 = NULL; + + ret = proc_dointvec_ms_jiffies_minmax(&tmp, write, buffer, lenp, ppos); + neigh_proc_update(ctl, write); + return ret; +} + +int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + + neigh_proc_update(ctl, write); + return ret; +} +EXPORT_SYMBOL(neigh_proc_dointvec); + +int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); + + neigh_proc_update(ctl, write); + return ret; +} +EXPORT_SYMBOL(neigh_proc_dointvec_jiffies); + +static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos); + + neigh_proc_update(ctl, write); + return ret; +} + +int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); + + neigh_proc_update(ctl, write); + return ret; +} +EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies); + +static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos); + + neigh_proc_update(ctl, write); + return ret; +} + +static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, + loff_t *ppos) +{ + struct neigh_parms *p = ctl->extra2; + int ret; + + if (strcmp(ctl->procname, "base_reachable_time") == 0) + ret = neigh_proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); + else if (strcmp(ctl->procname, "base_reachable_time_ms") == 0) + ret = neigh_proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); + else + ret = -1; + + if (write && ret == 0) { + /* update reachable_time as well, otherwise, the change will + * only be effective after the next time neigh_periodic_work + * decides to recompute it + */ + p->reachable_time = + neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); + } + return ret; +} + +#define NEIGH_PARMS_DATA_OFFSET(index) \ + (&((struct neigh_parms *) 0)->data[index]) + +#define NEIGH_SYSCTL_ENTRY(attr, data_attr, name, mval, proc) \ + [NEIGH_VAR_ ## attr] = { \ + .procname = name, \ + .data = NEIGH_PARMS_DATA_OFFSET(NEIGH_VAR_ ## data_attr), \ + .maxlen = sizeof(int), \ + .mode = mval, \ + .proc_handler = proc, \ + } + +#define NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_zero_intmax) + +#define NEIGH_SYSCTL_JIFFIES_ENTRY(attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_jiffies) + +#define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies) + +#define NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies_positive) + +#define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies) + +#define NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(attr, data_attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_unres_qlen) + +static struct neigh_sysctl_table { + struct ctl_table_header *sysctl_header; + struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1]; +} neigh_sysctl_template __read_mostly = { + .neigh_vars = { + NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"), + NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(UCAST_PROBES, "ucast_solicit"), + NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(APP_PROBES, "app_solicit"), + NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_REPROBES, "mcast_resolicit"), + NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"), + NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"), + NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"), + NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(INTERVAL_PROBE_TIME_MS, + "interval_probe_time_ms"), + NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"), + NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"), + NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"), + NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(ANYCAST_DELAY, "anycast_delay"), + NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(PROXY_DELAY, "proxy_delay"), + NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(LOCKTIME, "locktime"), + NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(QUEUE_LEN, QUEUE_LEN_BYTES, "unres_qlen"), + NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(RETRANS_TIME_MS, RETRANS_TIME, "retrans_time_ms"), + NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(BASE_REACHABLE_TIME_MS, BASE_REACHABLE_TIME, "base_reachable_time_ms"), + [NEIGH_VAR_GC_INTERVAL] = { + .procname = "gc_interval", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NEIGH_VAR_GC_THRESH1] = { + .procname = "gc_thresh1", + .maxlen = sizeof(int), + .mode = 0644, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, + .proc_handler = proc_dointvec_minmax, + }, + [NEIGH_VAR_GC_THRESH2] = { + .procname = "gc_thresh2", + .maxlen = sizeof(int), + .mode = 0644, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, + .proc_handler = proc_dointvec_minmax, + }, + [NEIGH_VAR_GC_THRESH3] = { + .procname = "gc_thresh3", + .maxlen = sizeof(int), + .mode = 0644, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, + .proc_handler = proc_dointvec_minmax, + }, + {}, + }, +}; + +int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, + proc_handler *handler) +{ + int i; + struct neigh_sysctl_table *t; + const char *dev_name_source; + char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ]; + char *p_name; + + t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL_ACCOUNT); + if (!t) + goto err; + + for (i = 0; i < NEIGH_VAR_GC_INTERVAL; i++) { + t->neigh_vars[i].data += (long) p; + t->neigh_vars[i].extra1 = dev; + t->neigh_vars[i].extra2 = p; + } + + if (dev) { + dev_name_source = dev->name; + /* Terminate the table early */ + memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0, + sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL])); + } else { + struct neigh_table *tbl = p->tbl; + dev_name_source = "default"; + t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval; + t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1; + t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2; + t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3; + } + + if (handler) { + /* RetransTime */ + t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler; + /* ReachableTime */ + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler; + /* RetransTime (in milliseconds)*/ + t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler; + /* ReachableTime (in milliseconds) */ + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler; + } else { + /* Those handlers will update p->reachable_time after + * base_reachable_time(_ms) is set to ensure the new timer starts being + * applied after the next neighbour update instead of waiting for + * neigh_periodic_work to update its value (can be multiple minutes) + * So any handler that replaces them should do this as well + */ + /* ReachableTime */ + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = + neigh_proc_base_reachable_time; + /* ReachableTime (in milliseconds) */ + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = + neigh_proc_base_reachable_time; + } + + switch (neigh_parms_family(p)) { + case AF_INET: + p_name = "ipv4"; + break; + case AF_INET6: + p_name = "ipv6"; + break; + default: + BUG(); + } + + snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s", + p_name, dev_name_source); + t->sysctl_header = + register_net_sysctl(neigh_parms_net(p), neigh_path, t->neigh_vars); + if (!t->sysctl_header) + goto free; + + p->sysctl_table = t; + return 0; + +free: + kfree(t); +err: + return -ENOBUFS; +} +EXPORT_SYMBOL(neigh_sysctl_register); + +void neigh_sysctl_unregister(struct neigh_parms *p) +{ + if (p->sysctl_table) { + struct neigh_sysctl_table *t = p->sysctl_table; + p->sysctl_table = NULL; + unregister_net_sysctl_table(t->sysctl_header); + kfree(t); + } +} +EXPORT_SYMBOL(neigh_sysctl_unregister); + +#endif /* CONFIG_SYSCTL */ + +static int __init neigh_init(void) +{ + rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_get, neigh_dump_info, 0); + + rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info, + 0); + rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, 0); + + return 0; +} + +subsys_initcall(neigh_init); diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c new file mode 100644 index 000000000..1ec23bf8b --- /dev/null +++ b/net/core/net-procfs.c @@ -0,0 +1,407 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/netdevice.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <net/wext.h> + +#include "dev.h" + +#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1) + +#define get_bucket(x) ((x) >> BUCKET_SPACE) +#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1)) +#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) + +static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos) +{ + struct net *net = seq_file_net(seq); + struct net_device *dev; + struct hlist_head *h; + unsigned int count = 0, offset = get_offset(*pos); + + h = &net->dev_index_head[get_bucket(*pos)]; + hlist_for_each_entry_rcu(dev, h, index_hlist) { + if (++count == offset) + return dev; + } + + return NULL; +} + +static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos) +{ + struct net_device *dev; + unsigned int bucket; + + do { + dev = dev_from_same_bucket(seq, pos); + if (dev) + return dev; + + bucket = get_bucket(*pos) + 1; + *pos = set_bucket_offset(bucket, 1); + } while (bucket < NETDEV_HASHENTRIES); + + return NULL; +} + +/* + * This is invoked by the /proc filesystem handler to display a device + * in detail. + */ +static void *dev_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + rcu_read_lock(); + if (!*pos) + return SEQ_START_TOKEN; + + if (get_bucket(*pos) >= NETDEV_HASHENTRIES) + return NULL; + + return dev_from_bucket(seq, pos); +} + +static void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return dev_from_bucket(seq, pos); +} + +static void dev_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + +static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) +{ + struct rtnl_link_stats64 temp; + const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); + + seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " + "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n", + dev->name, stats->rx_bytes, stats->rx_packets, + stats->rx_errors, + stats->rx_dropped + stats->rx_missed_errors, + stats->rx_fifo_errors, + stats->rx_length_errors + stats->rx_over_errors + + stats->rx_crc_errors + stats->rx_frame_errors, + stats->rx_compressed, stats->multicast, + stats->tx_bytes, stats->tx_packets, + stats->tx_errors, stats->tx_dropped, + stats->tx_fifo_errors, stats->collisions, + stats->tx_carrier_errors + + stats->tx_aborted_errors + + stats->tx_window_errors + + stats->tx_heartbeat_errors, + stats->tx_compressed); +} + +/* + * Called from the PROCfs module. This now uses the new arbitrary sized + * /proc/net interface to create /proc/net/dev + */ +static int dev_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "Inter-| Receive " + " | Transmit\n" + " face |bytes packets errs drop fifo frame " + "compressed multicast|bytes packets errs " + "drop fifo colls carrier compressed\n"); + else + dev_seq_printf_stats(seq, v); + return 0; +} + +static u32 softnet_backlog_len(struct softnet_data *sd) +{ + return skb_queue_len_lockless(&sd->input_pkt_queue) + + skb_queue_len_lockless(&sd->process_queue); +} + +static struct softnet_data *softnet_get_online(loff_t *pos) +{ + struct softnet_data *sd = NULL; + + while (*pos < nr_cpu_ids) + if (cpu_online(*pos)) { + sd = &per_cpu(softnet_data, *pos); + break; + } else + ++*pos; + return sd; +} + +static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) +{ + return softnet_get_online(pos); +} + +static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return softnet_get_online(pos); +} + +static void softnet_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int softnet_seq_show(struct seq_file *seq, void *v) +{ + struct softnet_data *sd = v; + unsigned int flow_limit_count = 0; + +#ifdef CONFIG_NET_FLOW_LIMIT + struct sd_flow_limit *fl; + + rcu_read_lock(); + fl = rcu_dereference(sd->flow_limit); + if (fl) + flow_limit_count = fl->count; + rcu_read_unlock(); +#endif + + /* the index is the CPU id owing this sd. Since offline CPUs are not + * displayed, it would be othrwise not trivial for the user-space + * mapping the data a specific CPU + */ + seq_printf(seq, + "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", + sd->processed, sd->dropped, sd->time_squeeze, 0, + 0, 0, 0, 0, /* was fastroute */ + 0, /* was cpu_collision */ + sd->received_rps, flow_limit_count, + softnet_backlog_len(sd), (int)seq->index); + return 0; +} + +static const struct seq_operations dev_seq_ops = { + .start = dev_seq_start, + .next = dev_seq_next, + .stop = dev_seq_stop, + .show = dev_seq_show, +}; + +static const struct seq_operations softnet_seq_ops = { + .start = softnet_seq_start, + .next = softnet_seq_next, + .stop = softnet_seq_stop, + .show = softnet_seq_show, +}; + +static void *ptype_get_idx(struct seq_file *seq, loff_t pos) +{ + struct list_head *ptype_list = NULL; + struct packet_type *pt = NULL; + struct net_device *dev; + loff_t i = 0; + int t; + + for_each_netdev_rcu(seq_file_net(seq), dev) { + ptype_list = &dev->ptype_all; + list_for_each_entry_rcu(pt, ptype_list, list) { + if (i == pos) + return pt; + ++i; + } + } + + list_for_each_entry_rcu(pt, &ptype_all, list) { + if (i == pos) + return pt; + ++i; + } + + for (t = 0; t < PTYPE_HASH_SIZE; t++) { + list_for_each_entry_rcu(pt, &ptype_base[t], list) { + if (i == pos) + return pt; + ++i; + } + } + return NULL; +} + +static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + rcu_read_lock(); + return *pos ? ptype_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct net_device *dev; + struct packet_type *pt; + struct list_head *nxt; + int hash; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ptype_get_idx(seq, 0); + + pt = v; + nxt = pt->list.next; + if (pt->dev) { + if (nxt != &pt->dev->ptype_all) + goto found; + + dev = pt->dev; + for_each_netdev_continue_rcu(seq_file_net(seq), dev) { + if (!list_empty(&dev->ptype_all)) { + nxt = dev->ptype_all.next; + goto found; + } + } + + nxt = ptype_all.next; + goto ptype_all; + } + + if (pt->type == htons(ETH_P_ALL)) { +ptype_all: + if (nxt != &ptype_all) + goto found; + hash = 0; + nxt = ptype_base[0].next; + } else + hash = ntohs(pt->type) & PTYPE_HASH_MASK; + + while (nxt == &ptype_base[hash]) { + if (++hash >= PTYPE_HASH_SIZE) + return NULL; + nxt = ptype_base[hash].next; + } +found: + return list_entry(nxt, struct packet_type, list); +} + +static void ptype_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + +static int ptype_seq_show(struct seq_file *seq, void *v) +{ + struct packet_type *pt = v; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, "Type Device Function\n"); + else if ((!pt->af_packet_net || net_eq(pt->af_packet_net, seq_file_net(seq))) && + (!pt->dev || net_eq(dev_net(pt->dev), seq_file_net(seq)))) { + if (pt->type == htons(ETH_P_ALL)) + seq_puts(seq, "ALL "); + else + seq_printf(seq, "%04x", ntohs(pt->type)); + + seq_printf(seq, " %-8s %ps\n", + pt->dev ? pt->dev->name : "", pt->func); + } + + return 0; +} + +static const struct seq_operations ptype_seq_ops = { + .start = ptype_seq_start, + .next = ptype_seq_next, + .stop = ptype_seq_stop, + .show = ptype_seq_show, +}; + +static int __net_init dev_proc_net_init(struct net *net) +{ + int rc = -ENOMEM; + + if (!proc_create_net("dev", 0444, net->proc_net, &dev_seq_ops, + sizeof(struct seq_net_private))) + goto out; + if (!proc_create_seq("softnet_stat", 0444, net->proc_net, + &softnet_seq_ops)) + goto out_dev; + if (!proc_create_net("ptype", 0444, net->proc_net, &ptype_seq_ops, + sizeof(struct seq_net_private))) + goto out_softnet; + + if (wext_proc_init(net)) + goto out_ptype; + rc = 0; +out: + return rc; +out_ptype: + remove_proc_entry("ptype", net->proc_net); +out_softnet: + remove_proc_entry("softnet_stat", net->proc_net); +out_dev: + remove_proc_entry("dev", net->proc_net); + goto out; +} + +static void __net_exit dev_proc_net_exit(struct net *net) +{ + wext_proc_exit(net); + + remove_proc_entry("ptype", net->proc_net); + remove_proc_entry("softnet_stat", net->proc_net); + remove_proc_entry("dev", net->proc_net); +} + +static struct pernet_operations __net_initdata dev_proc_ops = { + .init = dev_proc_net_init, + .exit = dev_proc_net_exit, +}; + +static int dev_mc_seq_show(struct seq_file *seq, void *v) +{ + struct netdev_hw_addr *ha; + struct net_device *dev = v; + + if (v == SEQ_START_TOKEN) + return 0; + + netif_addr_lock_bh(dev); + netdev_for_each_mc_addr(ha, dev) { + seq_printf(seq, "%-4d %-15s %-5d %-5d %*phN\n", + dev->ifindex, dev->name, + ha->refcount, ha->global_use, + (int)dev->addr_len, ha->addr); + } + netif_addr_unlock_bh(dev); + return 0; +} + +static const struct seq_operations dev_mc_seq_ops = { + .start = dev_seq_start, + .next = dev_seq_next, + .stop = dev_seq_stop, + .show = dev_mc_seq_show, +}; + +static int __net_init dev_mc_net_init(struct net *net) +{ + if (!proc_create_net("dev_mcast", 0, net->proc_net, &dev_mc_seq_ops, + sizeof(struct seq_net_private))) + return -ENOMEM; + return 0; +} + +static void __net_exit dev_mc_net_exit(struct net *net) +{ + remove_proc_entry("dev_mcast", net->proc_net); +} + +static struct pernet_operations __net_initdata dev_mc_net_ops = { + .init = dev_mc_net_init, + .exit = dev_mc_net_exit, +}; + +int __init dev_proc_init(void) +{ + int ret = register_pernet_subsys(&dev_proc_ops); + if (!ret) + return register_pernet_subsys(&dev_mc_net_ops); + return ret; +} diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c new file mode 100644 index 000000000..8409d4140 --- /dev/null +++ b/net/core/net-sysfs.c @@ -0,0 +1,2079 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * net-sysfs.c - network device class and attributes + * + * Copyright (c) 2003 Stephen Hemminger <shemminger@osdl.org> + */ + +#include <linux/capability.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/slab.h> +#include <linux/sched/signal.h> +#include <linux/sched/isolation.h> +#include <linux/nsproxy.h> +#include <net/sock.h> +#include <net/net_namespace.h> +#include <linux/rtnetlink.h> +#include <linux/vmalloc.h> +#include <linux/export.h> +#include <linux/jiffies.h> +#include <linux/pm_runtime.h> +#include <linux/of.h> +#include <linux/of_net.h> +#include <linux/cpu.h> + +#include "dev.h" +#include "net-sysfs.h" + +#ifdef CONFIG_SYSFS +static const char fmt_hex[] = "%#x\n"; +static const char fmt_dec[] = "%d\n"; +static const char fmt_ulong[] = "%lu\n"; +static const char fmt_u64[] = "%llu\n"; + +/* Caller holds RTNL or dev_base_lock */ +static inline int dev_isalive(const struct net_device *dev) +{ + return dev->reg_state <= NETREG_REGISTERED; +} + +/* use same locking rules as GIF* ioctl's */ +static ssize_t netdev_show(const struct device *dev, + struct device_attribute *attr, char *buf, + ssize_t (*format)(const struct net_device *, char *)) +{ + struct net_device *ndev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + read_lock(&dev_base_lock); + if (dev_isalive(ndev)) + ret = (*format)(ndev, buf); + read_unlock(&dev_base_lock); + + return ret; +} + +/* generate a show function for simple field */ +#define NETDEVICE_SHOW(field, format_string) \ +static ssize_t format_##field(const struct net_device *dev, char *buf) \ +{ \ + return sysfs_emit(buf, format_string, dev->field); \ +} \ +static ssize_t field##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + return netdev_show(dev, attr, buf, format_##field); \ +} \ + +#define NETDEVICE_SHOW_RO(field, format_string) \ +NETDEVICE_SHOW(field, format_string); \ +static DEVICE_ATTR_RO(field) + +#define NETDEVICE_SHOW_RW(field, format_string) \ +NETDEVICE_SHOW(field, format_string); \ +static DEVICE_ATTR_RW(field) + +/* use same locking and permission rules as SIF* ioctl's */ +static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len, + int (*set)(struct net_device *, unsigned long)) +{ + struct net_device *netdev = to_net_dev(dev); + struct net *net = dev_net(netdev); + unsigned long new; + int ret; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + ret = kstrtoul(buf, 0, &new); + if (ret) + goto err; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) { + ret = (*set)(netdev, new); + if (ret == 0) + ret = len; + } + rtnl_unlock(); + err: + return ret; +} + +NETDEVICE_SHOW_RO(dev_id, fmt_hex); +NETDEVICE_SHOW_RO(dev_port, fmt_dec); +NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec); +NETDEVICE_SHOW_RO(addr_len, fmt_dec); +NETDEVICE_SHOW_RO(ifindex, fmt_dec); +NETDEVICE_SHOW_RO(type, fmt_dec); +NETDEVICE_SHOW_RO(link_mode, fmt_dec); + +static ssize_t iflink_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct net_device *ndev = to_net_dev(dev); + + return sysfs_emit(buf, fmt_dec, dev_get_iflink(ndev)); +} +static DEVICE_ATTR_RO(iflink); + +static ssize_t format_name_assign_type(const struct net_device *dev, char *buf) +{ + return sysfs_emit(buf, fmt_dec, dev->name_assign_type); +} + +static ssize_t name_assign_type_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct net_device *ndev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + if (ndev->name_assign_type != NET_NAME_UNKNOWN) + ret = netdev_show(dev, attr, buf, format_name_assign_type); + + return ret; +} +static DEVICE_ATTR_RO(name_assign_type); + +/* use same locking rules as GIFHWADDR ioctl's */ +static ssize_t address_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct net_device *ndev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + read_lock(&dev_base_lock); + if (dev_isalive(ndev)) + ret = sysfs_format_mac(buf, ndev->dev_addr, ndev->addr_len); + read_unlock(&dev_base_lock); + return ret; +} +static DEVICE_ATTR_RO(address); + +static ssize_t broadcast_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *ndev = to_net_dev(dev); + + if (dev_isalive(ndev)) + return sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len); + return -EINVAL; +} +static DEVICE_ATTR_RO(broadcast); + +static int change_carrier(struct net_device *dev, unsigned long new_carrier) +{ + if (!netif_running(dev)) + return -EINVAL; + return dev_change_carrier(dev, (bool)new_carrier); +} + +static ssize_t carrier_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct net_device *netdev = to_net_dev(dev); + + /* The check is also done in change_carrier; this helps returning early + * without hitting the trylock/restart in netdev_store. + */ + if (!netdev->netdev_ops->ndo_change_carrier) + return -EOPNOTSUPP; + + return netdev_store(dev, attr, buf, len, change_carrier); +} + +static ssize_t carrier_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + + if (netif_running(netdev)) + return sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev)); + + return -EINVAL; +} +static DEVICE_ATTR_RW(carrier); + +static ssize_t speed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + int ret = -EINVAL; + + /* The check is also done in __ethtool_get_link_ksettings; this helps + * returning early without hitting the trylock/restart below. + */ + if (!netdev->ethtool_ops->get_link_ksettings) + return ret; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (netif_running(netdev) && netif_device_present(netdev)) { + struct ethtool_link_ksettings cmd; + + if (!__ethtool_get_link_ksettings(netdev, &cmd)) + ret = sysfs_emit(buf, fmt_dec, cmd.base.speed); + } + rtnl_unlock(); + return ret; +} +static DEVICE_ATTR_RO(speed); + +static ssize_t duplex_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + int ret = -EINVAL; + + /* The check is also done in __ethtool_get_link_ksettings; this helps + * returning early without hitting the trylock/restart below. + */ + if (!netdev->ethtool_ops->get_link_ksettings) + return ret; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (netif_running(netdev)) { + struct ethtool_link_ksettings cmd; + + if (!__ethtool_get_link_ksettings(netdev, &cmd)) { + const char *duplex; + + switch (cmd.base.duplex) { + case DUPLEX_HALF: + duplex = "half"; + break; + case DUPLEX_FULL: + duplex = "full"; + break; + default: + duplex = "unknown"; + break; + } + ret = sysfs_emit(buf, "%s\n", duplex); + } + } + rtnl_unlock(); + return ret; +} +static DEVICE_ATTR_RO(duplex); + +static ssize_t testing_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + + if (netif_running(netdev)) + return sysfs_emit(buf, fmt_dec, !!netif_testing(netdev)); + + return -EINVAL; +} +static DEVICE_ATTR_RO(testing); + +static ssize_t dormant_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + + if (netif_running(netdev)) + return sysfs_emit(buf, fmt_dec, !!netif_dormant(netdev)); + + return -EINVAL; +} +static DEVICE_ATTR_RO(dormant); + +static const char *const operstates[] = { + "unknown", + "notpresent", /* currently unused */ + "down", + "lowerlayerdown", + "testing", + "dormant", + "up" +}; + +static ssize_t operstate_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + const struct net_device *netdev = to_net_dev(dev); + unsigned char operstate; + + read_lock(&dev_base_lock); + operstate = netdev->operstate; + if (!netif_running(netdev)) + operstate = IF_OPER_DOWN; + read_unlock(&dev_base_lock); + + if (operstate >= ARRAY_SIZE(operstates)) + return -EINVAL; /* should not happen */ + + return sysfs_emit(buf, "%s\n", operstates[operstate]); +} +static DEVICE_ATTR_RO(operstate); + +static ssize_t carrier_changes_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + + return sysfs_emit(buf, fmt_dec, + atomic_read(&netdev->carrier_up_count) + + atomic_read(&netdev->carrier_down_count)); +} +static DEVICE_ATTR_RO(carrier_changes); + +static ssize_t carrier_up_count_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + + return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_up_count)); +} +static DEVICE_ATTR_RO(carrier_up_count); + +static ssize_t carrier_down_count_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + + return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_down_count)); +} +static DEVICE_ATTR_RO(carrier_down_count); + +/* read-write attributes */ + +static int change_mtu(struct net_device *dev, unsigned long new_mtu) +{ + return dev_set_mtu(dev, (int)new_mtu); +} + +static ssize_t mtu_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, change_mtu); +} +NETDEVICE_SHOW_RW(mtu, fmt_dec); + +static int change_flags(struct net_device *dev, unsigned long new_flags) +{ + return dev_change_flags(dev, (unsigned int)new_flags, NULL); +} + +static ssize_t flags_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, change_flags); +} +NETDEVICE_SHOW_RW(flags, fmt_hex); + +static ssize_t tx_queue_len_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + return netdev_store(dev, attr, buf, len, dev_change_tx_queue_len); +} +NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec); + +static int change_gro_flush_timeout(struct net_device *dev, unsigned long val) +{ + WRITE_ONCE(dev->gro_flush_timeout, val); + return 0; +} + +static ssize_t gro_flush_timeout_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + return netdev_store(dev, attr, buf, len, change_gro_flush_timeout); +} +NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); + +static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val) +{ + WRITE_ONCE(dev->napi_defer_hard_irqs, val); + return 0; +} + +static ssize_t napi_defer_hard_irqs_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + return netdev_store(dev, attr, buf, len, change_napi_defer_hard_irqs); +} +NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_dec); + +static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct net_device *netdev = to_net_dev(dev); + struct net *net = dev_net(netdev); + size_t count = len; + ssize_t ret = 0; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + /* ignore trailing newline */ + if (len > 0 && buf[len - 1] == '\n') + --count; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) { + ret = dev_set_alias(netdev, buf, count); + if (ret < 0) + goto err; + ret = len; + netdev_state_change(netdev); + } +err: + rtnl_unlock(); + + return ret; +} + +static ssize_t ifalias_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + const struct net_device *netdev = to_net_dev(dev); + char tmp[IFALIASZ]; + ssize_t ret = 0; + + ret = dev_get_alias(netdev, tmp, sizeof(tmp)); + if (ret > 0) + ret = sysfs_emit(buf, "%s\n", tmp); + return ret; +} +static DEVICE_ATTR_RW(ifalias); + +static int change_group(struct net_device *dev, unsigned long new_group) +{ + dev_set_group(dev, (int)new_group); + return 0; +} + +static ssize_t group_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, change_group); +} +NETDEVICE_SHOW(group, fmt_dec); +static DEVICE_ATTR(netdev_group, 0644, group_show, group_store); + +static int change_proto_down(struct net_device *dev, unsigned long proto_down) +{ + return dev_change_proto_down(dev, (bool)proto_down); +} + +static ssize_t proto_down_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, change_proto_down); +} +NETDEVICE_SHOW_RW(proto_down, fmt_dec); + +static ssize_t phys_port_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + /* The check is also done in dev_get_phys_port_id; this helps returning + * early without hitting the trylock/restart below. + */ + if (!netdev->netdev_ops->ndo_get_phys_port_id) + return -EOPNOTSUPP; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) { + struct netdev_phys_item_id ppid; + + ret = dev_get_phys_port_id(netdev, &ppid); + if (!ret) + ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); + } + rtnl_unlock(); + + return ret; +} +static DEVICE_ATTR_RO(phys_port_id); + +static ssize_t phys_port_name_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + /* The checks are also done in dev_get_phys_port_name; this helps + * returning early without hitting the trylock/restart below. + */ + if (!netdev->netdev_ops->ndo_get_phys_port_name && + !netdev->netdev_ops->ndo_get_devlink_port) + return -EOPNOTSUPP; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) { + char name[IFNAMSIZ]; + + ret = dev_get_phys_port_name(netdev, name, sizeof(name)); + if (!ret) + ret = sysfs_emit(buf, "%s\n", name); + } + rtnl_unlock(); + + return ret; +} +static DEVICE_ATTR_RO(phys_port_name); + +static ssize_t phys_switch_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + /* The checks are also done in dev_get_phys_port_name; this helps + * returning early without hitting the trylock/restart below. This works + * because recurse is false when calling dev_get_port_parent_id. + */ + if (!netdev->netdev_ops->ndo_get_port_parent_id && + !netdev->netdev_ops->ndo_get_devlink_port) + return -EOPNOTSUPP; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) { + struct netdev_phys_item_id ppid = { }; + + ret = dev_get_port_parent_id(netdev, &ppid, false); + if (!ret) + ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); + } + rtnl_unlock(); + + return ret; +} +static DEVICE_ATTR_RO(phys_switch_id); + +static ssize_t threaded_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) + ret = sysfs_emit(buf, fmt_dec, netdev->threaded); + + rtnl_unlock(); + return ret; +} + +static int modify_napi_threaded(struct net_device *dev, unsigned long val) +{ + int ret; + + if (list_empty(&dev->napi_list)) + return -EOPNOTSUPP; + + if (val != 0 && val != 1) + return -EOPNOTSUPP; + + ret = dev_set_threaded(dev, val); + + return ret; +} + +static ssize_t threaded_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, modify_napi_threaded); +} +static DEVICE_ATTR_RW(threaded); + +static struct attribute *net_class_attrs[] __ro_after_init = { + &dev_attr_netdev_group.attr, + &dev_attr_type.attr, + &dev_attr_dev_id.attr, + &dev_attr_dev_port.attr, + &dev_attr_iflink.attr, + &dev_attr_ifindex.attr, + &dev_attr_name_assign_type.attr, + &dev_attr_addr_assign_type.attr, + &dev_attr_addr_len.attr, + &dev_attr_link_mode.attr, + &dev_attr_address.attr, + &dev_attr_broadcast.attr, + &dev_attr_speed.attr, + &dev_attr_duplex.attr, + &dev_attr_dormant.attr, + &dev_attr_testing.attr, + &dev_attr_operstate.attr, + &dev_attr_carrier_changes.attr, + &dev_attr_ifalias.attr, + &dev_attr_carrier.attr, + &dev_attr_mtu.attr, + &dev_attr_flags.attr, + &dev_attr_tx_queue_len.attr, + &dev_attr_gro_flush_timeout.attr, + &dev_attr_napi_defer_hard_irqs.attr, + &dev_attr_phys_port_id.attr, + &dev_attr_phys_port_name.attr, + &dev_attr_phys_switch_id.attr, + &dev_attr_proto_down.attr, + &dev_attr_carrier_up_count.attr, + &dev_attr_carrier_down_count.attr, + &dev_attr_threaded.attr, + NULL, +}; +ATTRIBUTE_GROUPS(net_class); + +/* Show a given an attribute in the statistics group */ +static ssize_t netstat_show(const struct device *d, + struct device_attribute *attr, char *buf, + unsigned long offset) +{ + struct net_device *dev = to_net_dev(d); + ssize_t ret = -EINVAL; + + WARN_ON(offset > sizeof(struct rtnl_link_stats64) || + offset % sizeof(u64) != 0); + + read_lock(&dev_base_lock); + if (dev_isalive(dev)) { + struct rtnl_link_stats64 temp; + const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); + + ret = sysfs_emit(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset)); + } + read_unlock(&dev_base_lock); + return ret; +} + +/* generate a read-only statistics attribute */ +#define NETSTAT_ENTRY(name) \ +static ssize_t name##_show(struct device *d, \ + struct device_attribute *attr, char *buf) \ +{ \ + return netstat_show(d, attr, buf, \ + offsetof(struct rtnl_link_stats64, name)); \ +} \ +static DEVICE_ATTR_RO(name) + +NETSTAT_ENTRY(rx_packets); +NETSTAT_ENTRY(tx_packets); +NETSTAT_ENTRY(rx_bytes); +NETSTAT_ENTRY(tx_bytes); +NETSTAT_ENTRY(rx_errors); +NETSTAT_ENTRY(tx_errors); +NETSTAT_ENTRY(rx_dropped); +NETSTAT_ENTRY(tx_dropped); +NETSTAT_ENTRY(multicast); +NETSTAT_ENTRY(collisions); +NETSTAT_ENTRY(rx_length_errors); +NETSTAT_ENTRY(rx_over_errors); +NETSTAT_ENTRY(rx_crc_errors); +NETSTAT_ENTRY(rx_frame_errors); +NETSTAT_ENTRY(rx_fifo_errors); +NETSTAT_ENTRY(rx_missed_errors); +NETSTAT_ENTRY(tx_aborted_errors); +NETSTAT_ENTRY(tx_carrier_errors); +NETSTAT_ENTRY(tx_fifo_errors); +NETSTAT_ENTRY(tx_heartbeat_errors); +NETSTAT_ENTRY(tx_window_errors); +NETSTAT_ENTRY(rx_compressed); +NETSTAT_ENTRY(tx_compressed); +NETSTAT_ENTRY(rx_nohandler); + +static struct attribute *netstat_attrs[] __ro_after_init = { + &dev_attr_rx_packets.attr, + &dev_attr_tx_packets.attr, + &dev_attr_rx_bytes.attr, + &dev_attr_tx_bytes.attr, + &dev_attr_rx_errors.attr, + &dev_attr_tx_errors.attr, + &dev_attr_rx_dropped.attr, + &dev_attr_tx_dropped.attr, + &dev_attr_multicast.attr, + &dev_attr_collisions.attr, + &dev_attr_rx_length_errors.attr, + &dev_attr_rx_over_errors.attr, + &dev_attr_rx_crc_errors.attr, + &dev_attr_rx_frame_errors.attr, + &dev_attr_rx_fifo_errors.attr, + &dev_attr_rx_missed_errors.attr, + &dev_attr_tx_aborted_errors.attr, + &dev_attr_tx_carrier_errors.attr, + &dev_attr_tx_fifo_errors.attr, + &dev_attr_tx_heartbeat_errors.attr, + &dev_attr_tx_window_errors.attr, + &dev_attr_rx_compressed.attr, + &dev_attr_tx_compressed.attr, + &dev_attr_rx_nohandler.attr, + NULL +}; + +static const struct attribute_group netstat_group = { + .name = "statistics", + .attrs = netstat_attrs, +}; + +static struct attribute *wireless_attrs[] = { + NULL +}; + +static const struct attribute_group wireless_group = { + .name = "wireless", + .attrs = wireless_attrs, +}; + +static bool wireless_group_needed(struct net_device *ndev) +{ +#if IS_ENABLED(CONFIG_CFG80211) + if (ndev->ieee80211_ptr) + return true; +#endif +#if IS_ENABLED(CONFIG_WIRELESS_EXT) + if (ndev->wireless_handlers) + return true; +#endif + return false; +} + +#else /* CONFIG_SYSFS */ +#define net_class_groups NULL +#endif /* CONFIG_SYSFS */ + +#ifdef CONFIG_SYSFS +#define to_rx_queue_attr(_attr) \ + container_of(_attr, struct rx_queue_attribute, attr) + +#define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj) + +static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + const struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); + struct netdev_rx_queue *queue = to_rx_queue(kobj); + + if (!attribute->show) + return -EIO; + + return attribute->show(queue, buf); +} + +static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + const struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); + struct netdev_rx_queue *queue = to_rx_queue(kobj); + + if (!attribute->store) + return -EIO; + + return attribute->store(queue, buf, count); +} + +static const struct sysfs_ops rx_queue_sysfs_ops = { + .show = rx_queue_attr_show, + .store = rx_queue_attr_store, +}; + +#ifdef CONFIG_RPS +static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf) +{ + struct rps_map *map; + cpumask_var_t mask; + int i, len; + + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + rcu_read_lock(); + map = rcu_dereference(queue->rps_map); + if (map) + for (i = 0; i < map->len; i++) + cpumask_set_cpu(map->cpus[i], mask); + + len = sysfs_emit(buf, "%*pb\n", cpumask_pr_args(mask)); + rcu_read_unlock(); + free_cpumask_var(mask); + + return len < PAGE_SIZE ? len : -EINVAL; +} + +static ssize_t store_rps_map(struct netdev_rx_queue *queue, + const char *buf, size_t len) +{ + struct rps_map *old_map, *map; + cpumask_var_t mask; + int err, cpu, i; + static DEFINE_MUTEX(rps_map_mutex); + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); + if (err) { + free_cpumask_var(mask); + return err; + } + + if (!cpumask_empty(mask)) { + cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); + cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ)); + if (cpumask_empty(mask)) { + free_cpumask_var(mask); + return -EINVAL; + } + } + + map = kzalloc(max_t(unsigned int, + RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES), + GFP_KERNEL); + if (!map) { + free_cpumask_var(mask); + return -ENOMEM; + } + + i = 0; + for_each_cpu_and(cpu, mask, cpu_online_mask) + map->cpus[i++] = cpu; + + if (i) { + map->len = i; + } else { + kfree(map); + map = NULL; + } + + mutex_lock(&rps_map_mutex); + old_map = rcu_dereference_protected(queue->rps_map, + mutex_is_locked(&rps_map_mutex)); + rcu_assign_pointer(queue->rps_map, map); + + if (map) + static_branch_inc(&rps_needed); + if (old_map) + static_branch_dec(&rps_needed); + + mutex_unlock(&rps_map_mutex); + + if (old_map) + kfree_rcu(old_map, rcu); + + free_cpumask_var(mask); + return len; +} + +static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, + char *buf) +{ + struct rps_dev_flow_table *flow_table; + unsigned long val = 0; + + rcu_read_lock(); + flow_table = rcu_dereference(queue->rps_flow_table); + if (flow_table) + val = (unsigned long)flow_table->mask + 1; + rcu_read_unlock(); + + return sysfs_emit(buf, "%lu\n", val); +} + +static void rps_dev_flow_table_release(struct rcu_head *rcu) +{ + struct rps_dev_flow_table *table = container_of(rcu, + struct rps_dev_flow_table, rcu); + vfree(table); +} + +static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, + const char *buf, size_t len) +{ + unsigned long mask, count; + struct rps_dev_flow_table *table, *old_table; + static DEFINE_SPINLOCK(rps_dev_flow_lock); + int rc; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + rc = kstrtoul(buf, 0, &count); + if (rc < 0) + return rc; + + if (count) { + mask = count - 1; + /* mask = roundup_pow_of_two(count) - 1; + * without overflows... + */ + while ((mask | (mask >> 1)) != mask) + mask |= (mask >> 1); + /* On 64 bit arches, must check mask fits in table->mask (u32), + * and on 32bit arches, must check + * RPS_DEV_FLOW_TABLE_SIZE(mask + 1) doesn't overflow. + */ +#if BITS_PER_LONG > 32 + if (mask > (unsigned long)(u32)mask) + return -EINVAL; +#else + if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1)) + / sizeof(struct rps_dev_flow)) { + /* Enforce a limit to prevent overflow */ + return -EINVAL; + } +#endif + table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1)); + if (!table) + return -ENOMEM; + + table->mask = mask; + for (count = 0; count <= mask; count++) + table->flows[count].cpu = RPS_NO_CPU; + } else { + table = NULL; + } + + spin_lock(&rps_dev_flow_lock); + old_table = rcu_dereference_protected(queue->rps_flow_table, + lockdep_is_held(&rps_dev_flow_lock)); + rcu_assign_pointer(queue->rps_flow_table, table); + spin_unlock(&rps_dev_flow_lock); + + if (old_table) + call_rcu(&old_table->rcu, rps_dev_flow_table_release); + + return len; +} + +static struct rx_queue_attribute rps_cpus_attribute __ro_after_init + = __ATTR(rps_cpus, 0644, show_rps_map, store_rps_map); + +static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute __ro_after_init + = __ATTR(rps_flow_cnt, 0644, + show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt); +#endif /* CONFIG_RPS */ + +static struct attribute *rx_queue_default_attrs[] __ro_after_init = { +#ifdef CONFIG_RPS + &rps_cpus_attribute.attr, + &rps_dev_flow_table_cnt_attribute.attr, +#endif + NULL +}; +ATTRIBUTE_GROUPS(rx_queue_default); + +static void rx_queue_release(struct kobject *kobj) +{ + struct netdev_rx_queue *queue = to_rx_queue(kobj); +#ifdef CONFIG_RPS + struct rps_map *map; + struct rps_dev_flow_table *flow_table; + + map = rcu_dereference_protected(queue->rps_map, 1); + if (map) { + RCU_INIT_POINTER(queue->rps_map, NULL); + kfree_rcu(map, rcu); + } + + flow_table = rcu_dereference_protected(queue->rps_flow_table, 1); + if (flow_table) { + RCU_INIT_POINTER(queue->rps_flow_table, NULL); + call_rcu(&flow_table->rcu, rps_dev_flow_table_release); + } +#endif + + memset(kobj, 0, sizeof(*kobj)); + netdev_put(queue->dev, &queue->dev_tracker); +} + +static const void *rx_queue_namespace(struct kobject *kobj) +{ + struct netdev_rx_queue *queue = to_rx_queue(kobj); + struct device *dev = &queue->dev->dev; + const void *ns = NULL; + + if (dev->class && dev->class->ns_type) + ns = dev->class->namespace(dev); + + return ns; +} + +static void rx_queue_get_ownership(struct kobject *kobj, + kuid_t *uid, kgid_t *gid) +{ + const struct net *net = rx_queue_namespace(kobj); + + net_ns_get_ownership(net, uid, gid); +} + +static struct kobj_type rx_queue_ktype __ro_after_init = { + .sysfs_ops = &rx_queue_sysfs_ops, + .release = rx_queue_release, + .default_groups = rx_queue_default_groups, + .namespace = rx_queue_namespace, + .get_ownership = rx_queue_get_ownership, +}; + +static int rx_queue_add_kobject(struct net_device *dev, int index) +{ + struct netdev_rx_queue *queue = dev->_rx + index; + struct kobject *kobj = &queue->kobj; + int error = 0; + + /* Kobject_put later will trigger rx_queue_release call which + * decreases dev refcount: Take that reference here + */ + netdev_hold(queue->dev, &queue->dev_tracker, GFP_KERNEL); + + kobj->kset = dev->queues_kset; + error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, + "rx-%u", index); + if (error) + goto err; + + if (dev->sysfs_rx_queue_group) { + error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group); + if (error) + goto err; + } + + kobject_uevent(kobj, KOBJ_ADD); + + return error; + +err: + kobject_put(kobj); + return error; +} + +static int rx_queue_change_owner(struct net_device *dev, int index, kuid_t kuid, + kgid_t kgid) +{ + struct netdev_rx_queue *queue = dev->_rx + index; + struct kobject *kobj = &queue->kobj; + int error; + + error = sysfs_change_owner(kobj, kuid, kgid); + if (error) + return error; + + if (dev->sysfs_rx_queue_group) + error = sysfs_group_change_owner( + kobj, dev->sysfs_rx_queue_group, kuid, kgid); + + return error; +} +#endif /* CONFIG_SYSFS */ + +int +net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) +{ +#ifdef CONFIG_SYSFS + int i; + int error = 0; + +#ifndef CONFIG_RPS + if (!dev->sysfs_rx_queue_group) + return 0; +#endif + for (i = old_num; i < new_num; i++) { + error = rx_queue_add_kobject(dev, i); + if (error) { + new_num = old_num; + break; + } + } + + while (--i >= new_num) { + struct kobject *kobj = &dev->_rx[i].kobj; + + if (!refcount_read(&dev_net(dev)->ns.count)) + kobj->uevent_suppress = 1; + if (dev->sysfs_rx_queue_group) + sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); + kobject_put(kobj); + } + + return error; +#else + return 0; +#endif +} + +static int net_rx_queue_change_owner(struct net_device *dev, int num, + kuid_t kuid, kgid_t kgid) +{ +#ifdef CONFIG_SYSFS + int error = 0; + int i; + +#ifndef CONFIG_RPS + if (!dev->sysfs_rx_queue_group) + return 0; +#endif + for (i = 0; i < num; i++) { + error = rx_queue_change_owner(dev, i, kuid, kgid); + if (error) + break; + } + + return error; +#else + return 0; +#endif +} + +#ifdef CONFIG_SYSFS +/* + * netdev_queue sysfs structures and functions. + */ +struct netdev_queue_attribute { + struct attribute attr; + ssize_t (*show)(struct netdev_queue *queue, char *buf); + ssize_t (*store)(struct netdev_queue *queue, + const char *buf, size_t len); +}; +#define to_netdev_queue_attr(_attr) \ + container_of(_attr, struct netdev_queue_attribute, attr) + +#define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj) + +static ssize_t netdev_queue_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + const struct netdev_queue_attribute *attribute + = to_netdev_queue_attr(attr); + struct netdev_queue *queue = to_netdev_queue(kobj); + + if (!attribute->show) + return -EIO; + + return attribute->show(queue, buf); +} + +static ssize_t netdev_queue_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t count) +{ + const struct netdev_queue_attribute *attribute + = to_netdev_queue_attr(attr); + struct netdev_queue *queue = to_netdev_queue(kobj); + + if (!attribute->store) + return -EIO; + + return attribute->store(queue, buf, count); +} + +static const struct sysfs_ops netdev_queue_sysfs_ops = { + .show = netdev_queue_attr_show, + .store = netdev_queue_attr_store, +}; + +static ssize_t tx_timeout_show(struct netdev_queue *queue, char *buf) +{ + unsigned long trans_timeout = atomic_long_read(&queue->trans_timeout); + + return sysfs_emit(buf, fmt_ulong, trans_timeout); +} + +static unsigned int get_netdev_queue_index(struct netdev_queue *queue) +{ + struct net_device *dev = queue->dev; + unsigned int i; + + i = queue - dev->_tx; + BUG_ON(i >= dev->num_tx_queues); + + return i; +} + +static ssize_t traffic_class_show(struct netdev_queue *queue, + char *buf) +{ + struct net_device *dev = queue->dev; + int num_tc, tc; + int index; + + if (!netif_is_multiqueue(dev)) + return -ENOENT; + + if (!rtnl_trylock()) + return restart_syscall(); + + index = get_netdev_queue_index(queue); + + /* If queue belongs to subordinate dev use its TC mapping */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + + num_tc = dev->num_tc; + tc = netdev_txq_to_tc(dev, index); + + rtnl_unlock(); + + if (tc < 0) + return -EINVAL; + + /* We can report the traffic class one of two ways: + * Subordinate device traffic classes are reported with the traffic + * class first, and then the subordinate class so for example TC0 on + * subordinate device 2 will be reported as "0-2". If the queue + * belongs to the root device it will be reported with just the + * traffic class, so just "0" for TC 0 for example. + */ + return num_tc < 0 ? sysfs_emit(buf, "%d%d\n", tc, num_tc) : + sysfs_emit(buf, "%d\n", tc); +} + +#ifdef CONFIG_XPS +static ssize_t tx_maxrate_show(struct netdev_queue *queue, + char *buf) +{ + return sysfs_emit(buf, "%lu\n", queue->tx_maxrate); +} + +static ssize_t tx_maxrate_store(struct netdev_queue *queue, + const char *buf, size_t len) +{ + struct net_device *dev = queue->dev; + int err, index = get_netdev_queue_index(queue); + u32 rate = 0; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + /* The check is also done later; this helps returning early without + * hitting the trylock/restart below. + */ + if (!dev->netdev_ops->ndo_set_tx_maxrate) + return -EOPNOTSUPP; + + err = kstrtou32(buf, 10, &rate); + if (err < 0) + return err; + + if (!rtnl_trylock()) + return restart_syscall(); + + err = -EOPNOTSUPP; + if (dev->netdev_ops->ndo_set_tx_maxrate) + err = dev->netdev_ops->ndo_set_tx_maxrate(dev, index, rate); + + rtnl_unlock(); + if (!err) { + queue->tx_maxrate = rate; + return len; + } + return err; +} + +static struct netdev_queue_attribute queue_tx_maxrate __ro_after_init + = __ATTR_RW(tx_maxrate); +#endif + +static struct netdev_queue_attribute queue_trans_timeout __ro_after_init + = __ATTR_RO(tx_timeout); + +static struct netdev_queue_attribute queue_traffic_class __ro_after_init + = __ATTR_RO(traffic_class); + +#ifdef CONFIG_BQL +/* + * Byte queue limits sysfs structures and functions. + */ +static ssize_t bql_show(char *buf, unsigned int value) +{ + return sysfs_emit(buf, "%u\n", value); +} + +static ssize_t bql_set(const char *buf, const size_t count, + unsigned int *pvalue) +{ + unsigned int value; + int err; + + if (!strcmp(buf, "max") || !strcmp(buf, "max\n")) { + value = DQL_MAX_LIMIT; + } else { + err = kstrtouint(buf, 10, &value); + if (err < 0) + return err; + if (value > DQL_MAX_LIMIT) + return -EINVAL; + } + + *pvalue = value; + + return count; +} + +static ssize_t bql_show_hold_time(struct netdev_queue *queue, + char *buf) +{ + struct dql *dql = &queue->dql; + + return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time)); +} + +static ssize_t bql_set_hold_time(struct netdev_queue *queue, + const char *buf, size_t len) +{ + struct dql *dql = &queue->dql; + unsigned int value; + int err; + + err = kstrtouint(buf, 10, &value); + if (err < 0) + return err; + + dql->slack_hold_time = msecs_to_jiffies(value); + + return len; +} + +static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init + = __ATTR(hold_time, 0644, + bql_show_hold_time, bql_set_hold_time); + +static ssize_t bql_show_inflight(struct netdev_queue *queue, + char *buf) +{ + struct dql *dql = &queue->dql; + + return sysfs_emit(buf, "%u\n", dql->num_queued - dql->num_completed); +} + +static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init = + __ATTR(inflight, 0444, bql_show_inflight, NULL); + +#define BQL_ATTR(NAME, FIELD) \ +static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \ + char *buf) \ +{ \ + return bql_show(buf, queue->dql.FIELD); \ +} \ + \ +static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \ + const char *buf, size_t len) \ +{ \ + return bql_set(buf, len, &queue->dql.FIELD); \ +} \ + \ +static struct netdev_queue_attribute bql_ ## NAME ## _attribute __ro_after_init \ + = __ATTR(NAME, 0644, \ + bql_show_ ## NAME, bql_set_ ## NAME) + +BQL_ATTR(limit, limit); +BQL_ATTR(limit_max, max_limit); +BQL_ATTR(limit_min, min_limit); + +static struct attribute *dql_attrs[] __ro_after_init = { + &bql_limit_attribute.attr, + &bql_limit_max_attribute.attr, + &bql_limit_min_attribute.attr, + &bql_hold_time_attribute.attr, + &bql_inflight_attribute.attr, + NULL +}; + +static const struct attribute_group dql_group = { + .name = "byte_queue_limits", + .attrs = dql_attrs, +}; +#endif /* CONFIG_BQL */ + +#ifdef CONFIG_XPS +static ssize_t xps_queue_show(struct net_device *dev, unsigned int index, + int tc, char *buf, enum xps_map_type type) +{ + struct xps_dev_maps *dev_maps; + unsigned long *mask; + unsigned int nr_ids; + int j, len; + + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_maps[type]); + + /* Default to nr_cpu_ids/dev->num_rx_queues and do not just return 0 + * when dev_maps hasn't been allocated yet, to be backward compatible. + */ + nr_ids = dev_maps ? dev_maps->nr_ids : + (type == XPS_CPUS ? nr_cpu_ids : dev->num_rx_queues); + + mask = bitmap_zalloc(nr_ids, GFP_NOWAIT); + if (!mask) { + rcu_read_unlock(); + return -ENOMEM; + } + + if (!dev_maps || tc >= dev_maps->num_tc) + goto out_no_maps; + + for (j = 0; j < nr_ids; j++) { + int i, tci = j * dev_maps->num_tc + tc; + struct xps_map *map; + + map = rcu_dereference(dev_maps->attr_map[tci]); + if (!map) + continue; + + for (i = map->len; i--;) { + if (map->queues[i] == index) { + __set_bit(j, mask); + break; + } + } + } +out_no_maps: + rcu_read_unlock(); + + len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids); + bitmap_free(mask); + + return len < PAGE_SIZE ? len : -EINVAL; +} + +static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) +{ + struct net_device *dev = queue->dev; + unsigned int index; + int len, tc; + + if (!netif_is_multiqueue(dev)) + return -ENOENT; + + index = get_netdev_queue_index(queue); + + if (!rtnl_trylock()) + return restart_syscall(); + + /* If queue belongs to subordinate dev use its map */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + + tc = netdev_txq_to_tc(dev, index); + if (tc < 0) { + rtnl_unlock(); + return -EINVAL; + } + + /* Make sure the subordinate device can't be freed */ + get_device(&dev->dev); + rtnl_unlock(); + + len = xps_queue_show(dev, index, tc, buf, XPS_CPUS); + + put_device(&dev->dev); + return len; +} + +static ssize_t xps_cpus_store(struct netdev_queue *queue, + const char *buf, size_t len) +{ + struct net_device *dev = queue->dev; + unsigned int index; + cpumask_var_t mask; + int err; + + if (!netif_is_multiqueue(dev)) + return -ENOENT; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + index = get_netdev_queue_index(queue); + + err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); + if (err) { + free_cpumask_var(mask); + return err; + } + + if (!rtnl_trylock()) { + free_cpumask_var(mask); + return restart_syscall(); + } + + err = netif_set_xps_queue(dev, mask, index); + rtnl_unlock(); + + free_cpumask_var(mask); + + return err ? : len; +} + +static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init + = __ATTR_RW(xps_cpus); + +static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) +{ + struct net_device *dev = queue->dev; + unsigned int index; + int tc; + + index = get_netdev_queue_index(queue); + + if (!rtnl_trylock()) + return restart_syscall(); + + tc = netdev_txq_to_tc(dev, index); + rtnl_unlock(); + if (tc < 0) + return -EINVAL; + + return xps_queue_show(dev, index, tc, buf, XPS_RXQS); +} + +static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, + size_t len) +{ + struct net_device *dev = queue->dev; + struct net *net = dev_net(dev); + unsigned long *mask; + unsigned int index; + int err; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL); + if (!mask) + return -ENOMEM; + + index = get_netdev_queue_index(queue); + + err = bitmap_parse(buf, len, mask, dev->num_rx_queues); + if (err) { + bitmap_free(mask); + return err; + } + + if (!rtnl_trylock()) { + bitmap_free(mask); + return restart_syscall(); + } + + cpus_read_lock(); + err = __netif_set_xps_queue(dev, mask, index, XPS_RXQS); + cpus_read_unlock(); + + rtnl_unlock(); + + bitmap_free(mask); + return err ? : len; +} + +static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init + = __ATTR_RW(xps_rxqs); +#endif /* CONFIG_XPS */ + +static struct attribute *netdev_queue_default_attrs[] __ro_after_init = { + &queue_trans_timeout.attr, + &queue_traffic_class.attr, +#ifdef CONFIG_XPS + &xps_cpus_attribute.attr, + &xps_rxqs_attribute.attr, + &queue_tx_maxrate.attr, +#endif + NULL +}; +ATTRIBUTE_GROUPS(netdev_queue_default); + +static void netdev_queue_release(struct kobject *kobj) +{ + struct netdev_queue *queue = to_netdev_queue(kobj); + + memset(kobj, 0, sizeof(*kobj)); + netdev_put(queue->dev, &queue->dev_tracker); +} + +static const void *netdev_queue_namespace(struct kobject *kobj) +{ + struct netdev_queue *queue = to_netdev_queue(kobj); + struct device *dev = &queue->dev->dev; + const void *ns = NULL; + + if (dev->class && dev->class->ns_type) + ns = dev->class->namespace(dev); + + return ns; +} + +static void netdev_queue_get_ownership(struct kobject *kobj, + kuid_t *uid, kgid_t *gid) +{ + const struct net *net = netdev_queue_namespace(kobj); + + net_ns_get_ownership(net, uid, gid); +} + +static struct kobj_type netdev_queue_ktype __ro_after_init = { + .sysfs_ops = &netdev_queue_sysfs_ops, + .release = netdev_queue_release, + .default_groups = netdev_queue_default_groups, + .namespace = netdev_queue_namespace, + .get_ownership = netdev_queue_get_ownership, +}; + +static int netdev_queue_add_kobject(struct net_device *dev, int index) +{ + struct netdev_queue *queue = dev->_tx + index; + struct kobject *kobj = &queue->kobj; + int error = 0; + + /* Kobject_put later will trigger netdev_queue_release call + * which decreases dev refcount: Take that reference here + */ + netdev_hold(queue->dev, &queue->dev_tracker, GFP_KERNEL); + + kobj->kset = dev->queues_kset; + error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, + "tx-%u", index); + if (error) + goto err; + +#ifdef CONFIG_BQL + error = sysfs_create_group(kobj, &dql_group); + if (error) + goto err; +#endif + + kobject_uevent(kobj, KOBJ_ADD); + return 0; + +err: + kobject_put(kobj); + return error; +} + +static int tx_queue_change_owner(struct net_device *ndev, int index, + kuid_t kuid, kgid_t kgid) +{ + struct netdev_queue *queue = ndev->_tx + index; + struct kobject *kobj = &queue->kobj; + int error; + + error = sysfs_change_owner(kobj, kuid, kgid); + if (error) + return error; + +#ifdef CONFIG_BQL + error = sysfs_group_change_owner(kobj, &dql_group, kuid, kgid); +#endif + return error; +} +#endif /* CONFIG_SYSFS */ + +int +netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) +{ +#ifdef CONFIG_SYSFS + int i; + int error = 0; + + /* Tx queue kobjects are allowed to be updated when a device is being + * unregistered, but solely to remove queues from qdiscs. Any path + * adding queues should be fixed. + */ + WARN(dev->reg_state == NETREG_UNREGISTERING && new_num > old_num, + "New queues can't be registered after device unregistration."); + + for (i = old_num; i < new_num; i++) { + error = netdev_queue_add_kobject(dev, i); + if (error) { + new_num = old_num; + break; + } + } + + while (--i >= new_num) { + struct netdev_queue *queue = dev->_tx + i; + + if (!refcount_read(&dev_net(dev)->ns.count)) + queue->kobj.uevent_suppress = 1; +#ifdef CONFIG_BQL + sysfs_remove_group(&queue->kobj, &dql_group); +#endif + kobject_put(&queue->kobj); + } + + return error; +#else + return 0; +#endif /* CONFIG_SYSFS */ +} + +static int net_tx_queue_change_owner(struct net_device *dev, int num, + kuid_t kuid, kgid_t kgid) +{ +#ifdef CONFIG_SYSFS + int error = 0; + int i; + + for (i = 0; i < num; i++) { + error = tx_queue_change_owner(dev, i, kuid, kgid); + if (error) + break; + } + + return error; +#else + return 0; +#endif /* CONFIG_SYSFS */ +} + +static int register_queue_kobjects(struct net_device *dev) +{ + int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0; + +#ifdef CONFIG_SYSFS + dev->queues_kset = kset_create_and_add("queues", + NULL, &dev->dev.kobj); + if (!dev->queues_kset) + return -ENOMEM; + real_rx = dev->real_num_rx_queues; +#endif + real_tx = dev->real_num_tx_queues; + + error = net_rx_queue_update_kobjects(dev, 0, real_rx); + if (error) + goto error; + rxq = real_rx; + + error = netdev_queue_update_kobjects(dev, 0, real_tx); + if (error) + goto error; + txq = real_tx; + + return 0; + +error: + netdev_queue_update_kobjects(dev, txq, 0); + net_rx_queue_update_kobjects(dev, rxq, 0); +#ifdef CONFIG_SYSFS + kset_unregister(dev->queues_kset); +#endif + return error; +} + +static int queue_change_owner(struct net_device *ndev, kuid_t kuid, kgid_t kgid) +{ + int error = 0, real_rx = 0, real_tx = 0; + +#ifdef CONFIG_SYSFS + if (ndev->queues_kset) { + error = sysfs_change_owner(&ndev->queues_kset->kobj, kuid, kgid); + if (error) + return error; + } + real_rx = ndev->real_num_rx_queues; +#endif + real_tx = ndev->real_num_tx_queues; + + error = net_rx_queue_change_owner(ndev, real_rx, kuid, kgid); + if (error) + return error; + + error = net_tx_queue_change_owner(ndev, real_tx, kuid, kgid); + if (error) + return error; + + return 0; +} + +static void remove_queue_kobjects(struct net_device *dev) +{ + int real_rx = 0, real_tx = 0; + +#ifdef CONFIG_SYSFS + real_rx = dev->real_num_rx_queues; +#endif + real_tx = dev->real_num_tx_queues; + + net_rx_queue_update_kobjects(dev, real_rx, 0); + netdev_queue_update_kobjects(dev, real_tx, 0); + + dev->real_num_rx_queues = 0; + dev->real_num_tx_queues = 0; +#ifdef CONFIG_SYSFS + kset_unregister(dev->queues_kset); +#endif +} + +static bool net_current_may_mount(void) +{ + struct net *net = current->nsproxy->net_ns; + + return ns_capable(net->user_ns, CAP_SYS_ADMIN); +} + +static void *net_grab_current_ns(void) +{ + struct net *ns = current->nsproxy->net_ns; +#ifdef CONFIG_NET_NS + if (ns) + refcount_inc(&ns->passive); +#endif + return ns; +} + +static const void *net_initial_ns(void) +{ + return &init_net; +} + +static const void *net_netlink_ns(struct sock *sk) +{ + return sock_net(sk); +} + +const struct kobj_ns_type_operations net_ns_type_operations = { + .type = KOBJ_NS_TYPE_NET, + .current_may_mount = net_current_may_mount, + .grab_current_ns = net_grab_current_ns, + .netlink_ns = net_netlink_ns, + .initial_ns = net_initial_ns, + .drop_ns = net_drop_ns, +}; +EXPORT_SYMBOL_GPL(net_ns_type_operations); + +static int netdev_uevent(struct device *d, struct kobj_uevent_env *env) +{ + struct net_device *dev = to_net_dev(d); + int retval; + + /* pass interface to uevent. */ + retval = add_uevent_var(env, "INTERFACE=%s", dev->name); + if (retval) + goto exit; + + /* pass ifindex to uevent. + * ifindex is useful as it won't change (interface name may change) + * and is what RtNetlink uses natively. + */ + retval = add_uevent_var(env, "IFINDEX=%d", dev->ifindex); + +exit: + return retval; +} + +/* + * netdev_release -- destroy and free a dead device. + * Called when last reference to device kobject is gone. + */ +static void netdev_release(struct device *d) +{ + struct net_device *dev = to_net_dev(d); + + BUG_ON(dev->reg_state != NETREG_RELEASED); + + /* no need to wait for rcu grace period: + * device is dead and about to be freed. + */ + kfree(rcu_access_pointer(dev->ifalias)); + netdev_freemem(dev); +} + +static const void *net_namespace(struct device *d) +{ + struct net_device *dev = to_net_dev(d); + + return dev_net(dev); +} + +static void net_get_ownership(struct device *d, kuid_t *uid, kgid_t *gid) +{ + struct net_device *dev = to_net_dev(d); + const struct net *net = dev_net(dev); + + net_ns_get_ownership(net, uid, gid); +} + +static struct class net_class __ro_after_init = { + .name = "net", + .dev_release = netdev_release, + .dev_groups = net_class_groups, + .dev_uevent = netdev_uevent, + .ns_type = &net_ns_type_operations, + .namespace = net_namespace, + .get_ownership = net_get_ownership, +}; + +#ifdef CONFIG_OF +static int of_dev_node_match(struct device *dev, const void *data) +{ + for (; dev; dev = dev->parent) { + if (dev->of_node == data) + return 1; + } + + return 0; +} + +/* + * of_find_net_device_by_node - lookup the net device for the device node + * @np: OF device node + * + * Looks up the net_device structure corresponding with the device node. + * If successful, returns a pointer to the net_device with the embedded + * struct device refcount incremented by one, or NULL on failure. The + * refcount must be dropped when done with the net_device. + */ +struct net_device *of_find_net_device_by_node(struct device_node *np) +{ + struct device *dev; + + dev = class_find_device(&net_class, NULL, np, of_dev_node_match); + if (!dev) + return NULL; + + return to_net_dev(dev); +} +EXPORT_SYMBOL(of_find_net_device_by_node); +#endif + +/* Delete sysfs entries but hold kobject reference until after all + * netdev references are gone. + */ +void netdev_unregister_kobject(struct net_device *ndev) +{ + struct device *dev = &ndev->dev; + + if (!refcount_read(&dev_net(ndev)->ns.count)) + dev_set_uevent_suppress(dev, 1); + + kobject_get(&dev->kobj); + + remove_queue_kobjects(ndev); + + pm_runtime_set_memalloc_noio(dev, false); + + device_del(dev); +} + +/* Create sysfs entries for network device. */ +int netdev_register_kobject(struct net_device *ndev) +{ + struct device *dev = &ndev->dev; + const struct attribute_group **groups = ndev->sysfs_groups; + int error = 0; + + device_initialize(dev); + dev->class = &net_class; + dev->platform_data = ndev; + dev->groups = groups; + + dev_set_name(dev, "%s", ndev->name); + +#ifdef CONFIG_SYSFS + /* Allow for a device specific group */ + if (*groups) + groups++; + + *groups++ = &netstat_group; + + if (wireless_group_needed(ndev)) + *groups++ = &wireless_group; +#endif /* CONFIG_SYSFS */ + + error = device_add(dev); + if (error) + return error; + + error = register_queue_kobjects(ndev); + if (error) { + device_del(dev); + return error; + } + + pm_runtime_set_memalloc_noio(dev, true); + + return error; +} + +/* Change owner for sysfs entries when moving network devices across network + * namespaces owned by different user namespaces. + */ +int netdev_change_owner(struct net_device *ndev, const struct net *net_old, + const struct net *net_new) +{ + kuid_t old_uid = GLOBAL_ROOT_UID, new_uid = GLOBAL_ROOT_UID; + kgid_t old_gid = GLOBAL_ROOT_GID, new_gid = GLOBAL_ROOT_GID; + struct device *dev = &ndev->dev; + int error; + + net_ns_get_ownership(net_old, &old_uid, &old_gid); + net_ns_get_ownership(net_new, &new_uid, &new_gid); + + /* The network namespace was changed but the owning user namespace is + * identical so there's no need to change the owner of sysfs entries. + */ + if (uid_eq(old_uid, new_uid) && gid_eq(old_gid, new_gid)) + return 0; + + error = device_change_owner(dev, new_uid, new_gid); + if (error) + return error; + + error = queue_change_owner(ndev, new_uid, new_gid); + if (error) + return error; + + return 0; +} + +int netdev_class_create_file_ns(const struct class_attribute *class_attr, + const void *ns) +{ + return class_create_file_ns(&net_class, class_attr, ns); +} +EXPORT_SYMBOL(netdev_class_create_file_ns); + +void netdev_class_remove_file_ns(const struct class_attribute *class_attr, + const void *ns) +{ + class_remove_file_ns(&net_class, class_attr, ns); +} +EXPORT_SYMBOL(netdev_class_remove_file_ns); + +int __init netdev_kobject_init(void) +{ + kobj_ns_type_register(&net_ns_type_operations); + return class_register(&net_class); +} diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h new file mode 100644 index 000000000..8a5b04c26 --- /dev/null +++ b/net/core/net-sysfs.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NET_SYSFS_H__ +#define __NET_SYSFS_H__ + +int __init netdev_kobject_init(void); +int netdev_register_kobject(struct net_device *); +void netdev_unregister_kobject(struct net_device *); +int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num); +int netdev_queue_update_kobjects(struct net_device *net, + int old_num, int new_num); +int netdev_change_owner(struct net_device *, const struct net *net_old, + const struct net *net_new); + +#endif diff --git a/net/core/net-traces.c b/net/core/net-traces.c new file mode 100644 index 000000000..c40cd8dd7 --- /dev/null +++ b/net/core/net-traces.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * consolidates trace point definitions + * + * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com> + */ + +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/string.h> +#include <linux/if_arp.h> +#include <linux/inetdevice.h> +#include <linux/inet.h> +#include <linux/interrupt.h> +#include <linux/export.h> +#include <linux/netpoll.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/rcupdate.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <linux/netlink.h> +#include <linux/net_dropmon.h> +#include <linux/slab.h> + +#include <asm/unaligned.h> +#include <asm/bitops.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/skb.h> +#include <trace/events/net.h> +#include <trace/events/napi.h> +#include <trace/events/sock.h> +#include <trace/events/udp.h> +#include <trace/events/tcp.h> +#include <trace/events/fib.h> +#include <trace/events/qdisc.h> +#if IS_ENABLED(CONFIG_BRIDGE) +#include <trace/events/bridge.h> +EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add); +EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_external_learn_add); +EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete); +EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update); +#endif + +#if IS_ENABLED(CONFIG_PAGE_POOL) +#include <trace/events/page_pool.h> +#endif + +#include <trace/events/neigh.h> +EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update); +EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update_done); +EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_timer_handler); +EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_event_send_done); +EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_event_send_dead); +EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_cleanup_and_release); + +EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); + +EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); + +EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset); +EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_bad_csum); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c new file mode 100644 index 000000000..4c1707d0e --- /dev/null +++ b/net/core/net_namespace.c @@ -0,0 +1,1388 @@ +// SPDX-License-Identifier: GPL-2.0-only +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/workqueue.h> +#include <linux/rtnetlink.h> +#include <linux/cache.h> +#include <linux/slab.h> +#include <linux/list.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/idr.h> +#include <linux/rculist.h> +#include <linux/nsproxy.h> +#include <linux/fs.h> +#include <linux/proc_ns.h> +#include <linux/file.h> +#include <linux/export.h> +#include <linux/user_namespace.h> +#include <linux/net_namespace.h> +#include <linux/sched/task.h> +#include <linux/uidgid.h> +#include <linux/cookie.h> + +#include <net/sock.h> +#include <net/netlink.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +/* + * Our network namespace constructor/destructor lists + */ + +static LIST_HEAD(pernet_list); +static struct list_head *first_device = &pernet_list; + +LIST_HEAD(net_namespace_list); +EXPORT_SYMBOL_GPL(net_namespace_list); + +/* Protects net_namespace_list. Nests iside rtnl_lock() */ +DECLARE_RWSEM(net_rwsem); +EXPORT_SYMBOL_GPL(net_rwsem); + +#ifdef CONFIG_KEYS +static struct key_tag init_net_key_domain = { .usage = REFCOUNT_INIT(1) }; +#endif + +struct net init_net; +EXPORT_SYMBOL(init_net); + +static bool init_net_initialized; +/* + * pernet_ops_rwsem: protects: pernet_list, net_generic_ids, + * init_net_initialized and first_device pointer. + * This is internal net namespace object. Please, don't use it + * outside. + */ +DECLARE_RWSEM(pernet_ops_rwsem); +EXPORT_SYMBOL_GPL(pernet_ops_rwsem); + +#define MIN_PERNET_OPS_ID \ + ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *)) + +#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ + +static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; + +DEFINE_COOKIE(net_cookie); + +static struct net_generic *net_alloc_generic(void) +{ + struct net_generic *ng; + unsigned int generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]); + + ng = kzalloc(generic_size, GFP_KERNEL); + if (ng) + ng->s.len = max_gen_ptrs; + + return ng; +} + +static int net_assign_generic(struct net *net, unsigned int id, void *data) +{ + struct net_generic *ng, *old_ng; + + BUG_ON(id < MIN_PERNET_OPS_ID); + + old_ng = rcu_dereference_protected(net->gen, + lockdep_is_held(&pernet_ops_rwsem)); + if (old_ng->s.len > id) { + old_ng->ptr[id] = data; + return 0; + } + + ng = net_alloc_generic(); + if (!ng) + return -ENOMEM; + + /* + * Some synchronisation notes: + * + * The net_generic explores the net->gen array inside rcu + * read section. Besides once set the net->gen->ptr[x] + * pointer never changes (see rules in netns/generic.h). + * + * That said, we simply duplicate this array and schedule + * the old copy for kfree after a grace period. + */ + + memcpy(&ng->ptr[MIN_PERNET_OPS_ID], &old_ng->ptr[MIN_PERNET_OPS_ID], + (old_ng->s.len - MIN_PERNET_OPS_ID) * sizeof(void *)); + ng->ptr[id] = data; + + rcu_assign_pointer(net->gen, ng); + kfree_rcu(old_ng, s.rcu); + return 0; +} + +static int ops_init(const struct pernet_operations *ops, struct net *net) +{ + struct net_generic *ng; + int err = -ENOMEM; + void *data = NULL; + + if (ops->id && ops->size) { + data = kzalloc(ops->size, GFP_KERNEL); + if (!data) + goto out; + + err = net_assign_generic(net, *ops->id, data); + if (err) + goto cleanup; + } + err = 0; + if (ops->init) + err = ops->init(net); + if (!err) + return 0; + + if (ops->id && ops->size) { + ng = rcu_dereference_protected(net->gen, + lockdep_is_held(&pernet_ops_rwsem)); + ng->ptr[*ops->id] = NULL; + } + +cleanup: + kfree(data); + +out: + return err; +} + +static void ops_pre_exit_list(const struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + struct net *net; + + if (ops->pre_exit) { + list_for_each_entry(net, net_exit_list, exit_list) + ops->pre_exit(net); + } +} + +static void ops_exit_list(const struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + struct net *net; + if (ops->exit) { + list_for_each_entry(net, net_exit_list, exit_list) { + ops->exit(net); + cond_resched(); + } + } + if (ops->exit_batch) + ops->exit_batch(net_exit_list); +} + +static void ops_free_list(const struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + struct net *net; + if (ops->size && ops->id) { + list_for_each_entry(net, net_exit_list, exit_list) + kfree(net_generic(net, *ops->id)); + } +} + +/* should be called with nsid_lock held */ +static int alloc_netid(struct net *net, struct net *peer, int reqid) +{ + int min = 0, max = 0; + + if (reqid >= 0) { + min = reqid; + max = reqid + 1; + } + + return idr_alloc(&net->netns_ids, peer, min, max, GFP_ATOMIC); +} + +/* This function is used by idr_for_each(). If net is equal to peer, the + * function returns the id so that idr_for_each() stops. Because we cannot + * returns the id 0 (idr_for_each() will not stop), we return the magic value + * NET_ID_ZERO (-1) for it. + */ +#define NET_ID_ZERO -1 +static int net_eq_idr(int id, void *net, void *peer) +{ + if (net_eq(net, peer)) + return id ? : NET_ID_ZERO; + return 0; +} + +/* Must be called from RCU-critical section or with nsid_lock held */ +static int __peernet2id(const struct net *net, struct net *peer) +{ + int id = idr_for_each(&net->netns_ids, net_eq_idr, peer); + + /* Magic value for id 0. */ + if (id == NET_ID_ZERO) + return 0; + if (id > 0) + return id; + + return NETNSA_NSID_NOT_ASSIGNED; +} + +static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, + struct nlmsghdr *nlh, gfp_t gfp); +/* This function returns the id of a peer netns. If no id is assigned, one will + * be allocated and returned. + */ +int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) +{ + int id; + + if (refcount_read(&net->ns.count) == 0) + return NETNSA_NSID_NOT_ASSIGNED; + + spin_lock_bh(&net->nsid_lock); + id = __peernet2id(net, peer); + if (id >= 0) { + spin_unlock_bh(&net->nsid_lock); + return id; + } + + /* When peer is obtained from RCU lists, we may race with + * its cleanup. Check whether it's alive, and this guarantees + * we never hash a peer back to net->netns_ids, after it has + * just been idr_remove()'d from there in cleanup_net(). + */ + if (!maybe_get_net(peer)) { + spin_unlock_bh(&net->nsid_lock); + return NETNSA_NSID_NOT_ASSIGNED; + } + + id = alloc_netid(net, peer, -1); + spin_unlock_bh(&net->nsid_lock); + + put_net(peer); + if (id < 0) + return NETNSA_NSID_NOT_ASSIGNED; + + rtnl_net_notifyid(net, RTM_NEWNSID, id, 0, NULL, gfp); + + return id; +} +EXPORT_SYMBOL_GPL(peernet2id_alloc); + +/* This function returns, if assigned, the id of a peer netns. */ +int peernet2id(const struct net *net, struct net *peer) +{ + int id; + + rcu_read_lock(); + id = __peernet2id(net, peer); + rcu_read_unlock(); + + return id; +} +EXPORT_SYMBOL(peernet2id); + +/* This function returns true is the peer netns has an id assigned into the + * current netns. + */ +bool peernet_has_id(const struct net *net, struct net *peer) +{ + return peernet2id(net, peer) >= 0; +} + +struct net *get_net_ns_by_id(const struct net *net, int id) +{ + struct net *peer; + + if (id < 0) + return NULL; + + rcu_read_lock(); + peer = idr_find(&net->netns_ids, id); + if (peer) + peer = maybe_get_net(peer); + rcu_read_unlock(); + + return peer; +} +EXPORT_SYMBOL_GPL(get_net_ns_by_id); + +/* + * setup_net runs the initializers for the network namespace object. + */ +static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) +{ + /* Must be called with pernet_ops_rwsem held */ + const struct pernet_operations *ops, *saved_ops; + int error = 0; + LIST_HEAD(net_exit_list); + + refcount_set(&net->ns.count, 1); + ref_tracker_dir_init(&net->refcnt_tracker, 128); + + refcount_set(&net->passive, 1); + get_random_bytes(&net->hash_mix, sizeof(u32)); + preempt_disable(); + net->net_cookie = gen_cookie_next(&net_cookie); + preempt_enable(); + net->dev_base_seq = 1; + net->user_ns = user_ns; + idr_init(&net->netns_ids); + spin_lock_init(&net->nsid_lock); + mutex_init(&net->ipv4.ra_mutex); + + list_for_each_entry(ops, &pernet_list, list) { + error = ops_init(ops, net); + if (error < 0) + goto out_undo; + } + down_write(&net_rwsem); + list_add_tail_rcu(&net->list, &net_namespace_list); + up_write(&net_rwsem); +out: + return error; + +out_undo: + /* Walk through the list backwards calling the exit functions + * for the pernet modules whose init functions did not fail. + */ + list_add(&net->exit_list, &net_exit_list); + saved_ops = ops; + list_for_each_entry_continue_reverse(ops, &pernet_list, list) + ops_pre_exit_list(ops, &net_exit_list); + + synchronize_rcu(); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, &pernet_list, list) + ops_exit_list(ops, &net_exit_list); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, &pernet_list, list) + ops_free_list(ops, &net_exit_list); + + rcu_barrier(); + goto out; +} + +static int __net_init net_defaults_init_net(struct net *net) +{ + net->core.sysctl_somaxconn = SOMAXCONN; + net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; + + return 0; +} + +static struct pernet_operations net_defaults_ops = { + .init = net_defaults_init_net, +}; + +static __init int net_defaults_init(void) +{ + if (register_pernet_subsys(&net_defaults_ops)) + panic("Cannot initialize net default settings"); + + return 0; +} + +core_initcall(net_defaults_init); + +#ifdef CONFIG_NET_NS +static struct ucounts *inc_net_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES); +} + +static void dec_net_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_NET_NAMESPACES); +} + +static struct kmem_cache *net_cachep __ro_after_init; +static struct workqueue_struct *netns_wq; + +static struct net *net_alloc(void) +{ + struct net *net = NULL; + struct net_generic *ng; + + ng = net_alloc_generic(); + if (!ng) + goto out; + + net = kmem_cache_zalloc(net_cachep, GFP_KERNEL); + if (!net) + goto out_free; + +#ifdef CONFIG_KEYS + net->key_domain = kzalloc(sizeof(struct key_tag), GFP_KERNEL); + if (!net->key_domain) + goto out_free_2; + refcount_set(&net->key_domain->usage, 1); +#endif + + rcu_assign_pointer(net->gen, ng); +out: + return net; + +#ifdef CONFIG_KEYS +out_free_2: + kmem_cache_free(net_cachep, net); + net = NULL; +#endif +out_free: + kfree(ng); + goto out; +} + +static void net_free(struct net *net) +{ + if (refcount_dec_and_test(&net->passive)) { + kfree(rcu_access_pointer(net->gen)); + kmem_cache_free(net_cachep, net); + } +} + +void net_drop_ns(void *p) +{ + struct net *net = (struct net *)p; + + if (net) + net_free(net); +} + +struct net *copy_net_ns(unsigned long flags, + struct user_namespace *user_ns, struct net *old_net) +{ + struct ucounts *ucounts; + struct net *net; + int rv; + + if (!(flags & CLONE_NEWNET)) + return get_net(old_net); + + ucounts = inc_net_namespaces(user_ns); + if (!ucounts) + return ERR_PTR(-ENOSPC); + + net = net_alloc(); + if (!net) { + rv = -ENOMEM; + goto dec_ucounts; + } + refcount_set(&net->passive, 1); + net->ucounts = ucounts; + get_user_ns(user_ns); + + rv = down_read_killable(&pernet_ops_rwsem); + if (rv < 0) + goto put_userns; + + rv = setup_net(net, user_ns); + + up_read(&pernet_ops_rwsem); + + if (rv < 0) { +put_userns: +#ifdef CONFIG_KEYS + key_remove_domain(net->key_domain); +#endif + put_user_ns(user_ns); + net_free(net); +dec_ucounts: + dec_net_namespaces(ucounts); + return ERR_PTR(rv); + } + return net; +} + +/** + * net_ns_get_ownership - get sysfs ownership data for @net + * @net: network namespace in question (can be NULL) + * @uid: kernel user ID for sysfs objects + * @gid: kernel group ID for sysfs objects + * + * Returns the uid/gid pair of root in the user namespace associated with the + * given network namespace. + */ +void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid) +{ + if (net) { + kuid_t ns_root_uid = make_kuid(net->user_ns, 0); + kgid_t ns_root_gid = make_kgid(net->user_ns, 0); + + if (uid_valid(ns_root_uid)) + *uid = ns_root_uid; + + if (gid_valid(ns_root_gid)) + *gid = ns_root_gid; + } else { + *uid = GLOBAL_ROOT_UID; + *gid = GLOBAL_ROOT_GID; + } +} +EXPORT_SYMBOL_GPL(net_ns_get_ownership); + +static void unhash_nsid(struct net *net, struct net *last) +{ + struct net *tmp; + /* This function is only called from cleanup_net() work, + * and this work is the only process, that may delete + * a net from net_namespace_list. So, when the below + * is executing, the list may only grow. Thus, we do not + * use for_each_net_rcu() or net_rwsem. + */ + for_each_net(tmp) { + int id; + + spin_lock_bh(&tmp->nsid_lock); + id = __peernet2id(tmp, net); + if (id >= 0) + idr_remove(&tmp->netns_ids, id); + spin_unlock_bh(&tmp->nsid_lock); + if (id >= 0) + rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL, + GFP_KERNEL); + if (tmp == last) + break; + } + spin_lock_bh(&net->nsid_lock); + idr_destroy(&net->netns_ids); + spin_unlock_bh(&net->nsid_lock); +} + +static LLIST_HEAD(cleanup_list); + +static void cleanup_net(struct work_struct *work) +{ + const struct pernet_operations *ops; + struct net *net, *tmp, *last; + struct llist_node *net_kill_list; + LIST_HEAD(net_exit_list); + + /* Atomically snapshot the list of namespaces to cleanup */ + net_kill_list = llist_del_all(&cleanup_list); + + down_read(&pernet_ops_rwsem); + + /* Don't let anyone else find us. */ + down_write(&net_rwsem); + llist_for_each_entry(net, net_kill_list, cleanup_list) + list_del_rcu(&net->list); + /* Cache last net. After we unlock rtnl, no one new net + * added to net_namespace_list can assign nsid pointer + * to a net from net_kill_list (see peernet2id_alloc()). + * So, we skip them in unhash_nsid(). + * + * Note, that unhash_nsid() does not delete nsid links + * between net_kill_list's nets, as they've already + * deleted from net_namespace_list. But, this would be + * useless anyway, as netns_ids are destroyed there. + */ + last = list_last_entry(&net_namespace_list, struct net, list); + up_write(&net_rwsem); + + llist_for_each_entry(net, net_kill_list, cleanup_list) { + unhash_nsid(net, last); + list_add_tail(&net->exit_list, &net_exit_list); + } + + /* Run all of the network namespace pre_exit methods */ + list_for_each_entry_reverse(ops, &pernet_list, list) + ops_pre_exit_list(ops, &net_exit_list); + + /* + * Another CPU might be rcu-iterating the list, wait for it. + * This needs to be before calling the exit() notifiers, so + * the rcu_barrier() below isn't sufficient alone. + * Also the pre_exit() and exit() methods need this barrier. + */ + synchronize_rcu(); + + /* Run all of the network namespace exit methods */ + list_for_each_entry_reverse(ops, &pernet_list, list) + ops_exit_list(ops, &net_exit_list); + + /* Free the net generic variables */ + list_for_each_entry_reverse(ops, &pernet_list, list) + ops_free_list(ops, &net_exit_list); + + up_read(&pernet_ops_rwsem); + + /* Ensure there are no outstanding rcu callbacks using this + * network namespace. + */ + rcu_barrier(); + + /* Finally it is safe to free my network namespace structure */ + list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { + list_del_init(&net->exit_list); + dec_net_namespaces(net->ucounts); +#ifdef CONFIG_KEYS + key_remove_domain(net->key_domain); +#endif + put_user_ns(net->user_ns); + net_free(net); + } +} + +/** + * net_ns_barrier - wait until concurrent net_cleanup_work is done + * + * cleanup_net runs from work queue and will first remove namespaces + * from the global list, then run net exit functions. + * + * Call this in module exit path to make sure that all netns + * ->exit ops have been invoked before the function is removed. + */ +void net_ns_barrier(void) +{ + down_write(&pernet_ops_rwsem); + up_write(&pernet_ops_rwsem); +} +EXPORT_SYMBOL(net_ns_barrier); + +static DECLARE_WORK(net_cleanup_work, cleanup_net); + +void __put_net(struct net *net) +{ + ref_tracker_dir_exit(&net->refcnt_tracker); + /* Cleanup the network namespace in process context */ + if (llist_add(&net->cleanup_list, &cleanup_list)) + queue_work(netns_wq, &net_cleanup_work); +} +EXPORT_SYMBOL_GPL(__put_net); + +/** + * get_net_ns - increment the refcount of the network namespace + * @ns: common namespace (net) + * + * Returns the net's common namespace. + */ +struct ns_common *get_net_ns(struct ns_common *ns) +{ + return &get_net(container_of(ns, struct net, ns))->ns; +} +EXPORT_SYMBOL_GPL(get_net_ns); + +struct net *get_net_ns_by_fd(int fd) +{ + struct file *file; + struct ns_common *ns; + struct net *net; + + file = proc_ns_fget(fd); + if (IS_ERR(file)) + return ERR_CAST(file); + + ns = get_proc_ns(file_inode(file)); + if (ns->ops == &netns_operations) + net = get_net(container_of(ns, struct net, ns)); + else + net = ERR_PTR(-EINVAL); + + fput(file); + return net; +} +EXPORT_SYMBOL_GPL(get_net_ns_by_fd); +#endif + +struct net *get_net_ns_by_pid(pid_t pid) +{ + struct task_struct *tsk; + struct net *net; + + /* Lookup the network namespace */ + net = ERR_PTR(-ESRCH); + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (tsk) { + struct nsproxy *nsproxy; + task_lock(tsk); + nsproxy = tsk->nsproxy; + if (nsproxy) + net = get_net(nsproxy->net_ns); + task_unlock(tsk); + } + rcu_read_unlock(); + return net; +} +EXPORT_SYMBOL_GPL(get_net_ns_by_pid); + +static __net_init int net_ns_net_init(struct net *net) +{ +#ifdef CONFIG_NET_NS + net->ns.ops = &netns_operations; +#endif + return ns_alloc_inum(&net->ns); +} + +static __net_exit void net_ns_net_exit(struct net *net) +{ + ns_free_inum(&net->ns); +} + +static struct pernet_operations __net_initdata net_ns_ops = { + .init = net_ns_net_init, + .exit = net_ns_net_exit, +}; + +static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { + [NETNSA_NONE] = { .type = NLA_UNSPEC }, + [NETNSA_NSID] = { .type = NLA_S32 }, + [NETNSA_PID] = { .type = NLA_U32 }, + [NETNSA_FD] = { .type = NLA_U32 }, + [NETNSA_TARGET_NSID] = { .type = NLA_S32 }, +}; + +static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tb[NETNSA_MAX + 1]; + struct nlattr *nla; + struct net *peer; + int nsid, err; + + err = nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg), tb, + NETNSA_MAX, rtnl_net_policy, extack); + if (err < 0) + return err; + if (!tb[NETNSA_NSID]) { + NL_SET_ERR_MSG(extack, "nsid is missing"); + return -EINVAL; + } + nsid = nla_get_s32(tb[NETNSA_NSID]); + + if (tb[NETNSA_PID]) { + peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID])); + nla = tb[NETNSA_PID]; + } else if (tb[NETNSA_FD]) { + peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); + nla = tb[NETNSA_FD]; + } else { + NL_SET_ERR_MSG(extack, "Peer netns reference is missing"); + return -EINVAL; + } + if (IS_ERR(peer)) { + NL_SET_BAD_ATTR(extack, nla); + NL_SET_ERR_MSG(extack, "Peer netns reference is invalid"); + return PTR_ERR(peer); + } + + spin_lock_bh(&net->nsid_lock); + if (__peernet2id(net, peer) >= 0) { + spin_unlock_bh(&net->nsid_lock); + err = -EEXIST; + NL_SET_BAD_ATTR(extack, nla); + NL_SET_ERR_MSG(extack, + "Peer netns already has a nsid assigned"); + goto out; + } + + err = alloc_netid(net, peer, nsid); + spin_unlock_bh(&net->nsid_lock); + if (err >= 0) { + rtnl_net_notifyid(net, RTM_NEWNSID, err, NETLINK_CB(skb).portid, + nlh, GFP_KERNEL); + err = 0; + } else if (err == -ENOSPC && nsid >= 0) { + err = -EEXIST; + NL_SET_BAD_ATTR(extack, tb[NETNSA_NSID]); + NL_SET_ERR_MSG(extack, "The specified nsid is already used"); + } +out: + put_net(peer); + return err; +} + +static int rtnl_net_get_size(void) +{ + return NLMSG_ALIGN(sizeof(struct rtgenmsg)) + + nla_total_size(sizeof(s32)) /* NETNSA_NSID */ + + nla_total_size(sizeof(s32)) /* NETNSA_CURRENT_NSID */ + ; +} + +struct net_fill_args { + u32 portid; + u32 seq; + int flags; + int cmd; + int nsid; + bool add_ref; + int ref_nsid; +}; + +static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args) +{ + struct nlmsghdr *nlh; + struct rtgenmsg *rth; + + nlh = nlmsg_put(skb, args->portid, args->seq, args->cmd, sizeof(*rth), + args->flags); + if (!nlh) + return -EMSGSIZE; + + rth = nlmsg_data(nlh); + rth->rtgen_family = AF_UNSPEC; + + if (nla_put_s32(skb, NETNSA_NSID, args->nsid)) + goto nla_put_failure; + + if (args->add_ref && + nla_put_s32(skb, NETNSA_CURRENT_NSID, args->ref_nsid)) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int rtnl_net_valid_getid_req(struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + int i, err; + + if (!netlink_strict_get_check(skb)) + return nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg), + tb, NETNSA_MAX, rtnl_net_policy, + extack); + + err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb, + NETNSA_MAX, rtnl_net_policy, + extack); + if (err) + return err; + + for (i = 0; i <= NETNSA_MAX; i++) { + if (!tb[i]) + continue; + + switch (i) { + case NETNSA_PID: + case NETNSA_FD: + case NETNSA_NSID: + case NETNSA_TARGET_NSID: + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in peer netns getid request"); + return -EINVAL; + } + } + + return 0; +} + +static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tb[NETNSA_MAX + 1]; + struct net_fill_args fillargs = { + .portid = NETLINK_CB(skb).portid, + .seq = nlh->nlmsg_seq, + .cmd = RTM_NEWNSID, + }; + struct net *peer, *target = net; + struct nlattr *nla; + struct sk_buff *msg; + int err; + + err = rtnl_net_valid_getid_req(skb, nlh, tb, extack); + if (err < 0) + return err; + if (tb[NETNSA_PID]) { + peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID])); + nla = tb[NETNSA_PID]; + } else if (tb[NETNSA_FD]) { + peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); + nla = tb[NETNSA_FD]; + } else if (tb[NETNSA_NSID]) { + peer = get_net_ns_by_id(net, nla_get_s32(tb[NETNSA_NSID])); + if (!peer) + peer = ERR_PTR(-ENOENT); + nla = tb[NETNSA_NSID]; + } else { + NL_SET_ERR_MSG(extack, "Peer netns reference is missing"); + return -EINVAL; + } + + if (IS_ERR(peer)) { + NL_SET_BAD_ATTR(extack, nla); + NL_SET_ERR_MSG(extack, "Peer netns reference is invalid"); + return PTR_ERR(peer); + } + + if (tb[NETNSA_TARGET_NSID]) { + int id = nla_get_s32(tb[NETNSA_TARGET_NSID]); + + target = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, id); + if (IS_ERR(target)) { + NL_SET_BAD_ATTR(extack, tb[NETNSA_TARGET_NSID]); + NL_SET_ERR_MSG(extack, + "Target netns reference is invalid"); + err = PTR_ERR(target); + goto out; + } + fillargs.add_ref = true; + fillargs.ref_nsid = peernet2id(net, peer); + } + + msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL); + if (!msg) { + err = -ENOMEM; + goto out; + } + + fillargs.nsid = peernet2id(target, peer); + err = rtnl_net_fill(msg, &fillargs); + if (err < 0) + goto err_out; + + err = rtnl_unicast(msg, net, NETLINK_CB(skb).portid); + goto out; + +err_out: + nlmsg_free(msg); +out: + if (fillargs.add_ref) + put_net(target); + put_net(peer); + return err; +} + +struct rtnl_net_dump_cb { + struct net *tgt_net; + struct net *ref_net; + struct sk_buff *skb; + struct net_fill_args fillargs; + int idx; + int s_idx; +}; + +/* Runs in RCU-critical section. */ +static int rtnl_net_dumpid_one(int id, void *peer, void *data) +{ + struct rtnl_net_dump_cb *net_cb = (struct rtnl_net_dump_cb *)data; + int ret; + + if (net_cb->idx < net_cb->s_idx) + goto cont; + + net_cb->fillargs.nsid = id; + if (net_cb->fillargs.add_ref) + net_cb->fillargs.ref_nsid = __peernet2id(net_cb->ref_net, peer); + ret = rtnl_net_fill(net_cb->skb, &net_cb->fillargs); + if (ret < 0) + return ret; + +cont: + net_cb->idx++; + return 0; +} + +static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk, + struct rtnl_net_dump_cb *net_cb, + struct netlink_callback *cb) +{ + struct netlink_ext_ack *extack = cb->extack; + struct nlattr *tb[NETNSA_MAX + 1]; + int err, i; + + err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb, + NETNSA_MAX, rtnl_net_policy, + extack); + if (err < 0) + return err; + + for (i = 0; i <= NETNSA_MAX; i++) { + if (!tb[i]) + continue; + + if (i == NETNSA_TARGET_NSID) { + struct net *net; + + net = rtnl_get_net_ns_capable(sk, nla_get_s32(tb[i])); + if (IS_ERR(net)) { + NL_SET_BAD_ATTR(extack, tb[i]); + NL_SET_ERR_MSG(extack, + "Invalid target network namespace id"); + return PTR_ERR(net); + } + net_cb->fillargs.add_ref = true; + net_cb->ref_net = net_cb->tgt_net; + net_cb->tgt_net = net; + } else { + NL_SET_BAD_ATTR(extack, tb[i]); + NL_SET_ERR_MSG(extack, + "Unsupported attribute in dump request"); + return -EINVAL; + } + } + + return 0; +} + +static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct rtnl_net_dump_cb net_cb = { + .tgt_net = sock_net(skb->sk), + .skb = skb, + .fillargs = { + .portid = NETLINK_CB(cb->skb).portid, + .seq = cb->nlh->nlmsg_seq, + .flags = NLM_F_MULTI, + .cmd = RTM_NEWNSID, + }, + .idx = 0, + .s_idx = cb->args[0], + }; + int err = 0; + + if (cb->strict_check) { + err = rtnl_valid_dump_net_req(cb->nlh, skb->sk, &net_cb, cb); + if (err < 0) + goto end; + } + + rcu_read_lock(); + idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb); + rcu_read_unlock(); + + cb->args[0] = net_cb.idx; +end: + if (net_cb.fillargs.add_ref) + put_net(net_cb.tgt_net); + return err < 0 ? err : skb->len; +} + +static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, + struct nlmsghdr *nlh, gfp_t gfp) +{ + struct net_fill_args fillargs = { + .portid = portid, + .seq = nlh ? nlh->nlmsg_seq : 0, + .cmd = cmd, + .nsid = id, + }; + struct sk_buff *msg; + int err = -ENOMEM; + + msg = nlmsg_new(rtnl_net_get_size(), gfp); + if (!msg) + goto out; + + err = rtnl_net_fill(msg, &fillargs); + if (err < 0) + goto err_out; + + rtnl_notify(msg, net, portid, RTNLGRP_NSID, nlh, gfp); + return; + +err_out: + nlmsg_free(msg); +out: + rtnl_set_sk_err(net, RTNLGRP_NSID, err); +} + +void __init net_ns_init(void) +{ + struct net_generic *ng; + +#ifdef CONFIG_NET_NS + net_cachep = kmem_cache_create("net_namespace", sizeof(struct net), + SMP_CACHE_BYTES, + SLAB_PANIC|SLAB_ACCOUNT, NULL); + + /* Create workqueue for cleanup */ + netns_wq = create_singlethread_workqueue("netns"); + if (!netns_wq) + panic("Could not create netns workq"); +#endif + + ng = net_alloc_generic(); + if (!ng) + panic("Could not allocate generic netns"); + + rcu_assign_pointer(init_net.gen, ng); + +#ifdef CONFIG_KEYS + init_net.key_domain = &init_net_key_domain; +#endif + down_write(&pernet_ops_rwsem); + if (setup_net(&init_net, &init_user_ns)) + panic("Could not setup the initial network namespace"); + + init_net_initialized = true; + up_write(&pernet_ops_rwsem); + + if (register_pernet_subsys(&net_ns_ops)) + panic("Could not register network namespace subsystems"); + + rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, + RTNL_FLAG_DOIT_UNLOCKED); + rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid, + RTNL_FLAG_DOIT_UNLOCKED); +} + +static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list) +{ + ops_pre_exit_list(ops, net_exit_list); + synchronize_rcu(); + ops_exit_list(ops, net_exit_list); + ops_free_list(ops, net_exit_list); +} + +#ifdef CONFIG_NET_NS +static int __register_pernet_operations(struct list_head *list, + struct pernet_operations *ops) +{ + struct net *net; + int error; + LIST_HEAD(net_exit_list); + + list_add_tail(&ops->list, list); + if (ops->init || (ops->id && ops->size)) { + /* We held write locked pernet_ops_rwsem, and parallel + * setup_net() and cleanup_net() are not possible. + */ + for_each_net(net) { + error = ops_init(ops, net); + if (error) + goto out_undo; + list_add_tail(&net->exit_list, &net_exit_list); + } + } + return 0; + +out_undo: + /* If I have an error cleanup all namespaces I initialized */ + list_del(&ops->list); + free_exit_list(ops, &net_exit_list); + return error; +} + +static void __unregister_pernet_operations(struct pernet_operations *ops) +{ + struct net *net; + LIST_HEAD(net_exit_list); + + list_del(&ops->list); + /* See comment in __register_pernet_operations() */ + for_each_net(net) + list_add_tail(&net->exit_list, &net_exit_list); + + free_exit_list(ops, &net_exit_list); +} + +#else + +static int __register_pernet_operations(struct list_head *list, + struct pernet_operations *ops) +{ + if (!init_net_initialized) { + list_add_tail(&ops->list, list); + return 0; + } + + return ops_init(ops, &init_net); +} + +static void __unregister_pernet_operations(struct pernet_operations *ops) +{ + if (!init_net_initialized) { + list_del(&ops->list); + } else { + LIST_HEAD(net_exit_list); + list_add(&init_net.exit_list, &net_exit_list); + free_exit_list(ops, &net_exit_list); + } +} + +#endif /* CONFIG_NET_NS */ + +static DEFINE_IDA(net_generic_ids); + +static int register_pernet_operations(struct list_head *list, + struct pernet_operations *ops) +{ + int error; + + if (ops->id) { + error = ida_alloc_min(&net_generic_ids, MIN_PERNET_OPS_ID, + GFP_KERNEL); + if (error < 0) + return error; + *ops->id = error; + max_gen_ptrs = max(max_gen_ptrs, *ops->id + 1); + } + error = __register_pernet_operations(list, ops); + if (error) { + rcu_barrier(); + if (ops->id) + ida_free(&net_generic_ids, *ops->id); + } + + return error; +} + +static void unregister_pernet_operations(struct pernet_operations *ops) +{ + __unregister_pernet_operations(ops); + rcu_barrier(); + if (ops->id) + ida_free(&net_generic_ids, *ops->id); +} + +/** + * register_pernet_subsys - register a network namespace subsystem + * @ops: pernet operations structure for the subsystem + * + * Register a subsystem which has init and exit functions + * that are called when network namespaces are created and + * destroyed respectively. + * + * When registered all network namespace init functions are + * called for every existing network namespace. Allowing kernel + * modules to have a race free view of the set of network namespaces. + * + * When a new network namespace is created all of the init + * methods are called in the order in which they were registered. + * + * When a network namespace is destroyed all of the exit methods + * are called in the reverse of the order with which they were + * registered. + */ +int register_pernet_subsys(struct pernet_operations *ops) +{ + int error; + down_write(&pernet_ops_rwsem); + error = register_pernet_operations(first_device, ops); + up_write(&pernet_ops_rwsem); + return error; +} +EXPORT_SYMBOL_GPL(register_pernet_subsys); + +/** + * unregister_pernet_subsys - unregister a network namespace subsystem + * @ops: pernet operations structure to manipulate + * + * Remove the pernet operations structure from the list to be + * used when network namespaces are created or destroyed. In + * addition run the exit method for all existing network + * namespaces. + */ +void unregister_pernet_subsys(struct pernet_operations *ops) +{ + down_write(&pernet_ops_rwsem); + unregister_pernet_operations(ops); + up_write(&pernet_ops_rwsem); +} +EXPORT_SYMBOL_GPL(unregister_pernet_subsys); + +/** + * register_pernet_device - register a network namespace device + * @ops: pernet operations structure for the subsystem + * + * Register a device which has init and exit functions + * that are called when network namespaces are created and + * destroyed respectively. + * + * When registered all network namespace init functions are + * called for every existing network namespace. Allowing kernel + * modules to have a race free view of the set of network namespaces. + * + * When a new network namespace is created all of the init + * methods are called in the order in which they were registered. + * + * When a network namespace is destroyed all of the exit methods + * are called in the reverse of the order with which they were + * registered. + */ +int register_pernet_device(struct pernet_operations *ops) +{ + int error; + down_write(&pernet_ops_rwsem); + error = register_pernet_operations(&pernet_list, ops); + if (!error && (first_device == &pernet_list)) + first_device = &ops->list; + up_write(&pernet_ops_rwsem); + return error; +} +EXPORT_SYMBOL_GPL(register_pernet_device); + +/** + * unregister_pernet_device - unregister a network namespace netdevice + * @ops: pernet operations structure to manipulate + * + * Remove the pernet operations structure from the list to be + * used when network namespaces are created or destroyed. In + * addition run the exit method for all existing network + * namespaces. + */ +void unregister_pernet_device(struct pernet_operations *ops) +{ + down_write(&pernet_ops_rwsem); + if (&ops->list == first_device) + first_device = first_device->next; + unregister_pernet_operations(ops); + up_write(&pernet_ops_rwsem); +} +EXPORT_SYMBOL_GPL(unregister_pernet_device); + +#ifdef CONFIG_NET_NS +static struct ns_common *netns_get(struct task_struct *task) +{ + struct net *net = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) + net = get_net(nsproxy->net_ns); + task_unlock(task); + + return net ? &net->ns : NULL; +} + +static inline struct net *to_net_ns(struct ns_common *ns) +{ + return container_of(ns, struct net, ns); +} + +static void netns_put(struct ns_common *ns) +{ + put_net(to_net_ns(ns)); +} + +static int netns_install(struct nsset *nsset, struct ns_common *ns) +{ + struct nsproxy *nsproxy = nsset->nsproxy; + struct net *net = to_net_ns(ns); + + if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) || + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + put_net(nsproxy->net_ns); + nsproxy->net_ns = get_net(net); + return 0; +} + +static struct user_namespace *netns_owner(struct ns_common *ns) +{ + return to_net_ns(ns)->user_ns; +} + +const struct proc_ns_operations netns_operations = { + .name = "net", + .type = CLONE_NEWNET, + .get = netns_get, + .put = netns_put, + .install = netns_install, + .owner = netns_owner, +}; +#endif diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c new file mode 100644 index 000000000..d6a70aeaa --- /dev/null +++ b/net/core/netclassid_cgroup.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * net/core/netclassid_cgroup.c Classid Cgroupfs Handling + * + * Authors: Thomas Graf <tgraf@suug.ch> + */ + +#include <linux/slab.h> +#include <linux/cgroup.h> +#include <linux/fdtable.h> +#include <linux/sched/task.h> + +#include <net/cls_cgroup.h> +#include <net/sock.h> + +static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct cgroup_cls_state, css) : NULL; +} + +struct cgroup_cls_state *task_cls_state(struct task_struct *p) +{ + return css_cls_state(task_css_check(p, net_cls_cgrp_id, + rcu_read_lock_bh_held())); +} +EXPORT_SYMBOL_GPL(task_cls_state); + +static struct cgroup_subsys_state * +cgrp_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct cgroup_cls_state *cs; + + cs = kzalloc(sizeof(*cs), GFP_KERNEL); + if (!cs) + return ERR_PTR(-ENOMEM); + + return &cs->css; +} + +static int cgrp_css_online(struct cgroup_subsys_state *css) +{ + struct cgroup_cls_state *cs = css_cls_state(css); + struct cgroup_cls_state *parent = css_cls_state(css->parent); + + if (parent) + cs->classid = parent->classid; + + return 0; +} + +static void cgrp_css_free(struct cgroup_subsys_state *css) +{ + kfree(css_cls_state(css)); +} + +/* + * To avoid freezing of sockets creation for tasks with big number of threads + * and opened sockets lets release file_lock every 1000 iterated descriptors. + * New sockets will already have been created with new classid. + */ + +struct update_classid_context { + u32 classid; + unsigned int batch; +}; + +#define UPDATE_CLASSID_BATCH 1000 + +static int update_classid_sock(const void *v, struct file *file, unsigned int n) +{ + struct update_classid_context *ctx = (void *)v; + struct socket *sock = sock_from_file(file); + + if (sock) + sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid); + if (--ctx->batch == 0) { + ctx->batch = UPDATE_CLASSID_BATCH; + return n + 1; + } + return 0; +} + +static void update_classid_task(struct task_struct *p, u32 classid) +{ + struct update_classid_context ctx = { + .classid = classid, + .batch = UPDATE_CLASSID_BATCH + }; + unsigned int fd = 0; + + do { + task_lock(p); + fd = iterate_fd(p->files, fd, update_classid_sock, &ctx); + task_unlock(p); + cond_resched(); + } while (fd); +} + +static void cgrp_attach(struct cgroup_taskset *tset) +{ + struct cgroup_subsys_state *css; + struct task_struct *p; + + cgroup_taskset_for_each(p, css, tset) { + update_classid_task(p, css_cls_state(css)->classid); + } +} + +static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) +{ + return css_cls_state(css)->classid; +} + +static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, + u64 value) +{ + struct cgroup_cls_state *cs = css_cls_state(css); + struct css_task_iter it; + struct task_struct *p; + + cs->classid = (u32)value; + + css_task_iter_start(css, 0, &it); + while ((p = css_task_iter_next(&it))) + update_classid_task(p, cs->classid); + css_task_iter_end(&it); + + return 0; +} + +static struct cftype ss_files[] = { + { + .name = "classid", + .read_u64 = read_classid, + .write_u64 = write_classid, + }, + { } /* terminate */ +}; + +struct cgroup_subsys net_cls_cgrp_subsys = { + .css_alloc = cgrp_css_alloc, + .css_online = cgrp_css_online, + .css_free = cgrp_css_free, + .attach = cgrp_attach, + .legacy_cftypes = ss_files, +}; diff --git a/net/core/netevent.c b/net/core/netevent.c new file mode 100644 index 000000000..5bb615e96 --- /dev/null +++ b/net/core/netevent.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Network event notifiers + * + * Authors: + * Tom Tucker <tom@opengridcomputing.com> + * Steve Wise <swise@opengridcomputing.com> + * + * Fixes: + */ + +#include <linux/rtnetlink.h> +#include <linux/notifier.h> +#include <linux/export.h> +#include <net/netevent.h> + +static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain); + +/** + * register_netevent_notifier - register a netevent notifier block + * @nb: notifier + * + * Register a notifier to be called when a netevent occurs. + * The notifier passed is linked into the kernel structures and must + * not be reused until it has been unregistered. A negative errno code + * is returned on a failure. + */ +int register_netevent_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&netevent_notif_chain, nb); +} +EXPORT_SYMBOL_GPL(register_netevent_notifier); + +/** + * unregister_netevent_notifier - unregister a netevent notifier block + * @nb: notifier + * + * Unregister a notifier previously registered by + * register_neigh_notifier(). The notifier is unlinked into the + * kernel structures and may then be reused. A negative errno code + * is returned on a failure. + */ + +int unregister_netevent_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&netevent_notif_chain, nb); +} +EXPORT_SYMBOL_GPL(unregister_netevent_notifier); + +/** + * call_netevent_notifiers - call all netevent notifier blocks + * @val: value passed unmodified to notifier function + * @v: pointer passed unmodified to notifier function + * + * Call all neighbour notifier blocks. Parameters and return value + * are as for notifier_call_chain(). + */ + +int call_netevent_notifiers(unsigned long val, void *v) +{ + return atomic_notifier_call_chain(&netevent_notif_chain, val, v); +} +EXPORT_SYMBOL_GPL(call_netevent_notifiers); diff --git a/net/core/netpoll.c b/net/core/netpoll.c new file mode 100644 index 000000000..4ac8d0ad9 --- /dev/null +++ b/net/core/netpoll.c @@ -0,0 +1,878 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Common framework for low-level network console, dump, and debugger code + * + * Sep 8 2003 Matt Mackall <mpm@selenic.com> + * + * based on the netconsole code from: + * + * Copyright (C) 2001 Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2002 Red Hat, Inc. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/moduleparam.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/string.h> +#include <linux/if_arp.h> +#include <linux/inetdevice.h> +#include <linux/inet.h> +#include <linux/interrupt.h> +#include <linux/netpoll.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/rcupdate.h> +#include <linux/workqueue.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <linux/if_vlan.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/addrconf.h> +#include <net/ndisc.h> +#include <net/ip6_checksum.h> +#include <asm/unaligned.h> +#include <trace/events/napi.h> +#include <linux/kconfig.h> + +/* + * We maintain a small pool of fully-sized skbs, to make sure the + * message gets out even in extreme OOM situations. + */ + +#define MAX_UDP_CHUNK 1460 +#define MAX_SKBS 32 + +static struct sk_buff_head skb_pool; + +DEFINE_STATIC_SRCU(netpoll_srcu); + +#define USEC_PER_POLL 50 + +#define MAX_SKB_SIZE \ + (sizeof(struct ethhdr) + \ + sizeof(struct iphdr) + \ + sizeof(struct udphdr) + \ + MAX_UDP_CHUNK) + +static void zap_completion_queue(void); + +static unsigned int carrier_timeout = 4; +module_param(carrier_timeout, uint, 0644); + +#define np_info(np, fmt, ...) \ + pr_info("%s: " fmt, np->name, ##__VA_ARGS__) +#define np_err(np, fmt, ...) \ + pr_err("%s: " fmt, np->name, ##__VA_ARGS__) +#define np_notice(np, fmt, ...) \ + pr_notice("%s: " fmt, np->name, ##__VA_ARGS__) + +static netdev_tx_t netpoll_start_xmit(struct sk_buff *skb, + struct net_device *dev, + struct netdev_queue *txq) +{ + netdev_tx_t status = NETDEV_TX_OK; + netdev_features_t features; + + features = netif_skb_features(skb); + + if (skb_vlan_tag_present(skb) && + !vlan_hw_offload_capable(features, skb->vlan_proto)) { + skb = __vlan_hwaccel_push_inside(skb); + if (unlikely(!skb)) { + /* This is actually a packet drop, but we + * don't want the code that calls this + * function to try and operate on a NULL skb. + */ + goto out; + } + } + + status = netdev_start_xmit(skb, dev, txq, false); + +out: + return status; +} + +static void queue_process(struct work_struct *work) +{ + struct netpoll_info *npinfo = + container_of(work, struct netpoll_info, tx_work.work); + struct sk_buff *skb; + unsigned long flags; + + while ((skb = skb_dequeue(&npinfo->txq))) { + struct net_device *dev = skb->dev; + struct netdev_queue *txq; + unsigned int q_index; + + if (!netif_device_present(dev) || !netif_running(dev)) { + kfree_skb(skb); + continue; + } + + local_irq_save(flags); + /* check if skb->queue_mapping is still valid */ + q_index = skb_get_queue_mapping(skb); + if (unlikely(q_index >= dev->real_num_tx_queues)) { + q_index = q_index % dev->real_num_tx_queues; + skb_set_queue_mapping(skb, q_index); + } + txq = netdev_get_tx_queue(dev, q_index); + HARD_TX_LOCK(dev, txq, smp_processor_id()); + if (netif_xmit_frozen_or_stopped(txq) || + !dev_xmit_complete(netpoll_start_xmit(skb, dev, txq))) { + skb_queue_head(&npinfo->txq, skb); + HARD_TX_UNLOCK(dev, txq); + local_irq_restore(flags); + + schedule_delayed_work(&npinfo->tx_work, HZ/10); + return; + } + HARD_TX_UNLOCK(dev, txq); + local_irq_restore(flags); + } +} + +static int netif_local_xmit_active(struct net_device *dev) +{ + int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *txq = netdev_get_tx_queue(dev, i); + + if (READ_ONCE(txq->xmit_lock_owner) == smp_processor_id()) + return 1; + } + + return 0; +} + +static void poll_one_napi(struct napi_struct *napi) +{ + int work; + + /* If we set this bit but see that it has already been set, + * that indicates that napi has been disabled and we need + * to abort this operation + */ + if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state)) + return; + + /* We explicilty pass the polling call a budget of 0 to + * indicate that we are clearing the Tx path only. + */ + work = napi->poll(napi, 0); + WARN_ONCE(work, "%pS exceeded budget in poll\n", napi->poll); + trace_napi_poll(napi, work, 0); + + clear_bit(NAPI_STATE_NPSVC, &napi->state); +} + +static void poll_napi(struct net_device *dev) +{ + struct napi_struct *napi; + int cpu = smp_processor_id(); + + list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { + if (cmpxchg(&napi->poll_owner, -1, cpu) == -1) { + poll_one_napi(napi); + smp_store_release(&napi->poll_owner, -1); + } + } +} + +void netpoll_poll_dev(struct net_device *dev) +{ + struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo); + const struct net_device_ops *ops; + + /* Don't do any rx activity if the dev_lock mutex is held + * the dev_open/close paths use this to block netpoll activity + * while changing device state + */ + if (!ni || down_trylock(&ni->dev_lock)) + return; + + /* Some drivers will take the same locks in poll and xmit, + * we can't poll if local CPU is already in xmit. + */ + if (!netif_running(dev) || netif_local_xmit_active(dev)) { + up(&ni->dev_lock); + return; + } + + ops = dev->netdev_ops; + if (ops->ndo_poll_controller) + ops->ndo_poll_controller(dev); + + poll_napi(dev); + + up(&ni->dev_lock); + + zap_completion_queue(); +} +EXPORT_SYMBOL(netpoll_poll_dev); + +void netpoll_poll_disable(struct net_device *dev) +{ + struct netpoll_info *ni; + int idx; + might_sleep(); + idx = srcu_read_lock(&netpoll_srcu); + ni = srcu_dereference(dev->npinfo, &netpoll_srcu); + if (ni) + down(&ni->dev_lock); + srcu_read_unlock(&netpoll_srcu, idx); +} +EXPORT_SYMBOL(netpoll_poll_disable); + +void netpoll_poll_enable(struct net_device *dev) +{ + struct netpoll_info *ni; + rcu_read_lock(); + ni = rcu_dereference(dev->npinfo); + if (ni) + up(&ni->dev_lock); + rcu_read_unlock(); +} +EXPORT_SYMBOL(netpoll_poll_enable); + +static void refill_skbs(void) +{ + struct sk_buff *skb; + unsigned long flags; + + spin_lock_irqsave(&skb_pool.lock, flags); + while (skb_pool.qlen < MAX_SKBS) { + skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC); + if (!skb) + break; + + __skb_queue_tail(&skb_pool, skb); + } + spin_unlock_irqrestore(&skb_pool.lock, flags); +} + +static void zap_completion_queue(void) +{ + unsigned long flags; + struct softnet_data *sd = &get_cpu_var(softnet_data); + + if (sd->completion_queue) { + struct sk_buff *clist; + + local_irq_save(flags); + clist = sd->completion_queue; + sd->completion_queue = NULL; + local_irq_restore(flags); + + while (clist != NULL) { + struct sk_buff *skb = clist; + clist = clist->next; + if (!skb_irq_freeable(skb)) { + refcount_set(&skb->users, 1); + dev_kfree_skb_any(skb); /* put this one back */ + } else { + __kfree_skb(skb); + } + } + } + + put_cpu_var(softnet_data); +} + +static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve) +{ + int count = 0; + struct sk_buff *skb; + + zap_completion_queue(); + refill_skbs(); +repeat: + + skb = alloc_skb(len, GFP_ATOMIC); + if (!skb) + skb = skb_dequeue(&skb_pool); + + if (!skb) { + if (++count < 10) { + netpoll_poll_dev(np->dev); + goto repeat; + } + return NULL; + } + + refcount_set(&skb->users, 1); + skb_reserve(skb, reserve); + return skb; +} + +static int netpoll_owner_active(struct net_device *dev) +{ + struct napi_struct *napi; + + list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { + if (napi->poll_owner == smp_processor_id()) + return 1; + } + return 0; +} + +/* call with IRQ disabled */ +static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) +{ + netdev_tx_t status = NETDEV_TX_BUSY; + struct net_device *dev; + unsigned long tries; + /* It is up to the caller to keep npinfo alive. */ + struct netpoll_info *npinfo; + + lockdep_assert_irqs_disabled(); + + dev = np->dev; + npinfo = rcu_dereference_bh(dev->npinfo); + + if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { + dev_kfree_skb_irq(skb); + return NET_XMIT_DROP; + } + + /* don't get messages out of order, and no recursion */ + if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) { + struct netdev_queue *txq; + + txq = netdev_core_pick_tx(dev, skb, NULL); + + /* try until next clock tick */ + for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; + tries > 0; --tries) { + if (HARD_TX_TRYLOCK(dev, txq)) { + if (!netif_xmit_stopped(txq)) + status = netpoll_start_xmit(skb, dev, txq); + + HARD_TX_UNLOCK(dev, txq); + + if (dev_xmit_complete(status)) + break; + + } + + /* tickle device maybe there is some cleanup */ + netpoll_poll_dev(np->dev); + + udelay(USEC_PER_POLL); + } + + WARN_ONCE(!irqs_disabled(), + "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pS)\n", + dev->name, dev->netdev_ops->ndo_start_xmit); + + } + + if (!dev_xmit_complete(status)) { + skb_queue_tail(&npinfo->txq, skb); + schedule_delayed_work(&npinfo->tx_work,0); + } + return NETDEV_TX_OK; +} + +netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) +{ + unsigned long flags; + netdev_tx_t ret; + + if (unlikely(!np)) { + dev_kfree_skb_irq(skb); + ret = NET_XMIT_DROP; + } else { + local_irq_save(flags); + ret = __netpoll_send_skb(np, skb); + local_irq_restore(flags); + } + return ret; +} +EXPORT_SYMBOL(netpoll_send_skb); + +void netpoll_send_udp(struct netpoll *np, const char *msg, int len) +{ + int total_len, ip_len, udp_len; + struct sk_buff *skb; + struct udphdr *udph; + struct iphdr *iph; + struct ethhdr *eth; + static atomic_t ip_ident; + struct ipv6hdr *ip6h; + + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + WARN_ON_ONCE(!irqs_disabled()); + + udp_len = len + sizeof(*udph); + if (np->ipv6) + ip_len = udp_len + sizeof(*ip6h); + else + ip_len = udp_len + sizeof(*iph); + + total_len = ip_len + LL_RESERVED_SPACE(np->dev); + + skb = find_skb(np, total_len + np->dev->needed_tailroom, + total_len - len); + if (!skb) + return; + + skb_copy_to_linear_data(skb, msg, len); + skb_put(skb, len); + + skb_push(skb, sizeof(*udph)); + skb_reset_transport_header(skb); + udph = udp_hdr(skb); + udph->source = htons(np->local_port); + udph->dest = htons(np->remote_port); + udph->len = htons(udp_len); + + if (np->ipv6) { + udph->check = 0; + udph->check = csum_ipv6_magic(&np->local_ip.in6, + &np->remote_ip.in6, + udp_len, IPPROTO_UDP, + csum_partial(udph, udp_len, 0)); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + + skb_push(skb, sizeof(*ip6h)); + skb_reset_network_header(skb); + ip6h = ipv6_hdr(skb); + + /* ip6h->version = 6; ip6h->priority = 0; */ + *(unsigned char *)ip6h = 0x60; + ip6h->flow_lbl[0] = 0; + ip6h->flow_lbl[1] = 0; + ip6h->flow_lbl[2] = 0; + + ip6h->payload_len = htons(sizeof(struct udphdr) + len); + ip6h->nexthdr = IPPROTO_UDP; + ip6h->hop_limit = 32; + ip6h->saddr = np->local_ip.in6; + ip6h->daddr = np->remote_ip.in6; + + eth = skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); + skb->protocol = eth->h_proto = htons(ETH_P_IPV6); + } else { + udph->check = 0; + udph->check = csum_tcpudp_magic(np->local_ip.ip, + np->remote_ip.ip, + udp_len, IPPROTO_UDP, + csum_partial(udph, udp_len, 0)); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + + skb_push(skb, sizeof(*iph)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + + /* iph->version = 4; iph->ihl = 5; */ + *(unsigned char *)iph = 0x45; + iph->tos = 0; + put_unaligned(htons(ip_len), &(iph->tot_len)); + iph->id = htons(atomic_inc_return(&ip_ident)); + iph->frag_off = 0; + iph->ttl = 64; + iph->protocol = IPPROTO_UDP; + iph->check = 0; + put_unaligned(np->local_ip.ip, &(iph->saddr)); + put_unaligned(np->remote_ip.ip, &(iph->daddr)); + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + + eth = skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); + skb->protocol = eth->h_proto = htons(ETH_P_IP); + } + + ether_addr_copy(eth->h_source, np->dev->dev_addr); + ether_addr_copy(eth->h_dest, np->remote_mac); + + skb->dev = np->dev; + + netpoll_send_skb(np, skb); +} +EXPORT_SYMBOL(netpoll_send_udp); + +void netpoll_print_options(struct netpoll *np) +{ + np_info(np, "local port %d\n", np->local_port); + if (np->ipv6) + np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6); + else + np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip); + np_info(np, "interface '%s'\n", np->dev_name); + np_info(np, "remote port %d\n", np->remote_port); + if (np->ipv6) + np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6); + else + np_info(np, "remote IPv4 address %pI4\n", &np->remote_ip.ip); + np_info(np, "remote ethernet address %pM\n", np->remote_mac); +} +EXPORT_SYMBOL(netpoll_print_options); + +static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr) +{ + const char *end; + + if (!strchr(str, ':') && + in4_pton(str, -1, (void *)addr, -1, &end) > 0) { + if (!*end) + return 0; + } + if (in6_pton(str, -1, addr->in6.s6_addr, -1, &end) > 0) { +#if IS_ENABLED(CONFIG_IPV6) + if (!*end) + return 1; +#else + return -1; +#endif + } + return -1; +} + +int netpoll_parse_options(struct netpoll *np, char *opt) +{ + char *cur=opt, *delim; + int ipv6; + bool ipversion_set = false; + + if (*cur != '@') { + if ((delim = strchr(cur, '@')) == NULL) + goto parse_failed; + *delim = 0; + if (kstrtou16(cur, 10, &np->local_port)) + goto parse_failed; + cur = delim; + } + cur++; + + if (*cur != '/') { + ipversion_set = true; + if ((delim = strchr(cur, '/')) == NULL) + goto parse_failed; + *delim = 0; + ipv6 = netpoll_parse_ip_addr(cur, &np->local_ip); + if (ipv6 < 0) + goto parse_failed; + else + np->ipv6 = (bool)ipv6; + cur = delim; + } + cur++; + + if (*cur != ',') { + /* parse out dev name */ + if ((delim = strchr(cur, ',')) == NULL) + goto parse_failed; + *delim = 0; + strscpy(np->dev_name, cur, sizeof(np->dev_name)); + cur = delim; + } + cur++; + + if (*cur != '@') { + /* dst port */ + if ((delim = strchr(cur, '@')) == NULL) + goto parse_failed; + *delim = 0; + if (*cur == ' ' || *cur == '\t') + np_info(np, "warning: whitespace is not allowed\n"); + if (kstrtou16(cur, 10, &np->remote_port)) + goto parse_failed; + cur = delim; + } + cur++; + + /* dst ip */ + if ((delim = strchr(cur, '/')) == NULL) + goto parse_failed; + *delim = 0; + ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip); + if (ipv6 < 0) + goto parse_failed; + else if (ipversion_set && np->ipv6 != (bool)ipv6) + goto parse_failed; + else + np->ipv6 = (bool)ipv6; + cur = delim + 1; + + if (*cur != 0) { + /* MAC address */ + if (!mac_pton(cur, np->remote_mac)) + goto parse_failed; + } + + netpoll_print_options(np); + + return 0; + + parse_failed: + np_info(np, "couldn't parse config at '%s'!\n", cur); + return -1; +} +EXPORT_SYMBOL(netpoll_parse_options); + +int __netpoll_setup(struct netpoll *np, struct net_device *ndev) +{ + struct netpoll_info *npinfo; + const struct net_device_ops *ops; + int err; + + np->dev = ndev; + strscpy(np->dev_name, ndev->name, IFNAMSIZ); + + if (ndev->priv_flags & IFF_DISABLE_NETPOLL) { + np_err(np, "%s doesn't support polling, aborting\n", + np->dev_name); + err = -ENOTSUPP; + goto out; + } + + if (!ndev->npinfo) { + npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); + if (!npinfo) { + err = -ENOMEM; + goto out; + } + + sema_init(&npinfo->dev_lock, 1); + skb_queue_head_init(&npinfo->txq); + INIT_DELAYED_WORK(&npinfo->tx_work, queue_process); + + refcount_set(&npinfo->refcnt, 1); + + ops = np->dev->netdev_ops; + if (ops->ndo_netpoll_setup) { + err = ops->ndo_netpoll_setup(ndev, npinfo); + if (err) + goto free_npinfo; + } + } else { + npinfo = rtnl_dereference(ndev->npinfo); + refcount_inc(&npinfo->refcnt); + } + + npinfo->netpoll = np; + + /* last thing to do is link it to the net device structure */ + rcu_assign_pointer(ndev->npinfo, npinfo); + + return 0; + +free_npinfo: + kfree(npinfo); +out: + return err; +} +EXPORT_SYMBOL_GPL(__netpoll_setup); + +int netpoll_setup(struct netpoll *np) +{ + struct net_device *ndev = NULL; + struct in_device *in_dev; + int err; + + rtnl_lock(); + if (np->dev_name[0]) { + struct net *net = current->nsproxy->net_ns; + ndev = __dev_get_by_name(net, np->dev_name); + } + if (!ndev) { + np_err(np, "%s doesn't exist, aborting\n", np->dev_name); + err = -ENODEV; + goto unlock; + } + dev_hold(ndev); + + if (netdev_master_upper_dev_get(ndev)) { + np_err(np, "%s is a slave device, aborting\n", np->dev_name); + err = -EBUSY; + goto put; + } + + if (!netif_running(ndev)) { + unsigned long atmost, atleast; + + np_info(np, "device %s not up yet, forcing it\n", np->dev_name); + + err = dev_open(ndev, NULL); + + if (err) { + np_err(np, "failed to open %s\n", ndev->name); + goto put; + } + + rtnl_unlock(); + atleast = jiffies + HZ/10; + atmost = jiffies + carrier_timeout * HZ; + while (!netif_carrier_ok(ndev)) { + if (time_after(jiffies, atmost)) { + np_notice(np, "timeout waiting for carrier\n"); + break; + } + msleep(1); + } + + /* If carrier appears to come up instantly, we don't + * trust it and pause so that we don't pump all our + * queued console messages into the bitbucket. + */ + + if (time_before(jiffies, atleast)) { + np_notice(np, "carrier detect appears untrustworthy, waiting 4 seconds\n"); + msleep(4000); + } + rtnl_lock(); + } + + if (!np->local_ip.ip) { + if (!np->ipv6) { + const struct in_ifaddr *ifa; + + in_dev = __in_dev_get_rtnl(ndev); + if (!in_dev) + goto put_noaddr; + + ifa = rtnl_dereference(in_dev->ifa_list); + if (!ifa) { +put_noaddr: + np_err(np, "no IP address for %s, aborting\n", + np->dev_name); + err = -EDESTADDRREQ; + goto put; + } + + np->local_ip.ip = ifa->ifa_local; + np_info(np, "local IP %pI4\n", &np->local_ip.ip); + } else { +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_dev *idev; + + err = -EDESTADDRREQ; + idev = __in6_dev_get(ndev); + if (idev) { + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifp, &idev->addr_list, if_list) { + if (!!(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) != + !!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL)) + continue; + np->local_ip.in6 = ifp->addr; + err = 0; + break; + } + read_unlock_bh(&idev->lock); + } + if (err) { + np_err(np, "no IPv6 address for %s, aborting\n", + np->dev_name); + goto put; + } else + np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6); +#else + np_err(np, "IPv6 is not supported %s, aborting\n", + np->dev_name); + err = -EINVAL; + goto put; +#endif + } + } + + /* fill up the skb queue */ + refill_skbs(); + + err = __netpoll_setup(np, ndev); + if (err) + goto put; + netdev_tracker_alloc(ndev, &np->dev_tracker, GFP_KERNEL); + rtnl_unlock(); + return 0; + +put: + dev_put(ndev); +unlock: + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(netpoll_setup); + +static int __init netpoll_init(void) +{ + skb_queue_head_init(&skb_pool); + return 0; +} +core_initcall(netpoll_init); + +static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) +{ + struct netpoll_info *npinfo = + container_of(rcu_head, struct netpoll_info, rcu); + + skb_queue_purge(&npinfo->txq); + + /* we can't call cancel_delayed_work_sync here, as we are in softirq */ + cancel_delayed_work(&npinfo->tx_work); + + /* clean after last, unfinished work */ + __skb_queue_purge(&npinfo->txq); + /* now cancel it again */ + cancel_delayed_work(&npinfo->tx_work); + kfree(npinfo); +} + +void __netpoll_cleanup(struct netpoll *np) +{ + struct netpoll_info *npinfo; + + npinfo = rtnl_dereference(np->dev->npinfo); + if (!npinfo) + return; + + synchronize_srcu(&netpoll_srcu); + + if (refcount_dec_and_test(&npinfo->refcnt)) { + const struct net_device_ops *ops; + + ops = np->dev->netdev_ops; + if (ops->ndo_netpoll_cleanup) + ops->ndo_netpoll_cleanup(np->dev); + + RCU_INIT_POINTER(np->dev->npinfo, NULL); + call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info); + } else + RCU_INIT_POINTER(np->dev->npinfo, NULL); +} +EXPORT_SYMBOL_GPL(__netpoll_cleanup); + +void __netpoll_free(struct netpoll *np) +{ + ASSERT_RTNL(); + + /* Wait for transmitting packets to finish before freeing. */ + synchronize_rcu(); + __netpoll_cleanup(np); + kfree(np); +} +EXPORT_SYMBOL_GPL(__netpoll_free); + +void netpoll_cleanup(struct netpoll *np) +{ + rtnl_lock(); + if (!np->dev) + goto out; + __netpoll_cleanup(np); + netdev_put(np->dev, &np->dev_tracker); + np->dev = NULL; +out: + rtnl_unlock(); +} +EXPORT_SYMBOL(netpoll_cleanup); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c new file mode 100644 index 000000000..8456dfbe2 --- /dev/null +++ b/net/core/netprio_cgroup.c @@ -0,0 +1,295 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * net/core/netprio_cgroup.c Priority Control Group + * + * Authors: Neil Horman <nhorman@tuxdriver.com> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/cgroup.h> +#include <linux/rcupdate.h> +#include <linux/atomic.h> +#include <linux/sched/task.h> + +#include <net/rtnetlink.h> +#include <net/pkt_cls.h> +#include <net/sock.h> +#include <net/netprio_cgroup.h> + +#include <linux/fdtable.h> + +/* + * netprio allocates per-net_device priomap array which is indexed by + * css->id. Limiting css ID to 16bits doesn't lose anything. + */ +#define NETPRIO_ID_MAX USHRT_MAX + +#define PRIOMAP_MIN_SZ 128 + +/* + * Extend @dev->priomap so that it's large enough to accommodate + * @target_idx. @dev->priomap.priomap_len > @target_idx after successful + * return. Must be called under rtnl lock. + */ +static int extend_netdev_table(struct net_device *dev, u32 target_idx) +{ + struct netprio_map *old, *new; + size_t new_sz, new_len; + + /* is the existing priomap large enough? */ + old = rtnl_dereference(dev->priomap); + if (old && old->priomap_len > target_idx) + return 0; + + /* + * Determine the new size. Let's keep it power-of-two. We start + * from PRIOMAP_MIN_SZ and double it until it's large enough to + * accommodate @target_idx. + */ + new_sz = PRIOMAP_MIN_SZ; + while (true) { + new_len = (new_sz - offsetof(struct netprio_map, priomap)) / + sizeof(new->priomap[0]); + if (new_len > target_idx) + break; + new_sz *= 2; + /* overflowed? */ + if (WARN_ON(new_sz < PRIOMAP_MIN_SZ)) + return -ENOSPC; + } + + /* allocate & copy */ + new = kzalloc(new_sz, GFP_KERNEL); + if (!new) + return -ENOMEM; + + if (old) + memcpy(new->priomap, old->priomap, + old->priomap_len * sizeof(old->priomap[0])); + + new->priomap_len = new_len; + + /* install the new priomap */ + rcu_assign_pointer(dev->priomap, new); + if (old) + kfree_rcu(old, rcu); + return 0; +} + +/** + * netprio_prio - return the effective netprio of a cgroup-net_device pair + * @css: css part of the target pair + * @dev: net_device part of the target pair + * + * Should be called under RCU read or rtnl lock. + */ +static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev) +{ + struct netprio_map *map = rcu_dereference_rtnl(dev->priomap); + int id = css->id; + + if (map && id < map->priomap_len) + return map->priomap[id]; + return 0; +} + +/** + * netprio_set_prio - set netprio on a cgroup-net_device pair + * @css: css part of the target pair + * @dev: net_device part of the target pair + * @prio: prio to set + * + * Set netprio to @prio on @css-@dev pair. Should be called under rtnl + * lock and may fail under memory pressure for non-zero @prio. + */ +static int netprio_set_prio(struct cgroup_subsys_state *css, + struct net_device *dev, u32 prio) +{ + struct netprio_map *map; + int id = css->id; + int ret; + + /* avoid extending priomap for zero writes */ + map = rtnl_dereference(dev->priomap); + if (!prio && (!map || map->priomap_len <= id)) + return 0; + + ret = extend_netdev_table(dev, id); + if (ret) + return ret; + + map = rtnl_dereference(dev->priomap); + map->priomap[id] = prio; + return 0; +} + +static struct cgroup_subsys_state * +cgrp_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct cgroup_subsys_state *css; + + css = kzalloc(sizeof(*css), GFP_KERNEL); + if (!css) + return ERR_PTR(-ENOMEM); + + return css; +} + +static int cgrp_css_online(struct cgroup_subsys_state *css) +{ + struct cgroup_subsys_state *parent_css = css->parent; + struct net_device *dev; + int ret = 0; + + if (css->id > NETPRIO_ID_MAX) + return -ENOSPC; + + if (!parent_css) + return 0; + + rtnl_lock(); + /* + * Inherit prios from the parent. As all prios are set during + * onlining, there is no need to clear them on offline. + */ + for_each_netdev(&init_net, dev) { + u32 prio = netprio_prio(parent_css, dev); + + ret = netprio_set_prio(css, dev, prio); + if (ret) + break; + } + rtnl_unlock(); + return ret; +} + +static void cgrp_css_free(struct cgroup_subsys_state *css) +{ + kfree(css); +} + +static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft) +{ + return css->id; +} + +static int read_priomap(struct seq_file *sf, void *v) +{ + struct net_device *dev; + + rcu_read_lock(); + for_each_netdev_rcu(&init_net, dev) + seq_printf(sf, "%s %u\n", dev->name, + netprio_prio(seq_css(sf), dev)); + rcu_read_unlock(); + return 0; +} + +static ssize_t write_priomap(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + char devname[IFNAMSIZ + 1]; + struct net_device *dev; + u32 prio; + int ret; + + if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2) + return -EINVAL; + + dev = dev_get_by_name(&init_net, devname); + if (!dev) + return -ENODEV; + + rtnl_lock(); + + ret = netprio_set_prio(of_css(of), dev, prio); + + rtnl_unlock(); + dev_put(dev); + return ret ?: nbytes; +} + +static int update_netprio(const void *v, struct file *file, unsigned n) +{ + struct socket *sock = sock_from_file(file); + + if (sock) + sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data, + (unsigned long)v); + return 0; +} + +static void net_prio_attach(struct cgroup_taskset *tset) +{ + struct task_struct *p; + struct cgroup_subsys_state *css; + + cgroup_taskset_for_each(p, css, tset) { + void *v = (void *)(unsigned long)css->id; + + task_lock(p); + iterate_fd(p->files, 0, update_netprio, v); + task_unlock(p); + } +} + +static struct cftype ss_files[] = { + { + .name = "prioidx", + .read_u64 = read_prioidx, + }, + { + .name = "ifpriomap", + .seq_show = read_priomap, + .write = write_priomap, + }, + { } /* terminate */ +}; + +struct cgroup_subsys net_prio_cgrp_subsys = { + .css_alloc = cgrp_css_alloc, + .css_online = cgrp_css_online, + .css_free = cgrp_css_free, + .attach = net_prio_attach, + .legacy_cftypes = ss_files, +}; + +static int netprio_device_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct netprio_map *old; + + /* + * Note this is called with rtnl_lock held so we have update side + * protection on our rcu assignments + */ + + switch (event) { + case NETDEV_UNREGISTER: + old = rtnl_dereference(dev->priomap); + RCU_INIT_POINTER(dev->priomap, NULL); + if (old) + kfree_rcu(old, rcu); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block netprio_device_notifier = { + .notifier_call = netprio_device_event +}; + +static int __init init_cgroup_netprio(void) +{ + register_netdevice_notifier(&netprio_device_notifier); + return 0; +} +subsys_initcall(init_cgroup_netprio); diff --git a/net/core/of_net.c b/net/core/of_net.c new file mode 100644 index 000000000..f1a9bf757 --- /dev/null +++ b/net/core/of_net.c @@ -0,0 +1,170 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * OF helpers for network devices. + * + * Initially copied out of arch/powerpc/kernel/prom_parse.c + */ +#include <linux/etherdevice.h> +#include <linux/kernel.h> +#include <linux/of_net.h> +#include <linux/of_platform.h> +#include <linux/phy.h> +#include <linux/export.h> +#include <linux/device.h> +#include <linux/nvmem-consumer.h> + +/** + * of_get_phy_mode - Get phy mode for given device_node + * @np: Pointer to the given device_node + * @interface: Pointer to the result + * + * The function gets phy interface string from property 'phy-mode' or + * 'phy-connection-type'. The index in phy_modes table is set in + * interface and 0 returned. In case of error interface is set to + * PHY_INTERFACE_MODE_NA and an errno is returned, e.g. -ENODEV. + */ +int of_get_phy_mode(struct device_node *np, phy_interface_t *interface) +{ + const char *pm; + int err, i; + + *interface = PHY_INTERFACE_MODE_NA; + + err = of_property_read_string(np, "phy-mode", &pm); + if (err < 0) + err = of_property_read_string(np, "phy-connection-type", &pm); + if (err < 0) + return err; + + for (i = 0; i < PHY_INTERFACE_MODE_MAX; i++) + if (!strcasecmp(pm, phy_modes(i))) { + *interface = i; + return 0; + } + + return -ENODEV; +} +EXPORT_SYMBOL_GPL(of_get_phy_mode); + +static int of_get_mac_addr(struct device_node *np, const char *name, u8 *addr) +{ + struct property *pp = of_find_property(np, name, NULL); + + if (pp && pp->length == ETH_ALEN && is_valid_ether_addr(pp->value)) { + memcpy(addr, pp->value, ETH_ALEN); + return 0; + } + return -ENODEV; +} + +static int of_get_mac_addr_nvmem(struct device_node *np, u8 *addr) +{ + struct platform_device *pdev = of_find_device_by_node(np); + struct nvmem_cell *cell; + const void *mac; + size_t len; + int ret; + + /* Try lookup by device first, there might be a nvmem_cell_lookup + * associated with a given device. + */ + if (pdev) { + ret = nvmem_get_mac_address(&pdev->dev, addr); + put_device(&pdev->dev); + return ret; + } + + cell = of_nvmem_cell_get(np, "mac-address"); + if (IS_ERR(cell)) + return PTR_ERR(cell); + + mac = nvmem_cell_read(cell, &len); + nvmem_cell_put(cell); + + if (IS_ERR(mac)) + return PTR_ERR(mac); + + if (len != ETH_ALEN || !is_valid_ether_addr(mac)) { + kfree(mac); + return -EINVAL; + } + + memcpy(addr, mac, ETH_ALEN); + kfree(mac); + + return 0; +} + +/** + * of_get_mac_address() + * @np: Caller's Device Node + * @addr: Pointer to a six-byte array for the result + * + * Search the device tree for the best MAC address to use. 'mac-address' is + * checked first, because that is supposed to contain to "most recent" MAC + * address. If that isn't set, then 'local-mac-address' is checked next, + * because that is the default address. If that isn't set, then the obsolete + * 'address' is checked, just in case we're using an old device tree. If any + * of the above isn't set, then try to get MAC address from nvmem cell named + * 'mac-address'. + * + * Note that the 'address' property is supposed to contain a virtual address of + * the register set, but some DTS files have redefined that property to be the + * MAC address. + * + * All-zero MAC addresses are rejected, because those could be properties that + * exist in the device tree, but were not set by U-Boot. For example, the + * DTS could define 'mac-address' and 'local-mac-address', with zero MAC + * addresses. Some older U-Boots only initialized 'local-mac-address'. In + * this case, the real MAC is in 'local-mac-address', and 'mac-address' exists + * but is all zeros. + * + * Return: 0 on success and errno in case of error. +*/ +int of_get_mac_address(struct device_node *np, u8 *addr) +{ + int ret; + + if (!np) + return -ENODEV; + + ret = of_get_mac_addr(np, "mac-address", addr); + if (!ret) + return 0; + + ret = of_get_mac_addr(np, "local-mac-address", addr); + if (!ret) + return 0; + + ret = of_get_mac_addr(np, "address", addr); + if (!ret) + return 0; + + return of_get_mac_addr_nvmem(np, addr); +} +EXPORT_SYMBOL(of_get_mac_address); + +/** + * of_get_ethdev_address() + * @np: Caller's Device Node + * @dev: Pointer to netdevice which address will be updated + * + * Search the device tree for the best MAC address to use. + * If found set @dev->dev_addr to that address. + * + * See documentation of of_get_mac_address() for more information on how + * the best address is determined. + * + * Return: 0 on success and errno in case of error. + */ +int of_get_ethdev_address(struct device_node *np, struct net_device *dev) +{ + u8 addr[ETH_ALEN]; + int ret; + + ret = of_get_mac_address(np, addr); + if (!ret) + eth_hw_addr_set(dev, addr); + return ret; +} +EXPORT_SYMBOL(of_get_ethdev_address); diff --git a/net/core/page_pool.c b/net/core/page_pool.c new file mode 100644 index 000000000..caf6d950d --- /dev/null +++ b/net/core/page_pool.c @@ -0,0 +1,932 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * page_pool.c + * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> + * Copyright (C) 2016 Red Hat, Inc. + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/device.h> + +#include <net/page_pool.h> +#include <net/xdp.h> + +#include <linux/dma-direction.h> +#include <linux/dma-mapping.h> +#include <linux/page-flags.h> +#include <linux/mm.h> /* for put_page() */ +#include <linux/poison.h> +#include <linux/ethtool.h> + +#include <trace/events/page_pool.h> + +#define DEFER_TIME (msecs_to_jiffies(1000)) +#define DEFER_WARN_INTERVAL (60 * HZ) + +#define BIAS_MAX LONG_MAX + +#ifdef CONFIG_PAGE_POOL_STATS +/* alloc_stat_inc is intended to be used in softirq context */ +#define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++) +/* recycle_stat_inc is safe to use when preemption is possible. */ +#define recycle_stat_inc(pool, __stat) \ + do { \ + struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ + this_cpu_inc(s->__stat); \ + } while (0) + +#define recycle_stat_add(pool, __stat, val) \ + do { \ + struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ + this_cpu_add(s->__stat, val); \ + } while (0) + +static const char pp_stats[][ETH_GSTRING_LEN] = { + "rx_pp_alloc_fast", + "rx_pp_alloc_slow", + "rx_pp_alloc_slow_ho", + "rx_pp_alloc_empty", + "rx_pp_alloc_refill", + "rx_pp_alloc_waive", + "rx_pp_recycle_cached", + "rx_pp_recycle_cache_full", + "rx_pp_recycle_ring", + "rx_pp_recycle_ring_full", + "rx_pp_recycle_released_ref", +}; + +bool page_pool_get_stats(struct page_pool *pool, + struct page_pool_stats *stats) +{ + int cpu = 0; + + if (!stats) + return false; + + /* The caller is responsible to initialize stats. */ + stats->alloc_stats.fast += pool->alloc_stats.fast; + stats->alloc_stats.slow += pool->alloc_stats.slow; + stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order; + stats->alloc_stats.empty += pool->alloc_stats.empty; + stats->alloc_stats.refill += pool->alloc_stats.refill; + stats->alloc_stats.waive += pool->alloc_stats.waive; + + for_each_possible_cpu(cpu) { + const struct page_pool_recycle_stats *pcpu = + per_cpu_ptr(pool->recycle_stats, cpu); + + stats->recycle_stats.cached += pcpu->cached; + stats->recycle_stats.cache_full += pcpu->cache_full; + stats->recycle_stats.ring += pcpu->ring; + stats->recycle_stats.ring_full += pcpu->ring_full; + stats->recycle_stats.released_refcnt += pcpu->released_refcnt; + } + + return true; +} +EXPORT_SYMBOL(page_pool_get_stats); + +u8 *page_pool_ethtool_stats_get_strings(u8 *data) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pp_stats); i++) { + memcpy(data, pp_stats[i], ETH_GSTRING_LEN); + data += ETH_GSTRING_LEN; + } + + return data; +} +EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings); + +int page_pool_ethtool_stats_get_count(void) +{ + return ARRAY_SIZE(pp_stats); +} +EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); + +u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) +{ + struct page_pool_stats *pool_stats = stats; + + *data++ = pool_stats->alloc_stats.fast; + *data++ = pool_stats->alloc_stats.slow; + *data++ = pool_stats->alloc_stats.slow_high_order; + *data++ = pool_stats->alloc_stats.empty; + *data++ = pool_stats->alloc_stats.refill; + *data++ = pool_stats->alloc_stats.waive; + *data++ = pool_stats->recycle_stats.cached; + *data++ = pool_stats->recycle_stats.cache_full; + *data++ = pool_stats->recycle_stats.ring; + *data++ = pool_stats->recycle_stats.ring_full; + *data++ = pool_stats->recycle_stats.released_refcnt; + + return data; +} +EXPORT_SYMBOL(page_pool_ethtool_stats_get); + +#else +#define alloc_stat_inc(pool, __stat) +#define recycle_stat_inc(pool, __stat) +#define recycle_stat_add(pool, __stat, val) +#endif + +static bool page_pool_producer_lock(struct page_pool *pool) + __acquires(&pool->ring.producer_lock) +{ + bool in_softirq = in_softirq(); + + if (in_softirq) + spin_lock(&pool->ring.producer_lock); + else + spin_lock_bh(&pool->ring.producer_lock); + + return in_softirq; +} + +static void page_pool_producer_unlock(struct page_pool *pool, + bool in_softirq) + __releases(&pool->ring.producer_lock) +{ + if (in_softirq) + spin_unlock(&pool->ring.producer_lock); + else + spin_unlock_bh(&pool->ring.producer_lock); +} + +static int page_pool_init(struct page_pool *pool, + const struct page_pool_params *params) +{ + unsigned int ring_qsize = 1024; /* Default */ + + memcpy(&pool->p, params, sizeof(pool->p)); + + /* Validate only known flags were used */ + if (pool->p.flags & ~(PP_FLAG_ALL)) + return -EINVAL; + + if (pool->p.pool_size) + ring_qsize = pool->p.pool_size; + + /* Sanity limit mem that can be pinned down */ + if (ring_qsize > 32768) + return -E2BIG; + + /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. + * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, + * which is the XDP_TX use-case. + */ + if (pool->p.flags & PP_FLAG_DMA_MAP) { + if ((pool->p.dma_dir != DMA_FROM_DEVICE) && + (pool->p.dma_dir != DMA_BIDIRECTIONAL)) + return -EINVAL; + } + + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) { + /* In order to request DMA-sync-for-device the page + * needs to be mapped + */ + if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + return -EINVAL; + + if (!pool->p.max_len) + return -EINVAL; + + /* pool->p.offset has to be set according to the address + * offset used by the DMA engine to start copying rx data + */ + } + + if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT && + pool->p.flags & PP_FLAG_PAGE_FRAG) + return -EINVAL; + +#ifdef CONFIG_PAGE_POOL_STATS + pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); + if (!pool->recycle_stats) + return -ENOMEM; +#endif + + if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) { +#ifdef CONFIG_PAGE_POOL_STATS + free_percpu(pool->recycle_stats); +#endif + return -ENOMEM; + } + + atomic_set(&pool->pages_state_release_cnt, 0); + + /* Driver calling page_pool_create() also call page_pool_destroy() */ + refcount_set(&pool->user_cnt, 1); + + if (pool->p.flags & PP_FLAG_DMA_MAP) + get_device(pool->p.dev); + + return 0; +} + +struct page_pool *page_pool_create(const struct page_pool_params *params) +{ + struct page_pool *pool; + int err; + + pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); + if (!pool) + return ERR_PTR(-ENOMEM); + + err = page_pool_init(pool, params); + if (err < 0) { + pr_warn("%s() gave up with errno %d\n", __func__, err); + kfree(pool); + return ERR_PTR(err); + } + + return pool; +} +EXPORT_SYMBOL(page_pool_create); + +static void page_pool_return_page(struct page_pool *pool, struct page *page); + +noinline +static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) +{ + struct ptr_ring *r = &pool->ring; + struct page *page; + int pref_nid; /* preferred NUMA node */ + + /* Quicker fallback, avoid locks when ring is empty */ + if (__ptr_ring_empty(r)) { + alloc_stat_inc(pool, empty); + return NULL; + } + + /* Softirq guarantee CPU and thus NUMA node is stable. This, + * assumes CPU refilling driver RX-ring will also run RX-NAPI. + */ +#ifdef CONFIG_NUMA + pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; +#else + /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ + pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ +#endif + + /* Refill alloc array, but only if NUMA match */ + do { + page = __ptr_ring_consume(r); + if (unlikely(!page)) + break; + + if (likely(page_to_nid(page) == pref_nid)) { + pool->alloc.cache[pool->alloc.count++] = page; + } else { + /* NUMA mismatch; + * (1) release 1 page to page-allocator and + * (2) break out to fallthrough to alloc_pages_node. + * This limit stress on page buddy alloactor. + */ + page_pool_return_page(pool, page); + alloc_stat_inc(pool, waive); + page = NULL; + break; + } + } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); + + /* Return last page */ + if (likely(pool->alloc.count > 0)) { + page = pool->alloc.cache[--pool->alloc.count]; + alloc_stat_inc(pool, refill); + } + + return page; +} + +/* fast path */ +static struct page *__page_pool_get_cached(struct page_pool *pool) +{ + struct page *page; + + /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ + if (likely(pool->alloc.count)) { + /* Fast-path */ + page = pool->alloc.cache[--pool->alloc.count]; + alloc_stat_inc(pool, fast); + } else { + page = page_pool_refill_alloc_cache(pool); + } + + return page; +} + +static void page_pool_dma_sync_for_device(struct page_pool *pool, + struct page *page, + unsigned int dma_sync_size) +{ + dma_addr_t dma_addr = page_pool_get_dma_addr(page); + + dma_sync_size = min(dma_sync_size, pool->p.max_len); + dma_sync_single_range_for_device(pool->p.dev, dma_addr, + pool->p.offset, dma_sync_size, + pool->p.dma_dir); +} + +static bool page_pool_dma_map(struct page_pool *pool, struct page *page) +{ + dma_addr_t dma; + + /* Setup DMA mapping: use 'struct page' area for storing DMA-addr + * since dma_addr_t can be either 32 or 64 bits and does not always fit + * into page private data (i.e 32bit cpu with 64bit DMA caps) + * This mapping is kept for lifetime of page, until leaving pool. + */ + dma = dma_map_page_attrs(pool->p.dev, page, 0, + (PAGE_SIZE << pool->p.order), + pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC); + if (dma_mapping_error(pool->p.dev, dma)) + return false; + + page_pool_set_dma_addr(page, dma); + + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) + page_pool_dma_sync_for_device(pool, page, pool->p.max_len); + + return true; +} + +static void page_pool_set_pp_info(struct page_pool *pool, + struct page *page) +{ + page->pp = pool; + page->pp_magic |= PP_SIGNATURE; + if (pool->p.init_callback) + pool->p.init_callback(page, pool->p.init_arg); +} + +static void page_pool_clear_pp_info(struct page *page) +{ + page->pp_magic = 0; + page->pp = NULL; +} + +static struct page *__page_pool_alloc_page_order(struct page_pool *pool, + gfp_t gfp) +{ + struct page *page; + + gfp |= __GFP_COMP; + page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); + if (unlikely(!page)) + return NULL; + + if ((pool->p.flags & PP_FLAG_DMA_MAP) && + unlikely(!page_pool_dma_map(pool, page))) { + put_page(page); + return NULL; + } + + alloc_stat_inc(pool, slow_high_order); + page_pool_set_pp_info(pool, page); + + /* Track how many pages are held 'in-flight' */ + pool->pages_state_hold_cnt++; + trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); + return page; +} + +/* slow path */ +noinline +static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, + gfp_t gfp) +{ + const int bulk = PP_ALLOC_CACHE_REFILL; + unsigned int pp_flags = pool->p.flags; + unsigned int pp_order = pool->p.order; + struct page *page; + int i, nr_pages; + + /* Don't support bulk alloc for high-order pages */ + if (unlikely(pp_order)) + return __page_pool_alloc_page_order(pool, gfp); + + /* Unnecessary as alloc cache is empty, but guarantees zero count */ + if (unlikely(pool->alloc.count > 0)) + return pool->alloc.cache[--pool->alloc.count]; + + /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */ + memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); + + nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk, + pool->alloc.cache); + if (unlikely(!nr_pages)) + return NULL; + + /* Pages have been filled into alloc.cache array, but count is zero and + * page element have not been (possibly) DMA mapped. + */ + for (i = 0; i < nr_pages; i++) { + page = pool->alloc.cache[i]; + if ((pp_flags & PP_FLAG_DMA_MAP) && + unlikely(!page_pool_dma_map(pool, page))) { + put_page(page); + continue; + } + + page_pool_set_pp_info(pool, page); + pool->alloc.cache[pool->alloc.count++] = page; + /* Track how many pages are held 'in-flight' */ + pool->pages_state_hold_cnt++; + trace_page_pool_state_hold(pool, page, + pool->pages_state_hold_cnt); + } + + /* Return last page */ + if (likely(pool->alloc.count > 0)) { + page = pool->alloc.cache[--pool->alloc.count]; + alloc_stat_inc(pool, slow); + } else { + page = NULL; + } + + /* When page just alloc'ed is should/must have refcnt 1. */ + return page; +} + +/* For using page_pool replace: alloc_pages() API calls, but provide + * synchronization guarantee for allocation side. + */ +struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) +{ + struct page *page; + + /* Fast-path: Get a page from cache */ + page = __page_pool_get_cached(pool); + if (page) + return page; + + /* Slow-path: cache empty, do real allocation */ + page = __page_pool_alloc_pages_slow(pool, gfp); + return page; +} +EXPORT_SYMBOL(page_pool_alloc_pages); + +/* Calculate distance between two u32 values, valid if distance is below 2^(31) + * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution + */ +#define _distance(a, b) (s32)((a) - (b)) + +static s32 page_pool_inflight(struct page_pool *pool) +{ + u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); + u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); + s32 inflight; + + inflight = _distance(hold_cnt, release_cnt); + + trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); + WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); + + return inflight; +} + +/* Disconnects a page (from a page_pool). API users can have a need + * to disconnect a page (from a page_pool), to allow it to be used as + * a regular page (that will eventually be returned to the normal + * page-allocator via put_page). + */ +void page_pool_release_page(struct page_pool *pool, struct page *page) +{ + dma_addr_t dma; + int count; + + if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + /* Always account for inflight pages, even if we didn't + * map them + */ + goto skip_dma_unmap; + + dma = page_pool_get_dma_addr(page); + + /* When page is unmapped, it cannot be returned to our pool */ + dma_unmap_page_attrs(pool->p.dev, dma, + PAGE_SIZE << pool->p.order, pool->p.dma_dir, + DMA_ATTR_SKIP_CPU_SYNC); + page_pool_set_dma_addr(page, 0); +skip_dma_unmap: + page_pool_clear_pp_info(page); + + /* This may be the last page returned, releasing the pool, so + * it is not safe to reference pool afterwards. + */ + count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); + trace_page_pool_state_release(pool, page, count); +} +EXPORT_SYMBOL(page_pool_release_page); + +/* Return a page to the page allocator, cleaning up our state */ +static void page_pool_return_page(struct page_pool *pool, struct page *page) +{ + page_pool_release_page(pool, page); + + put_page(page); + /* An optimization would be to call __free_pages(page, pool->p.order) + * knowing page is not part of page-cache (thus avoiding a + * __page_cache_release() call). + */ +} + +static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) +{ + int ret; + /* BH protection not needed if current is softirq */ + if (in_softirq()) + ret = ptr_ring_produce(&pool->ring, page); + else + ret = ptr_ring_produce_bh(&pool->ring, page); + + if (!ret) { + recycle_stat_inc(pool, ring); + return true; + } + + return false; +} + +/* Only allow direct recycling in special circumstances, into the + * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. + * + * Caller must provide appropriate safe context. + */ +static bool page_pool_recycle_in_cache(struct page *page, + struct page_pool *pool) +{ + if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) { + recycle_stat_inc(pool, cache_full); + return false; + } + + /* Caller MUST have verified/know (page_ref_count(page) == 1) */ + pool->alloc.cache[pool->alloc.count++] = page; + recycle_stat_inc(pool, cached); + return true; +} + +/* If the page refcnt == 1, this will try to recycle the page. + * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for + * the configured size min(dma_sync_size, pool->max_len). + * If the page refcnt != 1, then the page will be returned to memory + * subsystem. + */ +static __always_inline struct page * +__page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) +{ + /* This allocator is optimized for the XDP mode that uses + * one-frame-per-page, but have fallbacks that act like the + * regular page allocator APIs. + * + * refcnt == 1 means page_pool owns page, and can recycle it. + * + * page is NOT reusable when allocated when system is under + * some pressure. (page_is_pfmemalloc) + */ + if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) { + /* Read barrier done in page_ref_count / READ_ONCE */ + + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) + page_pool_dma_sync_for_device(pool, page, + dma_sync_size); + + if (allow_direct && in_softirq() && + page_pool_recycle_in_cache(page, pool)) + return NULL; + + /* Page found as candidate for recycling */ + return page; + } + /* Fallback/non-XDP mode: API user have elevated refcnt. + * + * Many drivers split up the page into fragments, and some + * want to keep doing this to save memory and do refcnt based + * recycling. Support this use case too, to ease drivers + * switching between XDP/non-XDP. + * + * In-case page_pool maintains the DMA mapping, API user must + * call page_pool_put_page once. In this elevated refcnt + * case, the DMA is unmapped/released, as driver is likely + * doing refcnt based recycle tricks, meaning another process + * will be invoking put_page. + */ + recycle_stat_inc(pool, released_refcnt); + /* Do not replace this with page_pool_return_page() */ + page_pool_release_page(pool, page); + put_page(page); + + return NULL; +} + +void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) +{ + page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); + if (page && !page_pool_recycle_in_ring(pool, page)) { + /* Cache full, fallback to free pages */ + recycle_stat_inc(pool, ring_full); + page_pool_return_page(pool, page); + } +} +EXPORT_SYMBOL(page_pool_put_defragged_page); + +/* Caller must not use data area after call, as this function overwrites it */ +void page_pool_put_page_bulk(struct page_pool *pool, void **data, + int count) +{ + int i, bulk_len = 0; + bool in_softirq; + + for (i = 0; i < count; i++) { + struct page *page = virt_to_head_page(data[i]); + + /* It is not the last user for the page frag case */ + if (!page_pool_is_last_frag(pool, page)) + continue; + + page = __page_pool_put_page(pool, page, -1, false); + /* Approved for bulk recycling in ptr_ring cache */ + if (page) + data[bulk_len++] = page; + } + + if (unlikely(!bulk_len)) + return; + + /* Bulk producer into ptr_ring page_pool cache */ + in_softirq = page_pool_producer_lock(pool); + for (i = 0; i < bulk_len; i++) { + if (__ptr_ring_produce(&pool->ring, data[i])) { + /* ring full */ + recycle_stat_inc(pool, ring_full); + break; + } + } + recycle_stat_add(pool, ring, i); + page_pool_producer_unlock(pool, in_softirq); + + /* Hopefully all pages was return into ptr_ring */ + if (likely(i == bulk_len)) + return; + + /* ptr_ring cache full, free remaining pages outside producer lock + * since put_page() with refcnt == 1 can be an expensive operation + */ + for (; i < bulk_len; i++) + page_pool_return_page(pool, data[i]); +} +EXPORT_SYMBOL(page_pool_put_page_bulk); + +static struct page *page_pool_drain_frag(struct page_pool *pool, + struct page *page) +{ + long drain_count = BIAS_MAX - pool->frag_users; + + /* Some user is still using the page frag */ + if (likely(page_pool_defrag_page(page, drain_count))) + return NULL; + + if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) { + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) + page_pool_dma_sync_for_device(pool, page, -1); + + return page; + } + + page_pool_return_page(pool, page); + return NULL; +} + +static void page_pool_free_frag(struct page_pool *pool) +{ + long drain_count = BIAS_MAX - pool->frag_users; + struct page *page = pool->frag_page; + + pool->frag_page = NULL; + + if (!page || page_pool_defrag_page(page, drain_count)) + return; + + page_pool_return_page(pool, page); +} + +struct page *page_pool_alloc_frag(struct page_pool *pool, + unsigned int *offset, + unsigned int size, gfp_t gfp) +{ + unsigned int max_size = PAGE_SIZE << pool->p.order; + struct page *page = pool->frag_page; + + if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) || + size > max_size)) + return NULL; + + size = ALIGN(size, dma_get_cache_alignment()); + *offset = pool->frag_offset; + + if (page && *offset + size > max_size) { + page = page_pool_drain_frag(pool, page); + if (page) { + alloc_stat_inc(pool, fast); + goto frag_reset; + } + } + + if (!page) { + page = page_pool_alloc_pages(pool, gfp); + if (unlikely(!page)) { + pool->frag_page = NULL; + return NULL; + } + + pool->frag_page = page; + +frag_reset: + pool->frag_users = 1; + *offset = 0; + pool->frag_offset = size; + page_pool_fragment_page(page, BIAS_MAX); + return page; + } + + pool->frag_users++; + pool->frag_offset = *offset + size; + alloc_stat_inc(pool, fast); + return page; +} +EXPORT_SYMBOL(page_pool_alloc_frag); + +static void page_pool_empty_ring(struct page_pool *pool) +{ + struct page *page; + + /* Empty recycle ring */ + while ((page = ptr_ring_consume_bh(&pool->ring))) { + /* Verify the refcnt invariant of cached pages */ + if (!(page_ref_count(page) == 1)) + pr_crit("%s() page_pool refcnt %d violation\n", + __func__, page_ref_count(page)); + + page_pool_return_page(pool, page); + } +} + +static void page_pool_free(struct page_pool *pool) +{ + if (pool->disconnect) + pool->disconnect(pool); + + ptr_ring_cleanup(&pool->ring, NULL); + + if (pool->p.flags & PP_FLAG_DMA_MAP) + put_device(pool->p.dev); + +#ifdef CONFIG_PAGE_POOL_STATS + free_percpu(pool->recycle_stats); +#endif + kfree(pool); +} + +static void page_pool_empty_alloc_cache_once(struct page_pool *pool) +{ + struct page *page; + + if (pool->destroy_cnt) + return; + + /* Empty alloc cache, assume caller made sure this is + * no-longer in use, and page_pool_alloc_pages() cannot be + * call concurrently. + */ + while (pool->alloc.count) { + page = pool->alloc.cache[--pool->alloc.count]; + page_pool_return_page(pool, page); + } +} + +static void page_pool_scrub(struct page_pool *pool) +{ + page_pool_empty_alloc_cache_once(pool); + pool->destroy_cnt++; + + /* No more consumers should exist, but producers could still + * be in-flight. + */ + page_pool_empty_ring(pool); +} + +static int page_pool_release(struct page_pool *pool) +{ + int inflight; + + page_pool_scrub(pool); + inflight = page_pool_inflight(pool); + if (!inflight) + page_pool_free(pool); + + return inflight; +} + +static void page_pool_release_retry(struct work_struct *wq) +{ + struct delayed_work *dwq = to_delayed_work(wq); + struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); + int inflight; + + inflight = page_pool_release(pool); + if (!inflight) + return; + + /* Periodic warning */ + if (time_after_eq(jiffies, pool->defer_warn)) { + int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; + + pr_warn("%s() stalled pool shutdown %d inflight %d sec\n", + __func__, inflight, sec); + pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; + } + + /* Still not ready to be disconnected, retry later */ + schedule_delayed_work(&pool->release_dw, DEFER_TIME); +} + +void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), + struct xdp_mem_info *mem) +{ + refcount_inc(&pool->user_cnt); + pool->disconnect = disconnect; + pool->xdp_mem_id = mem->id; +} + +void page_pool_destroy(struct page_pool *pool) +{ + if (!pool) + return; + + if (!page_pool_put(pool)) + return; + + page_pool_free_frag(pool); + + if (!page_pool_release(pool)) + return; + + pool->defer_start = jiffies; + pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; + + INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); + schedule_delayed_work(&pool->release_dw, DEFER_TIME); +} +EXPORT_SYMBOL(page_pool_destroy); + +/* Caller must provide appropriate safe context, e.g. NAPI. */ +void page_pool_update_nid(struct page_pool *pool, int new_nid) +{ + struct page *page; + + trace_page_pool_update_nid(pool, new_nid); + pool->p.nid = new_nid; + + /* Flush pool alloc cache, as refill will check NUMA node */ + while (pool->alloc.count) { + page = pool->alloc.cache[--pool->alloc.count]; + page_pool_return_page(pool, page); + } +} +EXPORT_SYMBOL(page_pool_update_nid); + +bool page_pool_return_skb_page(struct page *page) +{ + struct page_pool *pp; + + page = compound_head(page); + + /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation + * in order to preserve any existing bits, such as bit 0 for the + * head page of compound page and bit 1 for pfmemalloc page, so + * mask those bits for freeing side when doing below checking, + * and page_is_pfmemalloc() is checked in __page_pool_put_page() + * to avoid recycling the pfmemalloc page. + */ + if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) + return false; + + pp = page->pp; + + /* Driver set this to memory recycling info. Reset it on recycle. + * This will *not* work for NIC using a split-page memory model. + * The page will be returned to the pool here regardless of the + * 'flipped' fragment being in use or not. + */ + page_pool_put_full_page(pp, page, false); + + return true; +} +EXPORT_SYMBOL(page_pool_return_skb_page); diff --git a/net/core/pktgen.c b/net/core/pktgen.c new file mode 100644 index 000000000..471d4effa --- /dev/null +++ b/net/core/pktgen.c @@ -0,0 +1,4039 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Authors: + * Copyright 2001, 2002 by Robert Olsson <robert.olsson@its.uu.se> + * Uppsala University and + * Swedish University of Agricultural Sciences + * + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * Ben Greear <greearb@candelatech.com> + * Jens Låås <jens.laas@data.slu.se> + * + * A tool for loading the network with preconfigurated packets. + * The tool is implemented as a linux module. Parameters are output + * device, delay (to hard_xmit), number of packets, and whether + * to use multiple SKBs or just the same one. + * pktgen uses the installed interface's output routine. + * + * Additional hacking by: + * + * Jens.Laas@data.slu.se + * Improved by ANK. 010120. + * Improved by ANK even more. 010212. + * MAC address typo fixed. 010417 --ro + * Integrated. 020301 --DaveM + * Added multiskb option 020301 --DaveM + * Scaling of results. 020417--sigurdur@linpro.no + * Significant re-work of the module: + * * Convert to threaded model to more efficiently be able to transmit + * and receive on multiple interfaces at once. + * * Converted many counters to __u64 to allow longer runs. + * * Allow configuration of ranges, like min/max IP address, MACs, + * and UDP-ports, for both source and destination, and can + * set to use a random distribution or sequentially walk the range. + * * Can now change most values after starting. + * * Place 12-byte packet in UDP payload with magic number, + * sequence number, and timestamp. + * * Add receiver code that detects dropped pkts, re-ordered pkts, and + * latencies (with micro-second) precision. + * * Add IOCTL interface to easily get counters & configuration. + * --Ben Greear <greearb@candelatech.com> + * + * Renamed multiskb to clone_skb and cleaned up sending core for two distinct + * skb modes. A clone_skb=0 mode for Ben "ranges" work and a clone_skb != 0 + * as a "fastpath" with a configurable number of clones after alloc's. + * clone_skb=0 means all packets are allocated this also means ranges time + * stamps etc can be used. clone_skb=100 means 1 malloc is followed by 100 + * clones. + * + * Also moved to /proc/net/pktgen/ + * --ro + * + * Sept 10: Fixed threading/locking. Lots of bone-headed and more clever + * mistakes. Also merged in DaveM's patch in the -pre6 patch. + * --Ben Greear <greearb@candelatech.com> + * + * Integrated to 2.5.x 021029 --Lucio Maciel (luciomaciel@zipmail.com.br) + * + * 021124 Finished major redesign and rewrite for new functionality. + * See Documentation/networking/pktgen.rst for how to use this. + * + * The new operation: + * For each CPU one thread/process is created at start. This process checks + * for running devices in the if_list and sends packets until count is 0 it + * also the thread checks the thread->control which is used for inter-process + * communication. controlling process "posts" operations to the threads this + * way. + * The if_list is RCU protected, and the if_lock remains to protect updating + * of if_list, from "add_device" as it invoked from userspace (via proc write). + * + * By design there should only be *one* "controlling" process. In practice + * multiple write accesses gives unpredictable result. Understood by "write" + * to /proc gives result code thats should be read be the "writer". + * For practical use this should be no problem. + * + * Note when adding devices to a specific CPU there good idea to also assign + * /proc/irq/XX/smp_affinity so TX-interrupts gets bound to the same CPU. + * --ro + * + * Fix refcount off by one if first packet fails, potential null deref, + * memleak 030710- KJP + * + * First "ranges" functionality for ipv6 030726 --ro + * + * Included flow support. 030802 ANK. + * + * Fixed unaligned access on IA-64 Grant Grundler <grundler@parisc-linux.org> + * + * Remove if fix from added Harald Welte <laforge@netfilter.org> 040419 + * ia64 compilation fix from Aron Griffis <aron@hp.com> 040604 + * + * New xmit() return, do_div and misc clean up by Stephen Hemminger + * <shemminger@osdl.org> 040923 + * + * Randy Dunlap fixed u64 printk compiler warning + * + * Remove FCS from BW calculation. Lennert Buytenhek <buytenh@wantstofly.org> + * New time handling. Lennert Buytenhek <buytenh@wantstofly.org> 041213 + * + * Corrections from Nikolai Malykh (nmalykh@bilim.com) + * Removed unused flags F_SET_SRCMAC & F_SET_SRCIP 041230 + * + * interruptible_sleep_on_timeout() replaced Nishanth Aravamudan <nacc@us.ibm.com> + * 050103 + * + * MPLS support by Steven Whitehouse <steve@chygwyn.com> + * + * 802.1Q/Q-in-Q support by Francesco Fondelli (FF) <francesco.fondelli@gmail.com> + * + * Fixed src_mac command to set source mac of packet to value specified in + * command by Adit Ranadive <adit.262@gmail.com> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/sys.h> +#include <linux/types.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/kernel.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/unistd.h> +#include <linux/string.h> +#include <linux/ptrace.h> +#include <linux/errno.h> +#include <linux/ioport.h> +#include <linux/interrupt.h> +#include <linux/capability.h> +#include <linux/hrtimer.h> +#include <linux/freezer.h> +#include <linux/delay.h> +#include <linux/timer.h> +#include <linux/list.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/inet.h> +#include <linux/inetdevice.h> +#include <linux/rtnetlink.h> +#include <linux/if_arp.h> +#include <linux/if_vlan.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/udp.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/wait.h> +#include <linux/etherdevice.h> +#include <linux/kthread.h> +#include <linux/prefetch.h> +#include <linux/mmzone.h> +#include <net/net_namespace.h> +#include <net/checksum.h> +#include <net/ipv6.h> +#include <net/udp.h> +#include <net/ip6_checksum.h> +#include <net/addrconf.h> +#ifdef CONFIG_XFRM +#include <net/xfrm.h> +#endif +#include <net/netns/generic.h> +#include <asm/byteorder.h> +#include <linux/rcupdate.h> +#include <linux/bitops.h> +#include <linux/io.h> +#include <linux/timex.h> +#include <linux/uaccess.h> +#include <asm/dma.h> +#include <asm/div64.h> /* do_div */ + +#define VERSION "2.75" +#define IP_NAME_SZ 32 +#define MAX_MPLS_LABELS 16 /* This is the max label stack depth */ +#define MPLS_STACK_BOTTOM htonl(0x00000100) +/* Max number of internet mix entries that can be specified in imix_weights. */ +#define MAX_IMIX_ENTRIES 20 +#define IMIX_PRECISION 100 /* Precision of IMIX distribution */ + +#define func_enter() pr_debug("entering %s\n", __func__); + +#define PKT_FLAGS \ + pf(IPV6) /* Interface in IPV6 Mode */ \ + pf(IPSRC_RND) /* IP-Src Random */ \ + pf(IPDST_RND) /* IP-Dst Random */ \ + pf(TXSIZE_RND) /* Transmit size is random */ \ + pf(UDPSRC_RND) /* UDP-Src Random */ \ + pf(UDPDST_RND) /* UDP-Dst Random */ \ + pf(UDPCSUM) /* Include UDP checksum */ \ + pf(NO_TIMESTAMP) /* Don't timestamp packets (default TS) */ \ + pf(MPLS_RND) /* Random MPLS labels */ \ + pf(QUEUE_MAP_RND) /* queue map Random */ \ + pf(QUEUE_MAP_CPU) /* queue map mirrors smp_processor_id() */ \ + pf(FLOW_SEQ) /* Sequential flows */ \ + pf(IPSEC) /* ipsec on for flows */ \ + pf(MACSRC_RND) /* MAC-Src Random */ \ + pf(MACDST_RND) /* MAC-Dst Random */ \ + pf(VID_RND) /* Random VLAN ID */ \ + pf(SVID_RND) /* Random SVLAN ID */ \ + pf(NODE) /* Node memory alloc*/ \ + +#define pf(flag) flag##_SHIFT, +enum pkt_flags { + PKT_FLAGS +}; +#undef pf + +/* Device flag bits */ +#define pf(flag) static const __u32 F_##flag = (1<<flag##_SHIFT); +PKT_FLAGS +#undef pf + +#define pf(flag) __stringify(flag), +static char *pkt_flag_names[] = { + PKT_FLAGS +}; +#undef pf + +#define NR_PKT_FLAGS ARRAY_SIZE(pkt_flag_names) + +/* Thread control flag bits */ +#define T_STOP (1<<0) /* Stop run */ +#define T_RUN (1<<1) /* Start run */ +#define T_REMDEVALL (1<<2) /* Remove all devs */ +#define T_REMDEV (1<<3) /* Remove one dev */ + +/* Xmit modes */ +#define M_START_XMIT 0 /* Default normal TX */ +#define M_NETIF_RECEIVE 1 /* Inject packets into stack */ +#define M_QUEUE_XMIT 2 /* Inject packet into qdisc */ + +/* If lock -- protects updating of if_list */ +#define if_lock(t) mutex_lock(&(t->if_lock)); +#define if_unlock(t) mutex_unlock(&(t->if_lock)); + +/* Used to help with determining the pkts on receive */ +#define PKTGEN_MAGIC 0xbe9be955 +#define PG_PROC_DIR "pktgen" +#define PGCTRL "pgctrl" + +#define MAX_CFLOWS 65536 + +#define VLAN_TAG_SIZE(x) ((x)->vlan_id == 0xffff ? 0 : 4) +#define SVLAN_TAG_SIZE(x) ((x)->svlan_id == 0xffff ? 0 : 4) + +struct imix_pkt { + u64 size; + u64 weight; + u64 count_so_far; +}; + +struct flow_state { + __be32 cur_daddr; + int count; +#ifdef CONFIG_XFRM + struct xfrm_state *x; +#endif + __u32 flags; +}; + +/* flow flag bits */ +#define F_INIT (1<<0) /* flow has been initialized */ + +struct pktgen_dev { + /* + * Try to keep frequent/infrequent used vars. separated. + */ + struct proc_dir_entry *entry; /* proc file */ + struct pktgen_thread *pg_thread;/* the owner */ + struct list_head list; /* chaining in the thread's run-queue */ + struct rcu_head rcu; /* freed by RCU */ + + int running; /* if false, the test will stop */ + + /* If min != max, then we will either do a linear iteration, or + * we will do a random selection from within the range. + */ + __u32 flags; + int xmit_mode; + int min_pkt_size; + int max_pkt_size; + int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */ + int nfrags; + int removal_mark; /* non-zero => the device is marked for + * removal by worker thread */ + + struct page *page; + u64 delay; /* nano-seconds */ + + __u64 count; /* Default No packets to send */ + __u64 sofar; /* How many pkts we've sent so far */ + __u64 tx_bytes; /* How many bytes we've transmitted */ + __u64 errors; /* Errors when trying to transmit, */ + + /* runtime counters relating to clone_skb */ + + __u32 clone_count; + int last_ok; /* Was last skb sent? + * Or a failed transmit of some sort? + * This will keep sequence numbers in order + */ + ktime_t next_tx; + ktime_t started_at; + ktime_t stopped_at; + u64 idle_acc; /* nano-seconds */ + + __u32 seq_num; + + int clone_skb; /* + * Use multiple SKBs during packet gen. + * If this number is greater than 1, then + * that many copies of the same packet will be + * sent before a new packet is allocated. + * If you want to send 1024 identical packets + * before creating a new packet, + * set clone_skb to 1024. + */ + + char dst_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char dst_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char src_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char src_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + + struct in6_addr in6_saddr; + struct in6_addr in6_daddr; + struct in6_addr cur_in6_daddr; + struct in6_addr cur_in6_saddr; + /* For ranges */ + struct in6_addr min_in6_daddr; + struct in6_addr max_in6_daddr; + struct in6_addr min_in6_saddr; + struct in6_addr max_in6_saddr; + + /* If we're doing ranges, random or incremental, then this + * defines the min/max for those ranges. + */ + __be32 saddr_min; /* inclusive, source IP address */ + __be32 saddr_max; /* exclusive, source IP address */ + __be32 daddr_min; /* inclusive, dest IP address */ + __be32 daddr_max; /* exclusive, dest IP address */ + + __u16 udp_src_min; /* inclusive, source UDP port */ + __u16 udp_src_max; /* exclusive, source UDP port */ + __u16 udp_dst_min; /* inclusive, dest UDP port */ + __u16 udp_dst_max; /* exclusive, dest UDP port */ + + /* DSCP + ECN */ + __u8 tos; /* six MSB of (former) IPv4 TOS + are for dscp codepoint */ + __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 + (see RFC 3260, sec. 4) */ + + /* IMIX */ + unsigned int n_imix_entries; + struct imix_pkt imix_entries[MAX_IMIX_ENTRIES]; + /* Maps 0-IMIX_PRECISION range to imix_entry based on probability*/ + __u8 imix_distribution[IMIX_PRECISION]; + + /* MPLS */ + unsigned int nr_labels; /* Depth of stack, 0 = no MPLS */ + __be32 labels[MAX_MPLS_LABELS]; + + /* VLAN/SVLAN (802.1Q/Q-in-Q) */ + __u8 vlan_p; + __u8 vlan_cfi; + __u16 vlan_id; /* 0xffff means no vlan tag */ + + __u8 svlan_p; + __u8 svlan_cfi; + __u16 svlan_id; /* 0xffff means no svlan tag */ + + __u32 src_mac_count; /* How many MACs to iterate through */ + __u32 dst_mac_count; /* How many MACs to iterate through */ + + unsigned char dst_mac[ETH_ALEN]; + unsigned char src_mac[ETH_ALEN]; + + __u32 cur_dst_mac_offset; + __u32 cur_src_mac_offset; + __be32 cur_saddr; + __be32 cur_daddr; + __u16 ip_id; + __u16 cur_udp_dst; + __u16 cur_udp_src; + __u16 cur_queue_map; + __u32 cur_pkt_size; + __u32 last_pkt_size; + + __u8 hh[14]; + /* = { + 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB, + + We fill in SRC address later + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x08, 0x00 + }; + */ + __u16 pad; /* pad out the hh struct to an even 16 bytes */ + + struct sk_buff *skb; /* skb we are to transmit next, used for when we + * are transmitting the same one multiple times + */ + struct net_device *odev; /* The out-going device. + * Note that the device should have it's + * pg_info pointer pointing back to this + * device. + * Set when the user specifies the out-going + * device name (not when the inject is + * started as it used to do.) + */ + netdevice_tracker dev_tracker; + char odevname[32]; + struct flow_state *flows; + unsigned int cflows; /* Concurrent flows (config) */ + unsigned int lflow; /* Flow length (config) */ + unsigned int nflows; /* accumulated flows (stats) */ + unsigned int curfl; /* current sequenced flow (state)*/ + + u16 queue_map_min; + u16 queue_map_max; + __u32 skb_priority; /* skb priority field */ + unsigned int burst; /* number of duplicated packets to burst */ + int node; /* Memory node */ + +#ifdef CONFIG_XFRM + __u8 ipsmode; /* IPSEC mode (config) */ + __u8 ipsproto; /* IPSEC type (config) */ + __u32 spi; + struct xfrm_dst xdst; + struct dst_ops dstops; +#endif + char result[512]; +}; + +struct pktgen_hdr { + __be32 pgh_magic; + __be32 seq_num; + __be32 tv_sec; + __be32 tv_usec; +}; + + +static unsigned int pg_net_id __read_mostly; + +struct pktgen_net { + struct net *net; + struct proc_dir_entry *proc_dir; + struct list_head pktgen_threads; + bool pktgen_exiting; +}; + +struct pktgen_thread { + struct mutex if_lock; /* for list of devices */ + struct list_head if_list; /* All device here */ + struct list_head th_list; + struct task_struct *tsk; + char result[512]; + + /* Field for thread to receive "posted" events terminate, + stop ifs etc. */ + + u32 control; + int cpu; + + wait_queue_head_t queue; + struct completion start_done; + struct pktgen_net *net; +}; + +#define REMOVE 1 +#define FIND 0 + +static const char version[] = + "Packet Generator for packet performance testing. " + "Version: " VERSION "\n"; + +static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i); +static int pktgen_add_device(struct pktgen_thread *t, const char *ifname); +static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, + const char *ifname, bool exact); +static int pktgen_device_event(struct notifier_block *, unsigned long, void *); +static void pktgen_run_all_threads(struct pktgen_net *pn); +static void pktgen_reset_all_threads(struct pktgen_net *pn); +static void pktgen_stop_all_threads(struct pktgen_net *pn); + +static void pktgen_stop(struct pktgen_thread *t); +static void pktgen_clear_counters(struct pktgen_dev *pkt_dev); +static void fill_imix_distribution(struct pktgen_dev *pkt_dev); + +/* Module parameters, defaults. */ +static int pg_count_d __read_mostly = 1000; +static int pg_delay_d __read_mostly; +static int pg_clone_skb_d __read_mostly; +static int debug __read_mostly; + +static DEFINE_MUTEX(pktgen_thread_lock); + +static struct notifier_block pktgen_notifier_block = { + .notifier_call = pktgen_device_event, +}; + +/* + * /proc handling functions + * + */ + +static int pgctrl_show(struct seq_file *seq, void *v) +{ + seq_puts(seq, version); + return 0; +} + +static ssize_t pgctrl_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + char data[128]; + struct pktgen_net *pn = net_generic(current->nsproxy->net_ns, pg_net_id); + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (count == 0) + return -EINVAL; + + if (count > sizeof(data)) + count = sizeof(data); + + if (copy_from_user(data, buf, count)) + return -EFAULT; + + data[count - 1] = 0; /* Strip trailing '\n' and terminate string */ + + if (!strcmp(data, "stop")) + pktgen_stop_all_threads(pn); + else if (!strcmp(data, "start")) + pktgen_run_all_threads(pn); + else if (!strcmp(data, "reset")) + pktgen_reset_all_threads(pn); + else + return -EINVAL; + + return count; +} + +static int pgctrl_open(struct inode *inode, struct file *file) +{ + return single_open(file, pgctrl_show, pde_data(inode)); +} + +static const struct proc_ops pktgen_proc_ops = { + .proc_open = pgctrl_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = pgctrl_write, + .proc_release = single_release, +}; + +static int pktgen_if_show(struct seq_file *seq, void *v) +{ + const struct pktgen_dev *pkt_dev = seq->private; + ktime_t stopped; + unsigned int i; + u64 idle; + + seq_printf(seq, + "Params: count %llu min_pkt_size: %u max_pkt_size: %u\n", + (unsigned long long)pkt_dev->count, pkt_dev->min_pkt_size, + pkt_dev->max_pkt_size); + + if (pkt_dev->n_imix_entries > 0) { + seq_puts(seq, " imix_weights: "); + for (i = 0; i < pkt_dev->n_imix_entries; i++) { + seq_printf(seq, "%llu,%llu ", + pkt_dev->imix_entries[i].size, + pkt_dev->imix_entries[i].weight); + } + seq_puts(seq, "\n"); + } + + seq_printf(seq, + " frags: %d delay: %llu clone_skb: %d ifname: %s\n", + pkt_dev->nfrags, (unsigned long long) pkt_dev->delay, + pkt_dev->clone_skb, pkt_dev->odevname); + + seq_printf(seq, " flows: %u flowlen: %u\n", pkt_dev->cflows, + pkt_dev->lflow); + + seq_printf(seq, + " queue_map_min: %u queue_map_max: %u\n", + pkt_dev->queue_map_min, + pkt_dev->queue_map_max); + + if (pkt_dev->skb_priority) + seq_printf(seq, " skb_priority: %u\n", + pkt_dev->skb_priority); + + if (pkt_dev->flags & F_IPV6) { + seq_printf(seq, + " saddr: %pI6c min_saddr: %pI6c max_saddr: %pI6c\n" + " daddr: %pI6c min_daddr: %pI6c max_daddr: %pI6c\n", + &pkt_dev->in6_saddr, + &pkt_dev->min_in6_saddr, &pkt_dev->max_in6_saddr, + &pkt_dev->in6_daddr, + &pkt_dev->min_in6_daddr, &pkt_dev->max_in6_daddr); + } else { + seq_printf(seq, + " dst_min: %s dst_max: %s\n", + pkt_dev->dst_min, pkt_dev->dst_max); + seq_printf(seq, + " src_min: %s src_max: %s\n", + pkt_dev->src_min, pkt_dev->src_max); + } + + seq_puts(seq, " src_mac: "); + + seq_printf(seq, "%pM ", + is_zero_ether_addr(pkt_dev->src_mac) ? + pkt_dev->odev->dev_addr : pkt_dev->src_mac); + + seq_puts(seq, "dst_mac: "); + seq_printf(seq, "%pM\n", pkt_dev->dst_mac); + + seq_printf(seq, + " udp_src_min: %d udp_src_max: %d" + " udp_dst_min: %d udp_dst_max: %d\n", + pkt_dev->udp_src_min, pkt_dev->udp_src_max, + pkt_dev->udp_dst_min, pkt_dev->udp_dst_max); + + seq_printf(seq, + " src_mac_count: %d dst_mac_count: %d\n", + pkt_dev->src_mac_count, pkt_dev->dst_mac_count); + + if (pkt_dev->nr_labels) { + seq_puts(seq, " mpls: "); + for (i = 0; i < pkt_dev->nr_labels; i++) + seq_printf(seq, "%08x%s", ntohl(pkt_dev->labels[i]), + i == pkt_dev->nr_labels-1 ? "\n" : ", "); + } + + if (pkt_dev->vlan_id != 0xffff) + seq_printf(seq, " vlan_id: %u vlan_p: %u vlan_cfi: %u\n", + pkt_dev->vlan_id, pkt_dev->vlan_p, + pkt_dev->vlan_cfi); + + if (pkt_dev->svlan_id != 0xffff) + seq_printf(seq, " svlan_id: %u vlan_p: %u vlan_cfi: %u\n", + pkt_dev->svlan_id, pkt_dev->svlan_p, + pkt_dev->svlan_cfi); + + if (pkt_dev->tos) + seq_printf(seq, " tos: 0x%02x\n", pkt_dev->tos); + + if (pkt_dev->traffic_class) + seq_printf(seq, " traffic_class: 0x%02x\n", pkt_dev->traffic_class); + + if (pkt_dev->burst > 1) + seq_printf(seq, " burst: %d\n", pkt_dev->burst); + + if (pkt_dev->node >= 0) + seq_printf(seq, " node: %d\n", pkt_dev->node); + + if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) + seq_puts(seq, " xmit_mode: netif_receive\n"); + else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) + seq_puts(seq, " xmit_mode: xmit_queue\n"); + + seq_puts(seq, " Flags: "); + + for (i = 0; i < NR_PKT_FLAGS; i++) { + if (i == FLOW_SEQ_SHIFT) + if (!pkt_dev->cflows) + continue; + + if (pkt_dev->flags & (1 << i)) { + seq_printf(seq, "%s ", pkt_flag_names[i]); +#ifdef CONFIG_XFRM + if (i == IPSEC_SHIFT && pkt_dev->spi) + seq_printf(seq, "spi:%u ", pkt_dev->spi); +#endif + } else if (i == FLOW_SEQ_SHIFT) { + seq_puts(seq, "FLOW_RND "); + } + } + + seq_puts(seq, "\n"); + + /* not really stopped, more like last-running-at */ + stopped = pkt_dev->running ? ktime_get() : pkt_dev->stopped_at; + idle = pkt_dev->idle_acc; + do_div(idle, NSEC_PER_USEC); + + seq_printf(seq, + "Current:\n pkts-sofar: %llu errors: %llu\n", + (unsigned long long)pkt_dev->sofar, + (unsigned long long)pkt_dev->errors); + + if (pkt_dev->n_imix_entries > 0) { + int i; + + seq_puts(seq, " imix_size_counts: "); + for (i = 0; i < pkt_dev->n_imix_entries; i++) { + seq_printf(seq, "%llu,%llu ", + pkt_dev->imix_entries[i].size, + pkt_dev->imix_entries[i].count_so_far); + } + seq_puts(seq, "\n"); + } + + seq_printf(seq, + " started: %lluus stopped: %lluus idle: %lluus\n", + (unsigned long long) ktime_to_us(pkt_dev->started_at), + (unsigned long long) ktime_to_us(stopped), + (unsigned long long) idle); + + seq_printf(seq, + " seq_num: %d cur_dst_mac_offset: %d cur_src_mac_offset: %d\n", + pkt_dev->seq_num, pkt_dev->cur_dst_mac_offset, + pkt_dev->cur_src_mac_offset); + + if (pkt_dev->flags & F_IPV6) { + seq_printf(seq, " cur_saddr: %pI6c cur_daddr: %pI6c\n", + &pkt_dev->cur_in6_saddr, + &pkt_dev->cur_in6_daddr); + } else + seq_printf(seq, " cur_saddr: %pI4 cur_daddr: %pI4\n", + &pkt_dev->cur_saddr, &pkt_dev->cur_daddr); + + seq_printf(seq, " cur_udp_dst: %d cur_udp_src: %d\n", + pkt_dev->cur_udp_dst, pkt_dev->cur_udp_src); + + seq_printf(seq, " cur_queue_map: %u\n", pkt_dev->cur_queue_map); + + seq_printf(seq, " flows: %u\n", pkt_dev->nflows); + + if (pkt_dev->result[0]) + seq_printf(seq, "Result: %s\n", pkt_dev->result); + else + seq_puts(seq, "Result: Idle\n"); + + return 0; +} + + +static int hex32_arg(const char __user *user_buffer, unsigned long maxlen, + __u32 *num) +{ + int i = 0; + *num = 0; + + for (; i < maxlen; i++) { + int value; + char c; + *num <<= 4; + if (get_user(c, &user_buffer[i])) + return -EFAULT; + value = hex_to_bin(c); + if (value >= 0) + *num |= value; + else + break; + } + return i; +} + +static int count_trail_chars(const char __user * user_buffer, + unsigned int maxlen) +{ + int i; + + for (i = 0; i < maxlen; i++) { + char c; + if (get_user(c, &user_buffer[i])) + return -EFAULT; + switch (c) { + case '\"': + case '\n': + case '\r': + case '\t': + case ' ': + case '=': + break; + default: + goto done; + } + } +done: + return i; +} + +static long num_arg(const char __user *user_buffer, unsigned long maxlen, + unsigned long *num) +{ + int i; + *num = 0; + + for (i = 0; i < maxlen; i++) { + char c; + if (get_user(c, &user_buffer[i])) + return -EFAULT; + if ((c >= '0') && (c <= '9')) { + *num *= 10; + *num += c - '0'; + } else + break; + } + return i; +} + +static int strn_len(const char __user * user_buffer, unsigned int maxlen) +{ + int i; + + for (i = 0; i < maxlen; i++) { + char c; + if (get_user(c, &user_buffer[i])) + return -EFAULT; + switch (c) { + case '\"': + case '\n': + case '\r': + case '\t': + case ' ': + goto done_str; + default: + break; + } + } +done_str: + return i; +} + +/* Parses imix entries from user buffer. + * The user buffer should consist of imix entries separated by spaces + * where each entry consists of size and weight delimited by commas. + * "size1,weight_1 size2,weight_2 ... size_n,weight_n" for example. + */ +static ssize_t get_imix_entries(const char __user *buffer, + struct pktgen_dev *pkt_dev) +{ + const int max_digits = 10; + int i = 0; + long len; + char c; + + pkt_dev->n_imix_entries = 0; + + do { + unsigned long weight; + unsigned long size; + + len = num_arg(&buffer[i], max_digits, &size); + if (len < 0) + return len; + i += len; + if (get_user(c, &buffer[i])) + return -EFAULT; + /* Check for comma between size_i and weight_i */ + if (c != ',') + return -EINVAL; + i++; + + if (size < 14 + 20 + 8) + size = 14 + 20 + 8; + + len = num_arg(&buffer[i], max_digits, &weight); + if (len < 0) + return len; + if (weight <= 0) + return -EINVAL; + + pkt_dev->imix_entries[pkt_dev->n_imix_entries].size = size; + pkt_dev->imix_entries[pkt_dev->n_imix_entries].weight = weight; + + i += len; + if (get_user(c, &buffer[i])) + return -EFAULT; + + i++; + pkt_dev->n_imix_entries++; + + if (pkt_dev->n_imix_entries > MAX_IMIX_ENTRIES) + return -E2BIG; + } while (c == ' '); + + return i; +} + +static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev) +{ + unsigned int n = 0; + char c; + ssize_t i = 0; + int len; + + pkt_dev->nr_labels = 0; + do { + __u32 tmp; + len = hex32_arg(&buffer[i], 8, &tmp); + if (len <= 0) + return len; + pkt_dev->labels[n] = htonl(tmp); + if (pkt_dev->labels[n] & MPLS_STACK_BOTTOM) + pkt_dev->flags |= F_MPLS_RND; + i += len; + if (get_user(c, &buffer[i])) + return -EFAULT; + i++; + n++; + if (n >= MAX_MPLS_LABELS) + return -E2BIG; + } while (c == ','); + + pkt_dev->nr_labels = n; + return i; +} + +static __u32 pktgen_read_flag(const char *f, bool *disable) +{ + __u32 i; + + if (f[0] == '!') { + *disable = true; + f++; + } + + for (i = 0; i < NR_PKT_FLAGS; i++) { + if (!IS_ENABLED(CONFIG_XFRM) && i == IPSEC_SHIFT) + continue; + + /* allow only disabling ipv6 flag */ + if (!*disable && i == IPV6_SHIFT) + continue; + + if (strcmp(f, pkt_flag_names[i]) == 0) + return 1 << i; + } + + if (strcmp(f, "FLOW_RND") == 0) { + *disable = !*disable; + return F_FLOW_SEQ; + } + + return 0; +} + +static ssize_t pktgen_if_write(struct file *file, + const char __user * user_buffer, size_t count, + loff_t * offset) +{ + struct seq_file *seq = file->private_data; + struct pktgen_dev *pkt_dev = seq->private; + int i, max, len; + char name[16], valstr[32]; + unsigned long value = 0; + char *pg_result = NULL; + int tmp = 0; + char buf[128]; + + pg_result = &(pkt_dev->result[0]); + + if (count < 1) { + pr_warn("wrong command format\n"); + return -EINVAL; + } + + max = count; + tmp = count_trail_chars(user_buffer, max); + if (tmp < 0) { + pr_warn("illegal format\n"); + return tmp; + } + i = tmp; + + /* Read variable name */ + + len = strn_len(&user_buffer[i], sizeof(name) - 1); + if (len < 0) + return len; + + memset(name, 0, sizeof(name)); + if (copy_from_user(name, &user_buffer[i], len)) + return -EFAULT; + i += len; + + max = count - i; + len = count_trail_chars(&user_buffer[i], max); + if (len < 0) + return len; + + i += len; + + if (debug) { + size_t copy = min_t(size_t, count + 1, 1024); + char *tp = strndup_user(user_buffer, copy); + + if (IS_ERR(tp)) + return PTR_ERR(tp); + + pr_debug("%s,%zu buffer -:%s:-\n", name, count, tp); + kfree(tp); + } + + if (!strcmp(name, "min_pkt_size")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value < 14 + 20 + 8) + value = 14 + 20 + 8; + if (value != pkt_dev->min_pkt_size) { + pkt_dev->min_pkt_size = value; + pkt_dev->cur_pkt_size = value; + } + sprintf(pg_result, "OK: min_pkt_size=%d", + pkt_dev->min_pkt_size); + return count; + } + + if (!strcmp(name, "max_pkt_size")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value < 14 + 20 + 8) + value = 14 + 20 + 8; + if (value != pkt_dev->max_pkt_size) { + pkt_dev->max_pkt_size = value; + pkt_dev->cur_pkt_size = value; + } + sprintf(pg_result, "OK: max_pkt_size=%d", + pkt_dev->max_pkt_size); + return count; + } + + /* Shortcut for min = max */ + + if (!strcmp(name, "pkt_size")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value < 14 + 20 + 8) + value = 14 + 20 + 8; + if (value != pkt_dev->min_pkt_size) { + pkt_dev->min_pkt_size = value; + pkt_dev->max_pkt_size = value; + pkt_dev->cur_pkt_size = value; + } + sprintf(pg_result, "OK: pkt_size=%d", pkt_dev->min_pkt_size); + return count; + } + + if (!strcmp(name, "imix_weights")) { + if (pkt_dev->clone_skb > 0) + return -EINVAL; + + len = get_imix_entries(&user_buffer[i], pkt_dev); + if (len < 0) + return len; + + fill_imix_distribution(pkt_dev); + + i += len; + return count; + } + + if (!strcmp(name, "debug")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + debug = value; + sprintf(pg_result, "OK: debug=%u", debug); + return count; + } + + if (!strcmp(name, "frags")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + pkt_dev->nfrags = value; + sprintf(pg_result, "OK: frags=%d", pkt_dev->nfrags); + return count; + } + if (!strcmp(name, "delay")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value == 0x7FFFFFFF) + pkt_dev->delay = ULLONG_MAX; + else + pkt_dev->delay = (u64)value; + + sprintf(pg_result, "OK: delay=%llu", + (unsigned long long) pkt_dev->delay); + return count; + } + if (!strcmp(name, "rate")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (!value) + return len; + pkt_dev->delay = pkt_dev->min_pkt_size*8*NSEC_PER_USEC/value; + if (debug) + pr_info("Delay set at: %llu ns\n", pkt_dev->delay); + + sprintf(pg_result, "OK: rate=%lu", value); + return count; + } + if (!strcmp(name, "ratep")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (!value) + return len; + pkt_dev->delay = NSEC_PER_SEC/value; + if (debug) + pr_info("Delay set at: %llu ns\n", pkt_dev->delay); + + sprintf(pg_result, "OK: rate=%lu", value); + return count; + } + if (!strcmp(name, "udp_src_min")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value != pkt_dev->udp_src_min) { + pkt_dev->udp_src_min = value; + pkt_dev->cur_udp_src = value; + } + sprintf(pg_result, "OK: udp_src_min=%u", pkt_dev->udp_src_min); + return count; + } + if (!strcmp(name, "udp_dst_min")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value != pkt_dev->udp_dst_min) { + pkt_dev->udp_dst_min = value; + pkt_dev->cur_udp_dst = value; + } + sprintf(pg_result, "OK: udp_dst_min=%u", pkt_dev->udp_dst_min); + return count; + } + if (!strcmp(name, "udp_src_max")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value != pkt_dev->udp_src_max) { + pkt_dev->udp_src_max = value; + pkt_dev->cur_udp_src = value; + } + sprintf(pg_result, "OK: udp_src_max=%u", pkt_dev->udp_src_max); + return count; + } + if (!strcmp(name, "udp_dst_max")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value != pkt_dev->udp_dst_max) { + pkt_dev->udp_dst_max = value; + pkt_dev->cur_udp_dst = value; + } + sprintf(pg_result, "OK: udp_dst_max=%u", pkt_dev->udp_dst_max); + return count; + } + if (!strcmp(name, "clone_skb")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + /* clone_skb is not supported for netif_receive xmit_mode and + * IMIX mode. + */ + if ((value > 0) && + ((pkt_dev->xmit_mode == M_NETIF_RECEIVE) || + !(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) + return -ENOTSUPP; + if (value > 0 && pkt_dev->n_imix_entries > 0) + return -EINVAL; + + i += len; + pkt_dev->clone_skb = value; + + sprintf(pg_result, "OK: clone_skb=%d", pkt_dev->clone_skb); + return count; + } + if (!strcmp(name, "count")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + pkt_dev->count = value; + sprintf(pg_result, "OK: count=%llu", + (unsigned long long)pkt_dev->count); + return count; + } + if (!strcmp(name, "src_mac_count")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (pkt_dev->src_mac_count != value) { + pkt_dev->src_mac_count = value; + pkt_dev->cur_src_mac_offset = 0; + } + sprintf(pg_result, "OK: src_mac_count=%d", + pkt_dev->src_mac_count); + return count; + } + if (!strcmp(name, "dst_mac_count")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (pkt_dev->dst_mac_count != value) { + pkt_dev->dst_mac_count = value; + pkt_dev->cur_dst_mac_offset = 0; + } + sprintf(pg_result, "OK: dst_mac_count=%d", + pkt_dev->dst_mac_count); + return count; + } + if (!strcmp(name, "burst")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if ((value > 1) && + ((pkt_dev->xmit_mode == M_QUEUE_XMIT) || + ((pkt_dev->xmit_mode == M_START_XMIT) && + (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))))) + return -ENOTSUPP; + pkt_dev->burst = value < 1 ? 1 : value; + sprintf(pg_result, "OK: burst=%u", pkt_dev->burst); + return count; + } + if (!strcmp(name, "node")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + + if (node_possible(value)) { + pkt_dev->node = value; + sprintf(pg_result, "OK: node=%d", pkt_dev->node); + if (pkt_dev->page) { + put_page(pkt_dev->page); + pkt_dev->page = NULL; + } + } + else + sprintf(pg_result, "ERROR: node not possible"); + return count; + } + if (!strcmp(name, "xmit_mode")) { + char f[32]; + + memset(f, 0, 32); + len = strn_len(&user_buffer[i], sizeof(f) - 1); + if (len < 0) + return len; + + if (copy_from_user(f, &user_buffer[i], len)) + return -EFAULT; + i += len; + + if (strcmp(f, "start_xmit") == 0) { + pkt_dev->xmit_mode = M_START_XMIT; + } else if (strcmp(f, "netif_receive") == 0) { + /* clone_skb set earlier, not supported in this mode */ + if (pkt_dev->clone_skb > 0) + return -ENOTSUPP; + + pkt_dev->xmit_mode = M_NETIF_RECEIVE; + + /* make sure new packet is allocated every time + * pktgen_xmit() is called + */ + pkt_dev->last_ok = 1; + } else if (strcmp(f, "queue_xmit") == 0) { + pkt_dev->xmit_mode = M_QUEUE_XMIT; + pkt_dev->last_ok = 1; + } else { + sprintf(pg_result, + "xmit_mode -:%s:- unknown\nAvailable modes: %s", + f, "start_xmit, netif_receive\n"); + return count; + } + sprintf(pg_result, "OK: xmit_mode=%s", f); + return count; + } + if (!strcmp(name, "flag")) { + __u32 flag; + char f[32]; + bool disable = false; + + memset(f, 0, 32); + len = strn_len(&user_buffer[i], sizeof(f) - 1); + if (len < 0) + return len; + + if (copy_from_user(f, &user_buffer[i], len)) + return -EFAULT; + i += len; + + flag = pktgen_read_flag(f, &disable); + + if (flag) { + if (disable) + pkt_dev->flags &= ~flag; + else + pkt_dev->flags |= flag; + } else { + sprintf(pg_result, + "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", + f, + "IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, " + "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, " + "MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, " + "QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, " + "NO_TIMESTAMP, " +#ifdef CONFIG_XFRM + "IPSEC, " +#endif + "NODE_ALLOC\n"); + return count; + } + sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags); + return count; + } + if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) { + len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_min) - 1); + if (len < 0) + return len; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + if (strcmp(buf, pkt_dev->dst_min) != 0) { + memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min)); + strcpy(pkt_dev->dst_min, buf); + pkt_dev->daddr_min = in_aton(pkt_dev->dst_min); + pkt_dev->cur_daddr = pkt_dev->daddr_min; + } + if (debug) + pr_debug("dst_min set to: %s\n", pkt_dev->dst_min); + i += len; + sprintf(pg_result, "OK: dst_min=%s", pkt_dev->dst_min); + return count; + } + if (!strcmp(name, "dst_max")) { + len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_max) - 1); + if (len < 0) + return len; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + if (strcmp(buf, pkt_dev->dst_max) != 0) { + memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max)); + strcpy(pkt_dev->dst_max, buf); + pkt_dev->daddr_max = in_aton(pkt_dev->dst_max); + pkt_dev->cur_daddr = pkt_dev->daddr_max; + } + if (debug) + pr_debug("dst_max set to: %s\n", pkt_dev->dst_max); + i += len; + sprintf(pg_result, "OK: dst_max=%s", pkt_dev->dst_max); + return count; + } + if (!strcmp(name, "dst6")) { + len = strn_len(&user_buffer[i], sizeof(buf) - 1); + if (len < 0) + return len; + + pkt_dev->flags |= F_IPV6; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + + in6_pton(buf, -1, pkt_dev->in6_daddr.s6_addr, -1, NULL); + snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_daddr); + + pkt_dev->cur_in6_daddr = pkt_dev->in6_daddr; + + if (debug) + pr_debug("dst6 set to: %s\n", buf); + + i += len; + sprintf(pg_result, "OK: dst6=%s", buf); + return count; + } + if (!strcmp(name, "dst6_min")) { + len = strn_len(&user_buffer[i], sizeof(buf) - 1); + if (len < 0) + return len; + + pkt_dev->flags |= F_IPV6; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + + in6_pton(buf, -1, pkt_dev->min_in6_daddr.s6_addr, -1, NULL); + snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->min_in6_daddr); + + pkt_dev->cur_in6_daddr = pkt_dev->min_in6_daddr; + if (debug) + pr_debug("dst6_min set to: %s\n", buf); + + i += len; + sprintf(pg_result, "OK: dst6_min=%s", buf); + return count; + } + if (!strcmp(name, "dst6_max")) { + len = strn_len(&user_buffer[i], sizeof(buf) - 1); + if (len < 0) + return len; + + pkt_dev->flags |= F_IPV6; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + + in6_pton(buf, -1, pkt_dev->max_in6_daddr.s6_addr, -1, NULL); + snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->max_in6_daddr); + + if (debug) + pr_debug("dst6_max set to: %s\n", buf); + + i += len; + sprintf(pg_result, "OK: dst6_max=%s", buf); + return count; + } + if (!strcmp(name, "src6")) { + len = strn_len(&user_buffer[i], sizeof(buf) - 1); + if (len < 0) + return len; + + pkt_dev->flags |= F_IPV6; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + + in6_pton(buf, -1, pkt_dev->in6_saddr.s6_addr, -1, NULL); + snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_saddr); + + pkt_dev->cur_in6_saddr = pkt_dev->in6_saddr; + + if (debug) + pr_debug("src6 set to: %s\n", buf); + + i += len; + sprintf(pg_result, "OK: src6=%s", buf); + return count; + } + if (!strcmp(name, "src_min")) { + len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_min) - 1); + if (len < 0) + return len; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + if (strcmp(buf, pkt_dev->src_min) != 0) { + memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min)); + strcpy(pkt_dev->src_min, buf); + pkt_dev->saddr_min = in_aton(pkt_dev->src_min); + pkt_dev->cur_saddr = pkt_dev->saddr_min; + } + if (debug) + pr_debug("src_min set to: %s\n", pkt_dev->src_min); + i += len; + sprintf(pg_result, "OK: src_min=%s", pkt_dev->src_min); + return count; + } + if (!strcmp(name, "src_max")) { + len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_max) - 1); + if (len < 0) + return len; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + if (strcmp(buf, pkt_dev->src_max) != 0) { + memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max)); + strcpy(pkt_dev->src_max, buf); + pkt_dev->saddr_max = in_aton(pkt_dev->src_max); + pkt_dev->cur_saddr = pkt_dev->saddr_max; + } + if (debug) + pr_debug("src_max set to: %s\n", pkt_dev->src_max); + i += len; + sprintf(pg_result, "OK: src_max=%s", pkt_dev->src_max); + return count; + } + if (!strcmp(name, "dst_mac")) { + len = strn_len(&user_buffer[i], sizeof(valstr) - 1); + if (len < 0) + return len; + + memset(valstr, 0, sizeof(valstr)); + if (copy_from_user(valstr, &user_buffer[i], len)) + return -EFAULT; + + if (!mac_pton(valstr, pkt_dev->dst_mac)) + return -EINVAL; + /* Set up Dest MAC */ + ether_addr_copy(&pkt_dev->hh[0], pkt_dev->dst_mac); + + sprintf(pg_result, "OK: dstmac %pM", pkt_dev->dst_mac); + return count; + } + if (!strcmp(name, "src_mac")) { + len = strn_len(&user_buffer[i], sizeof(valstr) - 1); + if (len < 0) + return len; + + memset(valstr, 0, sizeof(valstr)); + if (copy_from_user(valstr, &user_buffer[i], len)) + return -EFAULT; + + if (!mac_pton(valstr, pkt_dev->src_mac)) + return -EINVAL; + /* Set up Src MAC */ + ether_addr_copy(&pkt_dev->hh[6], pkt_dev->src_mac); + + sprintf(pg_result, "OK: srcmac %pM", pkt_dev->src_mac); + return count; + } + + if (!strcmp(name, "clear_counters")) { + pktgen_clear_counters(pkt_dev); + sprintf(pg_result, "OK: Clearing counters.\n"); + return count; + } + + if (!strcmp(name, "flows")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value > MAX_CFLOWS) + value = MAX_CFLOWS; + + pkt_dev->cflows = value; + sprintf(pg_result, "OK: flows=%u", pkt_dev->cflows); + return count; + } +#ifdef CONFIG_XFRM + if (!strcmp(name, "spi")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + pkt_dev->spi = value; + sprintf(pg_result, "OK: spi=%u", pkt_dev->spi); + return count; + } +#endif + if (!strcmp(name, "flowlen")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + pkt_dev->lflow = value; + sprintf(pg_result, "OK: flowlen=%u", pkt_dev->lflow); + return count; + } + + if (!strcmp(name, "queue_map_min")) { + len = num_arg(&user_buffer[i], 5, &value); + if (len < 0) + return len; + + i += len; + pkt_dev->queue_map_min = value; + sprintf(pg_result, "OK: queue_map_min=%u", pkt_dev->queue_map_min); + return count; + } + + if (!strcmp(name, "queue_map_max")) { + len = num_arg(&user_buffer[i], 5, &value); + if (len < 0) + return len; + + i += len; + pkt_dev->queue_map_max = value; + sprintf(pg_result, "OK: queue_map_max=%u", pkt_dev->queue_map_max); + return count; + } + + if (!strcmp(name, "mpls")) { + unsigned int n, cnt; + + len = get_labels(&user_buffer[i], pkt_dev); + if (len < 0) + return len; + i += len; + cnt = sprintf(pg_result, "OK: mpls="); + for (n = 0; n < pkt_dev->nr_labels; n++) + cnt += sprintf(pg_result + cnt, + "%08x%s", ntohl(pkt_dev->labels[n]), + n == pkt_dev->nr_labels-1 ? "" : ","); + + if (pkt_dev->nr_labels && pkt_dev->vlan_id != 0xffff) { + pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */ + pkt_dev->svlan_id = 0xffff; + + if (debug) + pr_debug("VLAN/SVLAN auto turned off\n"); + } + return count; + } + + if (!strcmp(name, "vlan_id")) { + len = num_arg(&user_buffer[i], 4, &value); + if (len < 0) + return len; + + i += len; + if (value <= 4095) { + pkt_dev->vlan_id = value; /* turn on VLAN */ + + if (debug) + pr_debug("VLAN turned on\n"); + + if (debug && pkt_dev->nr_labels) + pr_debug("MPLS auto turned off\n"); + + pkt_dev->nr_labels = 0; /* turn off MPLS */ + sprintf(pg_result, "OK: vlan_id=%u", pkt_dev->vlan_id); + } else { + pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */ + pkt_dev->svlan_id = 0xffff; + + if (debug) + pr_debug("VLAN/SVLAN turned off\n"); + } + return count; + } + + if (!strcmp(name, "vlan_p")) { + len = num_arg(&user_buffer[i], 1, &value); + if (len < 0) + return len; + + i += len; + if ((value <= 7) && (pkt_dev->vlan_id != 0xffff)) { + pkt_dev->vlan_p = value; + sprintf(pg_result, "OK: vlan_p=%u", pkt_dev->vlan_p); + } else { + sprintf(pg_result, "ERROR: vlan_p must be 0-7"); + } + return count; + } + + if (!strcmp(name, "vlan_cfi")) { + len = num_arg(&user_buffer[i], 1, &value); + if (len < 0) + return len; + + i += len; + if ((value <= 1) && (pkt_dev->vlan_id != 0xffff)) { + pkt_dev->vlan_cfi = value; + sprintf(pg_result, "OK: vlan_cfi=%u", pkt_dev->vlan_cfi); + } else { + sprintf(pg_result, "ERROR: vlan_cfi must be 0-1"); + } + return count; + } + + if (!strcmp(name, "svlan_id")) { + len = num_arg(&user_buffer[i], 4, &value); + if (len < 0) + return len; + + i += len; + if ((value <= 4095) && ((pkt_dev->vlan_id != 0xffff))) { + pkt_dev->svlan_id = value; /* turn on SVLAN */ + + if (debug) + pr_debug("SVLAN turned on\n"); + + if (debug && pkt_dev->nr_labels) + pr_debug("MPLS auto turned off\n"); + + pkt_dev->nr_labels = 0; /* turn off MPLS */ + sprintf(pg_result, "OK: svlan_id=%u", pkt_dev->svlan_id); + } else { + pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */ + pkt_dev->svlan_id = 0xffff; + + if (debug) + pr_debug("VLAN/SVLAN turned off\n"); + } + return count; + } + + if (!strcmp(name, "svlan_p")) { + len = num_arg(&user_buffer[i], 1, &value); + if (len < 0) + return len; + + i += len; + if ((value <= 7) && (pkt_dev->svlan_id != 0xffff)) { + pkt_dev->svlan_p = value; + sprintf(pg_result, "OK: svlan_p=%u", pkt_dev->svlan_p); + } else { + sprintf(pg_result, "ERROR: svlan_p must be 0-7"); + } + return count; + } + + if (!strcmp(name, "svlan_cfi")) { + len = num_arg(&user_buffer[i], 1, &value); + if (len < 0) + return len; + + i += len; + if ((value <= 1) && (pkt_dev->svlan_id != 0xffff)) { + pkt_dev->svlan_cfi = value; + sprintf(pg_result, "OK: svlan_cfi=%u", pkt_dev->svlan_cfi); + } else { + sprintf(pg_result, "ERROR: svlan_cfi must be 0-1"); + } + return count; + } + + if (!strcmp(name, "tos")) { + __u32 tmp_value = 0; + len = hex32_arg(&user_buffer[i], 2, &tmp_value); + if (len < 0) + return len; + + i += len; + if (len == 2) { + pkt_dev->tos = tmp_value; + sprintf(pg_result, "OK: tos=0x%02x", pkt_dev->tos); + } else { + sprintf(pg_result, "ERROR: tos must be 00-ff"); + } + return count; + } + + if (!strcmp(name, "traffic_class")) { + __u32 tmp_value = 0; + len = hex32_arg(&user_buffer[i], 2, &tmp_value); + if (len < 0) + return len; + + i += len; + if (len == 2) { + pkt_dev->traffic_class = tmp_value; + sprintf(pg_result, "OK: traffic_class=0x%02x", pkt_dev->traffic_class); + } else { + sprintf(pg_result, "ERROR: traffic_class must be 00-ff"); + } + return count; + } + + if (!strcmp(name, "skb_priority")) { + len = num_arg(&user_buffer[i], 9, &value); + if (len < 0) + return len; + + i += len; + pkt_dev->skb_priority = value; + sprintf(pg_result, "OK: skb_priority=%i", + pkt_dev->skb_priority); + return count; + } + + sprintf(pkt_dev->result, "No such parameter \"%s\"", name); + return -EINVAL; +} + +static int pktgen_if_open(struct inode *inode, struct file *file) +{ + return single_open(file, pktgen_if_show, pde_data(inode)); +} + +static const struct proc_ops pktgen_if_proc_ops = { + .proc_open = pktgen_if_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = pktgen_if_write, + .proc_release = single_release, +}; + +static int pktgen_thread_show(struct seq_file *seq, void *v) +{ + struct pktgen_thread *t = seq->private; + const struct pktgen_dev *pkt_dev; + + BUG_ON(!t); + + seq_puts(seq, "Running: "); + + rcu_read_lock(); + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) + if (pkt_dev->running) + seq_printf(seq, "%s ", pkt_dev->odevname); + + seq_puts(seq, "\nStopped: "); + + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) + if (!pkt_dev->running) + seq_printf(seq, "%s ", pkt_dev->odevname); + + if (t->result[0]) + seq_printf(seq, "\nResult: %s\n", t->result); + else + seq_puts(seq, "\nResult: NA\n"); + + rcu_read_unlock(); + + return 0; +} + +static ssize_t pktgen_thread_write(struct file *file, + const char __user * user_buffer, + size_t count, loff_t * offset) +{ + struct seq_file *seq = file->private_data; + struct pktgen_thread *t = seq->private; + int i, max, len, ret; + char name[40]; + char *pg_result; + + if (count < 1) { + // sprintf(pg_result, "Wrong command format"); + return -EINVAL; + } + + max = count; + len = count_trail_chars(user_buffer, max); + if (len < 0) + return len; + + i = len; + + /* Read variable name */ + + len = strn_len(&user_buffer[i], sizeof(name) - 1); + if (len < 0) + return len; + + memset(name, 0, sizeof(name)); + if (copy_from_user(name, &user_buffer[i], len)) + return -EFAULT; + i += len; + + max = count - i; + len = count_trail_chars(&user_buffer[i], max); + if (len < 0) + return len; + + i += len; + + if (debug) + pr_debug("t=%s, count=%lu\n", name, (unsigned long)count); + + if (!t) { + pr_err("ERROR: No thread\n"); + ret = -EINVAL; + goto out; + } + + pg_result = &(t->result[0]); + + if (!strcmp(name, "add_device")) { + char f[32]; + memset(f, 0, 32); + len = strn_len(&user_buffer[i], sizeof(f) - 1); + if (len < 0) { + ret = len; + goto out; + } + if (copy_from_user(f, &user_buffer[i], len)) + return -EFAULT; + i += len; + mutex_lock(&pktgen_thread_lock); + ret = pktgen_add_device(t, f); + mutex_unlock(&pktgen_thread_lock); + if (!ret) { + ret = count; + sprintf(pg_result, "OK: add_device=%s", f); + } else + sprintf(pg_result, "ERROR: can not add device %s", f); + goto out; + } + + if (!strcmp(name, "rem_device_all")) { + mutex_lock(&pktgen_thread_lock); + t->control |= T_REMDEVALL; + mutex_unlock(&pktgen_thread_lock); + schedule_timeout_interruptible(msecs_to_jiffies(125)); /* Propagate thread->control */ + ret = count; + sprintf(pg_result, "OK: rem_device_all"); + goto out; + } + + if (!strcmp(name, "max_before_softirq")) { + sprintf(pg_result, "OK: Note! max_before_softirq is obsoleted -- Do not use"); + ret = count; + goto out; + } + + ret = -EINVAL; +out: + return ret; +} + +static int pktgen_thread_open(struct inode *inode, struct file *file) +{ + return single_open(file, pktgen_thread_show, pde_data(inode)); +} + +static const struct proc_ops pktgen_thread_proc_ops = { + .proc_open = pktgen_thread_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = pktgen_thread_write, + .proc_release = single_release, +}; + +/* Think find or remove for NN */ +static struct pktgen_dev *__pktgen_NN_threads(const struct pktgen_net *pn, + const char *ifname, int remove) +{ + struct pktgen_thread *t; + struct pktgen_dev *pkt_dev = NULL; + bool exact = (remove == FIND); + + list_for_each_entry(t, &pn->pktgen_threads, th_list) { + pkt_dev = pktgen_find_dev(t, ifname, exact); + if (pkt_dev) { + if (remove) { + pkt_dev->removal_mark = 1; + t->control |= T_REMDEV; + } + break; + } + } + return pkt_dev; +} + +/* + * mark a device for removal + */ +static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname) +{ + struct pktgen_dev *pkt_dev = NULL; + const int max_tries = 10, msec_per_try = 125; + int i = 0; + + mutex_lock(&pktgen_thread_lock); + pr_debug("%s: marking %s for removal\n", __func__, ifname); + + while (1) { + + pkt_dev = __pktgen_NN_threads(pn, ifname, REMOVE); + if (pkt_dev == NULL) + break; /* success */ + + mutex_unlock(&pktgen_thread_lock); + pr_debug("%s: waiting for %s to disappear....\n", + __func__, ifname); + schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try)); + mutex_lock(&pktgen_thread_lock); + + if (++i >= max_tries) { + pr_err("%s: timed out after waiting %d msec for device %s to be removed\n", + __func__, msec_per_try * i, ifname); + break; + } + + } + + mutex_unlock(&pktgen_thread_lock); +} + +static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *dev) +{ + struct pktgen_thread *t; + + mutex_lock(&pktgen_thread_lock); + + list_for_each_entry(t, &pn->pktgen_threads, th_list) { + struct pktgen_dev *pkt_dev; + + if_lock(t); + list_for_each_entry(pkt_dev, &t->if_list, list) { + if (pkt_dev->odev != dev) + continue; + + proc_remove(pkt_dev->entry); + + pkt_dev->entry = proc_create_data(dev->name, 0600, + pn->proc_dir, + &pktgen_if_proc_ops, + pkt_dev); + if (!pkt_dev->entry) + pr_err("can't move proc entry for '%s'\n", + dev->name); + break; + } + if_unlock(t); + } + mutex_unlock(&pktgen_thread_lock); +} + +static int pktgen_device_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct pktgen_net *pn = net_generic(dev_net(dev), pg_net_id); + + if (pn->pktgen_exiting) + return NOTIFY_DONE; + + /* It is OK that we do not hold the group lock right now, + * as we run under the RTNL lock. + */ + + switch (event) { + case NETDEV_CHANGENAME: + pktgen_change_name(pn, dev); + break; + + case NETDEV_UNREGISTER: + pktgen_mark_device(pn, dev->name); + break; + } + + return NOTIFY_DONE; +} + +static struct net_device *pktgen_dev_get_by_name(const struct pktgen_net *pn, + struct pktgen_dev *pkt_dev, + const char *ifname) +{ + char b[IFNAMSIZ+5]; + int i; + + for (i = 0; ifname[i] != '@'; i++) { + if (i == IFNAMSIZ) + break; + + b[i] = ifname[i]; + } + b[i] = 0; + + return dev_get_by_name(pn->net, b); +} + + +/* Associate pktgen_dev with a device. */ + +static int pktgen_setup_dev(const struct pktgen_net *pn, + struct pktgen_dev *pkt_dev, const char *ifname) +{ + struct net_device *odev; + int err; + + /* Clean old setups */ + if (pkt_dev->odev) { + netdev_put(pkt_dev->odev, &pkt_dev->dev_tracker); + pkt_dev->odev = NULL; + } + + odev = pktgen_dev_get_by_name(pn, pkt_dev, ifname); + if (!odev) { + pr_err("no such netdevice: \"%s\"\n", ifname); + return -ENODEV; + } + + if (odev->type != ARPHRD_ETHER && odev->type != ARPHRD_LOOPBACK) { + pr_err("not an ethernet or loopback device: \"%s\"\n", ifname); + err = -EINVAL; + } else if (!netif_running(odev)) { + pr_err("device is down: \"%s\"\n", ifname); + err = -ENETDOWN; + } else { + pkt_dev->odev = odev; + netdev_tracker_alloc(odev, &pkt_dev->dev_tracker, GFP_KERNEL); + return 0; + } + + dev_put(odev); + return err; +} + +/* Read pkt_dev from the interface and set up internal pktgen_dev + * structure to have the right information to create/send packets + */ +static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) +{ + int ntxq; + + if (!pkt_dev->odev) { + pr_err("ERROR: pkt_dev->odev == NULL in setup_inject\n"); + sprintf(pkt_dev->result, + "ERROR: pkt_dev->odev == NULL in setup_inject.\n"); + return; + } + + /* make sure that we don't pick a non-existing transmit queue */ + ntxq = pkt_dev->odev->real_num_tx_queues; + + if (ntxq <= pkt_dev->queue_map_min) { + pr_warn("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", + pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq, + pkt_dev->odevname); + pkt_dev->queue_map_min = (ntxq ?: 1) - 1; + } + if (pkt_dev->queue_map_max >= ntxq) { + pr_warn("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", + pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq, + pkt_dev->odevname); + pkt_dev->queue_map_max = (ntxq ?: 1) - 1; + } + + /* Default to the interface's mac if not explicitly set. */ + + if (is_zero_ether_addr(pkt_dev->src_mac)) + ether_addr_copy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr); + + /* Set up Dest MAC */ + ether_addr_copy(&(pkt_dev->hh[0]), pkt_dev->dst_mac); + + if (pkt_dev->flags & F_IPV6) { + int i, set = 0, err = 1; + struct inet6_dev *idev; + + if (pkt_dev->min_pkt_size == 0) { + pkt_dev->min_pkt_size = 14 + sizeof(struct ipv6hdr) + + sizeof(struct udphdr) + + sizeof(struct pktgen_hdr) + + pkt_dev->pkt_overhead; + } + + for (i = 0; i < sizeof(struct in6_addr); i++) + if (pkt_dev->cur_in6_saddr.s6_addr[i]) { + set = 1; + break; + } + + if (!set) { + + /* + * Use linklevel address if unconfigured. + * + * use ipv6_get_lladdr if/when it's get exported + */ + + rcu_read_lock(); + idev = __in6_dev_get(pkt_dev->odev); + if (idev) { + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifp, &idev->addr_list, if_list) { + if ((ifp->scope & IFA_LINK) && + !(ifp->flags & IFA_F_TENTATIVE)) { + pkt_dev->cur_in6_saddr = ifp->addr; + err = 0; + break; + } + } + read_unlock_bh(&idev->lock); + } + rcu_read_unlock(); + if (err) + pr_err("ERROR: IPv6 link address not available\n"); + } + } else { + if (pkt_dev->min_pkt_size == 0) { + pkt_dev->min_pkt_size = 14 + sizeof(struct iphdr) + + sizeof(struct udphdr) + + sizeof(struct pktgen_hdr) + + pkt_dev->pkt_overhead; + } + + pkt_dev->saddr_min = 0; + pkt_dev->saddr_max = 0; + if (strlen(pkt_dev->src_min) == 0) { + + struct in_device *in_dev; + + rcu_read_lock(); + in_dev = __in_dev_get_rcu(pkt_dev->odev); + if (in_dev) { + const struct in_ifaddr *ifa; + + ifa = rcu_dereference(in_dev->ifa_list); + if (ifa) { + pkt_dev->saddr_min = ifa->ifa_address; + pkt_dev->saddr_max = pkt_dev->saddr_min; + } + } + rcu_read_unlock(); + } else { + pkt_dev->saddr_min = in_aton(pkt_dev->src_min); + pkt_dev->saddr_max = in_aton(pkt_dev->src_max); + } + + pkt_dev->daddr_min = in_aton(pkt_dev->dst_min); + pkt_dev->daddr_max = in_aton(pkt_dev->dst_max); + } + /* Initialize current values. */ + pkt_dev->cur_pkt_size = pkt_dev->min_pkt_size; + if (pkt_dev->min_pkt_size > pkt_dev->max_pkt_size) + pkt_dev->max_pkt_size = pkt_dev->min_pkt_size; + + pkt_dev->cur_dst_mac_offset = 0; + pkt_dev->cur_src_mac_offset = 0; + pkt_dev->cur_saddr = pkt_dev->saddr_min; + pkt_dev->cur_daddr = pkt_dev->daddr_min; + pkt_dev->cur_udp_dst = pkt_dev->udp_dst_min; + pkt_dev->cur_udp_src = pkt_dev->udp_src_min; + pkt_dev->nflows = 0; +} + + +static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) +{ + ktime_t start_time, end_time; + s64 remaining; + struct hrtimer_sleeper t; + + hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_set_expires(&t.timer, spin_until); + + remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer)); + if (remaining <= 0) + goto out; + + start_time = ktime_get(); + if (remaining < 100000) { + /* for small delays (<100us), just loop until limit is reached */ + do { + end_time = ktime_get(); + } while (ktime_compare(end_time, spin_until) < 0); + } else { + do { + set_current_state(TASK_INTERRUPTIBLE); + hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_ABS); + + if (likely(t.task)) + schedule(); + + hrtimer_cancel(&t.timer); + } while (t.task && pkt_dev->running && !signal_pending(current)); + __set_current_state(TASK_RUNNING); + end_time = ktime_get(); + } + + pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time)); +out: + pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay); + destroy_hrtimer_on_stack(&t.timer); +} + +static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) +{ + pkt_dev->pkt_overhead = 0; + pkt_dev->pkt_overhead += pkt_dev->nr_labels*sizeof(u32); + pkt_dev->pkt_overhead += VLAN_TAG_SIZE(pkt_dev); + pkt_dev->pkt_overhead += SVLAN_TAG_SIZE(pkt_dev); +} + +static inline int f_seen(const struct pktgen_dev *pkt_dev, int flow) +{ + return !!(pkt_dev->flows[flow].flags & F_INIT); +} + +static inline int f_pick(struct pktgen_dev *pkt_dev) +{ + int flow = pkt_dev->curfl; + + if (pkt_dev->flags & F_FLOW_SEQ) { + if (pkt_dev->flows[flow].count >= pkt_dev->lflow) { + /* reset time */ + pkt_dev->flows[flow].count = 0; + pkt_dev->flows[flow].flags = 0; + pkt_dev->curfl += 1; + if (pkt_dev->curfl >= pkt_dev->cflows) + pkt_dev->curfl = 0; /*reset */ + } + } else { + flow = prandom_u32_max(pkt_dev->cflows); + pkt_dev->curfl = flow; + + if (pkt_dev->flows[flow].count > pkt_dev->lflow) { + pkt_dev->flows[flow].count = 0; + pkt_dev->flows[flow].flags = 0; + } + } + + return pkt_dev->curfl; +} + + +#ifdef CONFIG_XFRM +/* If there was already an IPSEC SA, we keep it as is, else + * we go look for it ... +*/ +#define DUMMY_MARK 0 +static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) +{ + struct xfrm_state *x = pkt_dev->flows[flow].x; + struct pktgen_net *pn = net_generic(dev_net(pkt_dev->odev), pg_net_id); + if (!x) { + + if (pkt_dev->spi) { + /* We need as quick as possible to find the right SA + * Searching with minimum criteria to archieve this. + */ + x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET); + } else { + /* slow path: we dont already have xfrm_state */ + x = xfrm_stateonly_find(pn->net, DUMMY_MARK, 0, + (xfrm_address_t *)&pkt_dev->cur_daddr, + (xfrm_address_t *)&pkt_dev->cur_saddr, + AF_INET, + pkt_dev->ipsmode, + pkt_dev->ipsproto, 0); + } + if (x) { + pkt_dev->flows[flow].x = x; + set_pkt_overhead(pkt_dev); + pkt_dev->pkt_overhead += x->props.header_len; + } + + } +} +#endif +static void set_cur_queue_map(struct pktgen_dev *pkt_dev) +{ + + if (pkt_dev->flags & F_QUEUE_MAP_CPU) + pkt_dev->cur_queue_map = smp_processor_id(); + + else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) { + __u16 t; + if (pkt_dev->flags & F_QUEUE_MAP_RND) { + t = prandom_u32_max(pkt_dev->queue_map_max - + pkt_dev->queue_map_min + 1) + + pkt_dev->queue_map_min; + } else { + t = pkt_dev->cur_queue_map + 1; + if (t > pkt_dev->queue_map_max) + t = pkt_dev->queue_map_min; + } + pkt_dev->cur_queue_map = t; + } + pkt_dev->cur_queue_map = pkt_dev->cur_queue_map % pkt_dev->odev->real_num_tx_queues; +} + +/* Increment/randomize headers according to flags and current values + * for IP src/dest, UDP src/dst port, MAC-Addr src/dst + */ +static void mod_cur_headers(struct pktgen_dev *pkt_dev) +{ + __u32 imn; + __u32 imx; + int flow = 0; + + if (pkt_dev->cflows) + flow = f_pick(pkt_dev); + + /* Deal with source MAC */ + if (pkt_dev->src_mac_count > 1) { + __u32 mc; + __u32 tmp; + + if (pkt_dev->flags & F_MACSRC_RND) + mc = prandom_u32_max(pkt_dev->src_mac_count); + else { + mc = pkt_dev->cur_src_mac_offset++; + if (pkt_dev->cur_src_mac_offset >= + pkt_dev->src_mac_count) + pkt_dev->cur_src_mac_offset = 0; + } + + tmp = pkt_dev->src_mac[5] + (mc & 0xFF); + pkt_dev->hh[11] = tmp; + tmp = (pkt_dev->src_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[10] = tmp; + tmp = (pkt_dev->src_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[9] = tmp; + tmp = (pkt_dev->src_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[8] = tmp; + tmp = (pkt_dev->src_mac[1] + (tmp >> 8)); + pkt_dev->hh[7] = tmp; + } + + /* Deal with Destination MAC */ + if (pkt_dev->dst_mac_count > 1) { + __u32 mc; + __u32 tmp; + + if (pkt_dev->flags & F_MACDST_RND) + mc = prandom_u32_max(pkt_dev->dst_mac_count); + + else { + mc = pkt_dev->cur_dst_mac_offset++; + if (pkt_dev->cur_dst_mac_offset >= + pkt_dev->dst_mac_count) { + pkt_dev->cur_dst_mac_offset = 0; + } + } + + tmp = pkt_dev->dst_mac[5] + (mc & 0xFF); + pkt_dev->hh[5] = tmp; + tmp = (pkt_dev->dst_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[4] = tmp; + tmp = (pkt_dev->dst_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[3] = tmp; + tmp = (pkt_dev->dst_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[2] = tmp; + tmp = (pkt_dev->dst_mac[1] + (tmp >> 8)); + pkt_dev->hh[1] = tmp; + } + + if (pkt_dev->flags & F_MPLS_RND) { + unsigned int i; + for (i = 0; i < pkt_dev->nr_labels; i++) + if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM) + pkt_dev->labels[i] = MPLS_STACK_BOTTOM | + ((__force __be32)get_random_u32() & + htonl(0x000fffff)); + } + + if ((pkt_dev->flags & F_VID_RND) && (pkt_dev->vlan_id != 0xffff)) { + pkt_dev->vlan_id = prandom_u32_max(4096); + } + + if ((pkt_dev->flags & F_SVID_RND) && (pkt_dev->svlan_id != 0xffff)) { + pkt_dev->svlan_id = prandom_u32_max(4096); + } + + if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) { + if (pkt_dev->flags & F_UDPSRC_RND) + pkt_dev->cur_udp_src = prandom_u32_max( + pkt_dev->udp_src_max - pkt_dev->udp_src_min) + + pkt_dev->udp_src_min; + + else { + pkt_dev->cur_udp_src++; + if (pkt_dev->cur_udp_src >= pkt_dev->udp_src_max) + pkt_dev->cur_udp_src = pkt_dev->udp_src_min; + } + } + + if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) { + if (pkt_dev->flags & F_UDPDST_RND) { + pkt_dev->cur_udp_dst = prandom_u32_max( + pkt_dev->udp_dst_max - pkt_dev->udp_dst_min) + + pkt_dev->udp_dst_min; + } else { + pkt_dev->cur_udp_dst++; + if (pkt_dev->cur_udp_dst >= pkt_dev->udp_dst_max) + pkt_dev->cur_udp_dst = pkt_dev->udp_dst_min; + } + } + + if (!(pkt_dev->flags & F_IPV6)) { + + imn = ntohl(pkt_dev->saddr_min); + imx = ntohl(pkt_dev->saddr_max); + if (imn < imx) { + __u32 t; + if (pkt_dev->flags & F_IPSRC_RND) + t = prandom_u32_max(imx - imn) + imn; + else { + t = ntohl(pkt_dev->cur_saddr); + t++; + if (t > imx) + t = imn; + + } + pkt_dev->cur_saddr = htonl(t); + } + + if (pkt_dev->cflows && f_seen(pkt_dev, flow)) { + pkt_dev->cur_daddr = pkt_dev->flows[flow].cur_daddr; + } else { + imn = ntohl(pkt_dev->daddr_min); + imx = ntohl(pkt_dev->daddr_max); + if (imn < imx) { + __u32 t; + __be32 s; + if (pkt_dev->flags & F_IPDST_RND) { + + do { + t = prandom_u32_max(imx - imn) + + imn; + s = htonl(t); + } while (ipv4_is_loopback(s) || + ipv4_is_multicast(s) || + ipv4_is_lbcast(s) || + ipv4_is_zeronet(s) || + ipv4_is_local_multicast(s)); + pkt_dev->cur_daddr = s; + } else { + t = ntohl(pkt_dev->cur_daddr); + t++; + if (t > imx) { + t = imn; + } + pkt_dev->cur_daddr = htonl(t); + } + } + if (pkt_dev->cflows) { + pkt_dev->flows[flow].flags |= F_INIT; + pkt_dev->flows[flow].cur_daddr = + pkt_dev->cur_daddr; +#ifdef CONFIG_XFRM + if (pkt_dev->flags & F_IPSEC) + get_ipsec_sa(pkt_dev, flow); +#endif + pkt_dev->nflows++; + } + } + } else { /* IPV6 * */ + + if (!ipv6_addr_any(&pkt_dev->min_in6_daddr)) { + int i; + + /* Only random destinations yet */ + + for (i = 0; i < 4; i++) { + pkt_dev->cur_in6_daddr.s6_addr32[i] = + (((__force __be32)get_random_u32() | + pkt_dev->min_in6_daddr.s6_addr32[i]) & + pkt_dev->max_in6_daddr.s6_addr32[i]); + } + } + } + + if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) { + __u32 t; + if (pkt_dev->flags & F_TXSIZE_RND) { + t = prandom_u32_max(pkt_dev->max_pkt_size - + pkt_dev->min_pkt_size) + + pkt_dev->min_pkt_size; + } else { + t = pkt_dev->cur_pkt_size + 1; + if (t > pkt_dev->max_pkt_size) + t = pkt_dev->min_pkt_size; + } + pkt_dev->cur_pkt_size = t; + } else if (pkt_dev->n_imix_entries > 0) { + struct imix_pkt *entry; + __u32 t = prandom_u32_max(IMIX_PRECISION); + __u8 entry_index = pkt_dev->imix_distribution[t]; + + entry = &pkt_dev->imix_entries[entry_index]; + entry->count_so_far++; + pkt_dev->cur_pkt_size = entry->size; + } + + set_cur_queue_map(pkt_dev); + + pkt_dev->flows[flow].count++; +} + +static void fill_imix_distribution(struct pktgen_dev *pkt_dev) +{ + int cumulative_probabilites[MAX_IMIX_ENTRIES]; + int j = 0; + __u64 cumulative_prob = 0; + __u64 total_weight = 0; + int i = 0; + + for (i = 0; i < pkt_dev->n_imix_entries; i++) + total_weight += pkt_dev->imix_entries[i].weight; + + /* Fill cumulative_probabilites with sum of normalized probabilities */ + for (i = 0; i < pkt_dev->n_imix_entries - 1; i++) { + cumulative_prob += div64_u64(pkt_dev->imix_entries[i].weight * + IMIX_PRECISION, + total_weight); + cumulative_probabilites[i] = cumulative_prob; + } + cumulative_probabilites[pkt_dev->n_imix_entries - 1] = 100; + + for (i = 0; i < IMIX_PRECISION; i++) { + if (i == cumulative_probabilites[j]) + j++; + pkt_dev->imix_distribution[i] = j; + } +} + +#ifdef CONFIG_XFRM +static u32 pktgen_dst_metrics[RTAX_MAX + 1] = { + + [RTAX_HOPLIMIT] = 0x5, /* Set a static hoplimit */ +}; + +static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev) +{ + struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x; + int err = 0; + struct net *net = dev_net(pkt_dev->odev); + + if (!x) + return 0; + /* XXX: we dont support tunnel mode for now until + * we resolve the dst issue */ + if ((x->props.mode != XFRM_MODE_TRANSPORT) && (pkt_dev->spi == 0)) + return 0; + + /* But when user specify an valid SPI, transformation + * supports both transport/tunnel mode + ESP/AH type. + */ + if ((x->props.mode == XFRM_MODE_TUNNEL) && (pkt_dev->spi != 0)) + skb->_skb_refdst = (unsigned long)&pkt_dev->xdst.u.dst | SKB_DST_NOREF; + + rcu_read_lock_bh(); + err = pktgen_xfrm_outer_mode_output(x, skb); + rcu_read_unlock_bh(); + if (err) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEMODEERROR); + goto error; + } + err = x->type->output(x, skb); + if (err) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEPROTOERROR); + goto error; + } + spin_lock_bh(&x->lock); + x->curlft.bytes += skb->len; + x->curlft.packets++; + spin_unlock_bh(&x->lock); +error: + return err; +} + +static void free_SAs(struct pktgen_dev *pkt_dev) +{ + if (pkt_dev->cflows) { + /* let go of the SAs if we have them */ + int i; + for (i = 0; i < pkt_dev->cflows; i++) { + struct xfrm_state *x = pkt_dev->flows[i].x; + if (x) { + xfrm_state_put(x); + pkt_dev->flows[i].x = NULL; + } + } + } +} + +static int process_ipsec(struct pktgen_dev *pkt_dev, + struct sk_buff *skb, __be16 protocol) +{ + if (pkt_dev->flags & F_IPSEC) { + struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x; + int nhead = 0; + if (x) { + struct ethhdr *eth; + struct iphdr *iph; + int ret; + + nhead = x->props.header_len - skb_headroom(skb); + if (nhead > 0) { + ret = pskb_expand_head(skb, nhead, 0, GFP_ATOMIC); + if (ret < 0) { + pr_err("Error expanding ipsec packet %d\n", + ret); + goto err; + } + } + + /* ipsec is not expecting ll header */ + skb_pull(skb, ETH_HLEN); + ret = pktgen_output_ipsec(skb, pkt_dev); + if (ret) { + pr_err("Error creating ipsec packet %d\n", ret); + goto err; + } + /* restore ll */ + eth = skb_push(skb, ETH_HLEN); + memcpy(eth, pkt_dev->hh, 2 * ETH_ALEN); + eth->h_proto = protocol; + + /* Update IPv4 header len as well as checksum value */ + iph = ip_hdr(skb); + iph->tot_len = htons(skb->len - ETH_HLEN); + ip_send_check(iph); + } + } + return 1; +err: + kfree_skb(skb); + return 0; +} +#endif + +static void mpls_push(__be32 *mpls, struct pktgen_dev *pkt_dev) +{ + unsigned int i; + for (i = 0; i < pkt_dev->nr_labels; i++) + *mpls++ = pkt_dev->labels[i] & ~MPLS_STACK_BOTTOM; + + mpls--; + *mpls |= MPLS_STACK_BOTTOM; +} + +static inline __be16 build_tci(unsigned int id, unsigned int cfi, + unsigned int prio) +{ + return htons(id | (cfi << 12) | (prio << 13)); +} + +static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, + int datalen) +{ + struct timespec64 timestamp; + struct pktgen_hdr *pgh; + + pgh = skb_put(skb, sizeof(*pgh)); + datalen -= sizeof(*pgh); + + if (pkt_dev->nfrags <= 0) { + skb_put_zero(skb, datalen); + } else { + int frags = pkt_dev->nfrags; + int i, len; + int frag_len; + + + if (frags > MAX_SKB_FRAGS) + frags = MAX_SKB_FRAGS; + len = datalen - frags * PAGE_SIZE; + if (len > 0) { + skb_put_zero(skb, len); + datalen = frags * PAGE_SIZE; + } + + i = 0; + frag_len = (datalen/frags) < PAGE_SIZE ? + (datalen/frags) : PAGE_SIZE; + while (datalen > 0) { + if (unlikely(!pkt_dev->page)) { + int node = numa_node_id(); + + if (pkt_dev->node >= 0 && (pkt_dev->flags & F_NODE)) + node = pkt_dev->node; + pkt_dev->page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!pkt_dev->page) + break; + } + get_page(pkt_dev->page); + skb_frag_set_page(skb, i, pkt_dev->page); + skb_frag_off_set(&skb_shinfo(skb)->frags[i], 0); + /*last fragment, fill rest of data*/ + if (i == (frags - 1)) + skb_frag_size_set(&skb_shinfo(skb)->frags[i], + (datalen < PAGE_SIZE ? datalen : PAGE_SIZE)); + else + skb_frag_size_set(&skb_shinfo(skb)->frags[i], frag_len); + datalen -= skb_frag_size(&skb_shinfo(skb)->frags[i]); + skb->len += skb_frag_size(&skb_shinfo(skb)->frags[i]); + skb->data_len += skb_frag_size(&skb_shinfo(skb)->frags[i]); + i++; + skb_shinfo(skb)->nr_frags = i; + } + } + + /* Stamp the time, and sequence number, + * convert them to network byte order + */ + pgh->pgh_magic = htonl(PKTGEN_MAGIC); + pgh->seq_num = htonl(pkt_dev->seq_num); + + if (pkt_dev->flags & F_NO_TIMESTAMP) { + pgh->tv_sec = 0; + pgh->tv_usec = 0; + } else { + /* + * pgh->tv_sec wraps in y2106 when interpreted as unsigned + * as done by wireshark, or y2038 when interpreted as signed. + * This is probably harmless, but if anyone wants to improve + * it, we could introduce a variant that puts 64-bit nanoseconds + * into the respective header bytes. + * This would also be slightly faster to read. + */ + ktime_get_real_ts64(×tamp); + pgh->tv_sec = htonl(timestamp.tv_sec); + pgh->tv_usec = htonl(timestamp.tv_nsec / NSEC_PER_USEC); + } +} + +static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, + struct pktgen_dev *pkt_dev) +{ + unsigned int extralen = LL_RESERVED_SPACE(dev); + struct sk_buff *skb = NULL; + unsigned int size; + + size = pkt_dev->cur_pkt_size + 64 + extralen + pkt_dev->pkt_overhead; + if (pkt_dev->flags & F_NODE) { + int node = pkt_dev->node >= 0 ? pkt_dev->node : numa_node_id(); + + skb = __alloc_skb(NET_SKB_PAD + size, GFP_NOWAIT, 0, node); + if (likely(skb)) { + skb_reserve(skb, NET_SKB_PAD); + skb->dev = dev; + } + } else { + skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); + } + + /* the caller pre-fetches from skb->data and reserves for the mac hdr */ + if (likely(skb)) + skb_reserve(skb, extralen - 16); + + return skb; +} + +static struct sk_buff *fill_packet_ipv4(struct net_device *odev, + struct pktgen_dev *pkt_dev) +{ + struct sk_buff *skb = NULL; + __u8 *eth; + struct udphdr *udph; + int datalen, iplen; + struct iphdr *iph; + __be16 protocol = htons(ETH_P_IP); + __be32 *mpls; + __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */ + __be16 *vlan_encapsulated_proto = NULL; /* packet type ID field (or len) for VLAN tag */ + __be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */ + __be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */ + u16 queue_map; + + if (pkt_dev->nr_labels) + protocol = htons(ETH_P_MPLS_UC); + + if (pkt_dev->vlan_id != 0xffff) + protocol = htons(ETH_P_8021Q); + + /* Update any of the values, used when we're incrementing various + * fields. + */ + mod_cur_headers(pkt_dev); + queue_map = pkt_dev->cur_queue_map; + + skb = pktgen_alloc_skb(odev, pkt_dev); + if (!skb) { + sprintf(pkt_dev->result, "No memory"); + return NULL; + } + + prefetchw(skb->data); + skb_reserve(skb, 16); + + /* Reserve for ethernet and IP header */ + eth = skb_push(skb, 14); + mpls = skb_put(skb, pkt_dev->nr_labels * sizeof(__u32)); + if (pkt_dev->nr_labels) + mpls_push(mpls, pkt_dev); + + if (pkt_dev->vlan_id != 0xffff) { + if (pkt_dev->svlan_id != 0xffff) { + svlan_tci = skb_put(skb, sizeof(__be16)); + *svlan_tci = build_tci(pkt_dev->svlan_id, + pkt_dev->svlan_cfi, + pkt_dev->svlan_p); + svlan_encapsulated_proto = skb_put(skb, + sizeof(__be16)); + *svlan_encapsulated_proto = htons(ETH_P_8021Q); + } + vlan_tci = skb_put(skb, sizeof(__be16)); + *vlan_tci = build_tci(pkt_dev->vlan_id, + pkt_dev->vlan_cfi, + pkt_dev->vlan_p); + vlan_encapsulated_proto = skb_put(skb, sizeof(__be16)); + *vlan_encapsulated_proto = htons(ETH_P_IP); + } + + skb_reset_mac_header(skb); + skb_set_network_header(skb, skb->len); + iph = skb_put(skb, sizeof(struct iphdr)); + + skb_set_transport_header(skb, skb->len); + udph = skb_put(skb, sizeof(struct udphdr)); + skb_set_queue_mapping(skb, queue_map); + skb->priority = pkt_dev->skb_priority; + + memcpy(eth, pkt_dev->hh, 12); + *(__be16 *) & eth[12] = protocol; + + /* Eth + IPh + UDPh + mpls */ + datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 - + pkt_dev->pkt_overhead; + if (datalen < 0 || datalen < sizeof(struct pktgen_hdr)) + datalen = sizeof(struct pktgen_hdr); + + udph->source = htons(pkt_dev->cur_udp_src); + udph->dest = htons(pkt_dev->cur_udp_dst); + udph->len = htons(datalen + 8); /* DATA + udphdr */ + udph->check = 0; + + iph->ihl = 5; + iph->version = 4; + iph->ttl = 32; + iph->tos = pkt_dev->tos; + iph->protocol = IPPROTO_UDP; /* UDP */ + iph->saddr = pkt_dev->cur_saddr; + iph->daddr = pkt_dev->cur_daddr; + iph->id = htons(pkt_dev->ip_id); + pkt_dev->ip_id++; + iph->frag_off = 0; + iplen = 20 + 8 + datalen; + iph->tot_len = htons(iplen); + ip_send_check(iph); + skb->protocol = protocol; + skb->dev = odev; + skb->pkt_type = PACKET_HOST; + + pktgen_finalize_skb(pkt_dev, skb, datalen); + + if (!(pkt_dev->flags & F_UDPCSUM)) { + skb->ip_summed = CHECKSUM_NONE; + } else if (odev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM)) { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum = 0; + udp4_hwcsum(skb, iph->saddr, iph->daddr); + } else { + __wsum csum = skb_checksum(skb, skb_transport_offset(skb), datalen + 8, 0); + + /* add protocol-dependent pseudo-header */ + udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, + datalen + 8, IPPROTO_UDP, csum); + + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + } + +#ifdef CONFIG_XFRM + if (!process_ipsec(pkt_dev, skb, protocol)) + return NULL; +#endif + + return skb; +} + +static struct sk_buff *fill_packet_ipv6(struct net_device *odev, + struct pktgen_dev *pkt_dev) +{ + struct sk_buff *skb = NULL; + __u8 *eth; + struct udphdr *udph; + int datalen, udplen; + struct ipv6hdr *iph; + __be16 protocol = htons(ETH_P_IPV6); + __be32 *mpls; + __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */ + __be16 *vlan_encapsulated_proto = NULL; /* packet type ID field (or len) for VLAN tag */ + __be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */ + __be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */ + u16 queue_map; + + if (pkt_dev->nr_labels) + protocol = htons(ETH_P_MPLS_UC); + + if (pkt_dev->vlan_id != 0xffff) + protocol = htons(ETH_P_8021Q); + + /* Update any of the values, used when we're incrementing various + * fields. + */ + mod_cur_headers(pkt_dev); + queue_map = pkt_dev->cur_queue_map; + + skb = pktgen_alloc_skb(odev, pkt_dev); + if (!skb) { + sprintf(pkt_dev->result, "No memory"); + return NULL; + } + + prefetchw(skb->data); + skb_reserve(skb, 16); + + /* Reserve for ethernet and IP header */ + eth = skb_push(skb, 14); + mpls = skb_put(skb, pkt_dev->nr_labels * sizeof(__u32)); + if (pkt_dev->nr_labels) + mpls_push(mpls, pkt_dev); + + if (pkt_dev->vlan_id != 0xffff) { + if (pkt_dev->svlan_id != 0xffff) { + svlan_tci = skb_put(skb, sizeof(__be16)); + *svlan_tci = build_tci(pkt_dev->svlan_id, + pkt_dev->svlan_cfi, + pkt_dev->svlan_p); + svlan_encapsulated_proto = skb_put(skb, + sizeof(__be16)); + *svlan_encapsulated_proto = htons(ETH_P_8021Q); + } + vlan_tci = skb_put(skb, sizeof(__be16)); + *vlan_tci = build_tci(pkt_dev->vlan_id, + pkt_dev->vlan_cfi, + pkt_dev->vlan_p); + vlan_encapsulated_proto = skb_put(skb, sizeof(__be16)); + *vlan_encapsulated_proto = htons(ETH_P_IPV6); + } + + skb_reset_mac_header(skb); + skb_set_network_header(skb, skb->len); + iph = skb_put(skb, sizeof(struct ipv6hdr)); + + skb_set_transport_header(skb, skb->len); + udph = skb_put(skb, sizeof(struct udphdr)); + skb_set_queue_mapping(skb, queue_map); + skb->priority = pkt_dev->skb_priority; + + memcpy(eth, pkt_dev->hh, 12); + *(__be16 *) ð[12] = protocol; + + /* Eth + IPh + UDPh + mpls */ + datalen = pkt_dev->cur_pkt_size - 14 - + sizeof(struct ipv6hdr) - sizeof(struct udphdr) - + pkt_dev->pkt_overhead; + + if (datalen < 0 || datalen < sizeof(struct pktgen_hdr)) { + datalen = sizeof(struct pktgen_hdr); + net_info_ratelimited("increased datalen to %d\n", datalen); + } + + udplen = datalen + sizeof(struct udphdr); + udph->source = htons(pkt_dev->cur_udp_src); + udph->dest = htons(pkt_dev->cur_udp_dst); + udph->len = htons(udplen); + udph->check = 0; + + *(__be32 *) iph = htonl(0x60000000); /* Version + flow */ + + if (pkt_dev->traffic_class) { + /* Version + traffic class + flow (0) */ + *(__be32 *)iph |= htonl(0x60000000 | (pkt_dev->traffic_class << 20)); + } + + iph->hop_limit = 32; + + iph->payload_len = htons(udplen); + iph->nexthdr = IPPROTO_UDP; + + iph->daddr = pkt_dev->cur_in6_daddr; + iph->saddr = pkt_dev->cur_in6_saddr; + + skb->protocol = protocol; + skb->dev = odev; + skb->pkt_type = PACKET_HOST; + + pktgen_finalize_skb(pkt_dev, skb, datalen); + + if (!(pkt_dev->flags & F_UDPCSUM)) { + skb->ip_summed = CHECKSUM_NONE; + } else if (odev->features & (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM)) { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + udph->check = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, 0); + } else { + __wsum csum = skb_checksum(skb, skb_transport_offset(skb), udplen, 0); + + /* add protocol-dependent pseudo-header */ + udph->check = csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, csum); + + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + } + + return skb; +} + +static struct sk_buff *fill_packet(struct net_device *odev, + struct pktgen_dev *pkt_dev) +{ + if (pkt_dev->flags & F_IPV6) + return fill_packet_ipv6(odev, pkt_dev); + else + return fill_packet_ipv4(odev, pkt_dev); +} + +static void pktgen_clear_counters(struct pktgen_dev *pkt_dev) +{ + pkt_dev->seq_num = 1; + pkt_dev->idle_acc = 0; + pkt_dev->sofar = 0; + pkt_dev->tx_bytes = 0; + pkt_dev->errors = 0; +} + +/* Set up structure for sending pkts, clear counters */ + +static void pktgen_run(struct pktgen_thread *t) +{ + struct pktgen_dev *pkt_dev; + int started = 0; + + func_enter(); + + rcu_read_lock(); + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) { + + /* + * setup odev and create initial packet. + */ + pktgen_setup_inject(pkt_dev); + + if (pkt_dev->odev) { + pktgen_clear_counters(pkt_dev); + pkt_dev->skb = NULL; + pkt_dev->started_at = pkt_dev->next_tx = ktime_get(); + + set_pkt_overhead(pkt_dev); + + strcpy(pkt_dev->result, "Starting"); + pkt_dev->running = 1; /* Cranke yeself! */ + started++; + } else + strcpy(pkt_dev->result, "Error starting"); + } + rcu_read_unlock(); + if (started) + t->control &= ~(T_STOP); +} + +static void pktgen_handle_all_threads(struct pktgen_net *pn, u32 flags) +{ + struct pktgen_thread *t; + + mutex_lock(&pktgen_thread_lock); + + list_for_each_entry(t, &pn->pktgen_threads, th_list) + t->control |= (flags); + + mutex_unlock(&pktgen_thread_lock); +} + +static void pktgen_stop_all_threads(struct pktgen_net *pn) +{ + func_enter(); + + pktgen_handle_all_threads(pn, T_STOP); +} + +static int thread_is_running(const struct pktgen_thread *t) +{ + const struct pktgen_dev *pkt_dev; + + rcu_read_lock(); + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) + if (pkt_dev->running) { + rcu_read_unlock(); + return 1; + } + rcu_read_unlock(); + return 0; +} + +static int pktgen_wait_thread_run(struct pktgen_thread *t) +{ + while (thread_is_running(t)) { + + /* note: 't' will still be around even after the unlock/lock + * cycle because pktgen_thread threads are only cleared at + * net exit + */ + mutex_unlock(&pktgen_thread_lock); + msleep_interruptible(100); + mutex_lock(&pktgen_thread_lock); + + if (signal_pending(current)) + goto signal; + } + return 1; +signal: + return 0; +} + +static int pktgen_wait_all_threads_run(struct pktgen_net *pn) +{ + struct pktgen_thread *t; + int sig = 1; + + /* prevent from racing with rmmod */ + if (!try_module_get(THIS_MODULE)) + return sig; + + mutex_lock(&pktgen_thread_lock); + + list_for_each_entry(t, &pn->pktgen_threads, th_list) { + sig = pktgen_wait_thread_run(t); + if (sig == 0) + break; + } + + if (sig == 0) + list_for_each_entry(t, &pn->pktgen_threads, th_list) + t->control |= (T_STOP); + + mutex_unlock(&pktgen_thread_lock); + module_put(THIS_MODULE); + return sig; +} + +static void pktgen_run_all_threads(struct pktgen_net *pn) +{ + func_enter(); + + pktgen_handle_all_threads(pn, T_RUN); + + /* Propagate thread->control */ + schedule_timeout_interruptible(msecs_to_jiffies(125)); + + pktgen_wait_all_threads_run(pn); +} + +static void pktgen_reset_all_threads(struct pktgen_net *pn) +{ + func_enter(); + + pktgen_handle_all_threads(pn, T_REMDEVALL); + + /* Propagate thread->control */ + schedule_timeout_interruptible(msecs_to_jiffies(125)); + + pktgen_wait_all_threads_run(pn); +} + +static void show_results(struct pktgen_dev *pkt_dev, int nr_frags) +{ + __u64 bps, mbps, pps; + char *p = pkt_dev->result; + ktime_t elapsed = ktime_sub(pkt_dev->stopped_at, + pkt_dev->started_at); + ktime_t idle = ns_to_ktime(pkt_dev->idle_acc); + + p += sprintf(p, "OK: %llu(c%llu+d%llu) usec, %llu (%dbyte,%dfrags)\n", + (unsigned long long)ktime_to_us(elapsed), + (unsigned long long)ktime_to_us(ktime_sub(elapsed, idle)), + (unsigned long long)ktime_to_us(idle), + (unsigned long long)pkt_dev->sofar, + pkt_dev->cur_pkt_size, nr_frags); + + pps = div64_u64(pkt_dev->sofar * NSEC_PER_SEC, + ktime_to_ns(elapsed)); + + if (pkt_dev->n_imix_entries > 0) { + int i; + struct imix_pkt *entry; + + bps = 0; + for (i = 0; i < pkt_dev->n_imix_entries; i++) { + entry = &pkt_dev->imix_entries[i]; + bps += entry->size * entry->count_so_far; + } + bps = div64_u64(bps * 8 * NSEC_PER_SEC, ktime_to_ns(elapsed)); + } else { + bps = pps * 8 * pkt_dev->cur_pkt_size; + } + + mbps = bps; + do_div(mbps, 1000000); + p += sprintf(p, " %llupps %lluMb/sec (%llubps) errors: %llu", + (unsigned long long)pps, + (unsigned long long)mbps, + (unsigned long long)bps, + (unsigned long long)pkt_dev->errors); +} + +/* Set stopped-at timer, remove from running list, do counters & statistics */ +static int pktgen_stop_device(struct pktgen_dev *pkt_dev) +{ + int nr_frags = pkt_dev->skb ? skb_shinfo(pkt_dev->skb)->nr_frags : -1; + + if (!pkt_dev->running) { + pr_warn("interface: %s is already stopped\n", + pkt_dev->odevname); + return -EINVAL; + } + + pkt_dev->running = 0; + kfree_skb(pkt_dev->skb); + pkt_dev->skb = NULL; + pkt_dev->stopped_at = ktime_get(); + + show_results(pkt_dev, nr_frags); + + return 0; +} + +static struct pktgen_dev *next_to_run(struct pktgen_thread *t) +{ + struct pktgen_dev *pkt_dev, *best = NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) { + if (!pkt_dev->running) + continue; + if (best == NULL) + best = pkt_dev; + else if (ktime_compare(pkt_dev->next_tx, best->next_tx) < 0) + best = pkt_dev; + } + rcu_read_unlock(); + + return best; +} + +static void pktgen_stop(struct pktgen_thread *t) +{ + struct pktgen_dev *pkt_dev; + + func_enter(); + + rcu_read_lock(); + + list_for_each_entry_rcu(pkt_dev, &t->if_list, list) { + pktgen_stop_device(pkt_dev); + } + + rcu_read_unlock(); +} + +/* + * one of our devices needs to be removed - find it + * and remove it + */ +static void pktgen_rem_one_if(struct pktgen_thread *t) +{ + struct list_head *q, *n; + struct pktgen_dev *cur; + + func_enter(); + + list_for_each_safe(q, n, &t->if_list) { + cur = list_entry(q, struct pktgen_dev, list); + + if (!cur->removal_mark) + continue; + + kfree_skb(cur->skb); + cur->skb = NULL; + + pktgen_remove_device(t, cur); + + break; + } +} + +static void pktgen_rem_all_ifs(struct pktgen_thread *t) +{ + struct list_head *q, *n; + struct pktgen_dev *cur; + + func_enter(); + + /* Remove all devices, free mem */ + + list_for_each_safe(q, n, &t->if_list) { + cur = list_entry(q, struct pktgen_dev, list); + + kfree_skb(cur->skb); + cur->skb = NULL; + + pktgen_remove_device(t, cur); + } +} + +static void pktgen_rem_thread(struct pktgen_thread *t) +{ + /* Remove from the thread list */ + remove_proc_entry(t->tsk->comm, t->net->proc_dir); +} + +static void pktgen_resched(struct pktgen_dev *pkt_dev) +{ + ktime_t idle_start = ktime_get(); + schedule(); + pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start)); +} + +static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev) +{ + ktime_t idle_start = ktime_get(); + + while (refcount_read(&(pkt_dev->skb->users)) != 1) { + if (signal_pending(current)) + break; + + if (need_resched()) + pktgen_resched(pkt_dev); + else + cpu_relax(); + } + pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start)); +} + +static void pktgen_xmit(struct pktgen_dev *pkt_dev) +{ + unsigned int burst = READ_ONCE(pkt_dev->burst); + struct net_device *odev = pkt_dev->odev; + struct netdev_queue *txq; + struct sk_buff *skb; + int ret; + + /* If device is offline, then don't send */ + if (unlikely(!netif_running(odev) || !netif_carrier_ok(odev))) { + pktgen_stop_device(pkt_dev); + return; + } + + /* This is max DELAY, this has special meaning of + * "never transmit" + */ + if (unlikely(pkt_dev->delay == ULLONG_MAX)) { + pkt_dev->next_tx = ktime_add_ns(ktime_get(), ULONG_MAX); + return; + } + + /* If no skb or clone count exhausted then get new one */ + if (!pkt_dev->skb || (pkt_dev->last_ok && + ++pkt_dev->clone_count >= pkt_dev->clone_skb)) { + /* build a new pkt */ + kfree_skb(pkt_dev->skb); + + pkt_dev->skb = fill_packet(odev, pkt_dev); + if (pkt_dev->skb == NULL) { + pr_err("ERROR: couldn't allocate skb in fill_packet\n"); + schedule(); + pkt_dev->clone_count--; /* back out increment, OOM */ + return; + } + pkt_dev->last_pkt_size = pkt_dev->skb->len; + pkt_dev->clone_count = 0; /* reset counter */ + } + + if (pkt_dev->delay && pkt_dev->last_ok) + spin(pkt_dev, pkt_dev->next_tx); + + if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) { + skb = pkt_dev->skb; + skb->protocol = eth_type_trans(skb, skb->dev); + refcount_add(burst, &skb->users); + local_bh_disable(); + do { + ret = netif_receive_skb(skb); + if (ret == NET_RX_DROP) + pkt_dev->errors++; + pkt_dev->sofar++; + pkt_dev->seq_num++; + if (refcount_read(&skb->users) != burst) { + /* skb was queued by rps/rfs or taps, + * so cannot reuse this skb + */ + WARN_ON(refcount_sub_and_test(burst - 1, &skb->users)); + /* get out of the loop and wait + * until skb is consumed + */ + break; + } + /* skb was 'freed' by stack, so clean few + * bits and reuse it + */ + skb_reset_redirect(skb); + } while (--burst > 0); + goto out; /* Skips xmit_mode M_START_XMIT */ + } else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) { + local_bh_disable(); + refcount_inc(&pkt_dev->skb->users); + + ret = dev_queue_xmit(pkt_dev->skb); + switch (ret) { + case NET_XMIT_SUCCESS: + pkt_dev->sofar++; + pkt_dev->seq_num++; + pkt_dev->tx_bytes += pkt_dev->last_pkt_size; + break; + case NET_XMIT_DROP: + case NET_XMIT_CN: + /* These are all valid return codes for a qdisc but + * indicate packets are being dropped or will likely + * be dropped soon. + */ + case NETDEV_TX_BUSY: + /* qdisc may call dev_hard_start_xmit directly in cases + * where no queues exist e.g. loopback device, virtual + * devices, etc. In this case we need to handle + * NETDEV_TX_ codes. + */ + default: + pkt_dev->errors++; + net_info_ratelimited("%s xmit error: %d\n", + pkt_dev->odevname, ret); + break; + } + goto out; + } + + txq = skb_get_tx_queue(odev, pkt_dev->skb); + + local_bh_disable(); + + HARD_TX_LOCK(odev, txq, smp_processor_id()); + + if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) { + pkt_dev->last_ok = 0; + goto unlock; + } + refcount_add(burst, &pkt_dev->skb->users); + +xmit_more: + ret = netdev_start_xmit(pkt_dev->skb, odev, txq, --burst > 0); + + switch (ret) { + case NETDEV_TX_OK: + pkt_dev->last_ok = 1; + pkt_dev->sofar++; + pkt_dev->seq_num++; + pkt_dev->tx_bytes += pkt_dev->last_pkt_size; + if (burst > 0 && !netif_xmit_frozen_or_drv_stopped(txq)) + goto xmit_more; + break; + case NET_XMIT_DROP: + case NET_XMIT_CN: + /* skb has been consumed */ + pkt_dev->errors++; + break; + default: /* Drivers are not supposed to return other values! */ + net_info_ratelimited("%s xmit error: %d\n", + pkt_dev->odevname, ret); + pkt_dev->errors++; + fallthrough; + case NETDEV_TX_BUSY: + /* Retry it next time */ + refcount_dec(&(pkt_dev->skb->users)); + pkt_dev->last_ok = 0; + } + if (unlikely(burst)) + WARN_ON(refcount_sub_and_test(burst, &pkt_dev->skb->users)); +unlock: + HARD_TX_UNLOCK(odev, txq); + +out: + local_bh_enable(); + + /* If pkt_dev->count is zero, then run forever */ + if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) { + pktgen_wait_for_skb(pkt_dev); + + /* Done with this */ + pktgen_stop_device(pkt_dev); + } +} + +/* + * Main loop of the thread goes here + */ + +static int pktgen_thread_worker(void *arg) +{ + struct pktgen_thread *t = arg; + struct pktgen_dev *pkt_dev = NULL; + int cpu = t->cpu; + + WARN_ON(smp_processor_id() != cpu); + + init_waitqueue_head(&t->queue); + complete(&t->start_done); + + pr_debug("starting pktgen/%d: pid=%d\n", cpu, task_pid_nr(current)); + + set_freezable(); + + while (!kthread_should_stop()) { + pkt_dev = next_to_run(t); + + if (unlikely(!pkt_dev && t->control == 0)) { + if (t->net->pktgen_exiting) + break; + wait_event_interruptible_timeout(t->queue, + t->control != 0, + HZ/10); + try_to_freeze(); + continue; + } + + if (likely(pkt_dev)) { + pktgen_xmit(pkt_dev); + + if (need_resched()) + pktgen_resched(pkt_dev); + else + cpu_relax(); + } + + if (t->control & T_STOP) { + pktgen_stop(t); + t->control &= ~(T_STOP); + } + + if (t->control & T_RUN) { + pktgen_run(t); + t->control &= ~(T_RUN); + } + + if (t->control & T_REMDEVALL) { + pktgen_rem_all_ifs(t); + t->control &= ~(T_REMDEVALL); + } + + if (t->control & T_REMDEV) { + pktgen_rem_one_if(t); + t->control &= ~(T_REMDEV); + } + + try_to_freeze(); + } + + pr_debug("%s stopping all device\n", t->tsk->comm); + pktgen_stop(t); + + pr_debug("%s removing all device\n", t->tsk->comm); + pktgen_rem_all_ifs(t); + + pr_debug("%s removing thread\n", t->tsk->comm); + pktgen_rem_thread(t); + + return 0; +} + +static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, + const char *ifname, bool exact) +{ + struct pktgen_dev *p, *pkt_dev = NULL; + size_t len = strlen(ifname); + + rcu_read_lock(); + list_for_each_entry_rcu(p, &t->if_list, list) + if (strncmp(p->odevname, ifname, len) == 0) { + if (p->odevname[len]) { + if (exact || p->odevname[len] != '@') + continue; + } + pkt_dev = p; + break; + } + + rcu_read_unlock(); + pr_debug("find_dev(%s) returning %p\n", ifname, pkt_dev); + return pkt_dev; +} + +/* + * Adds a dev at front of if_list. + */ + +static int add_dev_to_thread(struct pktgen_thread *t, + struct pktgen_dev *pkt_dev) +{ + int rv = 0; + + /* This function cannot be called concurrently, as its called + * under pktgen_thread_lock mutex, but it can run from + * userspace on another CPU than the kthread. The if_lock() + * is used here to sync with concurrent instances of + * _rem_dev_from_if_list() invoked via kthread, which is also + * updating the if_list */ + if_lock(t); + + if (pkt_dev->pg_thread) { + pr_err("ERROR: already assigned to a thread\n"); + rv = -EBUSY; + goto out; + } + + pkt_dev->running = 0; + pkt_dev->pg_thread = t; + list_add_rcu(&pkt_dev->list, &t->if_list); + +out: + if_unlock(t); + return rv; +} + +/* Called under thread lock */ + +static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) +{ + struct pktgen_dev *pkt_dev; + int err; + int node = cpu_to_node(t->cpu); + + /* We don't allow a device to be on several threads */ + + pkt_dev = __pktgen_NN_threads(t->net, ifname, FIND); + if (pkt_dev) { + pr_err("ERROR: interface already used\n"); + return -EBUSY; + } + + pkt_dev = kzalloc_node(sizeof(struct pktgen_dev), GFP_KERNEL, node); + if (!pkt_dev) + return -ENOMEM; + + strcpy(pkt_dev->odevname, ifname); + pkt_dev->flows = vzalloc_node(array_size(MAX_CFLOWS, + sizeof(struct flow_state)), + node); + if (pkt_dev->flows == NULL) { + kfree(pkt_dev); + return -ENOMEM; + } + + pkt_dev->removal_mark = 0; + pkt_dev->nfrags = 0; + pkt_dev->delay = pg_delay_d; + pkt_dev->count = pg_count_d; + pkt_dev->sofar = 0; + pkt_dev->udp_src_min = 9; /* sink port */ + pkt_dev->udp_src_max = 9; + pkt_dev->udp_dst_min = 9; + pkt_dev->udp_dst_max = 9; + pkt_dev->vlan_p = 0; + pkt_dev->vlan_cfi = 0; + pkt_dev->vlan_id = 0xffff; + pkt_dev->svlan_p = 0; + pkt_dev->svlan_cfi = 0; + pkt_dev->svlan_id = 0xffff; + pkt_dev->burst = 1; + pkt_dev->node = NUMA_NO_NODE; + + err = pktgen_setup_dev(t->net, pkt_dev, ifname); + if (err) + goto out1; + if (pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING) + pkt_dev->clone_skb = pg_clone_skb_d; + + pkt_dev->entry = proc_create_data(ifname, 0600, t->net->proc_dir, + &pktgen_if_proc_ops, pkt_dev); + if (!pkt_dev->entry) { + pr_err("cannot create %s/%s procfs entry\n", + PG_PROC_DIR, ifname); + err = -EINVAL; + goto out2; + } +#ifdef CONFIG_XFRM + pkt_dev->ipsmode = XFRM_MODE_TRANSPORT; + pkt_dev->ipsproto = IPPROTO_ESP; + + /* xfrm tunnel mode needs additional dst to extract outter + * ip header protocol/ttl/id field, here creat a phony one. + * instead of looking for a valid rt, which definitely hurting + * performance under such circumstance. + */ + pkt_dev->dstops.family = AF_INET; + pkt_dev->xdst.u.dst.dev = pkt_dev->odev; + dst_init_metrics(&pkt_dev->xdst.u.dst, pktgen_dst_metrics, false); + pkt_dev->xdst.child = &pkt_dev->xdst.u.dst; + pkt_dev->xdst.u.dst.ops = &pkt_dev->dstops; +#endif + + return add_dev_to_thread(t, pkt_dev); +out2: + netdev_put(pkt_dev->odev, &pkt_dev->dev_tracker); +out1: +#ifdef CONFIG_XFRM + free_SAs(pkt_dev); +#endif + vfree(pkt_dev->flows); + kfree(pkt_dev); + return err; +} + +static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn) +{ + struct pktgen_thread *t; + struct proc_dir_entry *pe; + struct task_struct *p; + + t = kzalloc_node(sizeof(struct pktgen_thread), GFP_KERNEL, + cpu_to_node(cpu)); + if (!t) { + pr_err("ERROR: out of memory, can't create new thread\n"); + return -ENOMEM; + } + + mutex_init(&t->if_lock); + t->cpu = cpu; + + INIT_LIST_HEAD(&t->if_list); + + list_add_tail(&t->th_list, &pn->pktgen_threads); + init_completion(&t->start_done); + + p = kthread_create_on_node(pktgen_thread_worker, + t, + cpu_to_node(cpu), + "kpktgend_%d", cpu); + if (IS_ERR(p)) { + pr_err("kthread_create_on_node() failed for cpu %d\n", t->cpu); + list_del(&t->th_list); + kfree(t); + return PTR_ERR(p); + } + kthread_bind(p, cpu); + t->tsk = p; + + pe = proc_create_data(t->tsk->comm, 0600, pn->proc_dir, + &pktgen_thread_proc_ops, t); + if (!pe) { + pr_err("cannot create %s/%s procfs entry\n", + PG_PROC_DIR, t->tsk->comm); + kthread_stop(p); + list_del(&t->th_list); + kfree(t); + return -EINVAL; + } + + t->net = pn; + get_task_struct(p); + wake_up_process(p); + wait_for_completion(&t->start_done); + + return 0; +} + +/* + * Removes a device from the thread if_list. + */ +static void _rem_dev_from_if_list(struct pktgen_thread *t, + struct pktgen_dev *pkt_dev) +{ + struct list_head *q, *n; + struct pktgen_dev *p; + + if_lock(t); + list_for_each_safe(q, n, &t->if_list) { + p = list_entry(q, struct pktgen_dev, list); + if (p == pkt_dev) + list_del_rcu(&p->list); + } + if_unlock(t); +} + +static int pktgen_remove_device(struct pktgen_thread *t, + struct pktgen_dev *pkt_dev) +{ + pr_debug("remove_device pkt_dev=%p\n", pkt_dev); + + if (pkt_dev->running) { + pr_warn("WARNING: trying to remove a running interface, stopping it now\n"); + pktgen_stop_device(pkt_dev); + } + + /* Dis-associate from the interface */ + + if (pkt_dev->odev) { + netdev_put(pkt_dev->odev, &pkt_dev->dev_tracker); + pkt_dev->odev = NULL; + } + + /* Remove proc before if_list entry, because add_device uses + * list to determine if interface already exist, avoid race + * with proc_create_data() */ + proc_remove(pkt_dev->entry); + + /* And update the thread if_list */ + _rem_dev_from_if_list(t, pkt_dev); + +#ifdef CONFIG_XFRM + free_SAs(pkt_dev); +#endif + vfree(pkt_dev->flows); + if (pkt_dev->page) + put_page(pkt_dev->page); + kfree_rcu(pkt_dev, rcu); + return 0; +} + +static int __net_init pg_net_init(struct net *net) +{ + struct pktgen_net *pn = net_generic(net, pg_net_id); + struct proc_dir_entry *pe; + int cpu, ret = 0; + + pn->net = net; + INIT_LIST_HEAD(&pn->pktgen_threads); + pn->pktgen_exiting = false; + pn->proc_dir = proc_mkdir(PG_PROC_DIR, pn->net->proc_net); + if (!pn->proc_dir) { + pr_warn("cannot create /proc/net/%s\n", PG_PROC_DIR); + return -ENODEV; + } + pe = proc_create(PGCTRL, 0600, pn->proc_dir, &pktgen_proc_ops); + if (pe == NULL) { + pr_err("cannot create %s procfs entry\n", PGCTRL); + ret = -EINVAL; + goto remove; + } + + for_each_online_cpu(cpu) { + int err; + + err = pktgen_create_thread(cpu, pn); + if (err) + pr_warn("Cannot create thread for cpu %d (%d)\n", + cpu, err); + } + + if (list_empty(&pn->pktgen_threads)) { + pr_err("Initialization failed for all threads\n"); + ret = -ENODEV; + goto remove_entry; + } + + return 0; + +remove_entry: + remove_proc_entry(PGCTRL, pn->proc_dir); +remove: + remove_proc_entry(PG_PROC_DIR, pn->net->proc_net); + return ret; +} + +static void __net_exit pg_net_exit(struct net *net) +{ + struct pktgen_net *pn = net_generic(net, pg_net_id); + struct pktgen_thread *t; + struct list_head *q, *n; + LIST_HEAD(list); + + /* Stop all interfaces & threads */ + pn->pktgen_exiting = true; + + mutex_lock(&pktgen_thread_lock); + list_splice_init(&pn->pktgen_threads, &list); + mutex_unlock(&pktgen_thread_lock); + + list_for_each_safe(q, n, &list) { + t = list_entry(q, struct pktgen_thread, th_list); + list_del(&t->th_list); + kthread_stop(t->tsk); + put_task_struct(t->tsk); + kfree(t); + } + + remove_proc_entry(PGCTRL, pn->proc_dir); + remove_proc_entry(PG_PROC_DIR, pn->net->proc_net); +} + +static struct pernet_operations pg_net_ops = { + .init = pg_net_init, + .exit = pg_net_exit, + .id = &pg_net_id, + .size = sizeof(struct pktgen_net), +}; + +static int __init pg_init(void) +{ + int ret = 0; + + pr_info("%s", version); + ret = register_pernet_subsys(&pg_net_ops); + if (ret) + return ret; + ret = register_netdevice_notifier(&pktgen_notifier_block); + if (ret) + unregister_pernet_subsys(&pg_net_ops); + + return ret; +} + +static void __exit pg_cleanup(void) +{ + unregister_netdevice_notifier(&pktgen_notifier_block); + unregister_pernet_subsys(&pg_net_ops); + /* Don't need rcu_barrier() due to use of kfree_rcu() */ +} + +module_init(pg_init); +module_exit(pg_cleanup); + +MODULE_AUTHOR("Robert Olsson <robert.olsson@its.uu.se>"); +MODULE_DESCRIPTION("Packet Generator tool"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(VERSION); +module_param(pg_count_d, int, 0); +MODULE_PARM_DESC(pg_count_d, "Default number of packets to inject"); +module_param(pg_delay_d, int, 0); +MODULE_PARM_DESC(pg_delay_d, "Default delay between packets (nanoseconds)"); +module_param(pg_clone_skb_d, int, 0); +MODULE_PARM_DESC(pg_clone_skb_d, "Default number of copies of the same packet"); +module_param(debug, int, 0); +MODULE_PARM_DESC(debug, "Enable debugging of pktgen module"); diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c new file mode 100644 index 000000000..598041b04 --- /dev/null +++ b/net/core/ptp_classifier.c @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* PTP classifier + */ + +/* The below program is the bpf_asm (tools/net/) representation of + * the opcode array in the ptp_filter structure. + * + * For convenience, this can easily be altered and reviewed with + * bpf_asm and bpf_dbg, e.g. `./bpf_asm -c prog` where prog is a + * simple file containing the below program: + * + * ldh [12] ; load ethertype + * + * ; PTP over UDP over IPv4 over Ethernet + * test_ipv4: + * jneq #0x800, test_ipv6 ; ETH_P_IP ? + * ldb [23] ; load proto + * jneq #17, drop_ipv4 ; IPPROTO_UDP ? + * ldh [20] ; load frag offset field + * jset #0x1fff, drop_ipv4 ; don't allow fragments + * ldxb 4*([14]&0xf) ; load IP header len + * ldh [x + 16] ; load UDP dst port + * jneq #319, drop_ipv4 ; is port PTP_EV_PORT ? + * ldh [x + 22] ; load payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0x10 ; PTP_CLASS_IPV4 + * ret a ; return PTP class + * drop_ipv4: ret #0x0 ; PTP_CLASS_NONE + * + * ; PTP over UDP over IPv6 over Ethernet + * test_ipv6: + * jneq #0x86dd, test_8021q ; ETH_P_IPV6 ? + * ldb [20] ; load proto + * jneq #17, drop_ipv6 ; IPPROTO_UDP ? + * ldh [56] ; load UDP dst port + * jneq #319, drop_ipv6 ; is port PTP_EV_PORT ? + * ldh [62] ; load payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0x20 ; PTP_CLASS_IPV6 + * ret a ; return PTP class + * drop_ipv6: ret #0x0 ; PTP_CLASS_NONE + * + * ; PTP over 802.1Q over Ethernet + * test_8021q: + * jneq #0x8100, test_ieee1588 ; ETH_P_8021Q ? + * ldh [16] ; load inner type + * jneq #0x88f7, test_8021q_ipv4 ; ETH_P_1588 ? + * ldb [18] ; load payload + * and #0x8 ; as we don't have ports here, test + * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these + * ldh [18] ; reload payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0xc0 ; PTP_CLASS_VLAN|PTP_CLASS_L2 + * ret a ; return PTP class + * + * ; PTP over UDP over IPv4 over 802.1Q over Ethernet + * test_8021q_ipv4: + * jneq #0x800, test_8021q_ipv6 ; ETH_P_IP ? + * ldb [27] ; load proto + * jneq #17, drop_8021q_ipv4 ; IPPROTO_UDP ? + * ldh [24] ; load frag offset field + * jset #0x1fff, drop_8021q_ipv4; don't allow fragments + * ldxb 4*([18]&0xf) ; load IP header len + * ldh [x + 20] ; load UDP dst port + * jneq #319, drop_8021q_ipv4 ; is port PTP_EV_PORT ? + * ldh [x + 26] ; load payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0x90 ; PTP_CLASS_VLAN|PTP_CLASS_IPV4 + * ret a ; return PTP class + * drop_8021q_ipv4: ret #0x0 ; PTP_CLASS_NONE + * + * ; PTP over UDP over IPv6 over 802.1Q over Ethernet + * test_8021q_ipv6: + * jneq #0x86dd, drop_8021q_ipv6 ; ETH_P_IPV6 ? + * ldb [24] ; load proto + * jneq #17, drop_8021q_ipv6 ; IPPROTO_UDP ? + * ldh [60] ; load UDP dst port + * jneq #319, drop_8021q_ipv6 ; is port PTP_EV_PORT ? + * ldh [66] ; load payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0xa0 ; PTP_CLASS_VLAN|PTP_CLASS_IPV6 + * ret a ; return PTP class + * drop_8021q_ipv6: ret #0x0 ; PTP_CLASS_NONE + * + * ; PTP over Ethernet + * test_ieee1588: + * jneq #0x88f7, drop_ieee1588 ; ETH_P_1588 ? + * ldb [14] ; load payload + * and #0x8 ; as we don't have ports here, test + * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these + * ldh [14] ; reload payload + * and #0xf ; mask PTP_CLASS_VMASK + * or #0x40 ; PTP_CLASS_L2 + * ret a ; return PTP class + * drop_ieee1588: ret #0x0 ; PTP_CLASS_NONE + */ + +#include <linux/skbuff.h> +#include <linux/filter.h> +#include <linux/ptp_classify.h> + +static struct bpf_prog *ptp_insns __read_mostly; + +unsigned int ptp_classify_raw(const struct sk_buff *skb) +{ + return bpf_prog_run(ptp_insns, skb); +} +EXPORT_SYMBOL_GPL(ptp_classify_raw); + +struct ptp_header *ptp_parse_header(struct sk_buff *skb, unsigned int type) +{ + u8 *ptr = skb_mac_header(skb); + + if (type & PTP_CLASS_VLAN) + ptr += VLAN_HLEN; + + switch (type & PTP_CLASS_PMASK) { + case PTP_CLASS_IPV4: + ptr += IPV4_HLEN(ptr) + UDP_HLEN; + break; + case PTP_CLASS_IPV6: + ptr += IP6_HLEN + UDP_HLEN; + break; + case PTP_CLASS_L2: + break; + default: + return NULL; + } + + ptr += ETH_HLEN; + + /* Ensure that the entire header is present in this packet. */ + if (ptr + sizeof(struct ptp_header) > skb->data + skb->len) + return NULL; + + return (struct ptp_header *)ptr; +} +EXPORT_SYMBOL_GPL(ptp_parse_header); + +bool ptp_msg_is_sync(struct sk_buff *skb, unsigned int type) +{ + struct ptp_header *hdr; + + hdr = ptp_parse_header(skb, type); + if (!hdr) + return false; + + return ptp_get_msgtype(hdr, type) == PTP_MSGTYPE_SYNC; +} +EXPORT_SYMBOL_GPL(ptp_msg_is_sync); + +void __init ptp_classifier_init(void) +{ + static struct sock_filter ptp_filter[] __initdata = { + { 0x28, 0, 0, 0x0000000c }, + { 0x15, 0, 12, 0x00000800 }, + { 0x30, 0, 0, 0x00000017 }, + { 0x15, 0, 9, 0x00000011 }, + { 0x28, 0, 0, 0x00000014 }, + { 0x45, 7, 0, 0x00001fff }, + { 0xb1, 0, 0, 0x0000000e }, + { 0x48, 0, 0, 0x00000010 }, + { 0x15, 0, 4, 0x0000013f }, + { 0x48, 0, 0, 0x00000016 }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x00000010 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x06, 0, 0, 0x00000000 }, + { 0x15, 0, 9, 0x000086dd }, + { 0x30, 0, 0, 0x00000014 }, + { 0x15, 0, 6, 0x00000011 }, + { 0x28, 0, 0, 0x00000038 }, + { 0x15, 0, 4, 0x0000013f }, + { 0x28, 0, 0, 0x0000003e }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x00000020 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x06, 0, 0, 0x00000000 }, + { 0x15, 0, 32, 0x00008100 }, + { 0x28, 0, 0, 0x00000010 }, + { 0x15, 0, 7, 0x000088f7 }, + { 0x30, 0, 0, 0x00000012 }, + { 0x54, 0, 0, 0x00000008 }, + { 0x15, 0, 35, 0x00000000 }, + { 0x28, 0, 0, 0x00000012 }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x000000c0 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x15, 0, 12, 0x00000800 }, + { 0x30, 0, 0, 0x0000001b }, + { 0x15, 0, 9, 0x00000011 }, + { 0x28, 0, 0, 0x00000018 }, + { 0x45, 7, 0, 0x00001fff }, + { 0xb1, 0, 0, 0x00000012 }, + { 0x48, 0, 0, 0x00000014 }, + { 0x15, 0, 4, 0x0000013f }, + { 0x48, 0, 0, 0x0000001a }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x00000090 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x06, 0, 0, 0x00000000 }, + { 0x15, 0, 8, 0x000086dd }, + { 0x30, 0, 0, 0x00000018 }, + { 0x15, 0, 6, 0x00000011 }, + { 0x28, 0, 0, 0x0000003c }, + { 0x15, 0, 4, 0x0000013f }, + { 0x28, 0, 0, 0x00000042 }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x000000a0 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x06, 0, 0, 0x00000000 }, + { 0x15, 0, 7, 0x000088f7 }, + { 0x30, 0, 0, 0x0000000e }, + { 0x54, 0, 0, 0x00000008 }, + { 0x15, 0, 4, 0x00000000 }, + { 0x28, 0, 0, 0x0000000e }, + { 0x54, 0, 0, 0x0000000f }, + { 0x44, 0, 0, 0x00000040 }, + { 0x16, 0, 0, 0x00000000 }, + { 0x06, 0, 0, 0x00000000 }, + }; + struct sock_fprog_kern ptp_prog; + + ptp_prog.len = ARRAY_SIZE(ptp_filter); + ptp_prog.filter = ptp_filter; + + BUG_ON(bpf_prog_create(&ptp_insns, &ptp_prog)); +} diff --git a/net/core/request_sock.c b/net/core/request_sock.c new file mode 100644 index 000000000..63de5c635 --- /dev/null +++ b/net/core/request_sock.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * NET Generic infrastructure for Network protocols. + * + * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br> + * + * From code originally in include/net/tcp.h + */ + +#include <linux/module.h> +#include <linux/random.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/tcp.h> +#include <linux/vmalloc.h> + +#include <net/request_sock.h> + +/* + * Maximum number of SYN_RECV sockets in queue per LISTEN socket. + * One SYN_RECV socket costs about 80bytes on a 32bit machine. + * It would be better to replace it with a global counter for all sockets + * but then some measure against one socket starving all other sockets + * would be needed. + * + * The minimum value of it is 128. Experiments with real servers show that + * it is absolutely not enough even at 100conn/sec. 256 cures most + * of problems. + * This value is adjusted to 128 for low memory machines, + * and it will increase in proportion to the memory of machine. + * Note : Dont forget somaxconn that may limit backlog too. + */ + +void reqsk_queue_alloc(struct request_sock_queue *queue) +{ + queue->fastopenq.rskq_rst_head = NULL; + queue->fastopenq.rskq_rst_tail = NULL; + queue->fastopenq.qlen = 0; + + queue->rskq_accept_head = NULL; +} + +/* + * This function is called to set a Fast Open socket's "fastopen_rsk" field + * to NULL when a TFO socket no longer needs to access the request_sock. + * This happens only after 3WHS has been either completed or aborted (e.g., + * RST is received). + * + * Before TFO, a child socket is created only after 3WHS is completed, + * hence it never needs to access the request_sock. things get a lot more + * complex with TFO. A child socket, accepted or not, has to access its + * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts, + * until 3WHS is either completed or aborted. Afterwards the req will stay + * until either the child socket is accepted, or in the rare case when the + * listener is closed before the child is accepted. + * + * In short, a request socket is only freed after BOTH 3WHS has completed + * (or aborted) and the child socket has been accepted (or listener closed). + * When a child socket is accepted, its corresponding req->sk is set to + * NULL since it's no longer needed. More importantly, "req->sk == NULL" + * will be used by the code below to determine if a child socket has been + * accepted or not, and the check is protected by the fastopenq->lock + * described below. + * + * Note that fastopen_rsk is only accessed from the child socket's context + * with its socket lock held. But a request_sock (req) can be accessed by + * both its child socket through fastopen_rsk, and a listener socket through + * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin + * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created. + * only in the rare case when both the listener and the child locks are held, + * e.g., in inet_csk_listen_stop() do we not need to acquire the lock. + * The lock also protects other fields such as fastopenq->qlen, which is + * decremented by this function when fastopen_rsk is no longer needed. + * + * Note that another solution was to simply use the existing socket lock + * from the listener. But first socket lock is difficult to use. It is not + * a simple spin lock - one must consider sock_owned_by_user() and arrange + * to use sk_add_backlog() stuff. But what really makes it infeasible is the + * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to + * acquire a child's lock while holding listener's socket lock. A corner + * case might also exist in tcp_v4_hnd_req() that will trigger this locking + * order. + * + * This function also sets "treq->tfo_listener" to false. + * treq->tfo_listener is used by the listener so it is protected by the + * fastopenq->lock in this function. + */ +void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, + bool reset) +{ + struct sock *lsk = req->rsk_listener; + struct fastopen_queue *fastopenq; + + fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq; + + RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL); + spin_lock_bh(&fastopenq->lock); + fastopenq->qlen--; + tcp_rsk(req)->tfo_listener = false; + if (req->sk) /* the child socket hasn't been accepted yet */ + goto out; + + if (!reset || lsk->sk_state != TCP_LISTEN) { + /* If the listener has been closed don't bother with the + * special RST handling below. + */ + spin_unlock_bh(&fastopenq->lock); + reqsk_put(req); + return; + } + /* Wait for 60secs before removing a req that has triggered RST. + * This is a simple defense against TFO spoofing attack - by + * counting the req against fastopen.max_qlen, and disabling + * TFO when the qlen exceeds max_qlen. + * + * For more details see CoNext'11 "TCP Fast Open" paper. + */ + req->rsk_timer.expires = jiffies + 60*HZ; + if (fastopenq->rskq_rst_head == NULL) + fastopenq->rskq_rst_head = req; + else + fastopenq->rskq_rst_tail->dl_next = req; + + req->dl_next = NULL; + fastopenq->rskq_rst_tail = req; + fastopenq->qlen++; +out: + spin_unlock_bh(&fastopenq->lock); +} diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c new file mode 100644 index 000000000..7cf1e42d7 --- /dev/null +++ b/net/core/rtnetlink.c @@ -0,0 +1,6248 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Routing netlink socket interface: protocol independent part. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * Fixes: + * Vitaly E. Lavrov RTA_OK arithmetic was wrong. + */ + +#include <linux/bitops.h> +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/kernel.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/capability.h> +#include <linux/skbuff.h> +#include <linux/init.h> +#include <linux/security.h> +#include <linux/mutex.h> +#include <linux/if_addr.h> +#include <linux/if_bridge.h> +#include <linux/if_vlan.h> +#include <linux/pci.h> +#include <linux/etherdevice.h> +#include <linux/bpf.h> + +#include <linux/uaccess.h> + +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/arp.h> +#include <net/route.h> +#include <net/udp.h> +#include <net/tcp.h> +#include <net/sock.h> +#include <net/pkt_sched.h> +#include <net/fib_rules.h> +#include <net/rtnetlink.h> +#include <net/net_namespace.h> + +#include "dev.h" + +#define RTNL_MAX_TYPE 50 +#define RTNL_SLAVE_MAX_TYPE 40 + +struct rtnl_link { + rtnl_doit_func doit; + rtnl_dumpit_func dumpit; + struct module *owner; + unsigned int flags; + struct rcu_head rcu; +}; + +static DEFINE_MUTEX(rtnl_mutex); + +void rtnl_lock(void) +{ + mutex_lock(&rtnl_mutex); +} +EXPORT_SYMBOL(rtnl_lock); + +int rtnl_lock_killable(void) +{ + return mutex_lock_killable(&rtnl_mutex); +} +EXPORT_SYMBOL(rtnl_lock_killable); + +static struct sk_buff *defer_kfree_skb_list; +void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail) +{ + if (head && tail) { + tail->next = defer_kfree_skb_list; + defer_kfree_skb_list = head; + } +} +EXPORT_SYMBOL(rtnl_kfree_skbs); + +void __rtnl_unlock(void) +{ + struct sk_buff *head = defer_kfree_skb_list; + + defer_kfree_skb_list = NULL; + + /* Ensure that we didn't actually add any TODO item when __rtnl_unlock() + * is used. In some places, e.g. in cfg80211, we have code that will do + * something like + * rtnl_lock() + * wiphy_lock() + * ... + * rtnl_unlock() + * + * and because netdev_run_todo() acquires the RTNL for items on the list + * we could cause a situation such as this: + * Thread 1 Thread 2 + * rtnl_lock() + * unregister_netdevice() + * __rtnl_unlock() + * rtnl_lock() + * wiphy_lock() + * rtnl_unlock() + * netdev_run_todo() + * __rtnl_unlock() + * + * // list not empty now + * // because of thread 2 + * rtnl_lock() + * while (!list_empty(...)) + * rtnl_lock() + * wiphy_lock() + * **** DEADLOCK **** + * + * However, usage of __rtnl_unlock() is rare, and so we can ensure that + * it's not used in cases where something is added to do the list. + */ + WARN_ON(!list_empty(&net_todo_list)); + + mutex_unlock(&rtnl_mutex); + + while (head) { + struct sk_buff *next = head->next; + + kfree_skb(head); + cond_resched(); + head = next; + } +} + +void rtnl_unlock(void) +{ + /* This fellow will unlock it for us. */ + netdev_run_todo(); +} +EXPORT_SYMBOL(rtnl_unlock); + +int rtnl_trylock(void) +{ + return mutex_trylock(&rtnl_mutex); +} +EXPORT_SYMBOL(rtnl_trylock); + +int rtnl_is_locked(void) +{ + return mutex_is_locked(&rtnl_mutex); +} +EXPORT_SYMBOL(rtnl_is_locked); + +bool refcount_dec_and_rtnl_lock(refcount_t *r) +{ + return refcount_dec_and_mutex_lock(r, &rtnl_mutex); +} +EXPORT_SYMBOL(refcount_dec_and_rtnl_lock); + +#ifdef CONFIG_PROVE_LOCKING +bool lockdep_rtnl_is_held(void) +{ + return lockdep_is_held(&rtnl_mutex); +} +EXPORT_SYMBOL(lockdep_rtnl_is_held); +#endif /* #ifdef CONFIG_PROVE_LOCKING */ + +static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; + +static inline int rtm_msgindex(int msgtype) +{ + int msgindex = msgtype - RTM_BASE; + + /* + * msgindex < 0 implies someone tried to register a netlink + * control code. msgindex >= RTM_NR_MSGTYPES may indicate that + * the message type has not been added to linux/rtnetlink.h + */ + BUG_ON(msgindex < 0 || msgindex >= RTM_NR_MSGTYPES); + + return msgindex; +} + +static struct rtnl_link *rtnl_get_link(int protocol, int msgtype) +{ + struct rtnl_link __rcu **tab; + + if (protocol >= ARRAY_SIZE(rtnl_msg_handlers)) + protocol = PF_UNSPEC; + + tab = rcu_dereference_rtnl(rtnl_msg_handlers[protocol]); + if (!tab) + tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]); + + return rcu_dereference_rtnl(tab[msgtype]); +} + +static int rtnl_register_internal(struct module *owner, + int protocol, int msgtype, + rtnl_doit_func doit, rtnl_dumpit_func dumpit, + unsigned int flags) +{ + struct rtnl_link *link, *old; + struct rtnl_link __rcu **tab; + int msgindex; + int ret = -ENOBUFS; + + BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); + msgindex = rtm_msgindex(msgtype); + + rtnl_lock(); + tab = rtnl_dereference(rtnl_msg_handlers[protocol]); + if (tab == NULL) { + tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL); + if (!tab) + goto unlock; + + /* ensures we see the 0 stores */ + rcu_assign_pointer(rtnl_msg_handlers[protocol], tab); + } + + old = rtnl_dereference(tab[msgindex]); + if (old) { + link = kmemdup(old, sizeof(*old), GFP_KERNEL); + if (!link) + goto unlock; + } else { + link = kzalloc(sizeof(*link), GFP_KERNEL); + if (!link) + goto unlock; + } + + WARN_ON(link->owner && link->owner != owner); + link->owner = owner; + + WARN_ON(doit && link->doit && link->doit != doit); + if (doit) + link->doit = doit; + WARN_ON(dumpit && link->dumpit && link->dumpit != dumpit); + if (dumpit) + link->dumpit = dumpit; + + WARN_ON(rtnl_msgtype_kind(msgtype) != RTNL_KIND_DEL && + (flags & RTNL_FLAG_BULK_DEL_SUPPORTED)); + link->flags |= flags; + + /* publish protocol:msgtype */ + rcu_assign_pointer(tab[msgindex], link); + ret = 0; + if (old) + kfree_rcu(old, rcu); +unlock: + rtnl_unlock(); + return ret; +} + +/** + * rtnl_register_module - Register a rtnetlink message type + * + * @owner: module registering the hook (THIS_MODULE) + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * @doit: Function pointer called for each request message + * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message + * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions + * + * Like rtnl_register, but for use by removable modules. + */ +int rtnl_register_module(struct module *owner, + int protocol, int msgtype, + rtnl_doit_func doit, rtnl_dumpit_func dumpit, + unsigned int flags) +{ + return rtnl_register_internal(owner, protocol, msgtype, + doit, dumpit, flags); +} +EXPORT_SYMBOL_GPL(rtnl_register_module); + +/** + * rtnl_register - Register a rtnetlink message type + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * @doit: Function pointer called for each request message + * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message + * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions + * + * Registers the specified function pointers (at least one of them has + * to be non-NULL) to be called whenever a request message for the + * specified protocol family and message type is received. + * + * The special protocol family PF_UNSPEC may be used to define fallback + * function pointers for the case when no entry for the specific protocol + * family exists. + */ +void rtnl_register(int protocol, int msgtype, + rtnl_doit_func doit, rtnl_dumpit_func dumpit, + unsigned int flags) +{ + int err; + + err = rtnl_register_internal(NULL, protocol, msgtype, doit, dumpit, + flags); + if (err) + pr_err("Unable to register rtnetlink message handler, " + "protocol = %d, message type = %d\n", protocol, msgtype); +} + +/** + * rtnl_unregister - Unregister a rtnetlink message type + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * + * Returns 0 on success or a negative error code. + */ +int rtnl_unregister(int protocol, int msgtype) +{ + struct rtnl_link __rcu **tab; + struct rtnl_link *link; + int msgindex; + + BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); + msgindex = rtm_msgindex(msgtype); + + rtnl_lock(); + tab = rtnl_dereference(rtnl_msg_handlers[protocol]); + if (!tab) { + rtnl_unlock(); + return -ENOENT; + } + + link = rtnl_dereference(tab[msgindex]); + RCU_INIT_POINTER(tab[msgindex], NULL); + rtnl_unlock(); + + kfree_rcu(link, rcu); + + return 0; +} +EXPORT_SYMBOL_GPL(rtnl_unregister); + +/** + * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol + * @protocol : Protocol family or PF_UNSPEC + * + * Identical to calling rtnl_unregster() for all registered message types + * of a certain protocol family. + */ +void rtnl_unregister_all(int protocol) +{ + struct rtnl_link __rcu **tab; + struct rtnl_link *link; + int msgindex; + + BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); + + rtnl_lock(); + tab = rtnl_dereference(rtnl_msg_handlers[protocol]); + if (!tab) { + rtnl_unlock(); + return; + } + RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL); + for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) { + link = rtnl_dereference(tab[msgindex]); + if (!link) + continue; + + RCU_INIT_POINTER(tab[msgindex], NULL); + kfree_rcu(link, rcu); + } + rtnl_unlock(); + + synchronize_net(); + + kfree(tab); +} +EXPORT_SYMBOL_GPL(rtnl_unregister_all); + +static LIST_HEAD(link_ops); + +static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) +{ + const struct rtnl_link_ops *ops; + + list_for_each_entry(ops, &link_ops, list) { + if (!strcmp(ops->kind, kind)) + return ops; + } + return NULL; +} + +/** + * __rtnl_link_register - Register rtnl_link_ops with rtnetlink. + * @ops: struct rtnl_link_ops * to register + * + * The caller must hold the rtnl_mutex. This function should be used + * by drivers that create devices during module initialization. It + * must be called before registering the devices. + * + * Returns 0 on success or a negative error code. + */ +int __rtnl_link_register(struct rtnl_link_ops *ops) +{ + if (rtnl_link_ops_get(ops->kind)) + return -EEXIST; + + /* The check for alloc/setup is here because if ops + * does not have that filled up, it is not possible + * to use the ops for creating device. So do not + * fill up dellink as well. That disables rtnl_dellink. + */ + if ((ops->alloc || ops->setup) && !ops->dellink) + ops->dellink = unregister_netdevice_queue; + + list_add_tail(&ops->list, &link_ops); + return 0; +} +EXPORT_SYMBOL_GPL(__rtnl_link_register); + +/** + * rtnl_link_register - Register rtnl_link_ops with rtnetlink. + * @ops: struct rtnl_link_ops * to register + * + * Returns 0 on success or a negative error code. + */ +int rtnl_link_register(struct rtnl_link_ops *ops) +{ + int err; + + /* Sanity-check max sizes to avoid stack buffer overflow. */ + if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE || + ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)) + return -EINVAL; + + rtnl_lock(); + err = __rtnl_link_register(ops); + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL_GPL(rtnl_link_register); + +static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) +{ + struct net_device *dev; + LIST_HEAD(list_kill); + + for_each_netdev(net, dev) { + if (dev->rtnl_link_ops == ops) + ops->dellink(dev, &list_kill); + } + unregister_netdevice_many(&list_kill); +} + +/** + * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. + * @ops: struct rtnl_link_ops * to unregister + * + * The caller must hold the rtnl_mutex and guarantee net_namespace_list + * integrity (hold pernet_ops_rwsem for writing to close the race + * with setup_net() and cleanup_net()). + */ +void __rtnl_link_unregister(struct rtnl_link_ops *ops) +{ + struct net *net; + + for_each_net(net) { + __rtnl_kill_links(net, ops); + } + list_del(&ops->list); +} +EXPORT_SYMBOL_GPL(__rtnl_link_unregister); + +/* Return with the rtnl_lock held when there are no network + * devices unregistering in any network namespace. + */ +static void rtnl_lock_unregistering_all(void) +{ + struct net *net; + bool unregistering; + DEFINE_WAIT_FUNC(wait, woken_wake_function); + + add_wait_queue(&netdev_unregistering_wq, &wait); + for (;;) { + unregistering = false; + rtnl_lock(); + /* We held write locked pernet_ops_rwsem, and parallel + * setup_net() and cleanup_net() are not possible. + */ + for_each_net(net) { + if (atomic_read(&net->dev_unreg_count) > 0) { + unregistering = true; + break; + } + } + if (!unregistering) + break; + __rtnl_unlock(); + + wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); + } + remove_wait_queue(&netdev_unregistering_wq, &wait); +} + +/** + * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. + * @ops: struct rtnl_link_ops * to unregister + */ +void rtnl_link_unregister(struct rtnl_link_ops *ops) +{ + /* Close the race with setup_net() and cleanup_net() */ + down_write(&pernet_ops_rwsem); + rtnl_lock_unregistering_all(); + __rtnl_link_unregister(ops); + rtnl_unlock(); + up_write(&pernet_ops_rwsem); +} +EXPORT_SYMBOL_GPL(rtnl_link_unregister); + +static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev) +{ + struct net_device *master_dev; + const struct rtnl_link_ops *ops; + size_t size = 0; + + rcu_read_lock(); + + master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev); + if (!master_dev) + goto out; + + ops = master_dev->rtnl_link_ops; + if (!ops || !ops->get_slave_size) + goto out; + /* IFLA_INFO_SLAVE_DATA + nested data */ + size = nla_total_size(sizeof(struct nlattr)) + + ops->get_slave_size(master_dev, dev); + +out: + rcu_read_unlock(); + return size; +} + +static size_t rtnl_link_get_size(const struct net_device *dev) +{ + const struct rtnl_link_ops *ops = dev->rtnl_link_ops; + size_t size; + + if (!ops) + return 0; + + size = nla_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */ + nla_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */ + + if (ops->get_size) + /* IFLA_INFO_DATA + nested data */ + size += nla_total_size(sizeof(struct nlattr)) + + ops->get_size(dev); + + if (ops->get_xstats_size) + /* IFLA_INFO_XSTATS */ + size += nla_total_size(ops->get_xstats_size(dev)); + + size += rtnl_link_get_slave_info_data_size(dev); + + return size; +} + +static LIST_HEAD(rtnl_af_ops); + +static const struct rtnl_af_ops *rtnl_af_lookup(const int family) +{ + const struct rtnl_af_ops *ops; + + ASSERT_RTNL(); + + list_for_each_entry(ops, &rtnl_af_ops, list) { + if (ops->family == family) + return ops; + } + + return NULL; +} + +/** + * rtnl_af_register - Register rtnl_af_ops with rtnetlink. + * @ops: struct rtnl_af_ops * to register + * + * Returns 0 on success or a negative error code. + */ +void rtnl_af_register(struct rtnl_af_ops *ops) +{ + rtnl_lock(); + list_add_tail_rcu(&ops->list, &rtnl_af_ops); + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(rtnl_af_register); + +/** + * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. + * @ops: struct rtnl_af_ops * to unregister + */ +void rtnl_af_unregister(struct rtnl_af_ops *ops) +{ + rtnl_lock(); + list_del_rcu(&ops->list); + rtnl_unlock(); + + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(rtnl_af_unregister); + +static size_t rtnl_link_get_af_size(const struct net_device *dev, + u32 ext_filter_mask) +{ + struct rtnl_af_ops *af_ops; + size_t size; + + /* IFLA_AF_SPEC */ + size = nla_total_size(sizeof(struct nlattr)); + + rcu_read_lock(); + list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { + if (af_ops->get_link_af_size) { + /* AF_* + nested data */ + size += nla_total_size(sizeof(struct nlattr)) + + af_ops->get_link_af_size(dev, ext_filter_mask); + } + } + rcu_read_unlock(); + + return size; +} + +static bool rtnl_have_link_slave_info(const struct net_device *dev) +{ + struct net_device *master_dev; + bool ret = false; + + rcu_read_lock(); + + master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev); + if (master_dev && master_dev->rtnl_link_ops) + ret = true; + rcu_read_unlock(); + return ret; +} + +static int rtnl_link_slave_info_fill(struct sk_buff *skb, + const struct net_device *dev) +{ + struct net_device *master_dev; + const struct rtnl_link_ops *ops; + struct nlattr *slave_data; + int err; + + master_dev = netdev_master_upper_dev_get((struct net_device *) dev); + if (!master_dev) + return 0; + ops = master_dev->rtnl_link_ops; + if (!ops) + return 0; + if (nla_put_string(skb, IFLA_INFO_SLAVE_KIND, ops->kind) < 0) + return -EMSGSIZE; + if (ops->fill_slave_info) { + slave_data = nla_nest_start_noflag(skb, IFLA_INFO_SLAVE_DATA); + if (!slave_data) + return -EMSGSIZE; + err = ops->fill_slave_info(skb, master_dev, dev); + if (err < 0) + goto err_cancel_slave_data; + nla_nest_end(skb, slave_data); + } + return 0; + +err_cancel_slave_data: + nla_nest_cancel(skb, slave_data); + return err; +} + +static int rtnl_link_info_fill(struct sk_buff *skb, + const struct net_device *dev) +{ + const struct rtnl_link_ops *ops = dev->rtnl_link_ops; + struct nlattr *data; + int err; + + if (!ops) + return 0; + if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0) + return -EMSGSIZE; + if (ops->fill_xstats) { + err = ops->fill_xstats(skb, dev); + if (err < 0) + return err; + } + if (ops->fill_info) { + data = nla_nest_start_noflag(skb, IFLA_INFO_DATA); + if (data == NULL) + return -EMSGSIZE; + err = ops->fill_info(skb, dev); + if (err < 0) + goto err_cancel_data; + nla_nest_end(skb, data); + } + return 0; + +err_cancel_data: + nla_nest_cancel(skb, data); + return err; +} + +static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev) +{ + struct nlattr *linkinfo; + int err = -EMSGSIZE; + + linkinfo = nla_nest_start_noflag(skb, IFLA_LINKINFO); + if (linkinfo == NULL) + goto out; + + err = rtnl_link_info_fill(skb, dev); + if (err < 0) + goto err_cancel_link; + + err = rtnl_link_slave_info_fill(skb, dev); + if (err < 0) + goto err_cancel_link; + + nla_nest_end(skb, linkinfo); + return 0; + +err_cancel_link: + nla_nest_cancel(skb, linkinfo); +out: + return err; +} + +int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo) +{ + struct sock *rtnl = net->rtnl; + + return nlmsg_notify(rtnl, skb, pid, group, echo, GFP_KERNEL); +} + +int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) +{ + struct sock *rtnl = net->rtnl; + + return nlmsg_unicast(rtnl, skb, pid); +} +EXPORT_SYMBOL(rtnl_unicast); + +void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, + struct nlmsghdr *nlh, gfp_t flags) +{ + struct sock *rtnl = net->rtnl; + + nlmsg_notify(rtnl, skb, pid, group, nlmsg_report(nlh), flags); +} +EXPORT_SYMBOL(rtnl_notify); + +void rtnl_set_sk_err(struct net *net, u32 group, int error) +{ + struct sock *rtnl = net->rtnl; + + netlink_set_err(rtnl, 0, group, error); +} +EXPORT_SYMBOL(rtnl_set_sk_err); + +int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) +{ + struct nlattr *mx; + int i, valid = 0; + + /* nothing is dumped for dst_default_metrics, so just skip the loop */ + if (metrics == dst_default_metrics.metrics) + return 0; + + mx = nla_nest_start_noflag(skb, RTA_METRICS); + if (mx == NULL) + return -ENOBUFS; + + for (i = 0; i < RTAX_MAX; i++) { + if (metrics[i]) { + if (i == RTAX_CC_ALGO - 1) { + char tmp[TCP_CA_NAME_MAX], *name; + + name = tcp_ca_get_name_by_key(metrics[i], tmp); + if (!name) + continue; + if (nla_put_string(skb, i + 1, name)) + goto nla_put_failure; + } else if (i == RTAX_FEATURES - 1) { + u32 user_features = metrics[i] & RTAX_FEATURE_MASK; + + if (!user_features) + continue; + BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK); + if (nla_put_u32(skb, i + 1, user_features)) + goto nla_put_failure; + } else { + if (nla_put_u32(skb, i + 1, metrics[i])) + goto nla_put_failure; + } + valid++; + } + } + + if (!valid) { + nla_nest_cancel(skb, mx); + return 0; + } + + return nla_nest_end(skb, mx); + +nla_put_failure: + nla_nest_cancel(skb, mx); + return -EMSGSIZE; +} +EXPORT_SYMBOL(rtnetlink_put_metrics); + +int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, + long expires, u32 error) +{ + struct rta_cacheinfo ci = { + .rta_error = error, + .rta_id = id, + }; + + if (dst) { + ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse); + ci.rta_used = dst->__use; + ci.rta_clntref = atomic_read(&dst->__refcnt); + } + if (expires) { + unsigned long clock; + + clock = jiffies_to_clock_t(abs(expires)); + clock = min_t(unsigned long, clock, INT_MAX); + ci.rta_expires = (expires > 0) ? clock : -clock; + } + return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci); +} +EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); + +static void set_operstate(struct net_device *dev, unsigned char transition) +{ + unsigned char operstate = dev->operstate; + + switch (transition) { + case IF_OPER_UP: + if ((operstate == IF_OPER_DORMANT || + operstate == IF_OPER_TESTING || + operstate == IF_OPER_UNKNOWN) && + !netif_dormant(dev) && !netif_testing(dev)) + operstate = IF_OPER_UP; + break; + + case IF_OPER_TESTING: + if (netif_oper_up(dev)) + operstate = IF_OPER_TESTING; + break; + + case IF_OPER_DORMANT: + if (netif_oper_up(dev)) + operstate = IF_OPER_DORMANT; + break; + } + + if (dev->operstate != operstate) { + write_lock(&dev_base_lock); + dev->operstate = operstate; + write_unlock(&dev_base_lock); + netdev_state_change(dev); + } +} + +static unsigned int rtnl_dev_get_flags(const struct net_device *dev) +{ + return (dev->flags & ~(IFF_PROMISC | IFF_ALLMULTI)) | + (dev->gflags & (IFF_PROMISC | IFF_ALLMULTI)); +} + +static unsigned int rtnl_dev_combine_flags(const struct net_device *dev, + const struct ifinfomsg *ifm) +{ + unsigned int flags = ifm->ifi_flags; + + /* bugwards compatibility: ifi_change == 0 is treated as ~0 */ + if (ifm->ifi_change) + flags = (flags & ifm->ifi_change) | + (rtnl_dev_get_flags(dev) & ~ifm->ifi_change); + + return flags; +} + +static void copy_rtnl_link_stats(struct rtnl_link_stats *a, + const struct rtnl_link_stats64 *b) +{ + a->rx_packets = b->rx_packets; + a->tx_packets = b->tx_packets; + a->rx_bytes = b->rx_bytes; + a->tx_bytes = b->tx_bytes; + a->rx_errors = b->rx_errors; + a->tx_errors = b->tx_errors; + a->rx_dropped = b->rx_dropped; + a->tx_dropped = b->tx_dropped; + + a->multicast = b->multicast; + a->collisions = b->collisions; + + a->rx_length_errors = b->rx_length_errors; + a->rx_over_errors = b->rx_over_errors; + a->rx_crc_errors = b->rx_crc_errors; + a->rx_frame_errors = b->rx_frame_errors; + a->rx_fifo_errors = b->rx_fifo_errors; + a->rx_missed_errors = b->rx_missed_errors; + + a->tx_aborted_errors = b->tx_aborted_errors; + a->tx_carrier_errors = b->tx_carrier_errors; + a->tx_fifo_errors = b->tx_fifo_errors; + a->tx_heartbeat_errors = b->tx_heartbeat_errors; + a->tx_window_errors = b->tx_window_errors; + + a->rx_compressed = b->rx_compressed; + a->tx_compressed = b->tx_compressed; + + a->rx_nohandler = b->rx_nohandler; +} + +/* All VF info */ +static inline int rtnl_vfinfo_size(const struct net_device *dev, + u32 ext_filter_mask) +{ + if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) { + int num_vfs = dev_num_vf(dev->dev.parent); + size_t size = nla_total_size(0); + size += num_vfs * + (nla_total_size(0) + + nla_total_size(sizeof(struct ifla_vf_mac)) + + nla_total_size(sizeof(struct ifla_vf_broadcast)) + + nla_total_size(sizeof(struct ifla_vf_vlan)) + + nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */ + nla_total_size(MAX_VLAN_LIST_LEN * + sizeof(struct ifla_vf_vlan_info)) + + nla_total_size(sizeof(struct ifla_vf_spoofchk)) + + nla_total_size(sizeof(struct ifla_vf_tx_rate)) + + nla_total_size(sizeof(struct ifla_vf_rate)) + + nla_total_size(sizeof(struct ifla_vf_link_state)) + + nla_total_size(sizeof(struct ifla_vf_rss_query_en)) + + nla_total_size(sizeof(struct ifla_vf_trust))); + if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { + size += num_vfs * + (nla_total_size(0) + /* nest IFLA_VF_STATS */ + /* IFLA_VF_STATS_RX_PACKETS */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_PACKETS */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_RX_BYTES */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_BYTES */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_BROADCAST */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_MULTICAST */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_RX_DROPPED */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_DROPPED */ + nla_total_size_64bit(sizeof(__u64))); + } + return size; + } else + return 0; +} + +static size_t rtnl_port_size(const struct net_device *dev, + u32 ext_filter_mask) +{ + size_t port_size = nla_total_size(4) /* PORT_VF */ + + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */ + + nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */ + + nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */ + + nla_total_size(1) /* PROT_VDP_REQUEST */ + + nla_total_size(2); /* PORT_VDP_RESPONSE */ + size_t vf_ports_size = nla_total_size(sizeof(struct nlattr)); + size_t vf_port_size = nla_total_size(sizeof(struct nlattr)) + + port_size; + size_t port_self_size = nla_total_size(sizeof(struct nlattr)) + + port_size; + + if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || + !(ext_filter_mask & RTEXT_FILTER_VF)) + return 0; + if (dev_num_vf(dev->dev.parent)) + return port_self_size + vf_ports_size + + vf_port_size * dev_num_vf(dev->dev.parent); + else + return port_self_size; +} + +static size_t rtnl_xdp_size(void) +{ + size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ + nla_total_size(1) + /* XDP_ATTACHED */ + nla_total_size(4) + /* XDP_PROG_ID (or 1st mode) */ + nla_total_size(4); /* XDP_<mode>_PROG_ID */ + + return xdp_size; +} + +static size_t rtnl_prop_list_size(const struct net_device *dev) +{ + struct netdev_name_node *name_node; + size_t size; + + if (list_empty(&dev->name_node->list)) + return 0; + size = nla_total_size(0); + list_for_each_entry(name_node, &dev->name_node->list, list) + size += nla_total_size(ALTIFNAMSIZ); + return size; +} + +static size_t rtnl_proto_down_size(const struct net_device *dev) +{ + size_t size = nla_total_size(1); + + if (dev->proto_down_reason) + size += nla_total_size(0) + nla_total_size(4); + + return size; +} + +static noinline size_t if_nlmsg_size(const struct net_device *dev, + u32 ext_filter_mask) +{ + return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */ + + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */ + + nla_total_size_64bit(sizeof(struct rtnl_link_ifmap)) + + nla_total_size(sizeof(struct rtnl_link_stats)) + + nla_total_size_64bit(sizeof(struct rtnl_link_stats64)) + + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */ + + nla_total_size(4) /* IFLA_TXQLEN */ + + nla_total_size(4) /* IFLA_WEIGHT */ + + nla_total_size(4) /* IFLA_MTU */ + + nla_total_size(4) /* IFLA_LINK */ + + nla_total_size(4) /* IFLA_MASTER */ + + nla_total_size(1) /* IFLA_CARRIER */ + + nla_total_size(4) /* IFLA_PROMISCUITY */ + + nla_total_size(4) /* IFLA_ALLMULTI */ + + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */ + + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ + + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */ + + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */ + + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */ + + nla_total_size(4) /* IFLA_TSO_MAX_SIZE */ + + nla_total_size(4) /* IFLA_TSO_MAX_SEGS */ + + nla_total_size(1) /* IFLA_OPERSTATE */ + + nla_total_size(1) /* IFLA_LINKMODE */ + + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ + + nla_total_size(4) /* IFLA_LINK_NETNSID */ + + nla_total_size(4) /* IFLA_GROUP */ + + nla_total_size(ext_filter_mask + & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */ + + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ + + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + + rtnl_link_get_af_size(dev, ext_filter_mask) /* IFLA_AF_SPEC */ + + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ + + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ + + rtnl_xdp_size() /* IFLA_XDP */ + + nla_total_size(4) /* IFLA_EVENT */ + + nla_total_size(4) /* IFLA_NEW_NETNSID */ + + nla_total_size(4) /* IFLA_NEW_IFINDEX */ + + rtnl_proto_down_size(dev) /* proto down */ + + nla_total_size(4) /* IFLA_TARGET_NETNSID */ + + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ + + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ + + nla_total_size(4) /* IFLA_MIN_MTU */ + + nla_total_size(4) /* IFLA_MAX_MTU */ + + rtnl_prop_list_size(dev) + + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */ + + 0; +} + +static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) +{ + struct nlattr *vf_ports; + struct nlattr *vf_port; + int vf; + int err; + + vf_ports = nla_nest_start_noflag(skb, IFLA_VF_PORTS); + if (!vf_ports) + return -EMSGSIZE; + + for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) { + vf_port = nla_nest_start_noflag(skb, IFLA_VF_PORT); + if (!vf_port) + goto nla_put_failure; + if (nla_put_u32(skb, IFLA_PORT_VF, vf)) + goto nla_put_failure; + err = dev->netdev_ops->ndo_get_vf_port(dev, vf, skb); + if (err == -EMSGSIZE) + goto nla_put_failure; + if (err) { + nla_nest_cancel(skb, vf_port); + continue; + } + nla_nest_end(skb, vf_port); + } + + nla_nest_end(skb, vf_ports); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, vf_ports); + return -EMSGSIZE; +} + +static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev) +{ + struct nlattr *port_self; + int err; + + port_self = nla_nest_start_noflag(skb, IFLA_PORT_SELF); + if (!port_self) + return -EMSGSIZE; + + err = dev->netdev_ops->ndo_get_vf_port(dev, PORT_SELF_VF, skb); + if (err) { + nla_nest_cancel(skb, port_self); + return (err == -EMSGSIZE) ? err : 0; + } + + nla_nest_end(skb, port_self); + + return 0; +} + +static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev, + u32 ext_filter_mask) +{ + int err; + + if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || + !(ext_filter_mask & RTEXT_FILTER_VF)) + return 0; + + err = rtnl_port_self_fill(skb, dev); + if (err) + return err; + + if (dev_num_vf(dev->dev.parent)) { + err = rtnl_vf_ports_fill(skb, dev); + if (err) + return err; + } + + return 0; +} + +static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) +{ + int err; + struct netdev_phys_item_id ppid; + + err = dev_get_phys_port_id(dev, &ppid); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + if (nla_put(skb, IFLA_PHYS_PORT_ID, ppid.id_len, ppid.id)) + return -EMSGSIZE; + + return 0; +} + +static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev) +{ + char name[IFNAMSIZ]; + int err; + + err = dev_get_phys_port_name(dev, name, sizeof(name)); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + if (nla_put_string(skb, IFLA_PHYS_PORT_NAME, name)) + return -EMSGSIZE; + + return 0; +} + +static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) +{ + struct netdev_phys_item_id ppid = { }; + int err; + + err = dev_get_port_parent_id(dev, &ppid, false); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + if (nla_put(skb, IFLA_PHYS_SWITCH_ID, ppid.id_len, ppid.id)) + return -EMSGSIZE; + + return 0; +} + +static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb, + struct net_device *dev) +{ + struct rtnl_link_stats64 *sp; + struct nlattr *attr; + + attr = nla_reserve_64bit(skb, IFLA_STATS64, + sizeof(struct rtnl_link_stats64), IFLA_PAD); + if (!attr) + return -EMSGSIZE; + + sp = nla_data(attr); + dev_get_stats(dev, sp); + + attr = nla_reserve(skb, IFLA_STATS, + sizeof(struct rtnl_link_stats)); + if (!attr) + return -EMSGSIZE; + + copy_rtnl_link_stats(nla_data(attr), sp); + + return 0; +} + +static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, + struct net_device *dev, + int vfs_num, + struct nlattr *vfinfo, + u32 ext_filter_mask) +{ + struct ifla_vf_rss_query_en vf_rss_query_en; + struct nlattr *vf, *vfstats, *vfvlanlist; + struct ifla_vf_link_state vf_linkstate; + struct ifla_vf_vlan_info vf_vlan_info; + struct ifla_vf_spoofchk vf_spoofchk; + struct ifla_vf_tx_rate vf_tx_rate; + struct ifla_vf_stats vf_stats; + struct ifla_vf_trust vf_trust; + struct ifla_vf_vlan vf_vlan; + struct ifla_vf_rate vf_rate; + struct ifla_vf_mac vf_mac; + struct ifla_vf_broadcast vf_broadcast; + struct ifla_vf_info ivi; + struct ifla_vf_guid node_guid; + struct ifla_vf_guid port_guid; + + memset(&ivi, 0, sizeof(ivi)); + + /* Not all SR-IOV capable drivers support the + * spoofcheck and "RSS query enable" query. Preset to + * -1 so the user space tool can detect that the driver + * didn't report anything. + */ + ivi.spoofchk = -1; + ivi.rss_query_en = -1; + ivi.trusted = -1; + /* The default value for VF link state is "auto" + * IFLA_VF_LINK_STATE_AUTO which equals zero + */ + ivi.linkstate = 0; + /* VLAN Protocol by default is 802.1Q */ + ivi.vlan_proto = htons(ETH_P_8021Q); + if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi)) + return 0; + + memset(&vf_vlan_info, 0, sizeof(vf_vlan_info)); + memset(&node_guid, 0, sizeof(node_guid)); + memset(&port_guid, 0, sizeof(port_guid)); + + vf_mac.vf = + vf_vlan.vf = + vf_vlan_info.vf = + vf_rate.vf = + vf_tx_rate.vf = + vf_spoofchk.vf = + vf_linkstate.vf = + vf_rss_query_en.vf = + vf_trust.vf = + node_guid.vf = + port_guid.vf = ivi.vf; + + memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); + memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len); + vf_vlan.vlan = ivi.vlan; + vf_vlan.qos = ivi.qos; + vf_vlan_info.vlan = ivi.vlan; + vf_vlan_info.qos = ivi.qos; + vf_vlan_info.vlan_proto = ivi.vlan_proto; + vf_tx_rate.rate = ivi.max_tx_rate; + vf_rate.min_tx_rate = ivi.min_tx_rate; + vf_rate.max_tx_rate = ivi.max_tx_rate; + vf_spoofchk.setting = ivi.spoofchk; + vf_linkstate.link_state = ivi.linkstate; + vf_rss_query_en.setting = ivi.rss_query_en; + vf_trust.setting = ivi.trusted; + vf = nla_nest_start_noflag(skb, IFLA_VF_INFO); + if (!vf) + goto nla_put_vfinfo_failure; + if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || + nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) || + nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || + nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), + &vf_rate) || + nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), + &vf_tx_rate) || + nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), + &vf_spoofchk) || + nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate), + &vf_linkstate) || + nla_put(skb, IFLA_VF_RSS_QUERY_EN, + sizeof(vf_rss_query_en), + &vf_rss_query_en) || + nla_put(skb, IFLA_VF_TRUST, + sizeof(vf_trust), &vf_trust)) + goto nla_put_vf_failure; + + if (dev->netdev_ops->ndo_get_vf_guid && + !dev->netdev_ops->ndo_get_vf_guid(dev, vfs_num, &node_guid, + &port_guid)) { + if (nla_put(skb, IFLA_VF_IB_NODE_GUID, sizeof(node_guid), + &node_guid) || + nla_put(skb, IFLA_VF_IB_PORT_GUID, sizeof(port_guid), + &port_guid)) + goto nla_put_vf_failure; + } + vfvlanlist = nla_nest_start_noflag(skb, IFLA_VF_VLAN_LIST); + if (!vfvlanlist) + goto nla_put_vf_failure; + if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info), + &vf_vlan_info)) { + nla_nest_cancel(skb, vfvlanlist); + goto nla_put_vf_failure; + } + nla_nest_end(skb, vfvlanlist); + if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { + memset(&vf_stats, 0, sizeof(vf_stats)); + if (dev->netdev_ops->ndo_get_vf_stats) + dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, + &vf_stats); + vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS); + if (!vfstats) + goto nla_put_vf_failure; + if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, + vf_stats.rx_packets, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, + vf_stats.tx_packets, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES, + vf_stats.rx_bytes, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES, + vf_stats.tx_bytes, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, + vf_stats.broadcast, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, + vf_stats.multicast, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED, + vf_stats.rx_dropped, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED, + vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) { + nla_nest_cancel(skb, vfstats); + goto nla_put_vf_failure; + } + nla_nest_end(skb, vfstats); + } + nla_nest_end(skb, vf); + return 0; + +nla_put_vf_failure: + nla_nest_cancel(skb, vf); +nla_put_vfinfo_failure: + nla_nest_cancel(skb, vfinfo); + return -EMSGSIZE; +} + +static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb, + struct net_device *dev, + u32 ext_filter_mask) +{ + struct nlattr *vfinfo; + int i, num_vfs; + + if (!dev->dev.parent || ((ext_filter_mask & RTEXT_FILTER_VF) == 0)) + return 0; + + num_vfs = dev_num_vf(dev->dev.parent); + if (nla_put_u32(skb, IFLA_NUM_VF, num_vfs)) + return -EMSGSIZE; + + if (!dev->netdev_ops->ndo_get_vf_config) + return 0; + + vfinfo = nla_nest_start_noflag(skb, IFLA_VFINFO_LIST); + if (!vfinfo) + return -EMSGSIZE; + + for (i = 0; i < num_vfs; i++) { + if (rtnl_fill_vfinfo(skb, dev, i, vfinfo, ext_filter_mask)) + return -EMSGSIZE; + } + + nla_nest_end(skb, vfinfo); + return 0; +} + +static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) +{ + struct rtnl_link_ifmap map; + + memset(&map, 0, sizeof(map)); + map.mem_start = dev->mem_start; + map.mem_end = dev->mem_end; + map.base_addr = dev->base_addr; + map.irq = dev->irq; + map.dma = dev->dma; + map.port = dev->if_port; + + if (nla_put_64bit(skb, IFLA_MAP, sizeof(map), &map, IFLA_PAD)) + return -EMSGSIZE; + + return 0; +} + +static u32 rtnl_xdp_prog_skb(struct net_device *dev) +{ + const struct bpf_prog *generic_xdp_prog; + + ASSERT_RTNL(); + + generic_xdp_prog = rtnl_dereference(dev->xdp_prog); + if (!generic_xdp_prog) + return 0; + return generic_xdp_prog->aux->id; +} + +static u32 rtnl_xdp_prog_drv(struct net_device *dev) +{ + return dev_xdp_prog_id(dev, XDP_MODE_DRV); +} + +static u32 rtnl_xdp_prog_hw(struct net_device *dev) +{ + return dev_xdp_prog_id(dev, XDP_MODE_HW); +} + +static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev, + u32 *prog_id, u8 *mode, u8 tgt_mode, u32 attr, + u32 (*get_prog_id)(struct net_device *dev)) +{ + u32 curr_id; + int err; + + curr_id = get_prog_id(dev); + if (!curr_id) + return 0; + + *prog_id = curr_id; + err = nla_put_u32(skb, attr, curr_id); + if (err) + return err; + + if (*mode != XDP_ATTACHED_NONE) + *mode = XDP_ATTACHED_MULTI; + else + *mode = tgt_mode; + + return 0; +} + +static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) +{ + struct nlattr *xdp; + u32 prog_id; + int err; + u8 mode; + + xdp = nla_nest_start_noflag(skb, IFLA_XDP); + if (!xdp) + return -EMSGSIZE; + + prog_id = 0; + mode = XDP_ATTACHED_NONE; + err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB, + IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb); + if (err) + goto err_cancel; + err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV, + IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv); + if (err) + goto err_cancel; + err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW, + IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw); + if (err) + goto err_cancel; + + err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode); + if (err) + goto err_cancel; + + if (prog_id && mode != XDP_ATTACHED_MULTI) { + err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id); + if (err) + goto err_cancel; + } + + nla_nest_end(skb, xdp); + return 0; + +err_cancel: + nla_nest_cancel(skb, xdp); + return err; +} + +static u32 rtnl_get_event(unsigned long event) +{ + u32 rtnl_event_type = IFLA_EVENT_NONE; + + switch (event) { + case NETDEV_REBOOT: + rtnl_event_type = IFLA_EVENT_REBOOT; + break; + case NETDEV_FEAT_CHANGE: + rtnl_event_type = IFLA_EVENT_FEATURES; + break; + case NETDEV_BONDING_FAILOVER: + rtnl_event_type = IFLA_EVENT_BONDING_FAILOVER; + break; + case NETDEV_NOTIFY_PEERS: + rtnl_event_type = IFLA_EVENT_NOTIFY_PEERS; + break; + case NETDEV_RESEND_IGMP: + rtnl_event_type = IFLA_EVENT_IGMP_RESEND; + break; + case NETDEV_CHANGEINFODATA: + rtnl_event_type = IFLA_EVENT_BONDING_OPTIONS; + break; + default: + break; + } + + return rtnl_event_type; +} + +static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev) +{ + const struct net_device *upper_dev; + int ret = 0; + + rcu_read_lock(); + + upper_dev = netdev_master_upper_dev_get_rcu(dev); + if (upper_dev) + ret = nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex); + + rcu_read_unlock(); + return ret; +} + +static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev, + bool force) +{ + int ifindex = dev_get_iflink(dev); + + if (force || dev->ifindex != ifindex) + return nla_put_u32(skb, IFLA_LINK, ifindex); + + return 0; +} + +static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb, + struct net_device *dev) +{ + char buf[IFALIASZ]; + int ret; + + ret = dev_get_alias(dev, buf, sizeof(buf)); + return ret > 0 ? nla_put_string(skb, IFLA_IFALIAS, buf) : 0; +} + +static int rtnl_fill_link_netnsid(struct sk_buff *skb, + const struct net_device *dev, + struct net *src_net, gfp_t gfp) +{ + bool put_iflink = false; + + if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) { + struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); + + if (!net_eq(dev_net(dev), link_net)) { + int id = peernet2id_alloc(src_net, link_net, gfp); + + if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) + return -EMSGSIZE; + + put_iflink = true; + } + } + + return nla_put_iflink(skb, dev, put_iflink); +} + +static int rtnl_fill_link_af(struct sk_buff *skb, + const struct net_device *dev, + u32 ext_filter_mask) +{ + const struct rtnl_af_ops *af_ops; + struct nlattr *af_spec; + + af_spec = nla_nest_start_noflag(skb, IFLA_AF_SPEC); + if (!af_spec) + return -EMSGSIZE; + + list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { + struct nlattr *af; + int err; + + if (!af_ops->fill_link_af) + continue; + + af = nla_nest_start_noflag(skb, af_ops->family); + if (!af) + return -EMSGSIZE; + + err = af_ops->fill_link_af(skb, dev, ext_filter_mask); + /* + * Caller may return ENODATA to indicate that there + * was no data to be dumped. This is not an error, it + * means we should trim the attribute header and + * continue. + */ + if (err == -ENODATA) + nla_nest_cancel(skb, af); + else if (err < 0) + return -EMSGSIZE; + + nla_nest_end(skb, af); + } + + nla_nest_end(skb, af_spec); + return 0; +} + +static int rtnl_fill_alt_ifnames(struct sk_buff *skb, + const struct net_device *dev) +{ + struct netdev_name_node *name_node; + int count = 0; + + list_for_each_entry(name_node, &dev->name_node->list, list) { + if (nla_put_string(skb, IFLA_ALT_IFNAME, name_node->name)) + return -EMSGSIZE; + count++; + } + return count; +} + +static int rtnl_fill_prop_list(struct sk_buff *skb, + const struct net_device *dev) +{ + struct nlattr *prop_list; + int ret; + + prop_list = nla_nest_start(skb, IFLA_PROP_LIST); + if (!prop_list) + return -EMSGSIZE; + + ret = rtnl_fill_alt_ifnames(skb, dev); + if (ret <= 0) + goto nest_cancel; + + nla_nest_end(skb, prop_list); + return 0; + +nest_cancel: + nla_nest_cancel(skb, prop_list); + return ret; +} + +static int rtnl_fill_proto_down(struct sk_buff *skb, + const struct net_device *dev) +{ + struct nlattr *pr; + u32 preason; + + if (nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) + goto nla_put_failure; + + preason = dev->proto_down_reason; + if (!preason) + return 0; + + pr = nla_nest_start(skb, IFLA_PROTO_DOWN_REASON); + if (!pr) + return -EMSGSIZE; + + if (nla_put_u32(skb, IFLA_PROTO_DOWN_REASON_VALUE, preason)) { + nla_nest_cancel(skb, pr); + goto nla_put_failure; + } + + nla_nest_end(skb, pr); + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int rtnl_fill_ifinfo(struct sk_buff *skb, + struct net_device *dev, struct net *src_net, + int type, u32 pid, u32 seq, u32 change, + unsigned int flags, u32 ext_filter_mask, + u32 event, int *new_nsid, int new_ifindex, + int tgt_netnsid, gfp_t gfp) +{ + struct ifinfomsg *ifm; + struct nlmsghdr *nlh; + struct Qdisc *qdisc; + + ASSERT_RTNL(); + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); + if (nlh == NULL) + return -EMSGSIZE; + + ifm = nlmsg_data(nlh); + ifm->ifi_family = AF_UNSPEC; + ifm->__ifi_pad = 0; + ifm->ifi_type = dev->type; + ifm->ifi_index = dev->ifindex; + ifm->ifi_flags = dev_get_flags(dev); + ifm->ifi_change = change; + + if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid)) + goto nla_put_failure; + + qdisc = rtnl_dereference(dev->qdisc); + if (nla_put_string(skb, IFLA_IFNAME, dev->name) || + nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) || + nla_put_u8(skb, IFLA_OPERSTATE, + netif_running(dev) ? dev->operstate : IF_OPER_DOWN) || + nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) || + nla_put_u32(skb, IFLA_MTU, dev->mtu) || + nla_put_u32(skb, IFLA_MIN_MTU, dev->min_mtu) || + nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) || + nla_put_u32(skb, IFLA_GROUP, dev->group) || + nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) || + nla_put_u32(skb, IFLA_ALLMULTI, dev->allmulti) || + nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || + nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) || + nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) || + nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) || + nla_put_u32(skb, IFLA_TSO_MAX_SIZE, dev->tso_max_size) || + nla_put_u32(skb, IFLA_TSO_MAX_SEGS, dev->tso_max_segs) || +#ifdef CONFIG_RPS + nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || +#endif + put_master_ifindex(skb, dev) || + nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || + (qdisc && + nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) || + nla_put_ifalias(skb, dev) || + nla_put_u32(skb, IFLA_CARRIER_CHANGES, + atomic_read(&dev->carrier_up_count) + + atomic_read(&dev->carrier_down_count)) || + nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, + atomic_read(&dev->carrier_up_count)) || + nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, + atomic_read(&dev->carrier_down_count))) + goto nla_put_failure; + + if (rtnl_fill_proto_down(skb, dev)) + goto nla_put_failure; + + if (event != IFLA_EVENT_NONE) { + if (nla_put_u32(skb, IFLA_EVENT, event)) + goto nla_put_failure; + } + + if (rtnl_fill_link_ifmap(skb, dev)) + goto nla_put_failure; + + if (dev->addr_len) { + if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) || + nla_put(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast)) + goto nla_put_failure; + } + + if (rtnl_phys_port_id_fill(skb, dev)) + goto nla_put_failure; + + if (rtnl_phys_port_name_fill(skb, dev)) + goto nla_put_failure; + + if (rtnl_phys_switch_id_fill(skb, dev)) + goto nla_put_failure; + + if (rtnl_fill_stats(skb, dev)) + goto nla_put_failure; + + if (rtnl_fill_vf(skb, dev, ext_filter_mask)) + goto nla_put_failure; + + if (rtnl_port_fill(skb, dev, ext_filter_mask)) + goto nla_put_failure; + + if (rtnl_xdp_fill(skb, dev)) + goto nla_put_failure; + + if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) { + if (rtnl_link_fill(skb, dev) < 0) + goto nla_put_failure; + } + + if (rtnl_fill_link_netnsid(skb, dev, src_net, gfp)) + goto nla_put_failure; + + if (new_nsid && + nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0) + goto nla_put_failure; + if (new_ifindex && + nla_put_s32(skb, IFLA_NEW_IFINDEX, new_ifindex) < 0) + goto nla_put_failure; + + if (memchr_inv(dev->perm_addr, '\0', dev->addr_len) && + nla_put(skb, IFLA_PERM_ADDRESS, dev->addr_len, dev->perm_addr)) + goto nla_put_failure; + + rcu_read_lock(); + if (rtnl_fill_link_af(skb, dev, ext_filter_mask)) + goto nla_put_failure_rcu; + rcu_read_unlock(); + + if (rtnl_fill_prop_list(skb, dev)) + goto nla_put_failure; + + if (dev->dev.parent && + nla_put_string(skb, IFLA_PARENT_DEV_NAME, + dev_name(dev->dev.parent))) + goto nla_put_failure; + + if (dev->dev.parent && dev->dev.parent->bus && + nla_put_string(skb, IFLA_PARENT_DEV_BUS_NAME, + dev->dev.parent->bus->name)) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure_rcu: + rcu_read_unlock(); +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static const struct nla_policy ifla_policy[IFLA_MAX+1] = { + [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 }, + [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [IFLA_MAP] = { .len = sizeof(struct rtnl_link_ifmap) }, + [IFLA_MTU] = { .type = NLA_U32 }, + [IFLA_LINK] = { .type = NLA_U32 }, + [IFLA_MASTER] = { .type = NLA_U32 }, + [IFLA_CARRIER] = { .type = NLA_U8 }, + [IFLA_TXQLEN] = { .type = NLA_U32 }, + [IFLA_WEIGHT] = { .type = NLA_U32 }, + [IFLA_OPERSTATE] = { .type = NLA_U8 }, + [IFLA_LINKMODE] = { .type = NLA_U8 }, + [IFLA_LINKINFO] = { .type = NLA_NESTED }, + [IFLA_NET_NS_PID] = { .type = NLA_U32 }, + [IFLA_NET_NS_FD] = { .type = NLA_U32 }, + /* IFLA_IFALIAS is a string, but policy is set to NLA_BINARY to + * allow 0-length string (needed to remove an alias). + */ + [IFLA_IFALIAS] = { .type = NLA_BINARY, .len = IFALIASZ - 1 }, + [IFLA_VFINFO_LIST] = {. type = NLA_NESTED }, + [IFLA_VF_PORTS] = { .type = NLA_NESTED }, + [IFLA_PORT_SELF] = { .type = NLA_NESTED }, + [IFLA_AF_SPEC] = { .type = NLA_NESTED }, + [IFLA_EXT_MASK] = { .type = NLA_U32 }, + [IFLA_PROMISCUITY] = { .type = NLA_U32 }, + [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, + [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, + [IFLA_GSO_MAX_SEGS] = { .type = NLA_U32 }, + [IFLA_GSO_MAX_SIZE] = { .type = NLA_U32 }, + [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, + [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ + [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, + [IFLA_LINK_NETNSID] = { .type = NLA_S32 }, + [IFLA_PROTO_DOWN] = { .type = NLA_U8 }, + [IFLA_XDP] = { .type = NLA_NESTED }, + [IFLA_EVENT] = { .type = NLA_U32 }, + [IFLA_GROUP] = { .type = NLA_U32 }, + [IFLA_TARGET_NETNSID] = { .type = NLA_S32 }, + [IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 }, + [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 }, + [IFLA_MIN_MTU] = { .type = NLA_U32 }, + [IFLA_MAX_MTU] = { .type = NLA_U32 }, + [IFLA_PROP_LIST] = { .type = NLA_NESTED }, + [IFLA_ALT_IFNAME] = { .type = NLA_STRING, + .len = ALTIFNAMSIZ - 1 }, + [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT }, + [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED }, + [IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1), + [IFLA_PARENT_DEV_NAME] = { .type = NLA_NUL_STRING }, + [IFLA_GRO_MAX_SIZE] = { .type = NLA_U32 }, + [IFLA_TSO_MAX_SIZE] = { .type = NLA_REJECT }, + [IFLA_TSO_MAX_SEGS] = { .type = NLA_REJECT }, + [IFLA_ALLMULTI] = { .type = NLA_REJECT }, +}; + +static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { + [IFLA_INFO_KIND] = { .type = NLA_STRING }, + [IFLA_INFO_DATA] = { .type = NLA_NESTED }, + [IFLA_INFO_SLAVE_KIND] = { .type = NLA_STRING }, + [IFLA_INFO_SLAVE_DATA] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { + [IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) }, + [IFLA_VF_BROADCAST] = { .type = NLA_REJECT }, + [IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) }, + [IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED }, + [IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) }, + [IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) }, + [IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) }, + [IFLA_VF_LINK_STATE] = { .len = sizeof(struct ifla_vf_link_state) }, + [IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) }, + [IFLA_VF_STATS] = { .type = NLA_NESTED }, + [IFLA_VF_TRUST] = { .len = sizeof(struct ifla_vf_trust) }, + [IFLA_VF_IB_NODE_GUID] = { .len = sizeof(struct ifla_vf_guid) }, + [IFLA_VF_IB_PORT_GUID] = { .len = sizeof(struct ifla_vf_guid) }, +}; + +static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { + [IFLA_PORT_VF] = { .type = NLA_U32 }, + [IFLA_PORT_PROFILE] = { .type = NLA_STRING, + .len = PORT_PROFILE_MAX }, + [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY, + .len = PORT_UUID_MAX }, + [IFLA_PORT_HOST_UUID] = { .type = NLA_STRING, + .len = PORT_UUID_MAX }, + [IFLA_PORT_REQUEST] = { .type = NLA_U8, }, + [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, + + /* Unused, but we need to keep it here since user space could + * fill it. It's also broken with regard to NLA_BINARY use in + * combination with structs. + */ + [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_port_vsi) }, +}; + +static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { + [IFLA_XDP_UNSPEC] = { .strict_start_type = IFLA_XDP_EXPECTED_FD }, + [IFLA_XDP_FD] = { .type = NLA_S32 }, + [IFLA_XDP_EXPECTED_FD] = { .type = NLA_S32 }, + [IFLA_XDP_ATTACHED] = { .type = NLA_U8 }, + [IFLA_XDP_FLAGS] = { .type = NLA_U32 }, + [IFLA_XDP_PROG_ID] = { .type = NLA_U32 }, +}; + +static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) +{ + const struct rtnl_link_ops *ops = NULL; + struct nlattr *linfo[IFLA_INFO_MAX + 1]; + + if (nla_parse_nested_deprecated(linfo, IFLA_INFO_MAX, nla, ifla_info_policy, NULL) < 0) + return NULL; + + if (linfo[IFLA_INFO_KIND]) { + char kind[MODULE_NAME_LEN]; + + nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); + ops = rtnl_link_ops_get(kind); + } + + return ops; +} + +static bool link_master_filtered(struct net_device *dev, int master_idx) +{ + struct net_device *master; + + if (!master_idx) + return false; + + master = netdev_master_upper_dev_get(dev); + + /* 0 is already used to denote IFLA_MASTER wasn't passed, therefore need + * another invalid value for ifindex to denote "no master". + */ + if (master_idx == -1) + return !!master; + + if (!master || master->ifindex != master_idx) + return true; + + return false; +} + +static bool link_kind_filtered(const struct net_device *dev, + const struct rtnl_link_ops *kind_ops) +{ + if (kind_ops && dev->rtnl_link_ops != kind_ops) + return true; + + return false; +} + +static bool link_dump_filtered(struct net_device *dev, + int master_idx, + const struct rtnl_link_ops *kind_ops) +{ + if (link_master_filtered(dev, master_idx) || + link_kind_filtered(dev, kind_ops)) + return true; + + return false; +} + +/** + * rtnl_get_net_ns_capable - Get netns if sufficiently privileged. + * @sk: netlink socket + * @netnsid: network namespace identifier + * + * Returns the network namespace identified by netnsid on success or an error + * pointer on failure. + */ +struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid) +{ + struct net *net; + + net = get_net_ns_by_id(sock_net(sk), netnsid); + if (!net) + return ERR_PTR(-EINVAL); + + /* For now, the caller is required to have CAP_NET_ADMIN in + * the user namespace owning the target net ns. + */ + if (!sk_ns_capable(sk, net->user_ns, CAP_NET_ADMIN)) { + put_net(net); + return ERR_PTR(-EACCES); + } + return net; +} +EXPORT_SYMBOL_GPL(rtnl_get_net_ns_capable); + +static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, + bool strict_check, struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + int hdrlen; + + if (strict_check) { + struct ifinfomsg *ifm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + NL_SET_ERR_MSG(extack, "Invalid header for link dump"); + return -EINVAL; + } + + ifm = nlmsg_data(nlh); + if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || + ifm->ifi_change) { + NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request"); + return -EINVAL; + } + if (ifm->ifi_index) { + NL_SET_ERR_MSG(extack, "Filter by device index not supported for link dumps"); + return -EINVAL; + } + + return nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, + IFLA_MAX, ifla_policy, + extack); + } + + /* A hack to preserve kernel<->userspace interface. + * The correct header is ifinfomsg. It is consistent with rtnl_getlink. + * However, before Linux v3.9 the code here assumed rtgenmsg and that's + * what iproute2 < v3.9.0 used. + * We can detect the old iproute2. Even including the IFLA_EXT_MASK + * attribute, its netlink message is shorter than struct ifinfomsg. + */ + hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? + sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); + + return nlmsg_parse_deprecated(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, + extack); +} + +static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct netlink_ext_ack *extack = cb->extack; + const struct nlmsghdr *nlh = cb->nlh; + struct net *net = sock_net(skb->sk); + struct net *tgt_net = net; + int h, s_h; + int idx = 0, s_idx; + struct net_device *dev; + struct hlist_head *head; + struct nlattr *tb[IFLA_MAX+1]; + u32 ext_filter_mask = 0; + const struct rtnl_link_ops *kind_ops = NULL; + unsigned int flags = NLM_F_MULTI; + int master_idx = 0; + int netnsid = -1; + int err, i; + + s_h = cb->args[0]; + s_idx = cb->args[1]; + + err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack); + if (err < 0) { + if (cb->strict_check) + return err; + + goto walk_entries; + } + + for (i = 0; i <= IFLA_MAX; ++i) { + if (!tb[i]) + continue; + + /* new attributes should only be added with strict checking */ + switch (i) { + case IFLA_TARGET_NETNSID: + netnsid = nla_get_s32(tb[i]); + tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid); + if (IS_ERR(tgt_net)) { + NL_SET_ERR_MSG(extack, "Invalid target network namespace id"); + return PTR_ERR(tgt_net); + } + break; + case IFLA_EXT_MASK: + ext_filter_mask = nla_get_u32(tb[i]); + break; + case IFLA_MASTER: + master_idx = nla_get_u32(tb[i]); + break; + case IFLA_LINKINFO: + kind_ops = linkinfo_to_kind_ops(tb[i]); + break; + default: + if (cb->strict_check) { + NL_SET_ERR_MSG(extack, "Unsupported attribute in link dump request"); + return -EINVAL; + } + } + } + + if (master_idx || kind_ops) + flags |= NLM_F_DUMP_FILTERED; + +walk_entries: + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &tgt_net->dev_index_head[h]; + hlist_for_each_entry(dev, head, index_hlist) { + if (link_dump_filtered(dev, master_idx, kind_ops)) + goto cont; + if (idx < s_idx) + goto cont; + err = rtnl_fill_ifinfo(skb, dev, net, + RTM_NEWLINK, + NETLINK_CB(cb->skb).portid, + nlh->nlmsg_seq, 0, flags, + ext_filter_mask, 0, NULL, 0, + netnsid, GFP_KERNEL); + + if (err < 0) { + if (likely(skb->len)) + goto out; + + goto out_err; + } +cont: + idx++; + } + } +out: + err = skb->len; +out_err: + cb->args[1] = idx; + cb->args[0] = h; + cb->seq = tgt_net->dev_base_seq; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + if (netnsid >= 0) + put_net(tgt_net); + + return err; +} + +int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer, + struct netlink_ext_ack *exterr) +{ + const struct ifinfomsg *ifmp; + const struct nlattr *attrs; + size_t len; + + ifmp = nla_data(nla_peer); + attrs = nla_data(nla_peer) + sizeof(struct ifinfomsg); + len = nla_len(nla_peer) - sizeof(struct ifinfomsg); + + if (ifmp->ifi_index < 0) { + NL_SET_ERR_MSG_ATTR(exterr, nla_peer, + "ifindex can't be negative"); + return -EINVAL; + } + + return nla_parse_deprecated(tb, IFLA_MAX, attrs, len, ifla_policy, + exterr); +} +EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg); + +struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) +{ + struct net *net; + /* Examine the link attributes and figure out which + * network namespace we are talking about. + */ + if (tb[IFLA_NET_NS_PID]) + net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); + else if (tb[IFLA_NET_NS_FD]) + net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD])); + else + net = get_net(src_net); + return net; +} +EXPORT_SYMBOL(rtnl_link_get_net); + +/* Figure out which network namespace we are talking about by + * examining the link attributes in the following order: + * + * 1. IFLA_NET_NS_PID + * 2. IFLA_NET_NS_FD + * 3. IFLA_TARGET_NETNSID + */ +static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net, + struct nlattr *tb[]) +{ + struct net *net; + + if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) + return rtnl_link_get_net(src_net, tb); + + if (!tb[IFLA_TARGET_NETNSID]) + return get_net(src_net); + + net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_TARGET_NETNSID])); + if (!net) + return ERR_PTR(-EINVAL); + + return net; +} + +static struct net *rtnl_link_get_net_capable(const struct sk_buff *skb, + struct net *src_net, + struct nlattr *tb[], int cap) +{ + struct net *net; + + net = rtnl_link_get_net_by_nlattr(src_net, tb); + if (IS_ERR(net)) + return net; + + if (!netlink_ns_capable(skb, net->user_ns, cap)) { + put_net(net); + return ERR_PTR(-EPERM); + } + + return net; +} + +/* Verify that rtnetlink requests do not pass additional properties + * potentially referring to different network namespaces. + */ +static int rtnl_ensure_unique_netns(struct nlattr *tb[], + struct netlink_ext_ack *extack, + bool netns_id_only) +{ + + if (netns_id_only) { + if (!tb[IFLA_NET_NS_PID] && !tb[IFLA_NET_NS_FD]) + return 0; + + NL_SET_ERR_MSG(extack, "specified netns attribute not supported"); + return -EOPNOTSUPP; + } + + if (tb[IFLA_TARGET_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD])) + goto invalid_attr; + + if (tb[IFLA_NET_NS_PID] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_FD])) + goto invalid_attr; + + if (tb[IFLA_NET_NS_FD] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_PID])) + goto invalid_attr; + + return 0; + +invalid_attr: + NL_SET_ERR_MSG(extack, "multiple netns identifying attributes specified"); + return -EINVAL; +} + +static int rtnl_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate, + int max_tx_rate) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_set_vf_rate) + return -EOPNOTSUPP; + if (max_tx_rate && max_tx_rate < min_tx_rate) + return -EINVAL; + + return ops->ndo_set_vf_rate(dev, vf, min_tx_rate, max_tx_rate); +} + +static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], + struct netlink_ext_ack *extack) +{ + if (dev) { + if (tb[IFLA_ADDRESS] && + nla_len(tb[IFLA_ADDRESS]) < dev->addr_len) + return -EINVAL; + + if (tb[IFLA_BROADCAST] && + nla_len(tb[IFLA_BROADCAST]) < dev->addr_len) + return -EINVAL; + } + + if (tb[IFLA_AF_SPEC]) { + struct nlattr *af; + int rem, err; + + nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { + const struct rtnl_af_ops *af_ops; + + af_ops = rtnl_af_lookup(nla_type(af)); + if (!af_ops) + return -EAFNOSUPPORT; + + if (!af_ops->set_link_af) + return -EOPNOTSUPP; + + if (af_ops->validate_link_af) { + err = af_ops->validate_link_af(dev, af, extack); + if (err < 0) + return err; + } + } + } + + return 0; +} + +static int handle_infiniband_guid(struct net_device *dev, struct ifla_vf_guid *ivt, + int guid_type) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + return ops->ndo_set_vf_guid(dev, ivt->vf, ivt->guid, guid_type); +} + +static int handle_vf_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type) +{ + if (dev->type != ARPHRD_INFINIBAND) + return -EOPNOTSUPP; + + return handle_infiniband_guid(dev, ivt, guid_type); +} + +static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int err = -EINVAL; + + if (tb[IFLA_VF_MAC]) { + struct ifla_vf_mac *ivm = nla_data(tb[IFLA_VF_MAC]); + + if (ivm->vf >= INT_MAX) + return -EINVAL; + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_mac) + err = ops->ndo_set_vf_mac(dev, ivm->vf, + ivm->mac); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_VLAN]) { + struct ifla_vf_vlan *ivv = nla_data(tb[IFLA_VF_VLAN]); + + if (ivv->vf >= INT_MAX) + return -EINVAL; + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_vlan) + err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan, + ivv->qos, + htons(ETH_P_8021Q)); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_VLAN_LIST]) { + struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN]; + struct nlattr *attr; + int rem, len = 0; + + err = -EOPNOTSUPP; + if (!ops->ndo_set_vf_vlan) + return err; + + nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) { + if (nla_type(attr) != IFLA_VF_VLAN_INFO || + nla_len(attr) < NLA_HDRLEN) { + return -EINVAL; + } + if (len >= MAX_VLAN_LIST_LEN) + return -EOPNOTSUPP; + ivvl[len] = nla_data(attr); + + len++; + } + if (len == 0) + return -EINVAL; + + if (ivvl[0]->vf >= INT_MAX) + return -EINVAL; + err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan, + ivvl[0]->qos, ivvl[0]->vlan_proto); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_TX_RATE]) { + struct ifla_vf_tx_rate *ivt = nla_data(tb[IFLA_VF_TX_RATE]); + struct ifla_vf_info ivf; + + if (ivt->vf >= INT_MAX) + return -EINVAL; + err = -EOPNOTSUPP; + if (ops->ndo_get_vf_config) + err = ops->ndo_get_vf_config(dev, ivt->vf, &ivf); + if (err < 0) + return err; + + err = rtnl_set_vf_rate(dev, ivt->vf, + ivf.min_tx_rate, ivt->rate); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_RATE]) { + struct ifla_vf_rate *ivt = nla_data(tb[IFLA_VF_RATE]); + + if (ivt->vf >= INT_MAX) + return -EINVAL; + + err = rtnl_set_vf_rate(dev, ivt->vf, + ivt->min_tx_rate, ivt->max_tx_rate); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_SPOOFCHK]) { + struct ifla_vf_spoofchk *ivs = nla_data(tb[IFLA_VF_SPOOFCHK]); + + if (ivs->vf >= INT_MAX) + return -EINVAL; + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_spoofchk) + err = ops->ndo_set_vf_spoofchk(dev, ivs->vf, + ivs->setting); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_LINK_STATE]) { + struct ifla_vf_link_state *ivl = nla_data(tb[IFLA_VF_LINK_STATE]); + + if (ivl->vf >= INT_MAX) + return -EINVAL; + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_link_state) + err = ops->ndo_set_vf_link_state(dev, ivl->vf, + ivl->link_state); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_RSS_QUERY_EN]) { + struct ifla_vf_rss_query_en *ivrssq_en; + + err = -EOPNOTSUPP; + ivrssq_en = nla_data(tb[IFLA_VF_RSS_QUERY_EN]); + if (ivrssq_en->vf >= INT_MAX) + return -EINVAL; + if (ops->ndo_set_vf_rss_query_en) + err = ops->ndo_set_vf_rss_query_en(dev, ivrssq_en->vf, + ivrssq_en->setting); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_TRUST]) { + struct ifla_vf_trust *ivt = nla_data(tb[IFLA_VF_TRUST]); + + if (ivt->vf >= INT_MAX) + return -EINVAL; + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_trust) + err = ops->ndo_set_vf_trust(dev, ivt->vf, ivt->setting); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_IB_NODE_GUID]) { + struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]); + + if (ivt->vf >= INT_MAX) + return -EINVAL; + if (!ops->ndo_set_vf_guid) + return -EOPNOTSUPP; + return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID); + } + + if (tb[IFLA_VF_IB_PORT_GUID]) { + struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]); + + if (ivt->vf >= INT_MAX) + return -EINVAL; + if (!ops->ndo_set_vf_guid) + return -EOPNOTSUPP; + + return handle_vf_guid(dev, ivt, IFLA_VF_IB_PORT_GUID); + } + + return err; +} + +static int do_set_master(struct net_device *dev, int ifindex, + struct netlink_ext_ack *extack) +{ + struct net_device *upper_dev = netdev_master_upper_dev_get(dev); + const struct net_device_ops *ops; + int err; + + if (upper_dev) { + if (upper_dev->ifindex == ifindex) + return 0; + ops = upper_dev->netdev_ops; + if (ops->ndo_del_slave) { + err = ops->ndo_del_slave(upper_dev, dev); + if (err) + return err; + } else { + return -EOPNOTSUPP; + } + } + + if (ifindex) { + upper_dev = __dev_get_by_index(dev_net(dev), ifindex); + if (!upper_dev) + return -EINVAL; + ops = upper_dev->netdev_ops; + if (ops->ndo_add_slave) { + err = ops->ndo_add_slave(upper_dev, dev, extack); + if (err) + return err; + } else { + return -EOPNOTSUPP; + } + } + return 0; +} + +static const struct nla_policy ifla_proto_down_reason_policy[IFLA_PROTO_DOWN_REASON_VALUE + 1] = { + [IFLA_PROTO_DOWN_REASON_MASK] = { .type = NLA_U32 }, + [IFLA_PROTO_DOWN_REASON_VALUE] = { .type = NLA_U32 }, +}; + +static int do_set_proto_down(struct net_device *dev, + struct nlattr *nl_proto_down, + struct nlattr *nl_proto_down_reason, + struct netlink_ext_ack *extack) +{ + struct nlattr *pdreason[IFLA_PROTO_DOWN_REASON_MAX + 1]; + unsigned long mask = 0; + u32 value; + bool proto_down; + int err; + + if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN)) { + NL_SET_ERR_MSG(extack, "Protodown not supported by device"); + return -EOPNOTSUPP; + } + + if (nl_proto_down_reason) { + err = nla_parse_nested_deprecated(pdreason, + IFLA_PROTO_DOWN_REASON_MAX, + nl_proto_down_reason, + ifla_proto_down_reason_policy, + NULL); + if (err < 0) + return err; + + if (!pdreason[IFLA_PROTO_DOWN_REASON_VALUE]) { + NL_SET_ERR_MSG(extack, "Invalid protodown reason value"); + return -EINVAL; + } + + value = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_VALUE]); + + if (pdreason[IFLA_PROTO_DOWN_REASON_MASK]) + mask = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_MASK]); + + dev_change_proto_down_reason(dev, mask, value); + } + + if (nl_proto_down) { + proto_down = nla_get_u8(nl_proto_down); + + /* Don't turn off protodown if there are active reasons */ + if (!proto_down && dev->proto_down_reason) { + NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons"); + return -EBUSY; + } + err = dev_change_proto_down(dev, + proto_down); + if (err) + return err; + } + + return 0; +} + +#define DO_SETLINK_MODIFIED 0x01 +/* notify flag means notify + modified. */ +#define DO_SETLINK_NOTIFY 0x03 +static int do_setlink(const struct sk_buff *skb, + struct net_device *dev, struct ifinfomsg *ifm, + struct netlink_ext_ack *extack, + struct nlattr **tb, int status) +{ + const struct net_device_ops *ops = dev->netdev_ops; + char ifname[IFNAMSIZ]; + int err; + + err = validate_linkmsg(dev, tb, extack); + if (err < 0) + return err; + + if (tb[IFLA_IFNAME]) + nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + else + ifname[0] = '\0'; + + if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) { + const char *pat = ifname[0] ? ifname : NULL; + struct net *net; + int new_ifindex; + + net = rtnl_link_get_net_capable(skb, dev_net(dev), + tb, CAP_NET_ADMIN); + if (IS_ERR(net)) { + err = PTR_ERR(net); + goto errout; + } + + if (tb[IFLA_NEW_IFINDEX]) + new_ifindex = nla_get_s32(tb[IFLA_NEW_IFINDEX]); + else + new_ifindex = 0; + + err = __dev_change_net_namespace(dev, net, pat, new_ifindex); + put_net(net); + if (err) + goto errout; + status |= DO_SETLINK_MODIFIED; + } + + if (tb[IFLA_MAP]) { + struct rtnl_link_ifmap *u_map; + struct ifmap k_map; + + if (!ops->ndo_set_config) { + err = -EOPNOTSUPP; + goto errout; + } + + if (!netif_device_present(dev)) { + err = -ENODEV; + goto errout; + } + + u_map = nla_data(tb[IFLA_MAP]); + k_map.mem_start = (unsigned long) u_map->mem_start; + k_map.mem_end = (unsigned long) u_map->mem_end; + k_map.base_addr = (unsigned short) u_map->base_addr; + k_map.irq = (unsigned char) u_map->irq; + k_map.dma = (unsigned char) u_map->dma; + k_map.port = (unsigned char) u_map->port; + + err = ops->ndo_set_config(dev, &k_map); + if (err < 0) + goto errout; + + status |= DO_SETLINK_NOTIFY; + } + + if (tb[IFLA_ADDRESS]) { + struct sockaddr *sa; + int len; + + len = sizeof(sa_family_t) + max_t(size_t, dev->addr_len, + sizeof(*sa)); + sa = kmalloc(len, GFP_KERNEL); + if (!sa) { + err = -ENOMEM; + goto errout; + } + sa->sa_family = dev->type; + memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), + dev->addr_len); + err = dev_set_mac_address_user(dev, sa, extack); + kfree(sa); + if (err) + goto errout; + status |= DO_SETLINK_MODIFIED; + } + + if (tb[IFLA_MTU]) { + err = dev_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack); + if (err < 0) + goto errout; + status |= DO_SETLINK_MODIFIED; + } + + if (tb[IFLA_GROUP]) { + dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); + status |= DO_SETLINK_NOTIFY; + } + + /* + * Interface selected by interface index but interface + * name provided implies that a name change has been + * requested. + */ + if (ifm->ifi_index > 0 && ifname[0]) { + err = dev_change_name(dev, ifname); + if (err < 0) + goto errout; + status |= DO_SETLINK_MODIFIED; + } + + if (tb[IFLA_IFALIAS]) { + err = dev_set_alias(dev, nla_data(tb[IFLA_IFALIAS]), + nla_len(tb[IFLA_IFALIAS])); + if (err < 0) + goto errout; + status |= DO_SETLINK_NOTIFY; + } + + if (tb[IFLA_BROADCAST]) { + nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len); + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + } + + if (ifm->ifi_flags || ifm->ifi_change) { + err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), + extack); + if (err < 0) + goto errout; + } + + if (tb[IFLA_MASTER]) { + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); + if (err) + goto errout; + status |= DO_SETLINK_MODIFIED; + } + + if (tb[IFLA_CARRIER]) { + err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); + if (err) + goto errout; + status |= DO_SETLINK_MODIFIED; + } + + if (tb[IFLA_TXQLEN]) { + unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]); + + err = dev_change_tx_queue_len(dev, value); + if (err) + goto errout; + status |= DO_SETLINK_MODIFIED; + } + + if (tb[IFLA_GSO_MAX_SIZE]) { + u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]); + + if (max_size > dev->tso_max_size) { + err = -EINVAL; + goto errout; + } + + if (dev->gso_max_size ^ max_size) { + netif_set_gso_max_size(dev, max_size); + status |= DO_SETLINK_MODIFIED; + } + } + + if (tb[IFLA_GSO_MAX_SEGS]) { + u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); + + if (max_segs > GSO_MAX_SEGS || max_segs > dev->tso_max_segs) { + err = -EINVAL; + goto errout; + } + + if (dev->gso_max_segs ^ max_segs) { + netif_set_gso_max_segs(dev, max_segs); + status |= DO_SETLINK_MODIFIED; + } + } + + if (tb[IFLA_GRO_MAX_SIZE]) { + u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_MAX_SIZE]); + + if (dev->gro_max_size ^ gro_max_size) { + netif_set_gro_max_size(dev, gro_max_size); + status |= DO_SETLINK_MODIFIED; + } + } + + if (tb[IFLA_OPERSTATE]) + set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); + + if (tb[IFLA_LINKMODE]) { + unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]); + + write_lock(&dev_base_lock); + if (dev->link_mode ^ value) + status |= DO_SETLINK_NOTIFY; + dev->link_mode = value; + write_unlock(&dev_base_lock); + } + + if (tb[IFLA_VFINFO_LIST]) { + struct nlattr *vfinfo[IFLA_VF_MAX + 1]; + struct nlattr *attr; + int rem; + + nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) { + if (nla_type(attr) != IFLA_VF_INFO || + nla_len(attr) < NLA_HDRLEN) { + err = -EINVAL; + goto errout; + } + err = nla_parse_nested_deprecated(vfinfo, IFLA_VF_MAX, + attr, + ifla_vf_policy, + NULL); + if (err < 0) + goto errout; + err = do_setvfinfo(dev, vfinfo); + if (err < 0) + goto errout; + status |= DO_SETLINK_NOTIFY; + } + } + err = 0; + + if (tb[IFLA_VF_PORTS]) { + struct nlattr *port[IFLA_PORT_MAX+1]; + struct nlattr *attr; + int vf; + int rem; + + err = -EOPNOTSUPP; + if (!ops->ndo_set_vf_port) + goto errout; + + nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) { + if (nla_type(attr) != IFLA_VF_PORT || + nla_len(attr) < NLA_HDRLEN) { + err = -EINVAL; + goto errout; + } + err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX, + attr, + ifla_port_policy, + NULL); + if (err < 0) + goto errout; + if (!port[IFLA_PORT_VF]) { + err = -EOPNOTSUPP; + goto errout; + } + vf = nla_get_u32(port[IFLA_PORT_VF]); + err = ops->ndo_set_vf_port(dev, vf, port); + if (err < 0) + goto errout; + status |= DO_SETLINK_NOTIFY; + } + } + err = 0; + + if (tb[IFLA_PORT_SELF]) { + struct nlattr *port[IFLA_PORT_MAX+1]; + + err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX, + tb[IFLA_PORT_SELF], + ifla_port_policy, NULL); + if (err < 0) + goto errout; + + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_port) + err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port); + if (err < 0) + goto errout; + status |= DO_SETLINK_NOTIFY; + } + + if (tb[IFLA_AF_SPEC]) { + struct nlattr *af; + int rem; + + nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { + const struct rtnl_af_ops *af_ops; + + BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af)))); + + err = af_ops->set_link_af(dev, af, extack); + if (err < 0) + goto errout; + + status |= DO_SETLINK_NOTIFY; + } + } + err = 0; + + if (tb[IFLA_PROTO_DOWN] || tb[IFLA_PROTO_DOWN_REASON]) { + err = do_set_proto_down(dev, tb[IFLA_PROTO_DOWN], + tb[IFLA_PROTO_DOWN_REASON], extack); + if (err) + goto errout; + status |= DO_SETLINK_NOTIFY; + } + + if (tb[IFLA_XDP]) { + struct nlattr *xdp[IFLA_XDP_MAX + 1]; + u32 xdp_flags = 0; + + err = nla_parse_nested_deprecated(xdp, IFLA_XDP_MAX, + tb[IFLA_XDP], + ifla_xdp_policy, NULL); + if (err < 0) + goto errout; + + if (xdp[IFLA_XDP_ATTACHED] || xdp[IFLA_XDP_PROG_ID]) { + err = -EINVAL; + goto errout; + } + + if (xdp[IFLA_XDP_FLAGS]) { + xdp_flags = nla_get_u32(xdp[IFLA_XDP_FLAGS]); + if (xdp_flags & ~XDP_FLAGS_MASK) { + err = -EINVAL; + goto errout; + } + if (hweight32(xdp_flags & XDP_FLAGS_MODES) > 1) { + err = -EINVAL; + goto errout; + } + } + + if (xdp[IFLA_XDP_FD]) { + int expected_fd = -1; + + if (xdp_flags & XDP_FLAGS_REPLACE) { + if (!xdp[IFLA_XDP_EXPECTED_FD]) { + err = -EINVAL; + goto errout; + } + expected_fd = + nla_get_s32(xdp[IFLA_XDP_EXPECTED_FD]); + } + + err = dev_change_xdp_fd(dev, extack, + nla_get_s32(xdp[IFLA_XDP_FD]), + expected_fd, + xdp_flags); + if (err) + goto errout; + status |= DO_SETLINK_NOTIFY; + } + } + +errout: + if (status & DO_SETLINK_MODIFIED) { + if ((status & DO_SETLINK_NOTIFY) == DO_SETLINK_NOTIFY) + netdev_state_change(dev); + + if (err < 0) + net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n", + dev->name); + } + + return err; +} + +static struct net_device *rtnl_dev_get(struct net *net, + struct nlattr *tb[]) +{ + char ifname[ALTIFNAMSIZ]; + + if (tb[IFLA_IFNAME]) + nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + else if (tb[IFLA_ALT_IFNAME]) + nla_strscpy(ifname, tb[IFLA_ALT_IFNAME], ALTIFNAMSIZ); + else + return NULL; + + return __dev_get_by_name(net, ifname); +} + +static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct ifinfomsg *ifm; + struct net_device *dev; + int err; + struct nlattr *tb[IFLA_MAX+1]; + + err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, + ifla_policy, extack); + if (err < 0) + goto errout; + + err = rtnl_ensure_unique_netns(tb, extack, false); + if (err < 0) + goto errout; + + err = -EINVAL; + ifm = nlmsg_data(nlh); + if (ifm->ifi_index > 0) + dev = __dev_get_by_index(net, ifm->ifi_index); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(net, tb); + else + goto errout; + + if (dev == NULL) { + err = -ENODEV; + goto errout; + } + + err = do_setlink(skb, dev, ifm, extack, tb, 0); +errout: + return err; +} + +static int rtnl_group_dellink(const struct net *net, int group) +{ + struct net_device *dev, *aux; + LIST_HEAD(list_kill); + bool found = false; + + if (!group) + return -EPERM; + + for_each_netdev(net, dev) { + if (dev->group == group) { + const struct rtnl_link_ops *ops; + + found = true; + ops = dev->rtnl_link_ops; + if (!ops || !ops->dellink) + return -EOPNOTSUPP; + } + } + + if (!found) + return -ENODEV; + + for_each_netdev_safe(net, dev, aux) { + if (dev->group == group) { + const struct rtnl_link_ops *ops; + + ops = dev->rtnl_link_ops; + ops->dellink(dev, &list_kill); + } + } + unregister_netdevice_many(&list_kill); + + return 0; +} + +int rtnl_delete_link(struct net_device *dev) +{ + const struct rtnl_link_ops *ops; + LIST_HEAD(list_kill); + + ops = dev->rtnl_link_ops; + if (!ops || !ops->dellink) + return -EOPNOTSUPP; + + ops->dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + + return 0; +} +EXPORT_SYMBOL_GPL(rtnl_delete_link); + +static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct net *tgt_net = net; + struct net_device *dev = NULL; + struct ifinfomsg *ifm; + struct nlattr *tb[IFLA_MAX+1]; + int err; + int netnsid = -1; + + err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, + ifla_policy, extack); + if (err < 0) + return err; + + err = rtnl_ensure_unique_netns(tb, extack, true); + if (err < 0) + return err; + + if (tb[IFLA_TARGET_NETNSID]) { + netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); + tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid); + if (IS_ERR(tgt_net)) + return PTR_ERR(tgt_net); + } + + err = -EINVAL; + ifm = nlmsg_data(nlh); + if (ifm->ifi_index > 0) + dev = __dev_get_by_index(tgt_net, ifm->ifi_index); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(net, tb); + else if (tb[IFLA_GROUP]) + err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); + else + goto out; + + if (!dev) { + if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME] || ifm->ifi_index > 0) + err = -ENODEV; + + goto out; + } + + err = rtnl_delete_link(dev); + +out: + if (netnsid >= 0) + put_net(tgt_net); + + return err; +} + +int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) +{ + unsigned int old_flags; + int err; + + old_flags = dev->flags; + if (ifm && (ifm->ifi_flags || ifm->ifi_change)) { + err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), + NULL); + if (err < 0) + return err; + } + + if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) { + __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags)); + } else { + dev->rtnl_link_state = RTNL_LINK_INITIALIZED; + __dev_notify_flags(dev, old_flags, ~0U); + } + return 0; +} +EXPORT_SYMBOL(rtnl_configure_link); + +struct net_device *rtnl_create_link(struct net *net, const char *ifname, + unsigned char name_assign_type, + const struct rtnl_link_ops *ops, + struct nlattr *tb[], + struct netlink_ext_ack *extack) +{ + struct net_device *dev; + unsigned int num_tx_queues = 1; + unsigned int num_rx_queues = 1; + int err; + + if (tb[IFLA_NUM_TX_QUEUES]) + num_tx_queues = nla_get_u32(tb[IFLA_NUM_TX_QUEUES]); + else if (ops->get_num_tx_queues) + num_tx_queues = ops->get_num_tx_queues(); + + if (tb[IFLA_NUM_RX_QUEUES]) + num_rx_queues = nla_get_u32(tb[IFLA_NUM_RX_QUEUES]); + else if (ops->get_num_rx_queues) + num_rx_queues = ops->get_num_rx_queues(); + + if (num_tx_queues < 1 || num_tx_queues > 4096) { + NL_SET_ERR_MSG(extack, "Invalid number of transmit queues"); + return ERR_PTR(-EINVAL); + } + + if (num_rx_queues < 1 || num_rx_queues > 4096) { + NL_SET_ERR_MSG(extack, "Invalid number of receive queues"); + return ERR_PTR(-EINVAL); + } + + if (ops->alloc) { + dev = ops->alloc(tb, ifname, name_assign_type, + num_tx_queues, num_rx_queues); + if (IS_ERR(dev)) + return dev; + } else { + dev = alloc_netdev_mqs(ops->priv_size, ifname, + name_assign_type, ops->setup, + num_tx_queues, num_rx_queues); + } + + if (!dev) + return ERR_PTR(-ENOMEM); + + err = validate_linkmsg(dev, tb, extack); + if (err < 0) { + free_netdev(dev); + return ERR_PTR(err); + } + + dev_net_set(dev, net); + dev->rtnl_link_ops = ops; + dev->rtnl_link_state = RTNL_LINK_INITIALIZING; + + if (tb[IFLA_MTU]) { + u32 mtu = nla_get_u32(tb[IFLA_MTU]); + + err = dev_validate_mtu(dev, mtu, extack); + if (err) { + free_netdev(dev); + return ERR_PTR(err); + } + dev->mtu = mtu; + } + if (tb[IFLA_ADDRESS]) { + __dev_addr_set(dev, nla_data(tb[IFLA_ADDRESS]), + nla_len(tb[IFLA_ADDRESS])); + dev->addr_assign_type = NET_ADDR_SET; + } + if (tb[IFLA_BROADCAST]) + memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]), + nla_len(tb[IFLA_BROADCAST])); + if (tb[IFLA_TXQLEN]) + dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); + if (tb[IFLA_OPERSTATE]) + set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); + if (tb[IFLA_LINKMODE]) + dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); + if (tb[IFLA_GROUP]) + dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); + if (tb[IFLA_GSO_MAX_SIZE]) + netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE])); + if (tb[IFLA_GSO_MAX_SEGS]) + netif_set_gso_max_segs(dev, nla_get_u32(tb[IFLA_GSO_MAX_SEGS])); + if (tb[IFLA_GRO_MAX_SIZE]) + netif_set_gro_max_size(dev, nla_get_u32(tb[IFLA_GRO_MAX_SIZE])); + + return dev; +} +EXPORT_SYMBOL(rtnl_create_link); + +static int rtnl_group_changelink(const struct sk_buff *skb, + struct net *net, int group, + struct ifinfomsg *ifm, + struct netlink_ext_ack *extack, + struct nlattr **tb) +{ + struct net_device *dev, *aux; + int err; + + for_each_netdev_safe(net, dev, aux) { + if (dev->group == group) { + err = do_setlink(skb, dev, ifm, extack, tb, 0); + if (err < 0) + return err; + } + } + + return 0; +} + +static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, + const struct rtnl_link_ops *ops, + struct nlattr **tb, struct nlattr **data, + struct netlink_ext_ack *extack) +{ + unsigned char name_assign_type = NET_NAME_USER; + struct net *net = sock_net(skb->sk); + struct net *dest_net, *link_net; + struct net_device *dev; + char ifname[IFNAMSIZ]; + int err; + + if (!ops->alloc && !ops->setup) + return -EOPNOTSUPP; + + if (tb[IFLA_IFNAME]) { + nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + } else { + snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); + name_assign_type = NET_NAME_ENUM; + } + + dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); + if (IS_ERR(dest_net)) + return PTR_ERR(dest_net); + + if (tb[IFLA_LINK_NETNSID]) { + int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); + + link_net = get_net_ns_by_id(dest_net, id); + if (!link_net) { + NL_SET_ERR_MSG(extack, "Unknown network namespace id"); + err = -EINVAL; + goto out; + } + err = -EPERM; + if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) + goto out; + } else { + link_net = NULL; + } + + dev = rtnl_create_link(link_net ? : dest_net, ifname, + name_assign_type, ops, tb, extack); + if (IS_ERR(dev)) { + err = PTR_ERR(dev); + goto out; + } + + dev->ifindex = ifm->ifi_index; + + if (ops->newlink) + err = ops->newlink(link_net ? : net, dev, tb, data, extack); + else + err = register_netdevice(dev); + if (err < 0) { + free_netdev(dev); + goto out; + } + + err = rtnl_configure_link(dev, ifm); + if (err < 0) + goto out_unregister; + if (link_net) { + err = dev_change_net_namespace(dev, dest_net, ifname); + if (err < 0) + goto out_unregister; + } + if (tb[IFLA_MASTER]) { + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); + if (err) + goto out_unregister; + } +out: + if (link_net) + put_net(link_net); + put_net(dest_net); + return err; +out_unregister: + if (ops->newlink) { + LIST_HEAD(list_kill); + + ops->dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + } else { + unregister_netdevice(dev); + } + goto out; +} + +struct rtnl_newlink_tbs { + struct nlattr *tb[IFLA_MAX + 1]; + struct nlattr *attr[RTNL_MAX_TYPE + 1]; + struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; +}; + +static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct rtnl_newlink_tbs *tbs, + struct netlink_ext_ack *extack) +{ + struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; + struct nlattr ** const tb = tbs->tb; + const struct rtnl_link_ops *m_ops; + struct net_device *master_dev; + struct net *net = sock_net(skb->sk); + const struct rtnl_link_ops *ops; + struct nlattr **slave_data; + char kind[MODULE_NAME_LEN]; + struct net_device *dev; + struct ifinfomsg *ifm; + struct nlattr **data; + bool link_specified; + int err; + +#ifdef CONFIG_MODULES +replay: +#endif + err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, + ifla_policy, extack); + if (err < 0) + return err; + + err = rtnl_ensure_unique_netns(tb, extack, false); + if (err < 0) + return err; + + ifm = nlmsg_data(nlh); + if (ifm->ifi_index > 0) { + link_specified = true; + dev = __dev_get_by_index(net, ifm->ifi_index); + } else if (ifm->ifi_index < 0) { + NL_SET_ERR_MSG(extack, "ifindex can't be negative"); + return -EINVAL; + } else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) { + link_specified = true; + dev = rtnl_dev_get(net, tb); + } else { + link_specified = false; + dev = NULL; + } + + master_dev = NULL; + m_ops = NULL; + if (dev) { + master_dev = netdev_master_upper_dev_get(dev); + if (master_dev) + m_ops = master_dev->rtnl_link_ops; + } + + err = validate_linkmsg(dev, tb, extack); + if (err < 0) + return err; + + if (tb[IFLA_LINKINFO]) { + err = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX, + tb[IFLA_LINKINFO], + ifla_info_policy, NULL); + if (err < 0) + return err; + } else + memset(linkinfo, 0, sizeof(linkinfo)); + + if (linkinfo[IFLA_INFO_KIND]) { + nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); + ops = rtnl_link_ops_get(kind); + } else { + kind[0] = '\0'; + ops = NULL; + } + + data = NULL; + if (ops) { + if (ops->maxtype > RTNL_MAX_TYPE) + return -EINVAL; + + if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { + err = nla_parse_nested_deprecated(tbs->attr, ops->maxtype, + linkinfo[IFLA_INFO_DATA], + ops->policy, extack); + if (err < 0) + return err; + data = tbs->attr; + } + if (ops->validate) { + err = ops->validate(tb, data, extack); + if (err < 0) + return err; + } + } + + slave_data = NULL; + if (m_ops) { + if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) + return -EINVAL; + + if (m_ops->slave_maxtype && + linkinfo[IFLA_INFO_SLAVE_DATA]) { + err = nla_parse_nested_deprecated(tbs->slave_attr, + m_ops->slave_maxtype, + linkinfo[IFLA_INFO_SLAVE_DATA], + m_ops->slave_policy, + extack); + if (err < 0) + return err; + slave_data = tbs->slave_attr; + } + } + + if (dev) { + int status = 0; + + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + if (nlh->nlmsg_flags & NLM_F_REPLACE) + return -EOPNOTSUPP; + + if (linkinfo[IFLA_INFO_DATA]) { + if (!ops || ops != dev->rtnl_link_ops || + !ops->changelink) + return -EOPNOTSUPP; + + err = ops->changelink(dev, tb, data, extack); + if (err < 0) + return err; + status |= DO_SETLINK_NOTIFY; + } + + if (linkinfo[IFLA_INFO_SLAVE_DATA]) { + if (!m_ops || !m_ops->slave_changelink) + return -EOPNOTSUPP; + + err = m_ops->slave_changelink(master_dev, dev, tb, + slave_data, extack); + if (err < 0) + return err; + status |= DO_SETLINK_NOTIFY; + } + + return do_setlink(skb, dev, ifm, extack, tb, status); + } + + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + /* No dev found and NLM_F_CREATE not set. Requested dev does not exist, + * or it's for a group + */ + if (link_specified) + return -ENODEV; + if (tb[IFLA_GROUP]) + return rtnl_group_changelink(skb, net, + nla_get_u32(tb[IFLA_GROUP]), + ifm, extack, tb); + return -ENODEV; + } + + if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) + return -EOPNOTSUPP; + + if (!ops) { +#ifdef CONFIG_MODULES + if (kind[0]) { + __rtnl_unlock(); + request_module("rtnl-link-%s", kind); + rtnl_lock(); + ops = rtnl_link_ops_get(kind); + if (ops) + goto replay; + } +#endif + NL_SET_ERR_MSG(extack, "Unknown device type"); + return -EOPNOTSUPP; + } + + return rtnl_newlink_create(skb, ifm, ops, tb, data, extack); +} + +static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct rtnl_newlink_tbs *tbs; + int ret; + + tbs = kmalloc(sizeof(*tbs), GFP_KERNEL); + if (!tbs) + return -ENOMEM; + + ret = __rtnl_newlink(skb, nlh, tbs, extack); + kfree(tbs); + return ret; +} + +static int rtnl_valid_getlink_req(struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct ifinfomsg *ifm; + int i, err; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + NL_SET_ERR_MSG(extack, "Invalid header for get link"); + return -EINVAL; + } + + if (!netlink_strict_get_check(skb)) + return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, + ifla_policy, extack); + + ifm = nlmsg_data(nlh); + if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || + ifm->ifi_change) { + NL_SET_ERR_MSG(extack, "Invalid values in header for get link request"); + return -EINVAL; + } + + err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFLA_MAX, + ifla_policy, extack); + if (err) + return err; + + for (i = 0; i <= IFLA_MAX; i++) { + if (!tb[i]) + continue; + + switch (i) { + case IFLA_IFNAME: + case IFLA_ALT_IFNAME: + case IFLA_EXT_MASK: + case IFLA_TARGET_NETNSID: + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in get link request"); + return -EINVAL; + } + } + + return 0; +} + +static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct net *tgt_net = net; + struct ifinfomsg *ifm; + struct nlattr *tb[IFLA_MAX+1]; + struct net_device *dev = NULL; + struct sk_buff *nskb; + int netnsid = -1; + int err; + u32 ext_filter_mask = 0; + + err = rtnl_valid_getlink_req(skb, nlh, tb, extack); + if (err < 0) + return err; + + err = rtnl_ensure_unique_netns(tb, extack, true); + if (err < 0) + return err; + + if (tb[IFLA_TARGET_NETNSID]) { + netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); + tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid); + if (IS_ERR(tgt_net)) + return PTR_ERR(tgt_net); + } + + if (tb[IFLA_EXT_MASK]) + ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); + + err = -EINVAL; + ifm = nlmsg_data(nlh); + if (ifm->ifi_index > 0) + dev = __dev_get_by_index(tgt_net, ifm->ifi_index); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(tgt_net, tb); + else + goto out; + + err = -ENODEV; + if (dev == NULL) + goto out; + + err = -ENOBUFS; + nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL); + if (nskb == NULL) + goto out; + + err = rtnl_fill_ifinfo(nskb, dev, net, + RTM_NEWLINK, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, 0, 0, ext_filter_mask, + 0, NULL, 0, netnsid, GFP_KERNEL); + if (err < 0) { + /* -EMSGSIZE implies BUG in if_nlmsg_size */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(nskb); + } else + err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid); +out: + if (netnsid >= 0) + put_net(tgt_net); + + return err; +} + +static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr, + bool *changed, struct netlink_ext_ack *extack) +{ + char *alt_ifname; + size_t size; + int err; + + err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack); + if (err) + return err; + + if (cmd == RTM_NEWLINKPROP) { + size = rtnl_prop_list_size(dev); + size += nla_total_size(ALTIFNAMSIZ); + if (size >= U16_MAX) { + NL_SET_ERR_MSG(extack, + "effective property list too long"); + return -EINVAL; + } + } + + alt_ifname = nla_strdup(attr, GFP_KERNEL_ACCOUNT); + if (!alt_ifname) + return -ENOMEM; + + if (cmd == RTM_NEWLINKPROP) { + err = netdev_name_node_alt_create(dev, alt_ifname); + if (!err) + alt_ifname = NULL; + } else if (cmd == RTM_DELLINKPROP) { + err = netdev_name_node_alt_destroy(dev, alt_ifname); + } else { + WARN_ON_ONCE(1); + err = -EINVAL; + } + + kfree(alt_ifname); + if (!err) + *changed = true; + return err; +} + +static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tb[IFLA_MAX + 1]; + struct net_device *dev; + struct ifinfomsg *ifm; + bool changed = false; + struct nlattr *attr; + int err, rem; + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); + if (err) + return err; + + err = rtnl_ensure_unique_netns(tb, extack, true); + if (err) + return err; + + ifm = nlmsg_data(nlh); + if (ifm->ifi_index > 0) + dev = __dev_get_by_index(net, ifm->ifi_index); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(net, tb); + else + return -EINVAL; + + if (!dev) + return -ENODEV; + + if (!tb[IFLA_PROP_LIST]) + return 0; + + nla_for_each_nested(attr, tb[IFLA_PROP_LIST], rem) { + switch (nla_type(attr)) { + case IFLA_ALT_IFNAME: + err = rtnl_alt_ifname(cmd, dev, attr, &changed, extack); + if (err) + return err; + break; + } + } + + if (changed) + netdev_state_change(dev); + return 0; +} + +static int rtnl_newlinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + return rtnl_linkprop(RTM_NEWLINKPROP, skb, nlh, extack); +} + +static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack); +} + +static u32 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct net *net = sock_net(skb->sk); + size_t min_ifinfo_dump_size = 0; + struct nlattr *tb[IFLA_MAX+1]; + u32 ext_filter_mask = 0; + struct net_device *dev; + int hdrlen; + + /* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */ + hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? + sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); + + if (nlmsg_parse_deprecated(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, NULL) >= 0) { + if (tb[IFLA_EXT_MASK]) + ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); + } + + if (!ext_filter_mask) + return NLMSG_GOODSIZE; + /* + * traverse the list of net devices and compute the minimum + * buffer size based upon the filter mask. + */ + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + min_ifinfo_dump_size = max(min_ifinfo_dump_size, + if_nlmsg_size(dev, ext_filter_mask)); + } + rcu_read_unlock(); + + return nlmsg_total_size(min_ifinfo_dump_size); +} + +static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx; + int s_idx = cb->family; + int type = cb->nlh->nlmsg_type - RTM_BASE; + int ret = 0; + + if (s_idx == 0) + s_idx = 1; + + for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) { + struct rtnl_link __rcu **tab; + struct rtnl_link *link; + rtnl_dumpit_func dumpit; + + if (idx < s_idx || idx == PF_PACKET) + continue; + + if (type < 0 || type >= RTM_NR_MSGTYPES) + continue; + + tab = rcu_dereference_rtnl(rtnl_msg_handlers[idx]); + if (!tab) + continue; + + link = rcu_dereference_rtnl(tab[type]); + if (!link) + continue; + + dumpit = link->dumpit; + if (!dumpit) + continue; + + if (idx > s_idx) { + memset(&cb->args[0], 0, sizeof(cb->args)); + cb->prev_seq = 0; + cb->seq = 0; + } + ret = dumpit(skb, cb); + if (ret) + break; + } + cb->family = idx; + + return skb->len ? : ret; +} + +struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, + unsigned int change, + u32 event, gfp_t flags, int *new_nsid, + int new_ifindex) +{ + struct net *net = dev_net(dev); + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = nlmsg_new(if_nlmsg_size(dev, 0), flags); + if (skb == NULL) + goto errout; + + err = rtnl_fill_ifinfo(skb, dev, dev_net(dev), + type, 0, 0, change, 0, 0, event, + new_nsid, new_ifindex, -1, flags); + if (err < 0) { + /* -EMSGSIZE implies BUG in if_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + return skb; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_LINK, err); + return NULL; +} + +void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags) +{ + struct net *net = dev_net(dev); + + rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags); +} + +static void rtmsg_ifinfo_event(int type, struct net_device *dev, + unsigned int change, u32 event, + gfp_t flags, int *new_nsid, int new_ifindex) +{ + struct sk_buff *skb; + + if (dev->reg_state != NETREG_REGISTERED) + return; + + skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid, + new_ifindex); + if (skb) + rtmsg_ifinfo_send(skb, dev, flags); +} + +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, + gfp_t flags) +{ + rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, + NULL, 0); +} + +void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, + gfp_t flags, int *new_nsid, int new_ifindex) +{ + rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, + new_nsid, new_ifindex); +} + +static int nlmsg_populate_fdb_fill(struct sk_buff *skb, + struct net_device *dev, + u8 *addr, u16 vid, u32 pid, u32 seq, + int type, unsigned int flags, + int nlflags, u16 ndm_state) +{ + struct nlmsghdr *nlh; + struct ndmsg *ndm; + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), nlflags); + if (!nlh) + return -EMSGSIZE; + + ndm = nlmsg_data(nlh); + ndm->ndm_family = AF_BRIDGE; + ndm->ndm_pad1 = 0; + ndm->ndm_pad2 = 0; + ndm->ndm_flags = flags; + ndm->ndm_type = 0; + ndm->ndm_ifindex = dev->ifindex; + ndm->ndm_state = ndm_state; + + if (nla_put(skb, NDA_LLADDR, dev->addr_len, addr)) + goto nla_put_failure; + if (vid) + if (nla_put(skb, NDA_VLAN, sizeof(u16), &vid)) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static inline size_t rtnl_fdb_nlmsg_size(const struct net_device *dev) +{ + return NLMSG_ALIGN(sizeof(struct ndmsg)) + + nla_total_size(dev->addr_len) + /* NDA_LLADDR */ + nla_total_size(sizeof(u16)) + /* NDA_VLAN */ + 0; +} + +static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type, + u16 ndm_state) +{ + struct net *net = dev_net(dev); + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = nlmsg_new(rtnl_fdb_nlmsg_size(dev), GFP_ATOMIC); + if (!skb) + goto errout; + + err = nlmsg_populate_fdb_fill(skb, dev, addr, vid, + 0, 0, type, NTF_SELF, 0, ndm_state); + if (err < 0) { + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); + return; +errout: + rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); +} + +/* + * ndo_dflt_fdb_add - default netdevice operation to add an FDB entry + */ +int ndo_dflt_fdb_add(struct ndmsg *ndm, + struct nlattr *tb[], + struct net_device *dev, + const unsigned char *addr, u16 vid, + u16 flags) +{ + int err = -EINVAL; + + /* If aging addresses are supported device will need to + * implement its own handler for this. + */ + if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) { + netdev_info(dev, "default FDB implementation only supports local addresses\n"); + return err; + } + + if (vid) { + netdev_info(dev, "vlans aren't supported yet for dev_uc|mc_add()\n"); + return err; + } + + if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) + err = dev_uc_add_excl(dev, addr); + else if (is_multicast_ether_addr(addr)) + err = dev_mc_add_excl(dev, addr); + + /* Only return duplicate errors if NLM_F_EXCL is set */ + if (err == -EEXIST && !(flags & NLM_F_EXCL)) + err = 0; + + return err; +} +EXPORT_SYMBOL(ndo_dflt_fdb_add); + +static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid, + struct netlink_ext_ack *extack) +{ + u16 vid = 0; + + if (vlan_attr) { + if (nla_len(vlan_attr) != sizeof(u16)) { + NL_SET_ERR_MSG(extack, "invalid vlan attribute size"); + return -EINVAL; + } + + vid = nla_get_u16(vlan_attr); + + if (!vid || vid >= VLAN_VID_MASK) { + NL_SET_ERR_MSG(extack, "invalid vlan id"); + return -EINVAL; + } + } + *p_vid = vid; + return 0; +} + +static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct ndmsg *ndm; + struct nlattr *tb[NDA_MAX+1]; + struct net_device *dev; + u8 *addr; + u16 vid; + int err; + + err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, + extack); + if (err < 0) + return err; + + ndm = nlmsg_data(nlh); + if (ndm->ndm_ifindex == 0) { + NL_SET_ERR_MSG(extack, "invalid ifindex"); + return -EINVAL; + } + + dev = __dev_get_by_index(net, ndm->ndm_ifindex); + if (dev == NULL) { + NL_SET_ERR_MSG(extack, "unknown ifindex"); + return -ENODEV; + } + + if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { + NL_SET_ERR_MSG(extack, "invalid address"); + return -EINVAL; + } + + if (dev->type != ARPHRD_ETHER) { + NL_SET_ERR_MSG(extack, "FDB add only supported for Ethernet devices"); + return -EINVAL; + } + + addr = nla_data(tb[NDA_LLADDR]); + + err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); + if (err) + return err; + + err = -EOPNOTSUPP; + + /* Support fdb on master device the net/bridge default case */ + if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && + netif_is_bridge_port(dev)) { + struct net_device *br_dev = netdev_master_upper_dev_get(dev); + const struct net_device_ops *ops = br_dev->netdev_ops; + + err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid, + nlh->nlmsg_flags, extack); + if (err) + goto out; + else + ndm->ndm_flags &= ~NTF_MASTER; + } + + /* Embedded bridge, macvlan, and any other device support */ + if ((ndm->ndm_flags & NTF_SELF)) { + if (dev->netdev_ops->ndo_fdb_add) + err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr, + vid, + nlh->nlmsg_flags, + extack); + else + err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid, + nlh->nlmsg_flags); + + if (!err) { + rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH, + ndm->ndm_state); + ndm->ndm_flags &= ~NTF_SELF; + } + } +out: + return err; +} + +/* + * ndo_dflt_fdb_del - default netdevice operation to delete an FDB entry + */ +int ndo_dflt_fdb_del(struct ndmsg *ndm, + struct nlattr *tb[], + struct net_device *dev, + const unsigned char *addr, u16 vid) +{ + int err = -EINVAL; + + /* If aging addresses are supported device will need to + * implement its own handler for this. + */ + if (!(ndm->ndm_state & NUD_PERMANENT)) { + netdev_info(dev, "default FDB implementation only supports local addresses\n"); + return err; + } + + if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) + err = dev_uc_del(dev, addr); + else if (is_multicast_ether_addr(addr)) + err = dev_mc_del(dev, addr); + + return err; +} +EXPORT_SYMBOL(ndo_dflt_fdb_del); + +static const struct nla_policy fdb_del_bulk_policy[NDA_MAX + 1] = { + [NDA_VLAN] = { .type = NLA_U16 }, + [NDA_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1), + [NDA_NDM_STATE_MASK] = { .type = NLA_U16 }, + [NDA_NDM_FLAGS_MASK] = { .type = NLA_U8 }, +}; + +static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + bool del_bulk = !!(nlh->nlmsg_flags & NLM_F_BULK); + struct net *net = sock_net(skb->sk); + const struct net_device_ops *ops; + struct ndmsg *ndm; + struct nlattr *tb[NDA_MAX+1]; + struct net_device *dev; + __u8 *addr = NULL; + int err; + u16 vid; + + if (!netlink_capable(skb, CAP_NET_ADMIN)) + return -EPERM; + + if (!del_bulk) { + err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, + NULL, extack); + } else { + err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, + fdb_del_bulk_policy, extack); + } + if (err < 0) + return err; + + ndm = nlmsg_data(nlh); + if (ndm->ndm_ifindex == 0) { + NL_SET_ERR_MSG(extack, "invalid ifindex"); + return -EINVAL; + } + + dev = __dev_get_by_index(net, ndm->ndm_ifindex); + if (dev == NULL) { + NL_SET_ERR_MSG(extack, "unknown ifindex"); + return -ENODEV; + } + + if (!del_bulk) { + if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { + NL_SET_ERR_MSG(extack, "invalid address"); + return -EINVAL; + } + addr = nla_data(tb[NDA_LLADDR]); + } + + if (dev->type != ARPHRD_ETHER) { + NL_SET_ERR_MSG(extack, "FDB delete only supported for Ethernet devices"); + return -EINVAL; + } + + err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); + if (err) + return err; + + err = -EOPNOTSUPP; + + /* Support fdb on master device the net/bridge default case */ + if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && + netif_is_bridge_port(dev)) { + struct net_device *br_dev = netdev_master_upper_dev_get(dev); + + ops = br_dev->netdev_ops; + if (!del_bulk) { + if (ops->ndo_fdb_del) + err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack); + } else { + if (ops->ndo_fdb_del_bulk) + err = ops->ndo_fdb_del_bulk(ndm, tb, dev, vid, + extack); + } + + if (err) + goto out; + else + ndm->ndm_flags &= ~NTF_MASTER; + } + + /* Embedded bridge, macvlan, and any other device support */ + if (ndm->ndm_flags & NTF_SELF) { + ops = dev->netdev_ops; + if (!del_bulk) { + if (ops->ndo_fdb_del) + err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack); + else + err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid); + } else { + /* in case err was cleared by NTF_MASTER call */ + err = -EOPNOTSUPP; + if (ops->ndo_fdb_del_bulk) + err = ops->ndo_fdb_del_bulk(ndm, tb, dev, vid, + extack); + } + + if (!err) { + if (!del_bulk) + rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH, + ndm->ndm_state); + ndm->ndm_flags &= ~NTF_SELF; + } + } +out: + return err; +} + +static int nlmsg_populate_fdb(struct sk_buff *skb, + struct netlink_callback *cb, + struct net_device *dev, + int *idx, + struct netdev_hw_addr_list *list) +{ + struct netdev_hw_addr *ha; + int err; + u32 portid, seq; + + portid = NETLINK_CB(cb->skb).portid; + seq = cb->nlh->nlmsg_seq; + + list_for_each_entry(ha, &list->list, list) { + if (*idx < cb->args[2]) + goto skip; + + err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0, + portid, seq, + RTM_NEWNEIGH, NTF_SELF, + NLM_F_MULTI, NUD_PERMANENT); + if (err < 0) + return err; +skip: + *idx += 1; + } + return 0; +} + +/** + * ndo_dflt_fdb_dump - default netdevice operation to dump an FDB table. + * @skb: socket buffer to store message in + * @cb: netlink callback + * @dev: netdevice + * @filter_dev: ignored + * @idx: the number of FDB table entries dumped is added to *@idx + * + * Default netdevice operation to dump the existing unicast address list. + * Returns number of addresses from list put in skb. + */ +int ndo_dflt_fdb_dump(struct sk_buff *skb, + struct netlink_callback *cb, + struct net_device *dev, + struct net_device *filter_dev, + int *idx) +{ + int err; + + if (dev->type != ARPHRD_ETHER) + return -EINVAL; + + netif_addr_lock_bh(dev); + err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc); + if (err) + goto out; + err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc); +out: + netif_addr_unlock_bh(dev); + return err; +} +EXPORT_SYMBOL(ndo_dflt_fdb_dump); + +static int valid_fdb_dump_strict(const struct nlmsghdr *nlh, + int *br_idx, int *brport_idx, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[NDA_MAX + 1]; + struct ndmsg *ndm; + int err, i; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request"); + return -EINVAL; + } + + ndm = nlmsg_data(nlh); + if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || + ndm->ndm_flags || ndm->ndm_type) { + NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request"); + return -EINVAL; + } + + err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, + NDA_MAX, NULL, extack); + if (err < 0) + return err; + + *brport_idx = ndm->ndm_ifindex; + for (i = 0; i <= NDA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case NDA_IFINDEX: + if (nla_len(tb[i]) != sizeof(u32)) { + NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in fdb dump request"); + return -EINVAL; + } + *brport_idx = nla_get_u32(tb[NDA_IFINDEX]); + break; + case NDA_MASTER: + if (nla_len(tb[i]) != sizeof(u32)) { + NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in fdb dump request"); + return -EINVAL; + } + *br_idx = nla_get_u32(tb[NDA_MASTER]); + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb dump request"); + return -EINVAL; + } + } + + return 0; +} + +static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh, + int *br_idx, int *brport_idx, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_MAX+1]; + int err; + + /* A hack to preserve kernel<->userspace interface. + * Before Linux v4.12 this code accepted ndmsg since iproute2 v3.3.0. + * However, ndmsg is shorter than ifinfomsg thus nlmsg_parse() bails. + * So, check for ndmsg with an optional u32 attribute (not used here). + * Fortunately these sizes don't conflict with the size of ifinfomsg + * with an optional attribute. + */ + if (nlmsg_len(nlh) != sizeof(struct ndmsg) && + (nlmsg_len(nlh) != sizeof(struct ndmsg) + + nla_attr_size(sizeof(u32)))) { + struct ifinfomsg *ifm; + + err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), + tb, IFLA_MAX, ifla_policy, + extack); + if (err < 0) { + return -EINVAL; + } else if (err == 0) { + if (tb[IFLA_MASTER]) + *br_idx = nla_get_u32(tb[IFLA_MASTER]); + } + + ifm = nlmsg_data(nlh); + *brport_idx = ifm->ifi_index; + } + return 0; +} + +static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net_device *dev; + struct net_device *br_dev = NULL; + const struct net_device_ops *ops = NULL; + const struct net_device_ops *cops = NULL; + struct net *net = sock_net(skb->sk); + struct hlist_head *head; + int brport_idx = 0; + int br_idx = 0; + int h, s_h; + int idx = 0, s_idx; + int err = 0; + int fidx = 0; + + if (cb->strict_check) + err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx, + cb->extack); + else + err = valid_fdb_dump_legacy(cb->nlh, &br_idx, &brport_idx, + cb->extack); + if (err < 0) + return err; + + if (br_idx) { + br_dev = __dev_get_by_index(net, br_idx); + if (!br_dev) + return -ENODEV; + + ops = br_dev->netdev_ops; + } + + s_h = cb->args[0]; + s_idx = cb->args[1]; + + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + hlist_for_each_entry(dev, head, index_hlist) { + + if (brport_idx && (dev->ifindex != brport_idx)) + continue; + + if (!br_idx) { /* user did not specify a specific bridge */ + if (netif_is_bridge_port(dev)) { + br_dev = netdev_master_upper_dev_get(dev); + cops = br_dev->netdev_ops; + } + } else { + if (dev != br_dev && + !netif_is_bridge_port(dev)) + continue; + + if (br_dev != netdev_master_upper_dev_get(dev) && + !netif_is_bridge_master(dev)) + continue; + cops = ops; + } + + if (idx < s_idx) + goto cont; + + if (netif_is_bridge_port(dev)) { + if (cops && cops->ndo_fdb_dump) { + err = cops->ndo_fdb_dump(skb, cb, + br_dev, dev, + &fidx); + if (err == -EMSGSIZE) + goto out; + } + } + + if (dev->netdev_ops->ndo_fdb_dump) + err = dev->netdev_ops->ndo_fdb_dump(skb, cb, + dev, NULL, + &fidx); + else + err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, + &fidx); + if (err == -EMSGSIZE) + goto out; + + cops = NULL; + + /* reset fdb offset to 0 for rest of the interfaces */ + cb->args[2] = 0; + fidx = 0; +cont: + idx++; + } + } + +out: + cb->args[0] = h; + cb->args[1] = idx; + cb->args[2] = fidx; + + return skb->len; +} + +static int valid_fdb_get_strict(const struct nlmsghdr *nlh, + struct nlattr **tb, u8 *ndm_flags, + int *br_idx, int *brport_idx, u8 **addr, + u16 *vid, struct netlink_ext_ack *extack) +{ + struct ndmsg *ndm; + int err, i; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + NL_SET_ERR_MSG(extack, "Invalid header for fdb get request"); + return -EINVAL; + } + + ndm = nlmsg_data(nlh); + if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || + ndm->ndm_type) { + NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request"); + return -EINVAL; + } + + if (ndm->ndm_flags & ~(NTF_MASTER | NTF_SELF)) { + NL_SET_ERR_MSG(extack, "Invalid flags in header for fdb get request"); + return -EINVAL; + } + + err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, + NDA_MAX, nda_policy, extack); + if (err < 0) + return err; + + *ndm_flags = ndm->ndm_flags; + *brport_idx = ndm->ndm_ifindex; + for (i = 0; i <= NDA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case NDA_MASTER: + *br_idx = nla_get_u32(tb[i]); + break; + case NDA_LLADDR: + if (nla_len(tb[i]) != ETH_ALEN) { + NL_SET_ERR_MSG(extack, "Invalid address in fdb get request"); + return -EINVAL; + } + *addr = nla_data(tb[i]); + break; + case NDA_VLAN: + err = fdb_vid_parse(tb[i], vid, extack); + if (err) + return err; + break; + case NDA_VNI: + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb get request"); + return -EINVAL; + } + } + + return 0; +} + +static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net_device *dev = NULL, *br_dev = NULL; + const struct net_device_ops *ops = NULL; + struct net *net = sock_net(in_skb->sk); + struct nlattr *tb[NDA_MAX + 1]; + struct sk_buff *skb; + int brport_idx = 0; + u8 ndm_flags = 0; + int br_idx = 0; + u8 *addr = NULL; + u16 vid = 0; + int err; + + err = valid_fdb_get_strict(nlh, tb, &ndm_flags, &br_idx, + &brport_idx, &addr, &vid, extack); + if (err < 0) + return err; + + if (!addr) { + NL_SET_ERR_MSG(extack, "Missing lookup address for fdb get request"); + return -EINVAL; + } + + if (brport_idx) { + dev = __dev_get_by_index(net, brport_idx); + if (!dev) { + NL_SET_ERR_MSG(extack, "Unknown device ifindex"); + return -ENODEV; + } + } + + if (br_idx) { + if (dev) { + NL_SET_ERR_MSG(extack, "Master and device are mutually exclusive"); + return -EINVAL; + } + + br_dev = __dev_get_by_index(net, br_idx); + if (!br_dev) { + NL_SET_ERR_MSG(extack, "Invalid master ifindex"); + return -EINVAL; + } + ops = br_dev->netdev_ops; + } + + if (dev) { + if (!ndm_flags || (ndm_flags & NTF_MASTER)) { + if (!netif_is_bridge_port(dev)) { + NL_SET_ERR_MSG(extack, "Device is not a bridge port"); + return -EINVAL; + } + br_dev = netdev_master_upper_dev_get(dev); + if (!br_dev) { + NL_SET_ERR_MSG(extack, "Master of device not found"); + return -EINVAL; + } + ops = br_dev->netdev_ops; + } else { + if (!(ndm_flags & NTF_SELF)) { + NL_SET_ERR_MSG(extack, "Missing NTF_SELF"); + return -EINVAL; + } + ops = dev->netdev_ops; + } + } + + if (!br_dev && !dev) { + NL_SET_ERR_MSG(extack, "No device specified"); + return -ENODEV; + } + + if (!ops || !ops->ndo_fdb_get) { + NL_SET_ERR_MSG(extack, "Fdb get operation not supported by device"); + return -EOPNOTSUPP; + } + + skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (br_dev) + dev = br_dev; + err = ops->ndo_fdb_get(skb, tb, dev, addr, vid, + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, extack); + if (err) + goto out; + + return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); +out: + kfree_skb(skb); + return err; +} + +static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask, + unsigned int attrnum, unsigned int flag) +{ + if (mask & flag) + return nla_put_u8(skb, attrnum, !!(flags & flag)); + return 0; +} + +int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, + struct net_device *dev, u16 mode, + u32 flags, u32 mask, int nlflags, + u32 filter_mask, + int (*vlan_fill)(struct sk_buff *skb, + struct net_device *dev, + u32 filter_mask)) +{ + struct nlmsghdr *nlh; + struct ifinfomsg *ifm; + struct nlattr *br_afspec; + struct nlattr *protinfo; + u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; + struct net_device *br_dev = netdev_master_upper_dev_get(dev); + int err = 0; + + nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags); + if (nlh == NULL) + return -EMSGSIZE; + + ifm = nlmsg_data(nlh); + ifm->ifi_family = AF_BRIDGE; + ifm->__ifi_pad = 0; + ifm->ifi_type = dev->type; + ifm->ifi_index = dev->ifindex; + ifm->ifi_flags = dev_get_flags(dev); + ifm->ifi_change = 0; + + + if (nla_put_string(skb, IFLA_IFNAME, dev->name) || + nla_put_u32(skb, IFLA_MTU, dev->mtu) || + nla_put_u8(skb, IFLA_OPERSTATE, operstate) || + (br_dev && + nla_put_u32(skb, IFLA_MASTER, br_dev->ifindex)) || + (dev->addr_len && + nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || + (dev->ifindex != dev_get_iflink(dev) && + nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev)))) + goto nla_put_failure; + + br_afspec = nla_nest_start_noflag(skb, IFLA_AF_SPEC); + if (!br_afspec) + goto nla_put_failure; + + if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF)) { + nla_nest_cancel(skb, br_afspec); + goto nla_put_failure; + } + + if (mode != BRIDGE_MODE_UNDEF) { + if (nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) { + nla_nest_cancel(skb, br_afspec); + goto nla_put_failure; + } + } + if (vlan_fill) { + err = vlan_fill(skb, dev, filter_mask); + if (err) { + nla_nest_cancel(skb, br_afspec); + goto nla_put_failure; + } + } + nla_nest_end(skb, br_afspec); + + protinfo = nla_nest_start(skb, IFLA_PROTINFO); + if (!protinfo) + goto nla_put_failure; + + if (brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_MODE, BR_HAIRPIN_MODE) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_GUARD, BR_BPDU_GUARD) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_FAST_LEAVE, + BR_MULTICAST_FAST_LEAVE) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_LEARNING, BR_LEARNING) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_LEARNING_SYNC, BR_LEARNING_SYNC) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_PROXYARP, BR_PROXYARP) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD)) { + nla_nest_cancel(skb, protinfo); + goto nla_put_failure; + } + + nla_nest_end(skb, protinfo); + + nlmsg_end(skb, nlh); + return 0; +nla_put_failure: + nlmsg_cancel(skb, nlh); + return err ? err : -EMSGSIZE; +} +EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink); + +static int valid_bridge_getlink_req(const struct nlmsghdr *nlh, + bool strict_check, u32 *filter_mask, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_MAX+1]; + int err, i; + + if (strict_check) { + struct ifinfomsg *ifm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump"); + return -EINVAL; + } + + ifm = nlmsg_data(nlh); + if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || + ifm->ifi_change || ifm->ifi_index) { + NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request"); + return -EINVAL; + } + + err = nlmsg_parse_deprecated_strict(nlh, + sizeof(struct ifinfomsg), + tb, IFLA_MAX, ifla_policy, + extack); + } else { + err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), + tb, IFLA_MAX, ifla_policy, + extack); + } + if (err < 0) + return err; + + /* new attributes should only be added with strict checking */ + for (i = 0; i <= IFLA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case IFLA_EXT_MASK: + *filter_mask = nla_get_u32(tb[i]); + break; + default: + if (strict_check) { + NL_SET_ERR_MSG(extack, "Unsupported attribute in bridge link dump request"); + return -EINVAL; + } + } + } + + return 0; +} + +static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct nlmsghdr *nlh = cb->nlh; + struct net *net = sock_net(skb->sk); + struct net_device *dev; + int idx = 0; + u32 portid = NETLINK_CB(cb->skb).portid; + u32 seq = nlh->nlmsg_seq; + u32 filter_mask = 0; + int err; + + err = valid_bridge_getlink_req(nlh, cb->strict_check, &filter_mask, + cb->extack); + if (err < 0 && cb->strict_check) + return err; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + const struct net_device_ops *ops = dev->netdev_ops; + struct net_device *br_dev = netdev_master_upper_dev_get(dev); + + if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { + if (idx >= cb->args[0]) { + err = br_dev->netdev_ops->ndo_bridge_getlink( + skb, portid, seq, dev, + filter_mask, NLM_F_MULTI); + if (err < 0 && err != -EOPNOTSUPP) { + if (likely(skb->len)) + break; + + goto out_err; + } + } + idx++; + } + + if (ops->ndo_bridge_getlink) { + if (idx >= cb->args[0]) { + err = ops->ndo_bridge_getlink(skb, portid, + seq, dev, + filter_mask, + NLM_F_MULTI); + if (err < 0 && err != -EOPNOTSUPP) { + if (likely(skb->len)) + break; + + goto out_err; + } + } + idx++; + } + } + err = skb->len; +out_err: + rcu_read_unlock(); + cb->args[0] = idx; + + return err; +} + +static inline size_t bridge_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + + nla_total_size(sizeof(u32)) /* IFLA_MASTER */ + + nla_total_size(sizeof(u32)) /* IFLA_MTU */ + + nla_total_size(sizeof(u32)) /* IFLA_LINK */ + + nla_total_size(sizeof(u32)) /* IFLA_OPERSTATE */ + + nla_total_size(sizeof(u8)) /* IFLA_PROTINFO */ + + nla_total_size(sizeof(struct nlattr)) /* IFLA_AF_SPEC */ + + nla_total_size(sizeof(u16)) /* IFLA_BRIDGE_FLAGS */ + + nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_MODE */ +} + +static int rtnl_bridge_notify(struct net_device *dev) +{ + struct net *net = dev_net(dev); + struct sk_buff *skb; + int err = -EOPNOTSUPP; + + if (!dev->netdev_ops->ndo_bridge_getlink) + return 0; + + skb = nlmsg_new(bridge_nlmsg_size(), GFP_ATOMIC); + if (!skb) { + err = -ENOMEM; + goto errout; + } + + err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0, 0); + if (err < 0) + goto errout; + + /* Notification info is only filled for bridge ports, not the bridge + * device itself. Therefore, a zero notification length is valid and + * should not result in an error. + */ + if (!skb->len) + goto errout; + + rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); + return 0; +errout: + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + if (err) + rtnl_set_sk_err(net, RTNLGRP_LINK, err); + return err; +} + +static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct ifinfomsg *ifm; + struct net_device *dev; + struct nlattr *br_spec, *attr = NULL; + int rem, err = -EOPNOTSUPP; + u16 flags = 0; + bool have_flags = false; + + if (nlmsg_len(nlh) < sizeof(*ifm)) + return -EINVAL; + + ifm = nlmsg_data(nlh); + if (ifm->ifi_family != AF_BRIDGE) + return -EPFNOSUPPORT; + + dev = __dev_get_by_index(net, ifm->ifi_index); + if (!dev) { + NL_SET_ERR_MSG(extack, "unknown ifindex"); + return -ENODEV; + } + + br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); + if (br_spec) { + nla_for_each_nested(attr, br_spec, rem) { + if (nla_type(attr) == IFLA_BRIDGE_FLAGS && !have_flags) { + if (nla_len(attr) < sizeof(flags)) + return -EINVAL; + + have_flags = true; + flags = nla_get_u16(attr); + } + + if (nla_type(attr) == IFLA_BRIDGE_MODE) { + if (nla_len(attr) < sizeof(u16)) + return -EINVAL; + } + } + } + + if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { + struct net_device *br_dev = netdev_master_upper_dev_get(dev); + + if (!br_dev || !br_dev->netdev_ops->ndo_bridge_setlink) { + err = -EOPNOTSUPP; + goto out; + } + + err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags, + extack); + if (err) + goto out; + + flags &= ~BRIDGE_FLAGS_MASTER; + } + + if ((flags & BRIDGE_FLAGS_SELF)) { + if (!dev->netdev_ops->ndo_bridge_setlink) + err = -EOPNOTSUPP; + else + err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh, + flags, + extack); + if (!err) { + flags &= ~BRIDGE_FLAGS_SELF; + + /* Generate event to notify upper layer of bridge + * change + */ + err = rtnl_bridge_notify(dev); + } + } + + if (have_flags) + memcpy(nla_data(attr), &flags, sizeof(flags)); +out: + return err; +} + +static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct ifinfomsg *ifm; + struct net_device *dev; + struct nlattr *br_spec, *attr = NULL; + int rem, err = -EOPNOTSUPP; + u16 flags = 0; + bool have_flags = false; + + if (nlmsg_len(nlh) < sizeof(*ifm)) + return -EINVAL; + + ifm = nlmsg_data(nlh); + if (ifm->ifi_family != AF_BRIDGE) + return -EPFNOSUPPORT; + + dev = __dev_get_by_index(net, ifm->ifi_index); + if (!dev) { + NL_SET_ERR_MSG(extack, "unknown ifindex"); + return -ENODEV; + } + + br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); + if (br_spec) { + nla_for_each_nested(attr, br_spec, rem) { + if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { + if (nla_len(attr) < sizeof(flags)) + return -EINVAL; + + have_flags = true; + flags = nla_get_u16(attr); + break; + } + } + } + + if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { + struct net_device *br_dev = netdev_master_upper_dev_get(dev); + + if (!br_dev || !br_dev->netdev_ops->ndo_bridge_dellink) { + err = -EOPNOTSUPP; + goto out; + } + + err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh, flags); + if (err) + goto out; + + flags &= ~BRIDGE_FLAGS_MASTER; + } + + if ((flags & BRIDGE_FLAGS_SELF)) { + if (!dev->netdev_ops->ndo_bridge_dellink) + err = -EOPNOTSUPP; + else + err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh, + flags); + + if (!err) { + flags &= ~BRIDGE_FLAGS_SELF; + + /* Generate event to notify upper layer of bridge + * change + */ + err = rtnl_bridge_notify(dev); + } + } + + if (have_flags) + memcpy(nla_data(attr), &flags, sizeof(flags)); +out: + return err; +} + +static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr) +{ + return (mask & IFLA_STATS_FILTER_BIT(attrid)) && + (!idxattr || idxattr == attrid); +} + +static bool +rtnl_offload_xstats_have_ndo(const struct net_device *dev, int attr_id) +{ + return dev->netdev_ops && + dev->netdev_ops->ndo_has_offload_stats && + dev->netdev_ops->ndo_get_offload_stats && + dev->netdev_ops->ndo_has_offload_stats(dev, attr_id); +} + +static unsigned int +rtnl_offload_xstats_get_size_ndo(const struct net_device *dev, int attr_id) +{ + return rtnl_offload_xstats_have_ndo(dev, attr_id) ? + sizeof(struct rtnl_link_stats64) : 0; +} + +static int +rtnl_offload_xstats_fill_ndo(struct net_device *dev, int attr_id, + struct sk_buff *skb) +{ + unsigned int size = rtnl_offload_xstats_get_size_ndo(dev, attr_id); + struct nlattr *attr = NULL; + void *attr_data; + int err; + + if (!size) + return -ENODATA; + + attr = nla_reserve_64bit(skb, attr_id, size, + IFLA_OFFLOAD_XSTATS_UNSPEC); + if (!attr) + return -EMSGSIZE; + + attr_data = nla_data(attr); + memset(attr_data, 0, size); + + err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev, attr_data); + if (err) + return err; + + return 0; +} + +static unsigned int +rtnl_offload_xstats_get_size_stats(const struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + bool enabled = netdev_offload_xstats_enabled(dev, type); + + return enabled ? sizeof(struct rtnl_hw_stats64) : 0; +} + +struct rtnl_offload_xstats_request_used { + bool request; + bool used; +}; + +static int +rtnl_offload_xstats_get_stats(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct rtnl_offload_xstats_request_used *ru, + struct rtnl_hw_stats64 *stats, + struct netlink_ext_ack *extack) +{ + bool request; + bool used; + int err; + + request = netdev_offload_xstats_enabled(dev, type); + if (!request) { + used = false; + goto out; + } + + err = netdev_offload_xstats_get(dev, type, stats, &used, extack); + if (err) + return err; + +out: + if (ru) { + ru->request = request; + ru->used = used; + } + return 0; +} + +static int +rtnl_offload_xstats_fill_hw_s_info_one(struct sk_buff *skb, int attr_id, + struct rtnl_offload_xstats_request_used *ru) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, attr_id); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST, ru->request)) + goto nla_put_failure; + + if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED, ru->used)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int +rtnl_offload_xstats_fill_hw_s_info(struct sk_buff *skb, struct net_device *dev, + struct netlink_ext_ack *extack) +{ + enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; + struct rtnl_offload_xstats_request_used ru_l3; + struct nlattr *nest; + int err; + + err = rtnl_offload_xstats_get_stats(dev, t_l3, &ru_l3, NULL, extack); + if (err) + return err; + + nest = nla_nest_start(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO); + if (!nest) + return -EMSGSIZE; + + if (rtnl_offload_xstats_fill_hw_s_info_one(skb, + IFLA_OFFLOAD_XSTATS_L3_STATS, + &ru_l3)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int rtnl_offload_xstats_fill(struct sk_buff *skb, struct net_device *dev, + int *prividx, u32 off_filter_mask, + struct netlink_ext_ack *extack) +{ + enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; + int attr_id_hw_s_info = IFLA_OFFLOAD_XSTATS_HW_S_INFO; + int attr_id_l3_stats = IFLA_OFFLOAD_XSTATS_L3_STATS; + int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT; + bool have_data = false; + int err; + + if (*prividx <= attr_id_cpu_hit && + (off_filter_mask & + IFLA_STATS_FILTER_BIT(attr_id_cpu_hit))) { + err = rtnl_offload_xstats_fill_ndo(dev, attr_id_cpu_hit, skb); + if (!err) { + have_data = true; + } else if (err != -ENODATA) { + *prividx = attr_id_cpu_hit; + return err; + } + } + + if (*prividx <= attr_id_hw_s_info && + (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_hw_s_info))) { + *prividx = attr_id_hw_s_info; + + err = rtnl_offload_xstats_fill_hw_s_info(skb, dev, extack); + if (err) + return err; + + have_data = true; + *prividx = 0; + } + + if (*prividx <= attr_id_l3_stats && + (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_l3_stats))) { + unsigned int size_l3; + struct nlattr *attr; + + *prividx = attr_id_l3_stats; + + size_l3 = rtnl_offload_xstats_get_size_stats(dev, t_l3); + if (!size_l3) + goto skip_l3_stats; + attr = nla_reserve_64bit(skb, attr_id_l3_stats, size_l3, + IFLA_OFFLOAD_XSTATS_UNSPEC); + if (!attr) + return -EMSGSIZE; + + err = rtnl_offload_xstats_get_stats(dev, t_l3, NULL, + nla_data(attr), extack); + if (err) + return err; + + have_data = true; +skip_l3_stats: + *prividx = 0; + } + + if (!have_data) + return -ENODATA; + + *prividx = 0; + return 0; +} + +static unsigned int +rtnl_offload_xstats_get_size_hw_s_info_one(const struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + return nla_total_size(0) + + /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST */ + nla_total_size(sizeof(u8)) + + /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED */ + nla_total_size(sizeof(u8)) + + 0; +} + +static unsigned int +rtnl_offload_xstats_get_size_hw_s_info(const struct net_device *dev) +{ + enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; + + return nla_total_size(0) + + /* IFLA_OFFLOAD_XSTATS_L3_STATS */ + rtnl_offload_xstats_get_size_hw_s_info_one(dev, t_l3) + + 0; +} + +static int rtnl_offload_xstats_get_size(const struct net_device *dev, + u32 off_filter_mask) +{ + enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; + int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT; + int nla_size = 0; + int size; + + if (off_filter_mask & + IFLA_STATS_FILTER_BIT(attr_id_cpu_hit)) { + size = rtnl_offload_xstats_get_size_ndo(dev, attr_id_cpu_hit); + nla_size += nla_total_size_64bit(size); + } + + if (off_filter_mask & + IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO)) + nla_size += rtnl_offload_xstats_get_size_hw_s_info(dev); + + if (off_filter_mask & + IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_L3_STATS)) { + size = rtnl_offload_xstats_get_size_stats(dev, t_l3); + nla_size += nla_total_size_64bit(size); + } + + if (nla_size != 0) + nla_size += nla_total_size(0); + + return nla_size; +} + +struct rtnl_stats_dump_filters { + /* mask[0] filters outer attributes. Then individual nests have their + * filtering mask at the index of the nested attribute. + */ + u32 mask[IFLA_STATS_MAX + 1]; +}; + +static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, + int type, u32 pid, u32 seq, u32 change, + unsigned int flags, + const struct rtnl_stats_dump_filters *filters, + int *idxattr, int *prividx, + struct netlink_ext_ack *extack) +{ + unsigned int filter_mask = filters->mask[0]; + struct if_stats_msg *ifsm; + struct nlmsghdr *nlh; + struct nlattr *attr; + int s_prividx = *prividx; + int err; + + ASSERT_RTNL(); + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifsm), flags); + if (!nlh) + return -EMSGSIZE; + + ifsm = nlmsg_data(nlh); + ifsm->family = PF_UNSPEC; + ifsm->pad1 = 0; + ifsm->pad2 = 0; + ifsm->ifindex = dev->ifindex; + ifsm->filter_mask = filter_mask; + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, *idxattr)) { + struct rtnl_link_stats64 *sp; + + attr = nla_reserve_64bit(skb, IFLA_STATS_LINK_64, + sizeof(struct rtnl_link_stats64), + IFLA_STATS_UNSPEC); + if (!attr) { + err = -EMSGSIZE; + goto nla_put_failure; + } + + sp = nla_data(attr); + dev_get_stats(dev, sp); + } + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, *idxattr)) { + const struct rtnl_link_ops *ops = dev->rtnl_link_ops; + + if (ops && ops->fill_linkxstats) { + *idxattr = IFLA_STATS_LINK_XSTATS; + attr = nla_nest_start_noflag(skb, + IFLA_STATS_LINK_XSTATS); + if (!attr) { + err = -EMSGSIZE; + goto nla_put_failure; + } + + err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); + nla_nest_end(skb, attr); + if (err) + goto nla_put_failure; + *idxattr = 0; + } + } + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, + *idxattr)) { + const struct rtnl_link_ops *ops = NULL; + const struct net_device *master; + + master = netdev_master_upper_dev_get(dev); + if (master) + ops = master->rtnl_link_ops; + if (ops && ops->fill_linkxstats) { + *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE; + attr = nla_nest_start_noflag(skb, + IFLA_STATS_LINK_XSTATS_SLAVE); + if (!attr) { + err = -EMSGSIZE; + goto nla_put_failure; + } + + err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); + nla_nest_end(skb, attr); + if (err) + goto nla_put_failure; + *idxattr = 0; + } + } + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, + *idxattr)) { + u32 off_filter_mask; + + off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS]; + *idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS; + attr = nla_nest_start_noflag(skb, + IFLA_STATS_LINK_OFFLOAD_XSTATS); + if (!attr) { + err = -EMSGSIZE; + goto nla_put_failure; + } + + err = rtnl_offload_xstats_fill(skb, dev, prividx, + off_filter_mask, extack); + if (err == -ENODATA) + nla_nest_cancel(skb, attr); + else + nla_nest_end(skb, attr); + + if (err && err != -ENODATA) + goto nla_put_failure; + *idxattr = 0; + } + + if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, *idxattr)) { + struct rtnl_af_ops *af_ops; + + *idxattr = IFLA_STATS_AF_SPEC; + attr = nla_nest_start_noflag(skb, IFLA_STATS_AF_SPEC); + if (!attr) { + err = -EMSGSIZE; + goto nla_put_failure; + } + + rcu_read_lock(); + list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { + if (af_ops->fill_stats_af) { + struct nlattr *af; + + af = nla_nest_start_noflag(skb, + af_ops->family); + if (!af) { + rcu_read_unlock(); + err = -EMSGSIZE; + goto nla_put_failure; + } + err = af_ops->fill_stats_af(skb, dev); + + if (err == -ENODATA) { + nla_nest_cancel(skb, af); + } else if (err < 0) { + rcu_read_unlock(); + goto nla_put_failure; + } + + nla_nest_end(skb, af); + } + } + rcu_read_unlock(); + + nla_nest_end(skb, attr); + + *idxattr = 0; + } + + nlmsg_end(skb, nlh); + + return 0; + +nla_put_failure: + /* not a multi message or no progress mean a real error */ + if (!(flags & NLM_F_MULTI) || s_prividx == *prividx) + nlmsg_cancel(skb, nlh); + else + nlmsg_end(skb, nlh); + + return err; +} + +static size_t if_nlmsg_stats_size(const struct net_device *dev, + const struct rtnl_stats_dump_filters *filters) +{ + size_t size = NLMSG_ALIGN(sizeof(struct if_stats_msg)); + unsigned int filter_mask = filters->mask[0]; + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0)) + size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64)); + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, 0)) { + const struct rtnl_link_ops *ops = dev->rtnl_link_ops; + int attr = IFLA_STATS_LINK_XSTATS; + + if (ops && ops->get_linkxstats_size) { + size += nla_total_size(ops->get_linkxstats_size(dev, + attr)); + /* for IFLA_STATS_LINK_XSTATS */ + size += nla_total_size(0); + } + } + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, 0)) { + struct net_device *_dev = (struct net_device *)dev; + const struct rtnl_link_ops *ops = NULL; + const struct net_device *master; + + /* netdev_master_upper_dev_get can't take const */ + master = netdev_master_upper_dev_get(_dev); + if (master) + ops = master->rtnl_link_ops; + if (ops && ops->get_linkxstats_size) { + int attr = IFLA_STATS_LINK_XSTATS_SLAVE; + + size += nla_total_size(ops->get_linkxstats_size(dev, + attr)); + /* for IFLA_STATS_LINK_XSTATS_SLAVE */ + size += nla_total_size(0); + } + } + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) { + u32 off_filter_mask; + + off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS]; + size += rtnl_offload_xstats_get_size(dev, off_filter_mask); + } + + if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) { + struct rtnl_af_ops *af_ops; + + /* for IFLA_STATS_AF_SPEC */ + size += nla_total_size(0); + + rcu_read_lock(); + list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { + if (af_ops->get_stats_af_size) { + size += nla_total_size( + af_ops->get_stats_af_size(dev)); + + /* for AF_* */ + size += nla_total_size(0); + } + } + rcu_read_unlock(); + } + + return size; +} + +#define RTNL_STATS_OFFLOAD_XSTATS_VALID ((1 << __IFLA_OFFLOAD_XSTATS_MAX) - 1) + +static const struct nla_policy +rtnl_stats_get_policy_filters[IFLA_STATS_MAX + 1] = { + [IFLA_STATS_LINK_OFFLOAD_XSTATS] = + NLA_POLICY_MASK(NLA_U32, RTNL_STATS_OFFLOAD_XSTATS_VALID), +}; + +static const struct nla_policy +rtnl_stats_get_policy[IFLA_STATS_GETSET_MAX + 1] = { + [IFLA_STATS_GET_FILTERS] = + NLA_POLICY_NESTED(rtnl_stats_get_policy_filters), +}; + +static const struct nla_policy +ifla_stats_set_policy[IFLA_STATS_GETSET_MAX + 1] = { + [IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS] = NLA_POLICY_MAX(NLA_U8, 1), +}; + +static int rtnl_stats_get_parse_filters(struct nlattr *ifla_filters, + struct rtnl_stats_dump_filters *filters, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_STATS_MAX + 1]; + int err; + int at; + + err = nla_parse_nested(tb, IFLA_STATS_MAX, ifla_filters, + rtnl_stats_get_policy_filters, extack); + if (err < 0) + return err; + + for (at = 1; at <= IFLA_STATS_MAX; at++) { + if (tb[at]) { + if (!(filters->mask[0] & IFLA_STATS_FILTER_BIT(at))) { + NL_SET_ERR_MSG(extack, "Filtered attribute not enabled in filter_mask"); + return -EINVAL; + } + filters->mask[at] = nla_get_u32(tb[at]); + } + } + + return 0; +} + +static int rtnl_stats_get_parse(const struct nlmsghdr *nlh, + u32 filter_mask, + struct rtnl_stats_dump_filters *filters, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1]; + int err; + int i; + + filters->mask[0] = filter_mask; + for (i = 1; i < ARRAY_SIZE(filters->mask); i++) + filters->mask[i] = -1U; + + err = nlmsg_parse(nlh, sizeof(struct if_stats_msg), tb, + IFLA_STATS_GETSET_MAX, rtnl_stats_get_policy, extack); + if (err < 0) + return err; + + if (tb[IFLA_STATS_GET_FILTERS]) { + err = rtnl_stats_get_parse_filters(tb[IFLA_STATS_GET_FILTERS], + filters, extack); + if (err) + return err; + } + + return 0; +} + +static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, + bool is_dump, struct netlink_ext_ack *extack) +{ + struct if_stats_msg *ifsm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifsm))) { + NL_SET_ERR_MSG(extack, "Invalid header for stats dump"); + return -EINVAL; + } + + if (!strict_check) + return 0; + + ifsm = nlmsg_data(nlh); + + /* only requests using strict checks can pass data to influence + * the dump. The legacy exception is filter_mask. + */ + if (ifsm->pad1 || ifsm->pad2 || (is_dump && ifsm->ifindex)) { + NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request"); + return -EINVAL; + } + if (ifsm->filter_mask >= IFLA_STATS_FILTER_BIT(IFLA_STATS_MAX + 1)) { + NL_SET_ERR_MSG(extack, "Invalid stats requested through filter mask"); + return -EINVAL; + } + + return 0; +} + +static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct rtnl_stats_dump_filters filters; + struct net *net = sock_net(skb->sk); + struct net_device *dev = NULL; + int idxattr = 0, prividx = 0; + struct if_stats_msg *ifsm; + struct sk_buff *nskb; + int err; + + err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb), + false, extack); + if (err) + return err; + + ifsm = nlmsg_data(nlh); + if (ifsm->ifindex > 0) + dev = __dev_get_by_index(net, ifsm->ifindex); + else + return -EINVAL; + + if (!dev) + return -ENODEV; + + if (!ifsm->filter_mask) { + NL_SET_ERR_MSG(extack, "Filter mask must be set for stats get"); + return -EINVAL; + } + + err = rtnl_stats_get_parse(nlh, ifsm->filter_mask, &filters, extack); + if (err) + return err; + + nskb = nlmsg_new(if_nlmsg_stats_size(dev, &filters), GFP_KERNEL); + if (!nskb) + return -ENOBUFS; + + err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS, + NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, + 0, &filters, &idxattr, &prividx, extack); + if (err < 0) { + /* -EMSGSIZE implies BUG in if_nlmsg_stats_size */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(nskb); + } else { + err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid); + } + + return err; +} + +static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct netlink_ext_ack *extack = cb->extack; + int h, s_h, err, s_idx, s_idxattr, s_prividx; + struct rtnl_stats_dump_filters filters; + struct net *net = sock_net(skb->sk); + unsigned int flags = NLM_F_MULTI; + struct if_stats_msg *ifsm; + struct hlist_head *head; + struct net_device *dev; + int idx = 0; + + s_h = cb->args[0]; + s_idx = cb->args[1]; + s_idxattr = cb->args[2]; + s_prividx = cb->args[3]; + + cb->seq = net->dev_base_seq; + + err = rtnl_valid_stats_req(cb->nlh, cb->strict_check, true, extack); + if (err) + return err; + + ifsm = nlmsg_data(cb->nlh); + if (!ifsm->filter_mask) { + NL_SET_ERR_MSG(extack, "Filter mask must be set for stats dump"); + return -EINVAL; + } + + err = rtnl_stats_get_parse(cb->nlh, ifsm->filter_mask, &filters, + extack); + if (err) + return err; + + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + hlist_for_each_entry(dev, head, index_hlist) { + if (idx < s_idx) + goto cont; + err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, 0, + flags, &filters, + &s_idxattr, &s_prividx, + extack); + /* If we ran out of room on the first message, + * we're in trouble + */ + WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); + + if (err < 0) + goto out; + s_prividx = 0; + s_idxattr = 0; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + } +out: + cb->args[3] = s_prividx; + cb->args[2] = s_idxattr; + cb->args[1] = idx; + cb->args[0] = h; + + return skb->len; +} + +void rtnl_offload_xstats_notify(struct net_device *dev) +{ + struct rtnl_stats_dump_filters response_filters = {}; + struct net *net = dev_net(dev); + int idxattr = 0, prividx = 0; + struct sk_buff *skb; + int err = -ENOBUFS; + + ASSERT_RTNL(); + + response_filters.mask[0] |= + IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS); + response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |= + IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO); + + skb = nlmsg_new(if_nlmsg_stats_size(dev, &response_filters), + GFP_KERNEL); + if (!skb) + goto errout; + + err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, 0, 0, 0, 0, + &response_filters, &idxattr, &prividx, NULL); + if (err < 0) { + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, net, 0, RTNLGRP_STATS, NULL, GFP_KERNEL); + return; + +errout: + rtnl_set_sk_err(net, RTNLGRP_STATS, err); +} +EXPORT_SYMBOL(rtnl_offload_xstats_notify); + +static int rtnl_stats_set(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; + struct rtnl_stats_dump_filters response_filters = {}; + struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1]; + struct net *net = sock_net(skb->sk); + struct net_device *dev = NULL; + struct if_stats_msg *ifsm; + bool notify = false; + int err; + + err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb), + false, extack); + if (err) + return err; + + ifsm = nlmsg_data(nlh); + if (ifsm->family != AF_UNSPEC) { + NL_SET_ERR_MSG(extack, "Address family should be AF_UNSPEC"); + return -EINVAL; + } + + if (ifsm->ifindex > 0) + dev = __dev_get_by_index(net, ifsm->ifindex); + else + return -EINVAL; + + if (!dev) + return -ENODEV; + + if (ifsm->filter_mask) { + NL_SET_ERR_MSG(extack, "Filter mask must be 0 for stats set"); + return -EINVAL; + } + + err = nlmsg_parse(nlh, sizeof(*ifsm), tb, IFLA_STATS_GETSET_MAX, + ifla_stats_set_policy, extack); + if (err < 0) + return err; + + if (tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]) { + u8 req = nla_get_u8(tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]); + + if (req) + err = netdev_offload_xstats_enable(dev, t_l3, extack); + else + err = netdev_offload_xstats_disable(dev, t_l3); + + if (!err) + notify = true; + else if (err != -EALREADY) + return err; + + response_filters.mask[0] |= + IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS); + response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |= + IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO); + } + + if (notify) + rtnl_offload_xstats_notify(dev); + + return 0; +} + +/* Process one rtnetlink message. */ + +static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct rtnl_link *link; + enum rtnl_kinds kind; + struct module *owner; + int err = -EOPNOTSUPP; + rtnl_doit_func doit; + unsigned int flags; + int family; + int type; + + type = nlh->nlmsg_type; + if (type > RTM_MAX) + return -EOPNOTSUPP; + + type -= RTM_BASE; + + /* All the messages must have at least 1 byte length */ + if (nlmsg_len(nlh) < sizeof(struct rtgenmsg)) + return 0; + + family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; + kind = rtnl_msgtype_kind(type); + + if (kind != RTNL_KIND_GET && !netlink_net_capable(skb, CAP_NET_ADMIN)) + return -EPERM; + + rcu_read_lock(); + if (kind == RTNL_KIND_GET && (nlh->nlmsg_flags & NLM_F_DUMP)) { + struct sock *rtnl; + rtnl_dumpit_func dumpit; + u32 min_dump_alloc = 0; + + link = rtnl_get_link(family, type); + if (!link || !link->dumpit) { + family = PF_UNSPEC; + link = rtnl_get_link(family, type); + if (!link || !link->dumpit) + goto err_unlock; + } + owner = link->owner; + dumpit = link->dumpit; + + if (type == RTM_GETLINK - RTM_BASE) + min_dump_alloc = rtnl_calcit(skb, nlh); + + err = 0; + /* need to do this before rcu_read_unlock() */ + if (!try_module_get(owner)) + err = -EPROTONOSUPPORT; + + rcu_read_unlock(); + + rtnl = net->rtnl; + if (err == 0) { + struct netlink_dump_control c = { + .dump = dumpit, + .min_dump_alloc = min_dump_alloc, + .module = owner, + }; + err = netlink_dump_start(rtnl, skb, nlh, &c); + /* netlink_dump_start() will keep a reference on + * module if dump is still in progress. + */ + module_put(owner); + } + return err; + } + + link = rtnl_get_link(family, type); + if (!link || !link->doit) { + family = PF_UNSPEC; + link = rtnl_get_link(PF_UNSPEC, type); + if (!link || !link->doit) + goto out_unlock; + } + + owner = link->owner; + if (!try_module_get(owner)) { + err = -EPROTONOSUPPORT; + goto out_unlock; + } + + flags = link->flags; + if (kind == RTNL_KIND_DEL && (nlh->nlmsg_flags & NLM_F_BULK) && + !(flags & RTNL_FLAG_BULK_DEL_SUPPORTED)) { + NL_SET_ERR_MSG(extack, "Bulk delete is not supported"); + module_put(owner); + goto err_unlock; + } + + if (flags & RTNL_FLAG_DOIT_UNLOCKED) { + doit = link->doit; + rcu_read_unlock(); + if (doit) + err = doit(skb, nlh, extack); + module_put(owner); + return err; + } + rcu_read_unlock(); + + rtnl_lock(); + link = rtnl_get_link(family, type); + if (link && link->doit) + err = link->doit(skb, nlh, extack); + rtnl_unlock(); + + module_put(owner); + + return err; + +out_unlock: + rcu_read_unlock(); + return err; + +err_unlock: + rcu_read_unlock(); + return -EOPNOTSUPP; +} + +static void rtnetlink_rcv(struct sk_buff *skb) +{ + netlink_rcv_skb(skb, &rtnetlink_rcv_msg); +} + +static int rtnetlink_bind(struct net *net, int group) +{ + switch (group) { + case RTNLGRP_IPV4_MROUTE_R: + case RTNLGRP_IPV6_MROUTE_R: + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + break; + } + return 0; +} + +static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + switch (event) { + case NETDEV_REBOOT: + case NETDEV_CHANGEMTU: + case NETDEV_CHANGEADDR: + case NETDEV_CHANGENAME: + case NETDEV_FEAT_CHANGE: + case NETDEV_BONDING_FAILOVER: + case NETDEV_POST_TYPE_CHANGE: + case NETDEV_NOTIFY_PEERS: + case NETDEV_CHANGEUPPER: + case NETDEV_RESEND_IGMP: + case NETDEV_CHANGEINFODATA: + case NETDEV_CHANGELOWERSTATE: + case NETDEV_CHANGE_TX_QUEUE_LEN: + rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event), + GFP_KERNEL, NULL, 0); + break; + default: + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block rtnetlink_dev_notifier = { + .notifier_call = rtnetlink_event, +}; + + +static int __net_init rtnetlink_net_init(struct net *net) +{ + struct sock *sk; + struct netlink_kernel_cfg cfg = { + .groups = RTNLGRP_MAX, + .input = rtnetlink_rcv, + .cb_mutex = &rtnl_mutex, + .flags = NL_CFG_F_NONROOT_RECV, + .bind = rtnetlink_bind, + }; + + sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg); + if (!sk) + return -ENOMEM; + net->rtnl = sk; + return 0; +} + +static void __net_exit rtnetlink_net_exit(struct net *net) +{ + netlink_kernel_release(net->rtnl); + net->rtnl = NULL; +} + +static struct pernet_operations rtnetlink_net_ops = { + .init = rtnetlink_net_init, + .exit = rtnetlink_net_exit, +}; + +void __init rtnetlink_init(void) +{ + if (register_pernet_subsys(&rtnetlink_net_ops)) + panic("rtnetlink_init: cannot initialize rtnetlink\n"); + + register_netdevice_notifier(&rtnetlink_dev_notifier); + + rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, + rtnl_dump_ifinfo, 0); + rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, 0); + + rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, 0); + rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, 0); + rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, 0); + + rtnl_register(PF_UNSPEC, RTM_NEWLINKPROP, rtnl_newlinkprop, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_DELLINKPROP, rtnl_dellinkprop, NULL, 0); + + rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0); + rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, + RTNL_FLAG_BULK_DEL_SUPPORTED); + rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0); + + rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0); + rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, 0); + rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, 0); + + rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump, + 0); + rtnl_register(PF_UNSPEC, RTM_SETSTATS, rtnl_stats_set, NULL, 0); +} diff --git a/net/core/scm.c b/net/core/scm.c new file mode 100644 index 000000000..e762a4b8a --- /dev/null +++ b/net/core/scm.c @@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* scm.c - Socket level control messages processing. + * + * Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * Alignment and value checking mods by Craig Metz + */ + +#include <linux/module.h> +#include <linux/signal.h> +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/sched/user.h> +#include <linux/mm.h> +#include <linux/kernel.h> +#include <linux/stat.h> +#include <linux/socket.h> +#include <linux/file.h> +#include <linux/fcntl.h> +#include <linux/net.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/security.h> +#include <linux/pid_namespace.h> +#include <linux/pid.h> +#include <linux/nsproxy.h> +#include <linux/slab.h> +#include <linux/errqueue.h> +#include <linux/io_uring.h> + +#include <linux/uaccess.h> + +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/compat.h> +#include <net/scm.h> +#include <net/cls_cgroup.h> + + +/* + * Only allow a user to send credentials, that they could set with + * setu(g)id. + */ + +static __inline__ int scm_check_creds(struct ucred *creds) +{ + const struct cred *cred = current_cred(); + kuid_t uid = make_kuid(cred->user_ns, creds->uid); + kgid_t gid = make_kgid(cred->user_ns, creds->gid); + + if (!uid_valid(uid) || !gid_valid(gid)) + return -EINVAL; + + if ((creds->pid == task_tgid_vnr(current) || + ns_capable(task_active_pid_ns(current)->user_ns, CAP_SYS_ADMIN)) && + ((uid_eq(uid, cred->uid) || uid_eq(uid, cred->euid) || + uid_eq(uid, cred->suid)) || ns_capable(cred->user_ns, CAP_SETUID)) && + ((gid_eq(gid, cred->gid) || gid_eq(gid, cred->egid) || + gid_eq(gid, cred->sgid)) || ns_capable(cred->user_ns, CAP_SETGID))) { + return 0; + } + return -EPERM; +} + +static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) +{ + int *fdp = (int*)CMSG_DATA(cmsg); + struct scm_fp_list *fpl = *fplp; + struct file **fpp; + int i, num; + + num = (cmsg->cmsg_len - sizeof(struct cmsghdr))/sizeof(int); + + if (num <= 0) + return 0; + + if (num > SCM_MAX_FD) + return -EINVAL; + + if (!fpl) + { + fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL_ACCOUNT); + if (!fpl) + return -ENOMEM; + *fplp = fpl; + fpl->count = 0; + fpl->max = SCM_MAX_FD; + fpl->user = NULL; + } + fpp = &fpl->fp[fpl->count]; + + if (fpl->count + num > fpl->max) + return -EINVAL; + + /* + * Verify the descriptors and increment the usage count. + */ + + for (i=0; i< num; i++) + { + int fd = fdp[i]; + struct file *file; + + if (fd < 0 || !(file = fget_raw(fd))) + return -EBADF; + /* don't allow io_uring files */ + if (io_uring_get_socket(file)) { + fput(file); + return -EINVAL; + } + *fpp++ = file; + fpl->count++; + } + + if (!fpl->user) + fpl->user = get_uid(current_user()); + + return num; +} + +void __scm_destroy(struct scm_cookie *scm) +{ + struct scm_fp_list *fpl = scm->fp; + int i; + + if (fpl) { + scm->fp = NULL; + for (i=fpl->count-1; i>=0; i--) + fput(fpl->fp[i]); + free_uid(fpl->user); + kfree(fpl); + } +} +EXPORT_SYMBOL(__scm_destroy); + +int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) +{ + struct cmsghdr *cmsg; + int err; + + for_each_cmsghdr(cmsg, msg) { + err = -EINVAL; + + /* Verify that cmsg_len is at least sizeof(struct cmsghdr) */ + /* The first check was omitted in <= 2.2.5. The reasoning was + that parser checks cmsg_len in any case, so that + additional check would be work duplication. + But if cmsg_level is not SOL_SOCKET, we do not check + for too short ancillary data object at all! Oops. + OK, let's add it... + */ + if (!CMSG_OK(msg, cmsg)) + goto error; + + if (cmsg->cmsg_level != SOL_SOCKET) + continue; + + switch (cmsg->cmsg_type) + { + case SCM_RIGHTS: + if (!sock->ops || sock->ops->family != PF_UNIX) + goto error; + err=scm_fp_copy(cmsg, &p->fp); + if (err<0) + goto error; + break; + case SCM_CREDENTIALS: + { + struct ucred creds; + kuid_t uid; + kgid_t gid; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred))) + goto error; + memcpy(&creds, CMSG_DATA(cmsg), sizeof(struct ucred)); + err = scm_check_creds(&creds); + if (err) + goto error; + + p->creds.pid = creds.pid; + if (!p->pid || pid_vnr(p->pid) != creds.pid) { + struct pid *pid; + err = -ESRCH; + pid = find_get_pid(creds.pid); + if (!pid) + goto error; + put_pid(p->pid); + p->pid = pid; + } + + err = -EINVAL; + uid = make_kuid(current_user_ns(), creds.uid); + gid = make_kgid(current_user_ns(), creds.gid); + if (!uid_valid(uid) || !gid_valid(gid)) + goto error; + + p->creds.uid = uid; + p->creds.gid = gid; + break; + } + default: + goto error; + } + } + + if (p->fp && !p->fp->count) + { + kfree(p->fp); + p->fp = NULL; + } + return 0; + +error: + scm_destroy(p); + return err; +} +EXPORT_SYMBOL(__scm_send); + +int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) +{ + int cmlen = CMSG_LEN(len); + + if (msg->msg_flags & MSG_CMSG_COMPAT) + return put_cmsg_compat(msg, level, type, len, data); + + if (!msg->msg_control || msg->msg_controllen < sizeof(struct cmsghdr)) { + msg->msg_flags |= MSG_CTRUNC; + return 0; /* XXX: return error? check spec. */ + } + if (msg->msg_controllen < cmlen) { + msg->msg_flags |= MSG_CTRUNC; + cmlen = msg->msg_controllen; + } + + if (msg->msg_control_is_user) { + struct cmsghdr __user *cm = msg->msg_control_user; + + check_object_size(data, cmlen - sizeof(*cm), true); + + if (!user_write_access_begin(cm, cmlen)) + goto efault; + + unsafe_put_user(cmlen, &cm->cmsg_len, efault_end); + unsafe_put_user(level, &cm->cmsg_level, efault_end); + unsafe_put_user(type, &cm->cmsg_type, efault_end); + unsafe_copy_to_user(CMSG_USER_DATA(cm), data, + cmlen - sizeof(*cm), efault_end); + user_write_access_end(); + } else { + struct cmsghdr *cm = msg->msg_control; + + cm->cmsg_level = level; + cm->cmsg_type = type; + cm->cmsg_len = cmlen; + memcpy(CMSG_DATA(cm), data, cmlen - sizeof(*cm)); + } + + cmlen = min(CMSG_SPACE(len), msg->msg_controllen); + msg->msg_control += cmlen; + msg->msg_controllen -= cmlen; + return 0; + +efault_end: + user_write_access_end(); +efault: + return -EFAULT; +} +EXPORT_SYMBOL(put_cmsg); + +void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss_internal) +{ + struct scm_timestamping64 tss; + int i; + + for (i = 0; i < ARRAY_SIZE(tss.ts); i++) { + tss.ts[i].tv_sec = tss_internal->ts[i].tv_sec; + tss.ts[i].tv_nsec = tss_internal->ts[i].tv_nsec; + } + + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_NEW, sizeof(tss), &tss); +} +EXPORT_SYMBOL(put_cmsg_scm_timestamping64); + +void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_internal *tss_internal) +{ + struct scm_timestamping tss; + int i; + + for (i = 0; i < ARRAY_SIZE(tss.ts); i++) { + tss.ts[i].tv_sec = tss_internal->ts[i].tv_sec; + tss.ts[i].tv_nsec = tss_internal->ts[i].tv_nsec; + } + + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_OLD, sizeof(tss), &tss); +} +EXPORT_SYMBOL(put_cmsg_scm_timestamping); + +static int scm_max_fds(struct msghdr *msg) +{ + if (msg->msg_controllen <= sizeof(struct cmsghdr)) + return 0; + return (msg->msg_controllen - sizeof(struct cmsghdr)) / sizeof(int); +} + +void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) +{ + struct cmsghdr __user *cm = + (__force struct cmsghdr __user *)msg->msg_control; + unsigned int o_flags = (msg->msg_flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0; + int fdmax = min_t(int, scm_max_fds(msg), scm->fp->count); + int __user *cmsg_data = CMSG_USER_DATA(cm); + int err = 0, i; + + /* no use for FD passing from kernel space callers */ + if (WARN_ON_ONCE(!msg->msg_control_is_user)) + return; + + if (msg->msg_flags & MSG_CMSG_COMPAT) { + scm_detach_fds_compat(msg, scm); + return; + } + + for (i = 0; i < fdmax; i++) { + err = receive_fd_user(scm->fp->fp[i], cmsg_data + i, o_flags); + if (err < 0) + break; + } + + if (i > 0) { + int cmlen = CMSG_LEN(i * sizeof(int)); + + err = put_user(SOL_SOCKET, &cm->cmsg_level); + if (!err) + err = put_user(SCM_RIGHTS, &cm->cmsg_type); + if (!err) + err = put_user(cmlen, &cm->cmsg_len); + if (!err) { + cmlen = CMSG_SPACE(i * sizeof(int)); + if (msg->msg_controllen < cmlen) + cmlen = msg->msg_controllen; + msg->msg_control += cmlen; + msg->msg_controllen -= cmlen; + } + } + + if (i < scm->fp->count || (scm->fp->count && fdmax <= 0)) + msg->msg_flags |= MSG_CTRUNC; + + /* + * All of the files that fit in the message have had their usage counts + * incremented, so we just free the list. + */ + __scm_destroy(scm); +} +EXPORT_SYMBOL(scm_detach_fds); + +struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) +{ + struct scm_fp_list *new_fpl; + int i; + + if (!fpl) + return NULL; + + new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]), + GFP_KERNEL_ACCOUNT); + if (new_fpl) { + for (i = 0; i < fpl->count; i++) + get_file(fpl->fp[i]); + new_fpl->max = new_fpl->count; + new_fpl->user = get_uid(fpl->user); + } + return new_fpl; +} +EXPORT_SYMBOL(scm_fp_dup); diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c new file mode 100644 index 000000000..b0ff6153b --- /dev/null +++ b/net/core/secure_seq.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/cache.h> +#include <linux/random.h> +#include <linux/hrtimer.h> +#include <linux/ktime.h> +#include <linux/string.h> +#include <linux/net.h> +#include <linux/siphash.h> +#include <net/secure_seq.h> + +#if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET) +#include <linux/in6.h> +#include <net/tcp.h> + +static siphash_aligned_key_t net_secret; +static siphash_aligned_key_t ts_secret; + +#define EPHEMERAL_PORT_SHUFFLE_PERIOD (10 * HZ) + +static __always_inline void net_secret_init(void) +{ + net_get_random_once(&net_secret, sizeof(net_secret)); +} + +static __always_inline void ts_secret_init(void) +{ + net_get_random_once(&ts_secret, sizeof(ts_secret)); +} +#endif + +#ifdef CONFIG_INET +static u32 seq_scale(u32 seq) +{ + /* + * As close as possible to RFC 793, which + * suggests using a 250 kHz clock. + * Further reading shows this assumes 2 Mb/s networks. + * For 10 Mb/s Ethernet, a 1 MHz clock is appropriate. + * For 10 Gb/s Ethernet, a 1 GHz clock should be ok, but + * we also need to limit the resolution so that the u32 seq + * overlaps less than one time per MSL (2 minutes). + * Choosing a clock of 64 ns period is OK. (period of 274 s) + */ + return seq + (ktime_get_real_ns() >> 6); +} +#endif + +#if IS_ENABLED(CONFIG_IPV6) +u32 secure_tcpv6_ts_off(const struct net *net, + const __be32 *saddr, const __be32 *daddr) +{ + const struct { + struct in6_addr saddr; + struct in6_addr daddr; + } __aligned(SIPHASH_ALIGNMENT) combined = { + .saddr = *(struct in6_addr *)saddr, + .daddr = *(struct in6_addr *)daddr, + }; + + if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1) + return 0; + + ts_secret_init(); + return siphash(&combined, offsetofend(typeof(combined), daddr), + &ts_secret); +} +EXPORT_SYMBOL(secure_tcpv6_ts_off); + +u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, + __be16 sport, __be16 dport) +{ + const struct { + struct in6_addr saddr; + struct in6_addr daddr; + __be16 sport; + __be16 dport; + } __aligned(SIPHASH_ALIGNMENT) combined = { + .saddr = *(struct in6_addr *)saddr, + .daddr = *(struct in6_addr *)daddr, + .sport = sport, + .dport = dport + }; + u32 hash; + + net_secret_init(); + hash = siphash(&combined, offsetofend(typeof(combined), dport), + &net_secret); + return seq_scale(hash); +} +EXPORT_SYMBOL(secure_tcpv6_seq); + +u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, + __be16 dport) +{ + const struct { + struct in6_addr saddr; + struct in6_addr daddr; + unsigned int timeseed; + __be16 dport; + } __aligned(SIPHASH_ALIGNMENT) combined = { + .saddr = *(struct in6_addr *)saddr, + .daddr = *(struct in6_addr *)daddr, + .timeseed = jiffies / EPHEMERAL_PORT_SHUFFLE_PERIOD, + .dport = dport, + }; + net_secret_init(); + return siphash(&combined, offsetofend(typeof(combined), dport), + &net_secret); +} +EXPORT_SYMBOL(secure_ipv6_port_ephemeral); +#endif + +#ifdef CONFIG_INET +u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr) +{ + if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1) + return 0; + + ts_secret_init(); + return siphash_2u32((__force u32)saddr, (__force u32)daddr, + &ts_secret); +} + +/* secure_tcp_seq_and_tsoff(a, b, 0, d) == secure_ipv4_port_ephemeral(a, b, d), + * but fortunately, `sport' cannot be 0 in any circumstances. If this changes, + * it would be easy enough to have the former function use siphash_4u32, passing + * the arguments as separate u32. + */ +u32 secure_tcp_seq(__be32 saddr, __be32 daddr, + __be16 sport, __be16 dport) +{ + u32 hash; + + net_secret_init(); + hash = siphash_3u32((__force u32)saddr, (__force u32)daddr, + (__force u32)sport << 16 | (__force u32)dport, + &net_secret); + return seq_scale(hash); +} +EXPORT_SYMBOL_GPL(secure_tcp_seq); + +u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) +{ + net_secret_init(); + return siphash_4u32((__force u32)saddr, (__force u32)daddr, + (__force u16)dport, + jiffies / EPHEMERAL_PORT_SHUFFLE_PERIOD, + &net_secret); +} +EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); +#endif + +#if IS_ENABLED(CONFIG_IP_DCCP) +u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, + __be16 sport, __be16 dport) +{ + u64 seq; + net_secret_init(); + seq = siphash_3u32((__force u32)saddr, (__force u32)daddr, + (__force u32)sport << 16 | (__force u32)dport, + &net_secret); + seq += ktime_get_real_ns(); + seq &= (1ull << 48) - 1; + return seq; +} +EXPORT_SYMBOL(secure_dccp_sequence_number); + +#if IS_ENABLED(CONFIG_IPV6) +u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, + __be16 sport, __be16 dport) +{ + const struct { + struct in6_addr saddr; + struct in6_addr daddr; + __be16 sport; + __be16 dport; + } __aligned(SIPHASH_ALIGNMENT) combined = { + .saddr = *(struct in6_addr *)saddr, + .daddr = *(struct in6_addr *)daddr, + .sport = sport, + .dport = dport + }; + u64 seq; + net_secret_init(); + seq = siphash(&combined, offsetofend(typeof(combined), dport), + &net_secret); + seq += ktime_get_real_ns(); + seq &= (1ull << 48) - 1; + return seq; +} +EXPORT_SYMBOL(secure_dccpv6_sequence_number); +#endif +#endif diff --git a/net/core/selftests.c b/net/core/selftests.c new file mode 100644 index 000000000..acb1ee97b --- /dev/null +++ b/net/core/selftests.c @@ -0,0 +1,412 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2019 Synopsys, Inc. and/or its affiliates. + * stmmac Selftests Support + * + * Author: Jose Abreu <joabreu@synopsys.com> + * + * Ported from stmmac by: + * Copyright (C) 2021 Oleksij Rempel <o.rempel@pengutronix.de> + */ + +#include <linux/phy.h> +#include <net/selftests.h> +#include <net/tcp.h> +#include <net/udp.h> + +struct net_packet_attrs { + const unsigned char *src; + const unsigned char *dst; + u32 ip_src; + u32 ip_dst; + bool tcp; + u16 sport; + u16 dport; + int timeout; + int size; + int max_size; + u8 id; + u16 queue_mapping; +}; + +struct net_test_priv { + struct net_packet_attrs *packet; + struct packet_type pt; + struct completion comp; + int double_vlan; + int vlan_id; + int ok; +}; + +struct netsfhdr { + __be32 version; + __be64 magic; + u8 id; +} __packed; + +static u8 net_test_next_id; + +#define NET_TEST_PKT_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ + sizeof(struct netsfhdr)) +#define NET_TEST_PKT_MAGIC 0xdeadcafecafedeadULL +#define NET_LB_TIMEOUT msecs_to_jiffies(200) + +static struct sk_buff *net_test_get_skb(struct net_device *ndev, + struct net_packet_attrs *attr) +{ + struct sk_buff *skb = NULL; + struct udphdr *uhdr = NULL; + struct tcphdr *thdr = NULL; + struct netsfhdr *shdr; + struct ethhdr *ehdr; + struct iphdr *ihdr; + int iplen, size; + + size = attr->size + NET_TEST_PKT_SIZE; + + if (attr->tcp) + size += sizeof(struct tcphdr); + else + size += sizeof(struct udphdr); + + if (attr->max_size && attr->max_size > size) + size = attr->max_size; + + skb = netdev_alloc_skb(ndev, size); + if (!skb) + return NULL; + + prefetchw(skb->data); + + ehdr = skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); + + skb_set_network_header(skb, skb->len); + ihdr = skb_put(skb, sizeof(*ihdr)); + + skb_set_transport_header(skb, skb->len); + if (attr->tcp) + thdr = skb_put(skb, sizeof(*thdr)); + else + uhdr = skb_put(skb, sizeof(*uhdr)); + + eth_zero_addr(ehdr->h_dest); + + if (attr->src) + ether_addr_copy(ehdr->h_source, attr->src); + if (attr->dst) + ether_addr_copy(ehdr->h_dest, attr->dst); + + ehdr->h_proto = htons(ETH_P_IP); + + if (attr->tcp) { + thdr->source = htons(attr->sport); + thdr->dest = htons(attr->dport); + thdr->doff = sizeof(struct tcphdr) / 4; + thdr->check = 0; + } else { + uhdr->source = htons(attr->sport); + uhdr->dest = htons(attr->dport); + uhdr->len = htons(sizeof(*shdr) + sizeof(*uhdr) + attr->size); + if (attr->max_size) + uhdr->len = htons(attr->max_size - + (sizeof(*ihdr) + sizeof(*ehdr))); + uhdr->check = 0; + } + + ihdr->ihl = 5; + ihdr->ttl = 32; + ihdr->version = 4; + if (attr->tcp) + ihdr->protocol = IPPROTO_TCP; + else + ihdr->protocol = IPPROTO_UDP; + iplen = sizeof(*ihdr) + sizeof(*shdr) + attr->size; + if (attr->tcp) + iplen += sizeof(*thdr); + else + iplen += sizeof(*uhdr); + + if (attr->max_size) + iplen = attr->max_size - sizeof(*ehdr); + + ihdr->tot_len = htons(iplen); + ihdr->frag_off = 0; + ihdr->saddr = htonl(attr->ip_src); + ihdr->daddr = htonl(attr->ip_dst); + ihdr->tos = 0; + ihdr->id = 0; + ip_send_check(ihdr); + + shdr = skb_put(skb, sizeof(*shdr)); + shdr->version = 0; + shdr->magic = cpu_to_be64(NET_TEST_PKT_MAGIC); + attr->id = net_test_next_id; + shdr->id = net_test_next_id++; + + if (attr->size) + skb_put(skb, attr->size); + if (attr->max_size && attr->max_size > skb->len) + skb_put(skb, attr->max_size - skb->len); + + skb->csum = 0; + skb->ip_summed = CHECKSUM_PARTIAL; + if (attr->tcp) { + thdr->check = ~tcp_v4_check(skb->len, ihdr->saddr, + ihdr->daddr, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); + } else { + udp4_hwcsum(skb, ihdr->saddr, ihdr->daddr); + } + + skb->protocol = htons(ETH_P_IP); + skb->pkt_type = PACKET_HOST; + skb->dev = ndev; + + return skb; +} + +static int net_test_loopback_validate(struct sk_buff *skb, + struct net_device *ndev, + struct packet_type *pt, + struct net_device *orig_ndev) +{ + struct net_test_priv *tpriv = pt->af_packet_priv; + const unsigned char *src = tpriv->packet->src; + const unsigned char *dst = tpriv->packet->dst; + struct netsfhdr *shdr; + struct ethhdr *ehdr; + struct udphdr *uhdr; + struct tcphdr *thdr; + struct iphdr *ihdr; + + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) + goto out; + + if (skb_linearize(skb)) + goto out; + if (skb_headlen(skb) < (NET_TEST_PKT_SIZE - ETH_HLEN)) + goto out; + + ehdr = (struct ethhdr *)skb_mac_header(skb); + if (dst) { + if (!ether_addr_equal_unaligned(ehdr->h_dest, dst)) + goto out; + } + + if (src) { + if (!ether_addr_equal_unaligned(ehdr->h_source, src)) + goto out; + } + + ihdr = ip_hdr(skb); + if (tpriv->double_vlan) + ihdr = (struct iphdr *)(skb_network_header(skb) + 4); + + if (tpriv->packet->tcp) { + if (ihdr->protocol != IPPROTO_TCP) + goto out; + + thdr = (struct tcphdr *)((u8 *)ihdr + 4 * ihdr->ihl); + if (thdr->dest != htons(tpriv->packet->dport)) + goto out; + + shdr = (struct netsfhdr *)((u8 *)thdr + sizeof(*thdr)); + } else { + if (ihdr->protocol != IPPROTO_UDP) + goto out; + + uhdr = (struct udphdr *)((u8 *)ihdr + 4 * ihdr->ihl); + if (uhdr->dest != htons(tpriv->packet->dport)) + goto out; + + shdr = (struct netsfhdr *)((u8 *)uhdr + sizeof(*uhdr)); + } + + if (shdr->magic != cpu_to_be64(NET_TEST_PKT_MAGIC)) + goto out; + if (tpriv->packet->id != shdr->id) + goto out; + + tpriv->ok = true; + complete(&tpriv->comp); +out: + kfree_skb(skb); + return 0; +} + +static int __net_test_loopback(struct net_device *ndev, + struct net_packet_attrs *attr) +{ + struct net_test_priv *tpriv; + struct sk_buff *skb = NULL; + int ret = 0; + + tpriv = kzalloc(sizeof(*tpriv), GFP_KERNEL); + if (!tpriv) + return -ENOMEM; + + tpriv->ok = false; + init_completion(&tpriv->comp); + + tpriv->pt.type = htons(ETH_P_IP); + tpriv->pt.func = net_test_loopback_validate; + tpriv->pt.dev = ndev; + tpriv->pt.af_packet_priv = tpriv; + tpriv->packet = attr; + dev_add_pack(&tpriv->pt); + + skb = net_test_get_skb(ndev, attr); + if (!skb) { + ret = -ENOMEM; + goto cleanup; + } + + ret = dev_direct_xmit(skb, attr->queue_mapping); + if (ret < 0) { + goto cleanup; + } else if (ret > 0) { + ret = -ENETUNREACH; + goto cleanup; + } + + if (!attr->timeout) + attr->timeout = NET_LB_TIMEOUT; + + wait_for_completion_timeout(&tpriv->comp, attr->timeout); + ret = tpriv->ok ? 0 : -ETIMEDOUT; + +cleanup: + dev_remove_pack(&tpriv->pt); + kfree(tpriv); + return ret; +} + +static int net_test_netif_carrier(struct net_device *ndev) +{ + return netif_carrier_ok(ndev) ? 0 : -ENOLINK; +} + +static int net_test_phy_phydev(struct net_device *ndev) +{ + return ndev->phydev ? 0 : -EOPNOTSUPP; +} + +static int net_test_phy_loopback_enable(struct net_device *ndev) +{ + if (!ndev->phydev) + return -EOPNOTSUPP; + + return phy_loopback(ndev->phydev, true); +} + +static int net_test_phy_loopback_disable(struct net_device *ndev) +{ + if (!ndev->phydev) + return -EOPNOTSUPP; + + return phy_loopback(ndev->phydev, false); +} + +static int net_test_phy_loopback_udp(struct net_device *ndev) +{ + struct net_packet_attrs attr = { }; + + attr.dst = ndev->dev_addr; + return __net_test_loopback(ndev, &attr); +} + +static int net_test_phy_loopback_udp_mtu(struct net_device *ndev) +{ + struct net_packet_attrs attr = { }; + + attr.dst = ndev->dev_addr; + attr.max_size = ndev->mtu; + return __net_test_loopback(ndev, &attr); +} + +static int net_test_phy_loopback_tcp(struct net_device *ndev) +{ + struct net_packet_attrs attr = { }; + + attr.dst = ndev->dev_addr; + attr.tcp = true; + return __net_test_loopback(ndev, &attr); +} + +static const struct net_test { + char name[ETH_GSTRING_LEN]; + int (*fn)(struct net_device *ndev); +} net_selftests[] = { + { + .name = "Carrier ", + .fn = net_test_netif_carrier, + }, { + .name = "PHY dev is present ", + .fn = net_test_phy_phydev, + }, { + /* This test should be done before all PHY loopback test */ + .name = "PHY internal loopback, enable ", + .fn = net_test_phy_loopback_enable, + }, { + .name = "PHY internal loopback, UDP ", + .fn = net_test_phy_loopback_udp, + }, { + .name = "PHY internal loopback, MTU ", + .fn = net_test_phy_loopback_udp_mtu, + }, { + .name = "PHY internal loopback, TCP ", + .fn = net_test_phy_loopback_tcp, + }, { + /* This test should be done after all PHY loopback test */ + .name = "PHY internal loopback, disable", + .fn = net_test_phy_loopback_disable, + }, +}; + +void net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf) +{ + int count = net_selftest_get_count(); + int i; + + memset(buf, 0, sizeof(*buf) * count); + net_test_next_id = 0; + + if (etest->flags != ETH_TEST_FL_OFFLINE) { + netdev_err(ndev, "Only offline tests are supported\n"); + etest->flags |= ETH_TEST_FL_FAILED; + return; + } + + + for (i = 0; i < count; i++) { + buf[i] = net_selftests[i].fn(ndev); + if (buf[i] && (buf[i] != -EOPNOTSUPP)) + etest->flags |= ETH_TEST_FL_FAILED; + } +} +EXPORT_SYMBOL_GPL(net_selftest); + +int net_selftest_get_count(void) +{ + return ARRAY_SIZE(net_selftests); +} +EXPORT_SYMBOL_GPL(net_selftest_get_count); + +void net_selftest_get_strings(u8 *data) +{ + u8 *p = data; + int i; + + for (i = 0; i < net_selftest_get_count(); i++) { + snprintf(p, ETH_GSTRING_LEN, "%2d. %s", i + 1, + net_selftests[i].name); + p += ETH_GSTRING_LEN; + } +} +EXPORT_SYMBOL_GPL(net_selftest_get_strings); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Oleksij Rempel <o.rempel@pengutronix.de>"); diff --git a/net/core/skbuff.c b/net/core/skbuff.c new file mode 100644 index 000000000..8a819d0a7 --- /dev/null +++ b/net/core/skbuff.c @@ -0,0 +1,6687 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Routines having to do with the 'struct sk_buff' memory handlers. + * + * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> + * Florian La Roche <rzsfl@rz.uni-sb.de> + * + * Fixes: + * Alan Cox : Fixed the worst of the load + * balancer bugs. + * Dave Platt : Interrupt stacking fix. + * Richard Kooijman : Timestamp fixes. + * Alan Cox : Changed buffer format. + * Alan Cox : destructor hook for AF_UNIX etc. + * Linus Torvalds : Better skb_clone. + * Alan Cox : Added skb_copy. + * Alan Cox : Added all the changed routines Linus + * only put in the headers + * Ray VanTassle : Fixed --skb->lock in free + * Alan Cox : skb_copy copy arp field + * Andi Kleen : slabified it. + * Robert Olsson : Removed skb_head_pool + * + * NOTE: + * The __skb_ routines should be called with interrupts + * disabled, or you better be *real* sure that the operation is atomic + * with respect to whatever list is being frobbed (e.g. via lock_sock() + * or via disabling bottom half handlers, etc). + */ + +/* + * The functions in this file will not compile correctly with gcc 2.4.x + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/slab.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/sctp.h> +#include <linux/netdevice.h> +#ifdef CONFIG_NET_CLS_ACT +#include <net/pkt_sched.h> +#endif +#include <linux/string.h> +#include <linux/skbuff.h> +#include <linux/splice.h> +#include <linux/cache.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <linux/scatterlist.h> +#include <linux/errqueue.h> +#include <linux/prefetch.h> +#include <linux/if_vlan.h> +#include <linux/mpls.h> +#include <linux/kcov.h> + +#include <net/protocol.h> +#include <net/dst.h> +#include <net/sock.h> +#include <net/checksum.h> +#include <net/ip6_checksum.h> +#include <net/xfrm.h> +#include <net/mpls.h> +#include <net/mptcp.h> +#include <net/mctp.h> +#include <net/page_pool.h> + +#include <linux/uaccess.h> +#include <trace/events/skb.h> +#include <linux/highmem.h> +#include <linux/capability.h> +#include <linux/user_namespace.h> +#include <linux/indirect_call_wrapper.h> + +#include "dev.h" +#include "sock_destructor.h" + +struct kmem_cache *skbuff_head_cache __ro_after_init; +static struct kmem_cache *skbuff_fclone_cache __ro_after_init; +#ifdef CONFIG_SKB_EXTENSIONS +static struct kmem_cache *skbuff_ext_cache __ro_after_init; +#endif +int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; +EXPORT_SYMBOL(sysctl_max_skb_frags); + +#undef FN +#define FN(reason) [SKB_DROP_REASON_##reason] = #reason, +const char * const drop_reasons[] = { + DEFINE_DROP_REASON(FN, FN) +}; +EXPORT_SYMBOL(drop_reasons); + +/** + * skb_panic - private function for out-of-line support + * @skb: buffer + * @sz: size + * @addr: address + * @msg: skb_over_panic or skb_under_panic + * + * Out-of-line support for skb_put() and skb_push(). + * Called via the wrapper skb_over_panic() or skb_under_panic(). + * Keep out of line to prevent kernel bloat. + * __builtin_return_address is not used because it is not always reliable. + */ +static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, + const char msg[]) +{ + pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n", + msg, addr, skb->len, sz, skb->head, skb->data, + (unsigned long)skb->tail, (unsigned long)skb->end, + skb->dev ? skb->dev->name : "<NULL>"); + BUG(); +} + +static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr) +{ + skb_panic(skb, sz, addr, __func__); +} + +static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) +{ + skb_panic(skb, sz, addr, __func__); +} + +#define NAPI_SKB_CACHE_SIZE 64 +#define NAPI_SKB_CACHE_BULK 16 +#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) + +#if PAGE_SIZE == SZ_4K + +#define NAPI_HAS_SMALL_PAGE_FRAG 1 +#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc) + +/* specialized page frag allocator using a single order 0 page + * and slicing it into 1K sized fragment. Constrained to systems + * with a very limited amount of 1K fragments fitting a single + * page - to avoid excessive truesize underestimation + */ + +struct page_frag_1k { + void *va; + u16 offset; + bool pfmemalloc; +}; + +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp) +{ + struct page *page; + int offset; + + offset = nc->offset - SZ_1K; + if (likely(offset >= 0)) + goto use_frag; + + page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); + if (!page) + return NULL; + + nc->va = page_address(page); + nc->pfmemalloc = page_is_pfmemalloc(page); + offset = PAGE_SIZE - SZ_1K; + page_ref_add(page, offset / SZ_1K); + +use_frag: + nc->offset = offset; + return nc->va + offset; +} +#else + +/* the small page is actually unused in this build; add dummy helpers + * to please the compiler and avoid later preprocessor's conditionals + */ +#define NAPI_HAS_SMALL_PAGE_FRAG 0 +#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false + +struct page_frag_1k { +}; + +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) +{ + return NULL; +} + +#endif + +struct napi_alloc_cache { + struct page_frag_cache page; + struct page_frag_1k page_small; + unsigned int skb_count; + void *skb_cache[NAPI_SKB_CACHE_SIZE]; +}; + +static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); +static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); + +/* Double check that napi_get_frags() allocates skbs with + * skb->head being backed by slab, not a page fragment. + * This is to make sure bug fixed in 3226b158e67c + * ("net: avoid 32 x truesize under-estimation for tiny skbs") + * does not accidentally come back. + */ +void napi_get_frags_check(struct napi_struct *napi) +{ + struct sk_buff *skb; + + local_bh_disable(); + skb = napi_get_frags(napi); + WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); + napi_free_frags(napi); + local_bh_enable(); +} + +void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + + fragsz = SKB_DATA_ALIGN(fragsz); + + return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); +} +EXPORT_SYMBOL(__napi_alloc_frag_align); + +void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) +{ + void *data; + + fragsz = SKB_DATA_ALIGN(fragsz); + if (in_hardirq() || irqs_disabled()) { + struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache); + + data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask); + } else { + struct napi_alloc_cache *nc; + + local_bh_disable(); + nc = this_cpu_ptr(&napi_alloc_cache); + data = page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); + local_bh_enable(); + } + return data; +} +EXPORT_SYMBOL(__netdev_alloc_frag_align); + +static struct sk_buff *napi_skb_cache_get(void) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct sk_buff *skb; + + if (unlikely(!nc->skb_count)) { + nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache, + GFP_ATOMIC, + NAPI_SKB_CACHE_BULK, + nc->skb_cache); + if (unlikely(!nc->skb_count)) + return NULL; + } + + skb = nc->skb_cache[--nc->skb_count]; + kasan_unpoison_object_data(skbuff_head_cache, skb); + + return skb; +} + +/* Caller must provide SKB that is memset cleared */ +static void __build_skb_around(struct sk_buff *skb, void *data, + unsigned int frag_size) +{ + struct skb_shared_info *shinfo; + unsigned int size = frag_size ? : ksize(data); + + size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + /* Assumes caller memset cleared SKB */ + skb->truesize = SKB_TRUESIZE(size); + refcount_set(&skb->users, 1); + skb->head = data; + skb->data = data; + skb_reset_tail_pointer(skb); + skb_set_end_offset(skb, size); + skb->mac_header = (typeof(skb->mac_header))~0U; + skb->transport_header = (typeof(skb->transport_header))~0U; + skb->alloc_cpu = raw_smp_processor_id(); + /* make sure we initialize shinfo sequentially */ + shinfo = skb_shinfo(skb); + memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); + atomic_set(&shinfo->dataref, 1); + + skb_set_kcov_handle(skb, kcov_common_handle()); +} + +/** + * __build_skb - build a network buffer + * @data: data buffer provided by caller + * @frag_size: size of data, or 0 if head was kmalloced + * + * Allocate a new &sk_buff. Caller provides space holding head and + * skb_shared_info. @data must have been allocated by kmalloc() only if + * @frag_size is 0, otherwise data should come from the page allocator + * or vmalloc() + * The return is the new skb buffer. + * On a failure the return is %NULL, and @data is not freed. + * Notes : + * Before IO, driver allocates only data buffer where NIC put incoming frame + * Driver should add room at head (NET_SKB_PAD) and + * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) + * After IO, driver calls build_skb(), to allocate sk_buff and populate it + * before giving packet to stack. + * RX rings only contains data buffers, not full skbs. + */ +struct sk_buff *__build_skb(void *data, unsigned int frag_size) +{ + struct sk_buff *skb; + + skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); + if (unlikely(!skb)) + return NULL; + + memset(skb, 0, offsetof(struct sk_buff, tail)); + __build_skb_around(skb, data, frag_size); + + return skb; +} + +/* build_skb() is wrapper over __build_skb(), that specifically + * takes care of skb->head and skb->pfmemalloc + * This means that if @frag_size is not zero, then @data must be backed + * by a page fragment, not kmalloc() or vmalloc() + */ +struct sk_buff *build_skb(void *data, unsigned int frag_size) +{ + struct sk_buff *skb = __build_skb(data, frag_size); + + if (skb && frag_size) { + skb->head_frag = 1; + if (page_is_pfmemalloc(virt_to_head_page(data))) + skb->pfmemalloc = 1; + } + return skb; +} +EXPORT_SYMBOL(build_skb); + +/** + * build_skb_around - build a network buffer around provided skb + * @skb: sk_buff provide by caller, must be memset cleared + * @data: data buffer provided by caller + * @frag_size: size of data, or 0 if head was kmalloced + */ +struct sk_buff *build_skb_around(struct sk_buff *skb, + void *data, unsigned int frag_size) +{ + if (unlikely(!skb)) + return NULL; + + __build_skb_around(skb, data, frag_size); + + if (frag_size) { + skb->head_frag = 1; + if (page_is_pfmemalloc(virt_to_head_page(data))) + skb->pfmemalloc = 1; + } + return skb; +} +EXPORT_SYMBOL(build_skb_around); + +/** + * __napi_build_skb - build a network buffer + * @data: data buffer provided by caller + * @frag_size: size of data, or 0 if head was kmalloced + * + * Version of __build_skb() that uses NAPI percpu caches to obtain + * skbuff_head instead of inplace allocation. + * + * Returns a new &sk_buff on success, %NULL on allocation failure. + */ +static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size) +{ + struct sk_buff *skb; + + skb = napi_skb_cache_get(); + if (unlikely(!skb)) + return NULL; + + memset(skb, 0, offsetof(struct sk_buff, tail)); + __build_skb_around(skb, data, frag_size); + + return skb; +} + +/** + * napi_build_skb - build a network buffer + * @data: data buffer provided by caller + * @frag_size: size of data, or 0 if head was kmalloced + * + * Version of __napi_build_skb() that takes care of skb->head_frag + * and skb->pfmemalloc when the data is a page or page fragment. + * + * Returns a new &sk_buff on success, %NULL on allocation failure. + */ +struct sk_buff *napi_build_skb(void *data, unsigned int frag_size) +{ + struct sk_buff *skb = __napi_build_skb(data, frag_size); + + if (likely(skb) && frag_size) { + skb->head_frag = 1; + skb_propagate_pfmemalloc(virt_to_head_page(data), skb); + } + + return skb; +} +EXPORT_SYMBOL(napi_build_skb); + +/* + * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells + * the caller if emergency pfmemalloc reserves are being used. If it is and + * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves + * may be used. Otherwise, the packet data may be discarded until enough + * memory is free + */ +static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, + bool *pfmemalloc) +{ + bool ret_pfmemalloc = false; + size_t obj_size; + void *obj; + + obj_size = SKB_HEAD_ALIGN(*size); + + obj_size = kmalloc_size_roundup(obj_size); + /* The following cast might truncate high-order bits of obj_size, this + * is harmless because kmalloc(obj_size >= 2^32) will fail anyway. + */ + *size = (unsigned int)obj_size; + + /* + * Try a regular allocation, when that fails and we're not entitled + * to the reserves, fail. + */ + obj = kmalloc_node_track_caller(obj_size, + flags | __GFP_NOMEMALLOC | __GFP_NOWARN, + node); + if (obj || !(gfp_pfmemalloc_allowed(flags))) + goto out; + + /* Try again but now we are using pfmemalloc reserves */ + ret_pfmemalloc = true; + obj = kmalloc_node_track_caller(obj_size, flags, node); + +out: + if (pfmemalloc) + *pfmemalloc = ret_pfmemalloc; + + return obj; +} + +/* Allocate a new skbuff. We do this ourselves so we can fill in a few + * 'private' fields and also do memory statistics to find all the + * [BEEP] leaks. + * + */ + +/** + * __alloc_skb - allocate a network buffer + * @size: size to allocate + * @gfp_mask: allocation mask + * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache + * instead of head cache and allocate a cloned (child) skb. + * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for + * allocations in case the data is required for writeback + * @node: numa node to allocate memory on + * + * Allocate a new &sk_buff. The returned buffer has no headroom and a + * tail room of at least size bytes. The object has a reference count + * of one. The return is the buffer. On a failure the return is %NULL. + * + * Buffers may only be allocated from interrupts using a @gfp_mask of + * %GFP_ATOMIC. + */ +struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, + int flags, int node) +{ + struct kmem_cache *cache; + struct sk_buff *skb; + bool pfmemalloc; + u8 *data; + + cache = (flags & SKB_ALLOC_FCLONE) + ? skbuff_fclone_cache : skbuff_head_cache; + + if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) + gfp_mask |= __GFP_MEMALLOC; + + /* Get the HEAD */ + if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI && + likely(node == NUMA_NO_NODE || node == numa_mem_id())) + skb = napi_skb_cache_get(); + else + skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node); + if (unlikely(!skb)) + return NULL; + prefetchw(skb); + + /* We do our best to align skb_shared_info on a separate cache + * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives + * aligned memory blocks, unless SLUB/SLAB debug is enabled. + * Both skb->head and skb_shared_info are cache line aligned. + */ + data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc); + if (unlikely(!data)) + goto nodata; + /* kmalloc_size_roundup() might give us more room than requested. + * Put skb_shared_info exactly at the end of allocated zone, + * to allow max possible filling before reallocation. + */ + prefetchw(data + SKB_WITH_OVERHEAD(size)); + + /* + * Only clear those fields we need to clear, not those that we will + * actually initialise below. Hence, don't put any more fields after + * the tail pointer in struct sk_buff! + */ + memset(skb, 0, offsetof(struct sk_buff, tail)); + __build_skb_around(skb, data, size); + skb->pfmemalloc = pfmemalloc; + + if (flags & SKB_ALLOC_FCLONE) { + struct sk_buff_fclones *fclones; + + fclones = container_of(skb, struct sk_buff_fclones, skb1); + + skb->fclone = SKB_FCLONE_ORIG; + refcount_set(&fclones->fclone_ref, 1); + } + + return skb; + +nodata: + kmem_cache_free(cache, skb); + return NULL; +} +EXPORT_SYMBOL(__alloc_skb); + +/** + * __netdev_alloc_skb - allocate an skbuff for rx on a specific device + * @dev: network device to receive on + * @len: length to allocate + * @gfp_mask: get_free_pages mask, passed to alloc_skb + * + * Allocate a new &sk_buff and assign it a usage count of one. The + * buffer has NET_SKB_PAD headroom built in. Users should allocate + * the headroom they think they need without accounting for the + * built in space. The built in space is used for optimisations. + * + * %NULL is returned if there is no free memory. + */ +struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, + gfp_t gfp_mask) +{ + struct page_frag_cache *nc; + struct sk_buff *skb; + bool pfmemalloc; + void *data; + + len += NET_SKB_PAD; + + /* If requested length is either too small or too big, + * we use kmalloc() for skb->head allocation. + */ + if (len <= SKB_WITH_OVERHEAD(1024) || + len > SKB_WITH_OVERHEAD(PAGE_SIZE) || + (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { + skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); + if (!skb) + goto skb_fail; + goto skb_success; + } + + len = SKB_HEAD_ALIGN(len); + + if (sk_memalloc_socks()) + gfp_mask |= __GFP_MEMALLOC; + + if (in_hardirq() || irqs_disabled()) { + nc = this_cpu_ptr(&netdev_alloc_cache); + data = page_frag_alloc(nc, len, gfp_mask); + pfmemalloc = nc->pfmemalloc; + } else { + local_bh_disable(); + nc = this_cpu_ptr(&napi_alloc_cache.page); + data = page_frag_alloc(nc, len, gfp_mask); + pfmemalloc = nc->pfmemalloc; + local_bh_enable(); + } + + if (unlikely(!data)) + return NULL; + + skb = __build_skb(data, len); + if (unlikely(!skb)) { + skb_free_frag(data); + return NULL; + } + + if (pfmemalloc) + skb->pfmemalloc = 1; + skb->head_frag = 1; + +skb_success: + skb_reserve(skb, NET_SKB_PAD); + skb->dev = dev; + +skb_fail: + return skb; +} +EXPORT_SYMBOL(__netdev_alloc_skb); + +/** + * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance + * @napi: napi instance this buffer was allocated for + * @len: length to allocate + * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages + * + * Allocate a new sk_buff for use in NAPI receive. This buffer will + * attempt to allocate the head from a special reserved region used + * only for NAPI Rx allocation. By doing this we can save several + * CPU cycles by avoiding having to disable and re-enable IRQs. + * + * %NULL is returned if there is no free memory. + */ +struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, + gfp_t gfp_mask) +{ + struct napi_alloc_cache *nc; + struct sk_buff *skb; + bool pfmemalloc; + void *data; + + DEBUG_NET_WARN_ON_ONCE(!in_softirq()); + len += NET_SKB_PAD + NET_IP_ALIGN; + + /* If requested length is either too small or too big, + * we use kmalloc() for skb->head allocation. + * When the small frag allocator is available, prefer it over kmalloc + * for small fragments + */ + if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || + len > SKB_WITH_OVERHEAD(PAGE_SIZE) || + (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { + skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, + NUMA_NO_NODE); + if (!skb) + goto skb_fail; + goto skb_success; + } + + nc = this_cpu_ptr(&napi_alloc_cache); + + if (sk_memalloc_socks()) + gfp_mask |= __GFP_MEMALLOC; + + if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { + /* we are artificially inflating the allocation size, but + * that is not as bad as it may look like, as: + * - 'len' less than GRO_MAX_HEAD makes little sense + * - On most systems, larger 'len' values lead to fragment + * size above 512 bytes + * - kmalloc would use the kmalloc-1k slab for such values + * - Builds with smaller GRO_MAX_HEAD will very likely do + * little networking, as that implies no WiFi and no + * tunnels support, and 32 bits arches. + */ + len = SZ_1K; + + data = page_frag_alloc_1k(&nc->page_small, gfp_mask); + pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small); + } else { + len = SKB_HEAD_ALIGN(len); + + data = page_frag_alloc(&nc->page, len, gfp_mask); + pfmemalloc = nc->page.pfmemalloc; + } + + if (unlikely(!data)) + return NULL; + + skb = __napi_build_skb(data, len); + if (unlikely(!skb)) { + skb_free_frag(data); + return NULL; + } + + if (pfmemalloc) + skb->pfmemalloc = 1; + skb->head_frag = 1; + +skb_success: + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); + skb->dev = napi->dev; + +skb_fail: + return skb; +} +EXPORT_SYMBOL(__napi_alloc_skb); + +void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, + int size, unsigned int truesize) +{ + skb_fill_page_desc(skb, i, page, off, size); + skb->len += size; + skb->data_len += size; + skb->truesize += truesize; +} +EXPORT_SYMBOL(skb_add_rx_frag); + +void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, + unsigned int truesize) +{ + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + skb_frag_size_add(frag, size); + skb->len += size; + skb->data_len += size; + skb->truesize += truesize; +} +EXPORT_SYMBOL(skb_coalesce_rx_frag); + +static void skb_drop_list(struct sk_buff **listp) +{ + kfree_skb_list(*listp); + *listp = NULL; +} + +static inline void skb_drop_fraglist(struct sk_buff *skb) +{ + skb_drop_list(&skb_shinfo(skb)->frag_list); +} + +static void skb_clone_fraglist(struct sk_buff *skb) +{ + struct sk_buff *list; + + skb_walk_frags(skb, list) + skb_get(list); +} + +static void skb_free_head(struct sk_buff *skb) +{ + unsigned char *head = skb->head; + + if (skb->head_frag) { + if (skb_pp_recycle(skb, head)) + return; + skb_free_frag(head); + } else { + kfree(head); + } +} + +static void skb_release_data(struct sk_buff *skb) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + int i; + + if (skb->cloned && + atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, + &shinfo->dataref)) + goto exit; + + if (skb_zcopy(skb)) { + bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS; + + skb_zcopy_clear(skb, true); + if (skip_unref) + goto free_head; + } + + for (i = 0; i < shinfo->nr_frags; i++) + __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); + +free_head: + if (shinfo->frag_list) + kfree_skb_list(shinfo->frag_list); + + skb_free_head(skb); +exit: + /* When we clone an SKB we copy the reycling bit. The pp_recycle + * bit is only set on the head though, so in order to avoid races + * while trying to recycle fragments on __skb_frag_unref() we need + * to make one SKB responsible for triggering the recycle path. + * So disable the recycling bit if an SKB is cloned and we have + * additional references to the fragmented part of the SKB. + * Eventually the last SKB will have the recycling bit set and it's + * dataref set to 0, which will trigger the recycling + */ + skb->pp_recycle = 0; +} + +/* + * Free an skbuff by memory without cleaning the state. + */ +static void kfree_skbmem(struct sk_buff *skb) +{ + struct sk_buff_fclones *fclones; + + switch (skb->fclone) { + case SKB_FCLONE_UNAVAILABLE: + kmem_cache_free(skbuff_head_cache, skb); + return; + + case SKB_FCLONE_ORIG: + fclones = container_of(skb, struct sk_buff_fclones, skb1); + + /* We usually free the clone (TX completion) before original skb + * This test would have no chance to be true for the clone, + * while here, branch prediction will be good. + */ + if (refcount_read(&fclones->fclone_ref) == 1) + goto fastpath; + break; + + default: /* SKB_FCLONE_CLONE */ + fclones = container_of(skb, struct sk_buff_fclones, skb2); + break; + } + if (!refcount_dec_and_test(&fclones->fclone_ref)) + return; +fastpath: + kmem_cache_free(skbuff_fclone_cache, fclones); +} + +void skb_release_head_state(struct sk_buff *skb) +{ + skb_dst_drop(skb); + if (skb->destructor) { + DEBUG_NET_WARN_ON_ONCE(in_hardirq()); + skb->destructor(skb); + } +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + nf_conntrack_put(skb_nfct(skb)); +#endif + skb_ext_put(skb); +} + +/* Free everything but the sk_buff shell. */ +static void skb_release_all(struct sk_buff *skb) +{ + skb_release_head_state(skb); + if (likely(skb->head)) + skb_release_data(skb); +} + +/** + * __kfree_skb - private function + * @skb: buffer + * + * Free an sk_buff. Release anything attached to the buffer. + * Clean the state. This is an internal helper function. Users should + * always call kfree_skb + */ + +void __kfree_skb(struct sk_buff *skb) +{ + skb_release_all(skb); + kfree_skbmem(skb); +} +EXPORT_SYMBOL(__kfree_skb); + +/** + * kfree_skb_reason - free an sk_buff with special reason + * @skb: buffer to free + * @reason: reason why this skb is dropped + * + * Drop a reference to the buffer and free it if the usage count has + * hit zero. Meanwhile, pass the drop reason to 'kfree_skb' + * tracepoint. + */ +void __fix_address +kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) +{ + if (unlikely(!skb_unref(skb))) + return; + + DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX); + + trace_kfree_skb(skb, __builtin_return_address(0), reason); + __kfree_skb(skb); +} +EXPORT_SYMBOL(kfree_skb_reason); + +void kfree_skb_list_reason(struct sk_buff *segs, + enum skb_drop_reason reason) +{ + while (segs) { + struct sk_buff *next = segs->next; + + kfree_skb_reason(segs, reason); + segs = next; + } +} +EXPORT_SYMBOL(kfree_skb_list_reason); + +/* Dump skb information and contents. + * + * Must only be called from net_ratelimit()-ed paths. + * + * Dumps whole packets if full_pkt, only headers otherwise. + */ +void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) +{ + struct skb_shared_info *sh = skb_shinfo(skb); + struct net_device *dev = skb->dev; + struct sock *sk = skb->sk; + struct sk_buff *list_skb; + bool has_mac, has_trans; + int headroom, tailroom; + int i, len, seg_len; + + if (full_pkt) + len = skb->len; + else + len = min_t(int, skb->len, MAX_HEADER + 128); + + headroom = skb_headroom(skb); + tailroom = skb_tailroom(skb); + + has_mac = skb_mac_header_was_set(skb); + has_trans = skb_transport_header_was_set(skb); + + printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" + "mac=(%d,%d) net=(%d,%d) trans=%d\n" + "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" + "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n" + "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n", + level, skb->len, headroom, skb_headlen(skb), tailroom, + has_mac ? skb->mac_header : -1, + has_mac ? skb_mac_header_len(skb) : -1, + skb->network_header, + has_trans ? skb_network_header_len(skb) : -1, + has_trans ? skb->transport_header : -1, + sh->tx_flags, sh->nr_frags, + sh->gso_size, sh->gso_type, sh->gso_segs, + skb->csum, skb->ip_summed, skb->csum_complete_sw, + skb->csum_valid, skb->csum_level, + skb->hash, skb->sw_hash, skb->l4_hash, + ntohs(skb->protocol), skb->pkt_type, skb->skb_iif); + + if (dev) + printk("%sdev name=%s feat=%pNF\n", + level, dev->name, &dev->features); + if (sk) + printk("%ssk family=%hu type=%u proto=%u\n", + level, sk->sk_family, sk->sk_type, sk->sk_protocol); + + if (full_pkt && headroom) + print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET, + 16, 1, skb->head, headroom, false); + + seg_len = min_t(int, skb_headlen(skb), len); + if (seg_len) + print_hex_dump(level, "skb linear: ", DUMP_PREFIX_OFFSET, + 16, 1, skb->data, seg_len, false); + len -= seg_len; + + if (full_pkt && tailroom) + print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET, + 16, 1, skb_tail_pointer(skb), tailroom, false); + + for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + u32 p_off, p_len, copied; + struct page *p; + u8 *vaddr; + + skb_frag_foreach_page(frag, skb_frag_off(frag), + skb_frag_size(frag), p, p_off, p_len, + copied) { + seg_len = min_t(int, p_len, len); + vaddr = kmap_atomic(p); + print_hex_dump(level, "skb frag: ", + DUMP_PREFIX_OFFSET, + 16, 1, vaddr + p_off, seg_len, false); + kunmap_atomic(vaddr); + len -= seg_len; + if (!len) + break; + } + } + + if (full_pkt && skb_has_frag_list(skb)) { + printk("skb fraglist:\n"); + skb_walk_frags(skb, list_skb) + skb_dump(level, list_skb, true); + } +} +EXPORT_SYMBOL(skb_dump); + +/** + * skb_tx_error - report an sk_buff xmit error + * @skb: buffer that triggered an error + * + * Report xmit error if a device callback is tracking this skb. + * skb must be freed afterwards. + */ +void skb_tx_error(struct sk_buff *skb) +{ + if (skb) { + skb_zcopy_downgrade_managed(skb); + skb_zcopy_clear(skb, true); + } +} +EXPORT_SYMBOL(skb_tx_error); + +#ifdef CONFIG_TRACEPOINTS +/** + * consume_skb - free an skbuff + * @skb: buffer to free + * + * Drop a ref to the buffer and free it if the usage count has hit zero + * Functions identically to kfree_skb, but kfree_skb assumes that the frame + * is being dropped after a failure and notes that + */ +void consume_skb(struct sk_buff *skb) +{ + if (!skb_unref(skb)) + return; + + trace_consume_skb(skb); + __kfree_skb(skb); +} +EXPORT_SYMBOL(consume_skb); +#endif + +/** + * __consume_stateless_skb - free an skbuff, assuming it is stateless + * @skb: buffer to free + * + * Alike consume_skb(), but this variant assumes that this is the last + * skb reference and all the head states have been already dropped + */ +void __consume_stateless_skb(struct sk_buff *skb) +{ + trace_consume_skb(skb); + skb_release_data(skb); + kfree_skbmem(skb); +} + +static void napi_skb_cache_put(struct sk_buff *skb) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + u32 i; + + kasan_poison_object_data(skbuff_head_cache, skb); + nc->skb_cache[nc->skb_count++] = skb; + + if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { + for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++) + kasan_unpoison_object_data(skbuff_head_cache, + nc->skb_cache[i]); + + kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_HALF, + nc->skb_cache + NAPI_SKB_CACHE_HALF); + nc->skb_count = NAPI_SKB_CACHE_HALF; + } +} + +void __kfree_skb_defer(struct sk_buff *skb) +{ + skb_release_all(skb); + napi_skb_cache_put(skb); +} + +void napi_skb_free_stolen_head(struct sk_buff *skb) +{ + if (unlikely(skb->slow_gro)) { + nf_reset_ct(skb); + skb_dst_drop(skb); + skb_ext_put(skb); + skb_orphan(skb); + skb->slow_gro = 0; + } + napi_skb_cache_put(skb); +} + +void napi_consume_skb(struct sk_buff *skb, int budget) +{ + /* Zero budget indicate non-NAPI context called us, like netpoll */ + if (unlikely(!budget)) { + dev_consume_skb_any(skb); + return; + } + + DEBUG_NET_WARN_ON_ONCE(!in_softirq()); + + if (!skb_unref(skb)) + return; + + /* if reaching here SKB is ready to free */ + trace_consume_skb(skb); + + /* if SKB is a clone, don't handle this case */ + if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { + __kfree_skb(skb); + return; + } + + skb_release_all(skb); + napi_skb_cache_put(skb); +} +EXPORT_SYMBOL(napi_consume_skb); + +/* Make sure a field is contained by headers group */ +#define CHECK_SKB_FIELD(field) \ + BUILD_BUG_ON(offsetof(struct sk_buff, field) != \ + offsetof(struct sk_buff, headers.field)); \ + +static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) +{ + new->tstamp = old->tstamp; + /* We do not copy old->sk */ + new->dev = old->dev; + memcpy(new->cb, old->cb, sizeof(old->cb)); + skb_dst_copy(new, old); + __skb_ext_copy(new, old); + __nf_copy(new, old, false); + + /* Note : this field could be in the headers group. + * It is not yet because we do not want to have a 16 bit hole + */ + new->queue_mapping = old->queue_mapping; + + memcpy(&new->headers, &old->headers, sizeof(new->headers)); + CHECK_SKB_FIELD(protocol); + CHECK_SKB_FIELD(csum); + CHECK_SKB_FIELD(hash); + CHECK_SKB_FIELD(priority); + CHECK_SKB_FIELD(skb_iif); + CHECK_SKB_FIELD(vlan_proto); + CHECK_SKB_FIELD(vlan_tci); + CHECK_SKB_FIELD(transport_header); + CHECK_SKB_FIELD(network_header); + CHECK_SKB_FIELD(mac_header); + CHECK_SKB_FIELD(inner_protocol); + CHECK_SKB_FIELD(inner_transport_header); + CHECK_SKB_FIELD(inner_network_header); + CHECK_SKB_FIELD(inner_mac_header); + CHECK_SKB_FIELD(mark); +#ifdef CONFIG_NETWORK_SECMARK + CHECK_SKB_FIELD(secmark); +#endif +#ifdef CONFIG_NET_RX_BUSY_POLL + CHECK_SKB_FIELD(napi_id); +#endif + CHECK_SKB_FIELD(alloc_cpu); +#ifdef CONFIG_XPS + CHECK_SKB_FIELD(sender_cpu); +#endif +#ifdef CONFIG_NET_SCHED + CHECK_SKB_FIELD(tc_index); +#endif + +} + +/* + * You should not add any new code to this function. Add it to + * __copy_skb_header above instead. + */ +static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) +{ +#define C(x) n->x = skb->x + + n->next = n->prev = NULL; + n->sk = NULL; + __copy_skb_header(n, skb); + + C(len); + C(data_len); + C(mac_len); + n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; + n->cloned = 1; + n->nohdr = 0; + n->peeked = 0; + C(pfmemalloc); + C(pp_recycle); + n->destructor = NULL; + C(tail); + C(end); + C(head); + C(head_frag); + C(data); + C(truesize); + refcount_set(&n->users, 1); + + atomic_inc(&(skb_shinfo(skb)->dataref)); + skb->cloned = 1; + + return n; +#undef C +} + +/** + * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg + * @first: first sk_buff of the msg + */ +struct sk_buff *alloc_skb_for_msg(struct sk_buff *first) +{ + struct sk_buff *n; + + n = alloc_skb(0, GFP_ATOMIC); + if (!n) + return NULL; + + n->len = first->len; + n->data_len = first->len; + n->truesize = first->truesize; + + skb_shinfo(n)->frag_list = first; + + __copy_skb_header(n, first); + n->destructor = NULL; + + return n; +} +EXPORT_SYMBOL_GPL(alloc_skb_for_msg); + +/** + * skb_morph - morph one skb into another + * @dst: the skb to receive the contents + * @src: the skb to supply the contents + * + * This is identical to skb_clone except that the target skb is + * supplied by the user. + * + * The target skb is returned upon exit. + */ +struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) +{ + skb_release_all(dst); + return __skb_clone(dst, src); +} +EXPORT_SYMBOL_GPL(skb_morph); + +int mm_account_pinned_pages(struct mmpin *mmp, size_t size) +{ + unsigned long max_pg, num_pg, new_pg, old_pg; + struct user_struct *user; + + if (capable(CAP_IPC_LOCK) || !size) + return 0; + + num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */ + max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + user = mmp->user ? : current_user(); + + do { + old_pg = atomic_long_read(&user->locked_vm); + new_pg = old_pg + num_pg; + if (new_pg > max_pg) + return -ENOBUFS; + } while (atomic_long_cmpxchg(&user->locked_vm, old_pg, new_pg) != + old_pg); + + if (!mmp->user) { + mmp->user = get_uid(user); + mmp->num_pg = num_pg; + } else { + mmp->num_pg += num_pg; + } + + return 0; +} +EXPORT_SYMBOL_GPL(mm_account_pinned_pages); + +void mm_unaccount_pinned_pages(struct mmpin *mmp) +{ + if (mmp->user) { + atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm); + free_uid(mmp->user); + } +} +EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); + +static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) +{ + struct ubuf_info_msgzc *uarg; + struct sk_buff *skb; + + WARN_ON_ONCE(!in_task()); + + skb = sock_omalloc(sk, 0, GFP_KERNEL); + if (!skb) + return NULL; + + BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb)); + uarg = (void *)skb->cb; + uarg->mmp.user = NULL; + + if (mm_account_pinned_pages(&uarg->mmp, size)) { + kfree_skb(skb); + return NULL; + } + + uarg->ubuf.callback = msg_zerocopy_callback; + uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; + uarg->len = 1; + uarg->bytelen = size; + uarg->zerocopy = 1; + uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; + refcount_set(&uarg->ubuf.refcnt, 1); + sock_hold(sk); + + return &uarg->ubuf; +} + +static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg) +{ + return container_of((void *)uarg, struct sk_buff, cb); +} + +struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, + struct ubuf_info *uarg) +{ + if (uarg) { + struct ubuf_info_msgzc *uarg_zc; + const u32 byte_limit = 1 << 19; /* limit to a few TSO */ + u32 bytelen, next; + + /* there might be non MSG_ZEROCOPY users */ + if (uarg->callback != msg_zerocopy_callback) + return NULL; + + /* realloc only when socket is locked (TCP, UDP cork), + * so uarg->len and sk_zckey access is serialized + */ + if (!sock_owned_by_user(sk)) { + WARN_ON_ONCE(1); + return NULL; + } + + uarg_zc = uarg_to_msgzc(uarg); + bytelen = uarg_zc->bytelen + size; + if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) { + /* TCP can create new skb to attach new uarg */ + if (sk->sk_type == SOCK_STREAM) + goto new_alloc; + return NULL; + } + + next = (u32)atomic_read(&sk->sk_zckey); + if ((u32)(uarg_zc->id + uarg_zc->len) == next) { + if (mm_account_pinned_pages(&uarg_zc->mmp, size)) + return NULL; + uarg_zc->len++; + uarg_zc->bytelen = bytelen; + atomic_set(&sk->sk_zckey, ++next); + + /* no extra ref when appending to datagram (MSG_MORE) */ + if (sk->sk_type == SOCK_STREAM) + net_zcopy_get(uarg); + + return uarg; + } + } + +new_alloc: + return msg_zerocopy_alloc(sk, size); +} +EXPORT_SYMBOL_GPL(msg_zerocopy_realloc); + +static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) +{ + struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + u32 old_lo, old_hi; + u64 sum_len; + + old_lo = serr->ee.ee_info; + old_hi = serr->ee.ee_data; + sum_len = old_hi - old_lo + 1ULL + len; + + if (sum_len >= (1ULL << 32)) + return false; + + if (lo != old_hi + 1) + return false; + + serr->ee.ee_data += len; + return true; +} + +static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg) +{ + struct sk_buff *tail, *skb = skb_from_uarg(uarg); + struct sock_exterr_skb *serr; + struct sock *sk = skb->sk; + struct sk_buff_head *q; + unsigned long flags; + bool is_zerocopy; + u32 lo, hi; + u16 len; + + mm_unaccount_pinned_pages(&uarg->mmp); + + /* if !len, there was only 1 call, and it was aborted + * so do not queue a completion notification + */ + if (!uarg->len || sock_flag(sk, SOCK_DEAD)) + goto release; + + len = uarg->len; + lo = uarg->id; + hi = uarg->id + len - 1; + is_zerocopy = uarg->zerocopy; + + serr = SKB_EXT_ERR(skb); + memset(serr, 0, sizeof(*serr)); + serr->ee.ee_errno = 0; + serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY; + serr->ee.ee_data = hi; + serr->ee.ee_info = lo; + if (!is_zerocopy) + serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED; + + q = &sk->sk_error_queue; + spin_lock_irqsave(&q->lock, flags); + tail = skb_peek_tail(q); + if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY || + !skb_zerocopy_notify_extend(tail, lo, len)) { + __skb_queue_tail(q, skb); + skb = NULL; + } + spin_unlock_irqrestore(&q->lock, flags); + + sk_error_report(sk); + +release: + consume_skb(skb); + sock_put(sk); +} + +void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, + bool success) +{ + struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg); + + uarg_zc->zerocopy = uarg_zc->zerocopy & success; + + if (refcount_dec_and_test(&uarg->refcnt)) + __msg_zerocopy_callback(uarg_zc); +} +EXPORT_SYMBOL_GPL(msg_zerocopy_callback); + +void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) +{ + struct sock *sk = skb_from_uarg(uarg_to_msgzc(uarg))->sk; + + atomic_dec(&sk->sk_zckey); + uarg_to_msgzc(uarg)->len--; + + if (have_uref) + msg_zerocopy_callback(NULL, uarg, true); +} +EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); + +int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, + struct msghdr *msg, int len, + struct ubuf_info *uarg) +{ + struct ubuf_info *orig_uarg = skb_zcopy(skb); + int err, orig_len = skb->len; + + /* An skb can only point to one uarg. This edge case happens when + * TCP appends to an skb, but zerocopy_realloc triggered a new alloc. + */ + if (orig_uarg && uarg != orig_uarg) + return -EEXIST; + + err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len); + if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { + struct sock *save_sk = skb->sk; + + /* Streams do not free skb on error. Reset to prev state. */ + iov_iter_revert(&msg->msg_iter, skb->len - orig_len); + skb->sk = sk; + ___pskb_trim(skb, orig_len); + skb->sk = save_sk; + return err; + } + + skb_zcopy_set(skb, uarg, NULL); + return skb->len - orig_len; +} +EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); + +void __skb_zcopy_downgrade_managed(struct sk_buff *skb) +{ + int i; + + skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + skb_frag_ref(skb, i); +} +EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed); + +static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, + gfp_t gfp_mask) +{ + if (skb_zcopy(orig)) { + if (skb_zcopy(nskb)) { + /* !gfp_mask callers are verified to !skb_zcopy(nskb) */ + if (!gfp_mask) { + WARN_ON_ONCE(1); + return -ENOMEM; + } + if (skb_uarg(nskb) == skb_uarg(orig)) + return 0; + if (skb_copy_ubufs(nskb, GFP_ATOMIC)) + return -EIO; + } + skb_zcopy_set(nskb, skb_uarg(orig), NULL); + } + return 0; +} + +/** + * skb_copy_ubufs - copy userspace skb frags buffers to kernel + * @skb: the skb to modify + * @gfp_mask: allocation priority + * + * This must be called on skb with SKBFL_ZEROCOPY_ENABLE. + * It will copy all frags into kernel and drop the reference + * to userspace pages. + * + * If this function is called from an interrupt gfp_mask() must be + * %GFP_ATOMIC. + * + * Returns 0 on success or a negative error code on failure + * to allocate kernel memory to copy to. + */ +int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) +{ + int num_frags = skb_shinfo(skb)->nr_frags; + struct page *page, *head = NULL; + int i, order, psize, new_frags; + u32 d_off; + + if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) + return -EINVAL; + + if (!num_frags) + goto release; + + /* We might have to allocate high order pages, so compute what minimum + * page order is needed. + */ + order = 0; + while ((PAGE_SIZE << order) * MAX_SKB_FRAGS < __skb_pagelen(skb)) + order++; + psize = (PAGE_SIZE << order); + + new_frags = (__skb_pagelen(skb) + psize - 1) >> (PAGE_SHIFT + order); + for (i = 0; i < new_frags; i++) { + page = alloc_pages(gfp_mask | __GFP_COMP, order); + if (!page) { + while (head) { + struct page *next = (struct page *)page_private(head); + put_page(head); + head = next; + } + return -ENOMEM; + } + set_page_private(page, (unsigned long)head); + head = page; + } + + page = head; + d_off = 0; + for (i = 0; i < num_frags; i++) { + skb_frag_t *f = &skb_shinfo(skb)->frags[i]; + u32 p_off, p_len, copied; + struct page *p; + u8 *vaddr; + + skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f), + p, p_off, p_len, copied) { + u32 copy, done = 0; + vaddr = kmap_atomic(p); + + while (done < p_len) { + if (d_off == psize) { + d_off = 0; + page = (struct page *)page_private(page); + } + copy = min_t(u32, psize - d_off, p_len - done); + memcpy(page_address(page) + d_off, + vaddr + p_off + done, copy); + done += copy; + d_off += copy; + } + kunmap_atomic(vaddr); + } + } + + /* skb frags release userspace buffers */ + for (i = 0; i < num_frags; i++) + skb_frag_unref(skb, i); + + /* skb frags point to kernel buffers */ + for (i = 0; i < new_frags - 1; i++) { + __skb_fill_page_desc(skb, i, head, 0, psize); + head = (struct page *)page_private(head); + } + __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off); + skb_shinfo(skb)->nr_frags = new_frags; + +release: + skb_zcopy_clear(skb, false); + return 0; +} +EXPORT_SYMBOL_GPL(skb_copy_ubufs); + +/** + * skb_clone - duplicate an sk_buff + * @skb: buffer to clone + * @gfp_mask: allocation priority + * + * Duplicate an &sk_buff. The new one is not owned by a socket. Both + * copies share the same packet data but not structure. The new + * buffer has a reference count of 1. If the allocation fails the + * function returns %NULL otherwise the new buffer is returned. + * + * If this function is called from an interrupt gfp_mask() must be + * %GFP_ATOMIC. + */ + +struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) +{ + struct sk_buff_fclones *fclones = container_of(skb, + struct sk_buff_fclones, + skb1); + struct sk_buff *n; + + if (skb_orphan_frags(skb, gfp_mask)) + return NULL; + + if (skb->fclone == SKB_FCLONE_ORIG && + refcount_read(&fclones->fclone_ref) == 1) { + n = &fclones->skb2; + refcount_set(&fclones->fclone_ref, 2); + n->fclone = SKB_FCLONE_CLONE; + } else { + if (skb_pfmemalloc(skb)) + gfp_mask |= __GFP_MEMALLOC; + + n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); + if (!n) + return NULL; + + n->fclone = SKB_FCLONE_UNAVAILABLE; + } + + return __skb_clone(n, skb); +} +EXPORT_SYMBOL(skb_clone); + +void skb_headers_offset_update(struct sk_buff *skb, int off) +{ + /* Only adjust this if it actually is csum_start rather than csum */ + if (skb->ip_summed == CHECKSUM_PARTIAL) + skb->csum_start += off; + /* {transport,network,mac}_header and tail are relative to skb->head */ + skb->transport_header += off; + skb->network_header += off; + if (skb_mac_header_was_set(skb)) + skb->mac_header += off; + skb->inner_transport_header += off; + skb->inner_network_header += off; + skb->inner_mac_header += off; +} +EXPORT_SYMBOL(skb_headers_offset_update); + +void skb_copy_header(struct sk_buff *new, const struct sk_buff *old) +{ + __copy_skb_header(new, old); + + skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; + skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; + skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; +} +EXPORT_SYMBOL(skb_copy_header); + +static inline int skb_alloc_rx_flag(const struct sk_buff *skb) +{ + if (skb_pfmemalloc(skb)) + return SKB_ALLOC_RX; + return 0; +} + +/** + * skb_copy - create private copy of an sk_buff + * @skb: buffer to copy + * @gfp_mask: allocation priority + * + * Make a copy of both an &sk_buff and its data. This is used when the + * caller wishes to modify the data and needs a private copy of the + * data to alter. Returns %NULL on failure or the pointer to the buffer + * on success. The returned buffer has a reference count of 1. + * + * As by-product this function converts non-linear &sk_buff to linear + * one, so that &sk_buff becomes completely private and caller is allowed + * to modify all the data of returned buffer. This means that this + * function is not recommended for use in circumstances when only + * header is going to be modified. Use pskb_copy() instead. + */ + +struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) +{ + int headerlen = skb_headroom(skb); + unsigned int size = skb_end_offset(skb) + skb->data_len; + struct sk_buff *n = __alloc_skb(size, gfp_mask, + skb_alloc_rx_flag(skb), NUMA_NO_NODE); + + if (!n) + return NULL; + + /* Set the data pointer */ + skb_reserve(n, headerlen); + /* Set the tail pointer and length */ + skb_put(n, skb->len); + + BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); + + skb_copy_header(n, skb); + return n; +} +EXPORT_SYMBOL(skb_copy); + +/** + * __pskb_copy_fclone - create copy of an sk_buff with private head. + * @skb: buffer to copy + * @headroom: headroom of new skb + * @gfp_mask: allocation priority + * @fclone: if true allocate the copy of the skb from the fclone + * cache instead of the head cache; it is recommended to set this + * to true for the cases where the copy will likely be cloned + * + * Make a copy of both an &sk_buff and part of its data, located + * in header. Fragmented data remain shared. This is used when + * the caller wishes to modify only header of &sk_buff and needs + * private copy of the header to alter. Returns %NULL on failure + * or the pointer to the buffer on success. + * The returned buffer has a reference count of 1. + */ + +struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, + gfp_t gfp_mask, bool fclone) +{ + unsigned int size = skb_headlen(skb) + headroom; + int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0); + struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE); + + if (!n) + goto out; + + /* Set the data pointer */ + skb_reserve(n, headroom); + /* Set the tail pointer and length */ + skb_put(n, skb_headlen(skb)); + /* Copy the bytes */ + skb_copy_from_linear_data(skb, n->data, n->len); + + n->truesize += skb->data_len; + n->data_len = skb->data_len; + n->len = skb->len; + + if (skb_shinfo(skb)->nr_frags) { + int i; + + if (skb_orphan_frags(skb, gfp_mask) || + skb_zerocopy_clone(n, skb, gfp_mask)) { + kfree_skb(n); + n = NULL; + goto out; + } + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; + skb_frag_ref(skb, i); + } + skb_shinfo(n)->nr_frags = i; + } + + if (skb_has_frag_list(skb)) { + skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; + skb_clone_fraglist(n); + } + + skb_copy_header(n, skb); +out: + return n; +} +EXPORT_SYMBOL(__pskb_copy_fclone); + +/** + * pskb_expand_head - reallocate header of &sk_buff + * @skb: buffer to reallocate + * @nhead: room to add at head + * @ntail: room to add at tail + * @gfp_mask: allocation priority + * + * Expands (or creates identical copy, if @nhead and @ntail are zero) + * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have + * reference count of 1. Returns zero in the case of success or error, + * if expansion failed. In the last case, &sk_buff is not changed. + * + * All the pointers pointing into skb header may change and must be + * reloaded after call to this function. + */ + +int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, + gfp_t gfp_mask) +{ + unsigned int osize = skb_end_offset(skb); + unsigned int size = osize + nhead + ntail; + long off; + u8 *data; + int i; + + BUG_ON(nhead < 0); + + BUG_ON(skb_shared(skb)); + + skb_zcopy_downgrade_managed(skb); + + if (skb_pfmemalloc(skb)) + gfp_mask |= __GFP_MEMALLOC; + + data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); + if (!data) + goto nodata; + size = SKB_WITH_OVERHEAD(size); + + /* Copy only real data... and, alas, header. This should be + * optimized for the cases when header is void. + */ + memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); + + memcpy((struct skb_shared_info *)(data + size), + skb_shinfo(skb), + offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); + + /* + * if shinfo is shared we must drop the old head gracefully, but if it + * is not we can just drop the old head and let the existing refcount + * be since all we did is relocate the values + */ + if (skb_cloned(skb)) { + if (skb_orphan_frags(skb, gfp_mask)) + goto nofrags; + if (skb_zcopy(skb)) + refcount_inc(&skb_uarg(skb)->refcnt); + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + skb_frag_ref(skb, i); + + if (skb_has_frag_list(skb)) + skb_clone_fraglist(skb); + + skb_release_data(skb); + } else { + skb_free_head(skb); + } + off = (data + nhead) - skb->head; + + skb->head = data; + skb->head_frag = 0; + skb->data += off; + + skb_set_end_offset(skb, size); +#ifdef NET_SKBUFF_DATA_USES_OFFSET + off = nhead; +#endif + skb->tail += off; + skb_headers_offset_update(skb, nhead); + skb->cloned = 0; + skb->hdr_len = 0; + skb->nohdr = 0; + atomic_set(&skb_shinfo(skb)->dataref, 1); + + skb_metadata_clear(skb); + + /* It is not generally safe to change skb->truesize. + * For the moment, we really care of rx path, or + * when skb is orphaned (not attached to a socket). + */ + if (!skb->sk || skb->destructor == sock_edemux) + skb->truesize += size - osize; + + return 0; + +nofrags: + kfree(data); +nodata: + return -ENOMEM; +} +EXPORT_SYMBOL(pskb_expand_head); + +/* Make private copy of skb with writable head and some headroom */ + +struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) +{ + struct sk_buff *skb2; + int delta = headroom - skb_headroom(skb); + + if (delta <= 0) + skb2 = pskb_copy(skb, GFP_ATOMIC); + else { + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, + GFP_ATOMIC)) { + kfree_skb(skb2); + skb2 = NULL; + } + } + return skb2; +} +EXPORT_SYMBOL(skb_realloc_headroom); + +int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) +{ + unsigned int saved_end_offset, saved_truesize; + struct skb_shared_info *shinfo; + int res; + + saved_end_offset = skb_end_offset(skb); + saved_truesize = skb->truesize; + + res = pskb_expand_head(skb, 0, 0, pri); + if (res) + return res; + + skb->truesize = saved_truesize; + + if (likely(skb_end_offset(skb) == saved_end_offset)) + return 0; + + shinfo = skb_shinfo(skb); + + /* We are about to change back skb->end, + * we need to move skb_shinfo() to its new location. + */ + memmove(skb->head + saved_end_offset, + shinfo, + offsetof(struct skb_shared_info, frags[shinfo->nr_frags])); + + skb_set_end_offset(skb, saved_end_offset); + + return 0; +} + +/** + * skb_expand_head - reallocate header of &sk_buff + * @skb: buffer to reallocate + * @headroom: needed headroom + * + * Unlike skb_realloc_headroom, this one does not allocate a new skb + * if possible; copies skb->sk to new skb as needed + * and frees original skb in case of failures. + * + * It expect increased headroom and generates warning otherwise. + */ + +struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom) +{ + int delta = headroom - skb_headroom(skb); + int osize = skb_end_offset(skb); + struct sock *sk = skb->sk; + + if (WARN_ONCE(delta <= 0, + "%s is expecting an increase in the headroom", __func__)) + return skb; + + delta = SKB_DATA_ALIGN(delta); + /* pskb_expand_head() might crash, if skb is shared. */ + if (skb_shared(skb) || !is_skb_wmem(skb)) { + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + + if (unlikely(!nskb)) + goto fail; + + if (sk) + skb_set_owner_w(nskb, sk); + consume_skb(skb); + skb = nskb; + } + if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) + goto fail; + + if (sk && is_skb_wmem(skb)) { + delta = skb_end_offset(skb) - osize; + refcount_add(delta, &sk->sk_wmem_alloc); + skb->truesize += delta; + } + return skb; + +fail: + kfree_skb(skb); + return NULL; +} +EXPORT_SYMBOL(skb_expand_head); + +/** + * skb_copy_expand - copy and expand sk_buff + * @skb: buffer to copy + * @newheadroom: new free bytes at head + * @newtailroom: new free bytes at tail + * @gfp_mask: allocation priority + * + * Make a copy of both an &sk_buff and its data and while doing so + * allocate additional space. + * + * This is used when the caller wishes to modify the data and needs a + * private copy of the data to alter as well as more space for new fields. + * Returns %NULL on failure or the pointer to the buffer + * on success. The returned buffer has a reference count of 1. + * + * You must pass %GFP_ATOMIC as the allocation priority if this function + * is called from an interrupt. + */ +struct sk_buff *skb_copy_expand(const struct sk_buff *skb, + int newheadroom, int newtailroom, + gfp_t gfp_mask) +{ + /* + * Allocate the copy buffer + */ + struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom, + gfp_mask, skb_alloc_rx_flag(skb), + NUMA_NO_NODE); + int oldheadroom = skb_headroom(skb); + int head_copy_len, head_copy_off; + + if (!n) + return NULL; + + skb_reserve(n, newheadroom); + + /* Set the tail pointer and length */ + skb_put(n, skb->len); + + head_copy_len = oldheadroom; + head_copy_off = 0; + if (newheadroom <= head_copy_len) + head_copy_len = newheadroom; + else + head_copy_off = newheadroom - head_copy_len; + + /* Copy the linear header and data. */ + BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, + skb->len + head_copy_len)); + + skb_copy_header(n, skb); + + skb_headers_offset_update(n, newheadroom - oldheadroom); + + return n; +} +EXPORT_SYMBOL(skb_copy_expand); + +/** + * __skb_pad - zero pad the tail of an skb + * @skb: buffer to pad + * @pad: space to pad + * @free_on_error: free buffer on error + * + * Ensure that a buffer is followed by a padding area that is zero + * filled. Used by network drivers which may DMA or transfer data + * beyond the buffer end onto the wire. + * + * May return error in out of memory cases. The skb is freed on error + * if @free_on_error is true. + */ + +int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error) +{ + int err; + int ntail; + + /* If the skbuff is non linear tailroom is always zero.. */ + if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { + memset(skb->data+skb->len, 0, pad); + return 0; + } + + ntail = skb->data_len + pad - (skb->end - skb->tail); + if (likely(skb_cloned(skb) || ntail > 0)) { + err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); + if (unlikely(err)) + goto free_skb; + } + + /* FIXME: The use of this function with non-linear skb's really needs + * to be audited. + */ + err = skb_linearize(skb); + if (unlikely(err)) + goto free_skb; + + memset(skb->data + skb->len, 0, pad); + return 0; + +free_skb: + if (free_on_error) + kfree_skb(skb); + return err; +} +EXPORT_SYMBOL(__skb_pad); + +/** + * pskb_put - add data to the tail of a potentially fragmented buffer + * @skb: start of the buffer to use + * @tail: tail fragment of the buffer to use + * @len: amount of data to add + * + * This function extends the used data area of the potentially + * fragmented buffer. @tail must be the last fragment of @skb -- or + * @skb itself. If this would exceed the total buffer size the kernel + * will panic. A pointer to the first byte of the extra data is + * returned. + */ + +void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) +{ + if (tail != skb) { + skb->data_len += len; + skb->len += len; + } + return skb_put(tail, len); +} +EXPORT_SYMBOL_GPL(pskb_put); + +/** + * skb_put - add data to a buffer + * @skb: buffer to use + * @len: amount of data to add + * + * This function extends the used data area of the buffer. If this would + * exceed the total buffer size the kernel will panic. A pointer to the + * first byte of the extra data is returned. + */ +void *skb_put(struct sk_buff *skb, unsigned int len) +{ + void *tmp = skb_tail_pointer(skb); + SKB_LINEAR_ASSERT(skb); + skb->tail += len; + skb->len += len; + if (unlikely(skb->tail > skb->end)) + skb_over_panic(skb, len, __builtin_return_address(0)); + return tmp; +} +EXPORT_SYMBOL(skb_put); + +/** + * skb_push - add data to the start of a buffer + * @skb: buffer to use + * @len: amount of data to add + * + * This function extends the used data area of the buffer at the buffer + * start. If this would exceed the total buffer headroom the kernel will + * panic. A pointer to the first byte of the extra data is returned. + */ +void *skb_push(struct sk_buff *skb, unsigned int len) +{ + skb->data -= len; + skb->len += len; + if (unlikely(skb->data < skb->head)) + skb_under_panic(skb, len, __builtin_return_address(0)); + return skb->data; +} +EXPORT_SYMBOL(skb_push); + +/** + * skb_pull - remove data from the start of a buffer + * @skb: buffer to use + * @len: amount of data to remove + * + * This function removes data from the start of a buffer, returning + * the memory to the headroom. A pointer to the next data in the buffer + * is returned. Once the data has been pulled future pushes will overwrite + * the old data. + */ +void *skb_pull(struct sk_buff *skb, unsigned int len) +{ + return skb_pull_inline(skb, len); +} +EXPORT_SYMBOL(skb_pull); + +/** + * skb_pull_data - remove data from the start of a buffer returning its + * original position. + * @skb: buffer to use + * @len: amount of data to remove + * + * This function removes data from the start of a buffer, returning + * the memory to the headroom. A pointer to the original data in the buffer + * is returned after checking if there is enough data to pull. Once the + * data has been pulled future pushes will overwrite the old data. + */ +void *skb_pull_data(struct sk_buff *skb, size_t len) +{ + void *data = skb->data; + + if (skb->len < len) + return NULL; + + skb_pull(skb, len); + + return data; +} +EXPORT_SYMBOL(skb_pull_data); + +/** + * skb_trim - remove end from a buffer + * @skb: buffer to alter + * @len: new length + * + * Cut the length of a buffer down by removing data from the tail. If + * the buffer is already under the length specified it is not modified. + * The skb must be linear. + */ +void skb_trim(struct sk_buff *skb, unsigned int len) +{ + if (skb->len > len) + __skb_trim(skb, len); +} +EXPORT_SYMBOL(skb_trim); + +/* Trims skb to length len. It can change skb pointers. + */ + +int ___pskb_trim(struct sk_buff *skb, unsigned int len) +{ + struct sk_buff **fragp; + struct sk_buff *frag; + int offset = skb_headlen(skb); + int nfrags = skb_shinfo(skb)->nr_frags; + int i; + int err; + + if (skb_cloned(skb) && + unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) + return err; + + i = 0; + if (offset >= len) + goto drop_pages; + + for (; i < nfrags; i++) { + int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]); + + if (end < len) { + offset = end; + continue; + } + + skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset); + +drop_pages: + skb_shinfo(skb)->nr_frags = i; + + for (; i < nfrags; i++) + skb_frag_unref(skb, i); + + if (skb_has_frag_list(skb)) + skb_drop_fraglist(skb); + goto done; + } + + for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); + fragp = &frag->next) { + int end = offset + frag->len; + + if (skb_shared(frag)) { + struct sk_buff *nfrag; + + nfrag = skb_clone(frag, GFP_ATOMIC); + if (unlikely(!nfrag)) + return -ENOMEM; + + nfrag->next = frag->next; + consume_skb(frag); + frag = nfrag; + *fragp = frag; + } + + if (end < len) { + offset = end; + continue; + } + + if (end > len && + unlikely((err = pskb_trim(frag, len - offset)))) + return err; + + if (frag->next) + skb_drop_list(&frag->next); + break; + } + +done: + if (len > skb_headlen(skb)) { + skb->data_len -= skb->len - len; + skb->len = len; + } else { + skb->len = len; + skb->data_len = 0; + skb_set_tail_pointer(skb, len); + } + + if (!skb->sk || skb->destructor == sock_edemux) + skb_condense(skb); + return 0; +} +EXPORT_SYMBOL(___pskb_trim); + +/* Note : use pskb_trim_rcsum() instead of calling this directly + */ +int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) +{ + if (skb->ip_summed == CHECKSUM_COMPLETE) { + int delta = skb->len - len; + + skb->csum = csum_block_sub(skb->csum, + skb_checksum(skb, len, delta, 0), + len); + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { + int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len; + int offset = skb_checksum_start_offset(skb) + skb->csum_offset; + + if (offset + sizeof(__sum16) > hdlen) + return -EINVAL; + } + return __pskb_trim(skb, len); +} +EXPORT_SYMBOL(pskb_trim_rcsum_slow); + +/** + * __pskb_pull_tail - advance tail of skb header + * @skb: buffer to reallocate + * @delta: number of bytes to advance tail + * + * The function makes a sense only on a fragmented &sk_buff, + * it expands header moving its tail forward and copying necessary + * data from fragmented part. + * + * &sk_buff MUST have reference count of 1. + * + * Returns %NULL (and &sk_buff does not change) if pull failed + * or value of new tail of skb in the case of success. + * + * All the pointers pointing into skb header may change and must be + * reloaded after call to this function. + */ + +/* Moves tail of skb head forward, copying data from fragmented part, + * when it is necessary. + * 1. It may fail due to malloc failure. + * 2. It may change skb pointers. + * + * It is pretty complicated. Luckily, it is called only in exceptional cases. + */ +void *__pskb_pull_tail(struct sk_buff *skb, int delta) +{ + /* If skb has not enough free space at tail, get new one + * plus 128 bytes for future expansions. If we have enough + * room at tail, reallocate without expansion only if skb is cloned. + */ + int i, k, eat = (skb->tail + delta) - skb->end; + + if (eat > 0 || skb_cloned(skb)) { + if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, + GFP_ATOMIC)) + return NULL; + } + + BUG_ON(skb_copy_bits(skb, skb_headlen(skb), + skb_tail_pointer(skb), delta)); + + /* Optimization: no fragments, no reasons to preestimate + * size of pulled pages. Superb. + */ + if (!skb_has_frag_list(skb)) + goto pull_pages; + + /* Estimate size of pulled pages. */ + eat = delta; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); + + if (size >= eat) + goto pull_pages; + eat -= size; + } + + /* If we need update frag list, we are in troubles. + * Certainly, it is possible to add an offset to skb data, + * but taking into account that pulling is expected to + * be very rare operation, it is worth to fight against + * further bloating skb head and crucify ourselves here instead. + * Pure masohism, indeed. 8)8) + */ + if (eat) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + struct sk_buff *clone = NULL; + struct sk_buff *insp = NULL; + + do { + if (list->len <= eat) { + /* Eaten as whole. */ + eat -= list->len; + list = list->next; + insp = list; + } else { + /* Eaten partially. */ + if (skb_is_gso(skb) && !list->head_frag && + skb_headlen(list)) + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; + + if (skb_shared(list)) { + /* Sucks! We need to fork list. :-( */ + clone = skb_clone(list, GFP_ATOMIC); + if (!clone) + return NULL; + insp = list->next; + list = clone; + } else { + /* This may be pulled without + * problems. */ + insp = list; + } + if (!pskb_pull(list, eat)) { + kfree_skb(clone); + return NULL; + } + break; + } + } while (eat); + + /* Free pulled out fragments. */ + while ((list = skb_shinfo(skb)->frag_list) != insp) { + skb_shinfo(skb)->frag_list = list->next; + consume_skb(list); + } + /* And insert new clone at head. */ + if (clone) { + clone->next = list; + skb_shinfo(skb)->frag_list = clone; + } + } + /* Success! Now we may commit changes to skb data. */ + +pull_pages: + eat = delta; + k = 0; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); + + if (size <= eat) { + skb_frag_unref(skb, i); + eat -= size; + } else { + skb_frag_t *frag = &skb_shinfo(skb)->frags[k]; + + *frag = skb_shinfo(skb)->frags[i]; + if (eat) { + skb_frag_off_add(frag, eat); + skb_frag_size_sub(frag, eat); + if (!i) + goto end; + eat = 0; + } + k++; + } + } + skb_shinfo(skb)->nr_frags = k; + +end: + skb->tail += delta; + skb->data_len -= delta; + + if (!skb->data_len) + skb_zcopy_clear(skb, false); + + return skb_tail_pointer(skb); +} +EXPORT_SYMBOL(__pskb_pull_tail); + +/** + * skb_copy_bits - copy bits from skb to kernel buffer + * @skb: source skb + * @offset: offset in source + * @to: destination buffer + * @len: number of bytes to copy + * + * Copy the specified number of bytes from the source skb to the + * destination buffer. + * + * CAUTION ! : + * If its prototype is ever changed, + * check arch/{*}/net/{*}.S files, + * since it is called from BPF assembly code. + */ +int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) +{ + int start = skb_headlen(skb); + struct sk_buff *frag_iter; + int i, copy; + + if (offset > (int)skb->len - len) + goto fault; + + /* Copy header. */ + if ((copy = start - offset) > 0) { + if (copy > len) + copy = len; + skb_copy_from_linear_data_offset(skb, offset, to, copy); + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + skb_frag_t *f = &skb_shinfo(skb)->frags[i]; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(f); + if ((copy = end - offset) > 0) { + u32 p_off, p_len, copied; + struct page *p; + u8 *vaddr; + + if (copy > len) + copy = len; + + skb_frag_foreach_page(f, + skb_frag_off(f) + offset - start, + copy, p, p_off, p_len, copied) { + vaddr = kmap_atomic(p); + memcpy(to + copied, vaddr + p_off, p_len); + kunmap_atomic(vaddr); + } + + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_bits(frag_iter, offset - start, to, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + } + start = end; + } + + if (!len) + return 0; + +fault: + return -EFAULT; +} +EXPORT_SYMBOL(skb_copy_bits); + +/* + * Callback from splice_to_pipe(), if we need to release some pages + * at the end of the spd in case we error'ed out in filling the pipe. + */ +static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) +{ + put_page(spd->pages[i]); +} + +static struct page *linear_to_page(struct page *page, unsigned int *len, + unsigned int *offset, + struct sock *sk) +{ + struct page_frag *pfrag = sk_page_frag(sk); + + if (!sk_page_frag_refill(sk, pfrag)) + return NULL; + + *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset); + + memcpy(page_address(pfrag->page) + pfrag->offset, + page_address(page) + *offset, *len); + *offset = pfrag->offset; + pfrag->offset += *len; + + return pfrag->page; +} + +static bool spd_can_coalesce(const struct splice_pipe_desc *spd, + struct page *page, + unsigned int offset) +{ + return spd->nr_pages && + spd->pages[spd->nr_pages - 1] == page && + (spd->partial[spd->nr_pages - 1].offset + + spd->partial[spd->nr_pages - 1].len == offset); +} + +/* + * Fill page/offset/length into spd, if it can hold more pages. + */ +static bool spd_fill_page(struct splice_pipe_desc *spd, + struct pipe_inode_info *pipe, struct page *page, + unsigned int *len, unsigned int offset, + bool linear, + struct sock *sk) +{ + if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) + return true; + + if (linear) { + page = linear_to_page(page, len, &offset, sk); + if (!page) + return true; + } + if (spd_can_coalesce(spd, page, offset)) { + spd->partial[spd->nr_pages - 1].len += *len; + return false; + } + get_page(page); + spd->pages[spd->nr_pages] = page; + spd->partial[spd->nr_pages].len = *len; + spd->partial[spd->nr_pages].offset = offset; + spd->nr_pages++; + + return false; +} + +static bool __splice_segment(struct page *page, unsigned int poff, + unsigned int plen, unsigned int *off, + unsigned int *len, + struct splice_pipe_desc *spd, bool linear, + struct sock *sk, + struct pipe_inode_info *pipe) +{ + if (!*len) + return true; + + /* skip this segment if already processed */ + if (*off >= plen) { + *off -= plen; + return false; + } + + /* ignore any bits we already processed */ + poff += *off; + plen -= *off; + *off = 0; + + do { + unsigned int flen = min(*len, plen); + + if (spd_fill_page(spd, pipe, page, &flen, poff, + linear, sk)) + return true; + poff += flen; + plen -= flen; + *len -= flen; + } while (*len && plen); + + return false; +} + +/* + * Map linear and fragment data from the skb to spd. It reports true if the + * pipe is full or if we already spliced the requested length. + */ +static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, + unsigned int *offset, unsigned int *len, + struct splice_pipe_desc *spd, struct sock *sk) +{ + int seg; + struct sk_buff *iter; + + /* map the linear part : + * If skb->head_frag is set, this 'linear' part is backed by a + * fragment, and if the head is not shared with any clones then + * we can avoid a copy since we own the head portion of this page. + */ + if (__splice_segment(virt_to_page(skb->data), + (unsigned long) skb->data & (PAGE_SIZE - 1), + skb_headlen(skb), + offset, len, spd, + skb_head_is_locked(skb), + sk, pipe)) + return true; + + /* + * then map the fragments + */ + for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { + const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; + + if (__splice_segment(skb_frag_page(f), + skb_frag_off(f), skb_frag_size(f), + offset, len, spd, false, sk, pipe)) + return true; + } + + skb_walk_frags(skb, iter) { + if (*offset >= iter->len) { + *offset -= iter->len; + continue; + } + /* __skb_splice_bits() only fails if the output has no room + * left, so no point in going over the frag_list for the error + * case. + */ + if (__skb_splice_bits(iter, pipe, offset, len, spd, sk)) + return true; + } + + return false; +} + +/* + * Map data from the skb to a pipe. Should handle both the linear part, + * the fragments, and the frag list. + */ +int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, + struct pipe_inode_info *pipe, unsigned int tlen, + unsigned int flags) +{ + struct partial_page partial[MAX_SKB_FRAGS]; + struct page *pages[MAX_SKB_FRAGS]; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .nr_pages_max = MAX_SKB_FRAGS, + .ops = &nosteal_pipe_buf_ops, + .spd_release = sock_spd_release, + }; + int ret = 0; + + __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk); + + if (spd.nr_pages) + ret = splice_to_pipe(pipe, &spd); + + return ret; +} +EXPORT_SYMBOL_GPL(skb_splice_bits); + +static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg, + struct kvec *vec, size_t num, size_t size) +{ + struct socket *sock = sk->sk_socket; + + if (!sock) + return -EINVAL; + return kernel_sendmsg(sock, msg, vec, num, size); +} + +static int sendpage_unlocked(struct sock *sk, struct page *page, int offset, + size_t size, int flags) +{ + struct socket *sock = sk->sk_socket; + + if (!sock) + return -EINVAL; + return kernel_sendpage(sock, page, offset, size, flags); +} + +typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg, + struct kvec *vec, size_t num, size_t size); +typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset, + size_t size, int flags); +static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, + int len, sendmsg_func sendmsg, sendpage_func sendpage) +{ + unsigned int orig_len = len; + struct sk_buff *head = skb; + unsigned short fragidx; + int slen, ret; + +do_frag_list: + + /* Deal with head data */ + while (offset < skb_headlen(skb) && len) { + struct kvec kv; + struct msghdr msg; + + slen = min_t(int, len, skb_headlen(skb) - offset); + kv.iov_base = skb->data + offset; + kv.iov_len = slen; + memset(&msg, 0, sizeof(msg)); + msg.msg_flags = MSG_DONTWAIT; + + ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked, + sendmsg_unlocked, sk, &msg, &kv, 1, slen); + if (ret <= 0) + goto error; + + offset += ret; + len -= ret; + } + + /* All the data was skb head? */ + if (!len) + goto out; + + /* Make offset relative to start of frags */ + offset -= skb_headlen(skb); + + /* Find where we are in frag list */ + for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; + + if (offset < skb_frag_size(frag)) + break; + + offset -= skb_frag_size(frag); + } + + for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; + + slen = min_t(size_t, len, skb_frag_size(frag) - offset); + + while (slen) { + ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked, + sendpage_unlocked, sk, + skb_frag_page(frag), + skb_frag_off(frag) + offset, + slen, MSG_DONTWAIT); + if (ret <= 0) + goto error; + + len -= ret; + offset += ret; + slen -= ret; + } + + offset = 0; + } + + if (len) { + /* Process any frag lists */ + + if (skb == head) { + if (skb_has_frag_list(skb)) { + skb = skb_shinfo(skb)->frag_list; + goto do_frag_list; + } + } else if (skb->next) { + skb = skb->next; + goto do_frag_list; + } + } + +out: + return orig_len - len; + +error: + return orig_len == len ? ret : orig_len - len; +} + +/* Send skb data on a socket. Socket must be locked. */ +int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, + int len) +{ + return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked, + kernel_sendpage_locked); +} +EXPORT_SYMBOL_GPL(skb_send_sock_locked); + +/* Send skb data on a socket. Socket must be unlocked. */ +int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) +{ + return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, + sendpage_unlocked); +} + +/** + * skb_store_bits - store bits from kernel buffer to skb + * @skb: destination buffer + * @offset: offset in destination + * @from: source buffer + * @len: number of bytes to copy + * + * Copy the specified number of bytes from the source buffer to the + * destination skb. This function handles all the messy bits of + * traversing fragment lists and such. + */ + +int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) +{ + int start = skb_headlen(skb); + struct sk_buff *frag_iter; + int i, copy; + + if (offset > (int)skb->len - len) + goto fault; + + if ((copy = start - offset) > 0) { + if (copy > len) + copy = len; + skb_copy_to_linear_data_offset(skb, offset, from, copy); + if ((len -= copy) == 0) + return 0; + offset += copy; + from += copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + int end; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(frag); + if ((copy = end - offset) > 0) { + u32 p_off, p_len, copied; + struct page *p; + u8 *vaddr; + + if (copy > len) + copy = len; + + skb_frag_foreach_page(frag, + skb_frag_off(frag) + offset - start, + copy, p, p_off, p_len, copied) { + vaddr = kmap_atomic(p); + memcpy(vaddr + p_off, from + copied, p_len); + kunmap_atomic(vaddr); + } + + if ((len -= copy) == 0) + return 0; + offset += copy; + from += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_store_bits(frag_iter, offset - start, + from, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + from += copy; + } + start = end; + } + if (!len) + return 0; + +fault: + return -EFAULT; +} +EXPORT_SYMBOL(skb_store_bits); + +/* Checksum skb data. */ +__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, + __wsum csum, const struct skb_checksum_ops *ops) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + int pos = 0; + + /* Checksum header. */ + if (copy > 0) { + if (copy > len) + copy = len; + csum = INDIRECT_CALL_1(ops->update, csum_partial_ext, + skb->data + offset, copy, csum); + if ((len -= copy) == 0) + return csum; + offset += copy; + pos = copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(frag); + if ((copy = end - offset) > 0) { + u32 p_off, p_len, copied; + struct page *p; + __wsum csum2; + u8 *vaddr; + + if (copy > len) + copy = len; + + skb_frag_foreach_page(frag, + skb_frag_off(frag) + offset - start, + copy, p, p_off, p_len, copied) { + vaddr = kmap_atomic(p); + csum2 = INDIRECT_CALL_1(ops->update, + csum_partial_ext, + vaddr + p_off, p_len, 0); + kunmap_atomic(vaddr); + csum = INDIRECT_CALL_1(ops->combine, + csum_block_add_ext, csum, + csum2, pos, p_len); + pos += p_len; + } + + if (!(len -= copy)) + return csum; + offset += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + __wsum csum2; + if (copy > len) + copy = len; + csum2 = __skb_checksum(frag_iter, offset - start, + copy, 0, ops); + csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext, + csum, csum2, pos, copy); + if ((len -= copy) == 0) + return csum; + offset += copy; + pos += copy; + } + start = end; + } + BUG_ON(len); + + return csum; +} +EXPORT_SYMBOL(__skb_checksum); + +__wsum skb_checksum(const struct sk_buff *skb, int offset, + int len, __wsum csum) +{ + const struct skb_checksum_ops ops = { + .update = csum_partial_ext, + .combine = csum_block_add_ext, + }; + + return __skb_checksum(skb, offset, len, csum, &ops); +} +EXPORT_SYMBOL(skb_checksum); + +/* Both of above in one bottle. */ + +__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, + u8 *to, int len) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + int pos = 0; + __wsum csum = 0; + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + csum = csum_partial_copy_nocheck(skb->data + offset, to, + copy); + if ((len -= copy) == 0) + return csum; + offset += copy; + to += copy; + pos = copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); + if ((copy = end - offset) > 0) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + u32 p_off, p_len, copied; + struct page *p; + __wsum csum2; + u8 *vaddr; + + if (copy > len) + copy = len; + + skb_frag_foreach_page(frag, + skb_frag_off(frag) + offset - start, + copy, p, p_off, p_len, copied) { + vaddr = kmap_atomic(p); + csum2 = csum_partial_copy_nocheck(vaddr + p_off, + to + copied, + p_len); + kunmap_atomic(vaddr); + csum = csum_block_add(csum, csum2, pos); + pos += p_len; + } + + if (!(len -= copy)) + return csum; + offset += copy; + to += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + __wsum csum2; + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + csum2 = skb_copy_and_csum_bits(frag_iter, + offset - start, + to, copy); + csum = csum_block_add(csum, csum2, pos); + if ((len -= copy) == 0) + return csum; + offset += copy; + to += copy; + pos += copy; + } + start = end; + } + BUG_ON(len); + return csum; +} +EXPORT_SYMBOL(skb_copy_and_csum_bits); + +__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) +{ + __sum16 sum; + + sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); + /* See comments in __skb_checksum_complete(). */ + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev, skb); + } + if (!skb_shared(skb)) + skb->csum_valid = !sum; + return sum; +} +EXPORT_SYMBOL(__skb_checksum_complete_head); + +/* This function assumes skb->csum already holds pseudo header's checksum, + * which has been changed from the hardware checksum, for example, by + * __skb_checksum_validate_complete(). And, the original skb->csum must + * have been validated unsuccessfully for CHECKSUM_COMPLETE case. + * + * It returns non-zero if the recomputed checksum is still invalid, otherwise + * zero. The new checksum is stored back into skb->csum unless the skb is + * shared. + */ +__sum16 __skb_checksum_complete(struct sk_buff *skb) +{ + __wsum csum; + __sum16 sum; + + csum = skb_checksum(skb, 0, skb->len, 0); + + sum = csum_fold(csum_add(skb->csum, csum)); + /* This check is inverted, because we already knew the hardware + * checksum is invalid before calling this function. So, if the + * re-computed checksum is valid instead, then we have a mismatch + * between the original skb->csum and skb_checksum(). This means either + * the original hardware checksum is incorrect or we screw up skb->csum + * when moving skb->data around. + */ + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev, skb); + } + + if (!skb_shared(skb)) { + /* Save full packet checksum */ + skb->csum = csum; + skb->ip_summed = CHECKSUM_COMPLETE; + skb->csum_complete_sw = 1; + skb->csum_valid = !sum; + } + + return sum; +} +EXPORT_SYMBOL(__skb_checksum_complete); + +static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) +{ + net_warn_ratelimited( + "%s: attempt to compute crc32c without libcrc32c.ko\n", + __func__); + return 0; +} + +static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2, + int offset, int len) +{ + net_warn_ratelimited( + "%s: attempt to compute crc32c without libcrc32c.ko\n", + __func__); + return 0; +} + +static const struct skb_checksum_ops default_crc32c_ops = { + .update = warn_crc32c_csum_update, + .combine = warn_crc32c_csum_combine, +}; + +const struct skb_checksum_ops *crc32c_csum_stub __read_mostly = + &default_crc32c_ops; +EXPORT_SYMBOL(crc32c_csum_stub); + + /** + * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() + * @from: source buffer + * + * Calculates the amount of linear headroom needed in the 'to' skb passed + * into skb_zerocopy(). + */ +unsigned int +skb_zerocopy_headlen(const struct sk_buff *from) +{ + unsigned int hlen = 0; + + if (!from->head_frag || + skb_headlen(from) < L1_CACHE_BYTES || + skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) { + hlen = skb_headlen(from); + if (!hlen) + hlen = from->len; + } + + if (skb_has_frag_list(from)) + hlen = from->len; + + return hlen; +} +EXPORT_SYMBOL_GPL(skb_zerocopy_headlen); + +/** + * skb_zerocopy - Zero copy skb to skb + * @to: destination buffer + * @from: source buffer + * @len: number of bytes to copy from source buffer + * @hlen: size of linear headroom in destination buffer + * + * Copies up to `len` bytes from `from` to `to` by creating references + * to the frags in the source buffer. + * + * The `hlen` as calculated by skb_zerocopy_headlen() specifies the + * headroom in the `to` buffer. + * + * Return value: + * 0: everything is OK + * -ENOMEM: couldn't orphan frags of @from due to lack of memory + * -EFAULT: skb_copy_bits() found some problem with skb geometry + */ +int +skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) +{ + int i, j = 0; + int plen = 0; /* length of skb->head fragment */ + int ret; + struct page *page; + unsigned int offset; + + BUG_ON(!from->head_frag && !hlen); + + /* dont bother with small payloads */ + if (len <= skb_tailroom(to)) + return skb_copy_bits(from, 0, skb_put(to, len), len); + + if (hlen) { + ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen); + if (unlikely(ret)) + return ret; + len -= hlen; + } else { + plen = min_t(int, skb_headlen(from), len); + if (plen) { + page = virt_to_head_page(from->head); + offset = from->data - (unsigned char *)page_address(page); + __skb_fill_page_desc(to, 0, page, offset, plen); + get_page(page); + j = 1; + len -= plen; + } + } + + skb_len_add(to, len + plen); + + if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) { + skb_tx_error(from); + return -ENOMEM; + } + skb_zerocopy_clone(to, from, GFP_ATOMIC); + + for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { + int size; + + if (!len) + break; + skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; + size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]), + len); + skb_frag_size_set(&skb_shinfo(to)->frags[j], size); + len -= size; + skb_frag_ref(to, j); + j++; + } + skb_shinfo(to)->nr_frags = j; + + return 0; +} +EXPORT_SYMBOL_GPL(skb_zerocopy); + +void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) +{ + __wsum csum; + long csstart; + + if (skb->ip_summed == CHECKSUM_PARTIAL) + csstart = skb_checksum_start_offset(skb); + else + csstart = skb_headlen(skb); + + BUG_ON(csstart > skb_headlen(skb)); + + skb_copy_from_linear_data(skb, to, csstart); + + csum = 0; + if (csstart != skb->len) + csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, + skb->len - csstart); + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + long csstuff = csstart + skb->csum_offset; + + *((__sum16 *)(to + csstuff)) = csum_fold(csum); + } +} +EXPORT_SYMBOL(skb_copy_and_csum_dev); + +/** + * skb_dequeue - remove from the head of the queue + * @list: list to dequeue from + * + * Remove the head of the list. The list lock is taken so the function + * may be used safely with other locking list functions. The head item is + * returned or %NULL if the list is empty. + */ + +struct sk_buff *skb_dequeue(struct sk_buff_head *list) +{ + unsigned long flags; + struct sk_buff *result; + + spin_lock_irqsave(&list->lock, flags); + result = __skb_dequeue(list); + spin_unlock_irqrestore(&list->lock, flags); + return result; +} +EXPORT_SYMBOL(skb_dequeue); + +/** + * skb_dequeue_tail - remove from the tail of the queue + * @list: list to dequeue from + * + * Remove the tail of the list. The list lock is taken so the function + * may be used safely with other locking list functions. The tail item is + * returned or %NULL if the list is empty. + */ +struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) +{ + unsigned long flags; + struct sk_buff *result; + + spin_lock_irqsave(&list->lock, flags); + result = __skb_dequeue_tail(list); + spin_unlock_irqrestore(&list->lock, flags); + return result; +} +EXPORT_SYMBOL(skb_dequeue_tail); + +/** + * skb_queue_purge - empty a list + * @list: list to empty + * + * Delete all buffers on an &sk_buff list. Each buffer is removed from + * the list and one reference dropped. This function takes the list + * lock and is atomic with respect to other list locking functions. + */ +void skb_queue_purge(struct sk_buff_head *list) +{ + struct sk_buff *skb; + while ((skb = skb_dequeue(list)) != NULL) + kfree_skb(skb); +} +EXPORT_SYMBOL(skb_queue_purge); + +/** + * skb_rbtree_purge - empty a skb rbtree + * @root: root of the rbtree to empty + * Return value: the sum of truesizes of all purged skbs. + * + * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from + * the list and one reference dropped. This function does not take + * any lock. Synchronization should be handled by the caller (e.g., TCP + * out-of-order queue is protected by the socket lock). + */ +unsigned int skb_rbtree_purge(struct rb_root *root) +{ + struct rb_node *p = rb_first(root); + unsigned int sum = 0; + + while (p) { + struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); + + p = rb_next(p); + rb_erase(&skb->rbnode, root); + sum += skb->truesize; + kfree_skb(skb); + } + return sum; +} + +/** + * skb_queue_head - queue a buffer at the list head + * @list: list to use + * @newsk: buffer to queue + * + * Queue a buffer at the start of the list. This function takes the + * list lock and can be used safely with other locking &sk_buff functions + * safely. + * + * A buffer cannot be placed on two lists at the same time. + */ +void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_queue_head(list, newsk); + spin_unlock_irqrestore(&list->lock, flags); +} +EXPORT_SYMBOL(skb_queue_head); + +/** + * skb_queue_tail - queue a buffer at the list tail + * @list: list to use + * @newsk: buffer to queue + * + * Queue a buffer at the tail of the list. This function takes the + * list lock and can be used safely with other locking &sk_buff functions + * safely. + * + * A buffer cannot be placed on two lists at the same time. + */ +void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_queue_tail(list, newsk); + spin_unlock_irqrestore(&list->lock, flags); +} +EXPORT_SYMBOL(skb_queue_tail); + +/** + * skb_unlink - remove a buffer from a list + * @skb: buffer to remove + * @list: list to use + * + * Remove a packet from a list. The list locks are taken and this + * function is atomic with respect to other list locked calls + * + * You must know what list the SKB is on. + */ +void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_unlink(skb, list); + spin_unlock_irqrestore(&list->lock, flags); +} +EXPORT_SYMBOL(skb_unlink); + +/** + * skb_append - append a buffer + * @old: buffer to insert after + * @newsk: buffer to insert + * @list: list to use + * + * Place a packet after a given packet in a list. The list locks are taken + * and this function is atomic with respect to other list locked calls. + * A buffer cannot be placed on two lists at the same time. + */ +void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_queue_after(list, old, newsk); + spin_unlock_irqrestore(&list->lock, flags); +} +EXPORT_SYMBOL(skb_append); + +static inline void skb_split_inside_header(struct sk_buff *skb, + struct sk_buff* skb1, + const u32 len, const int pos) +{ + int i; + + skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len), + pos - len); + /* And move data appendix as is. */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; + + skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; + skb_shinfo(skb)->nr_frags = 0; + skb1->data_len = skb->data_len; + skb1->len += skb1->data_len; + skb->data_len = 0; + skb->len = len; + skb_set_tail_pointer(skb, len); +} + +static inline void skb_split_no_header(struct sk_buff *skb, + struct sk_buff* skb1, + const u32 len, int pos) +{ + int i, k = 0; + const int nfrags = skb_shinfo(skb)->nr_frags; + + skb_shinfo(skb)->nr_frags = 0; + skb1->len = skb1->data_len = skb->len - len; + skb->len = len; + skb->data_len = len - pos; + + for (i = 0; i < nfrags; i++) { + int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); + + if (pos + size > len) { + skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; + + if (pos < len) { + /* Split frag. + * We have two variants in this case: + * 1. Move all the frag to the second + * part, if it is possible. F.e. + * this approach is mandatory for TUX, + * where splitting is expensive. + * 2. Split is accurately. We make this. + */ + skb_frag_ref(skb, i); + skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos); + skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos); + skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos); + skb_shinfo(skb)->nr_frags++; + } + k++; + } else + skb_shinfo(skb)->nr_frags++; + pos += size; + } + skb_shinfo(skb1)->nr_frags = k; +} + +/** + * skb_split - Split fragmented skb to two parts at length len. + * @skb: the buffer to split + * @skb1: the buffer to receive the second part + * @len: new length for skb + */ +void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) +{ + int pos = skb_headlen(skb); + const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY; + + skb_zcopy_downgrade_managed(skb); + + skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags; + skb_zerocopy_clone(skb1, skb, 0); + if (len < pos) /* Split line is inside header. */ + skb_split_inside_header(skb, skb1, len, pos); + else /* Second chunk has no header, nothing to copy. */ + skb_split_no_header(skb, skb1, len, pos); +} +EXPORT_SYMBOL(skb_split); + +/* Shifting from/to a cloned skb is a no-go. + * + * Caller cannot keep skb_shinfo related pointers past calling here! + */ +static int skb_prepare_for_shift(struct sk_buff *skb) +{ + return skb_unclone_keeptruesize(skb, GFP_ATOMIC); +} + +/** + * skb_shift - Shifts paged data partially from skb to another + * @tgt: buffer into which tail data gets added + * @skb: buffer from which the paged data comes from + * @shiftlen: shift up to this many bytes + * + * Attempts to shift up to shiftlen worth of bytes, which may be less than + * the length of the skb, from skb to tgt. Returns number bytes shifted. + * It's up to caller to free skb if everything was shifted. + * + * If @tgt runs out of frags, the whole operation is aborted. + * + * Skb cannot include anything else but paged data while tgt is allowed + * to have non-paged data as well. + * + * TODO: full sized shift could be optimized but that would need + * specialized skb free'er to handle frags without up-to-date nr_frags. + */ +int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) +{ + int from, to, merge, todo; + skb_frag_t *fragfrom, *fragto; + + BUG_ON(shiftlen > skb->len); + + if (skb_headlen(skb)) + return 0; + if (skb_zcopy(tgt) || skb_zcopy(skb)) + return 0; + + todo = shiftlen; + from = 0; + to = skb_shinfo(tgt)->nr_frags; + fragfrom = &skb_shinfo(skb)->frags[from]; + + /* Actual merge is delayed until the point when we know we can + * commit all, so that we don't have to undo partial changes + */ + if (!to || + !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), + skb_frag_off(fragfrom))) { + merge = -1; + } else { + merge = to - 1; + + todo -= skb_frag_size(fragfrom); + if (todo < 0) { + if (skb_prepare_for_shift(skb) || + skb_prepare_for_shift(tgt)) + return 0; + + /* All previous frag pointers might be stale! */ + fragfrom = &skb_shinfo(skb)->frags[from]; + fragto = &skb_shinfo(tgt)->frags[merge]; + + skb_frag_size_add(fragto, shiftlen); + skb_frag_size_sub(fragfrom, shiftlen); + skb_frag_off_add(fragfrom, shiftlen); + + goto onlymerged; + } + + from++; + } + + /* Skip full, not-fitting skb to avoid expensive operations */ + if ((shiftlen == skb->len) && + (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) + return 0; + + if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) + return 0; + + while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { + if (to == MAX_SKB_FRAGS) + return 0; + + fragfrom = &skb_shinfo(skb)->frags[from]; + fragto = &skb_shinfo(tgt)->frags[to]; + + if (todo >= skb_frag_size(fragfrom)) { + *fragto = *fragfrom; + todo -= skb_frag_size(fragfrom); + from++; + to++; + + } else { + __skb_frag_ref(fragfrom); + skb_frag_page_copy(fragto, fragfrom); + skb_frag_off_copy(fragto, fragfrom); + skb_frag_size_set(fragto, todo); + + skb_frag_off_add(fragfrom, todo); + skb_frag_size_sub(fragfrom, todo); + todo = 0; + + to++; + break; + } + } + + /* Ready to "commit" this state change to tgt */ + skb_shinfo(tgt)->nr_frags = to; + + if (merge >= 0) { + fragfrom = &skb_shinfo(skb)->frags[0]; + fragto = &skb_shinfo(tgt)->frags[merge]; + + skb_frag_size_add(fragto, skb_frag_size(fragfrom)); + __skb_frag_unref(fragfrom, skb->pp_recycle); + } + + /* Reposition in the original skb */ + to = 0; + while (from < skb_shinfo(skb)->nr_frags) + skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; + skb_shinfo(skb)->nr_frags = to; + + BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); + +onlymerged: + /* Most likely the tgt won't ever need its checksum anymore, skb on + * the other hand might need it if it needs to be resent + */ + tgt->ip_summed = CHECKSUM_PARTIAL; + skb->ip_summed = CHECKSUM_PARTIAL; + + skb_len_add(skb, -shiftlen); + skb_len_add(tgt, shiftlen); + + return shiftlen; +} + +/** + * skb_prepare_seq_read - Prepare a sequential read of skb data + * @skb: the buffer to read + * @from: lower offset of data to be read + * @to: upper offset of data to be read + * @st: state variable + * + * Initializes the specified state variable. Must be called before + * invoking skb_seq_read() for the first time. + */ +void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, + unsigned int to, struct skb_seq_state *st) +{ + st->lower_offset = from; + st->upper_offset = to; + st->root_skb = st->cur_skb = skb; + st->frag_idx = st->stepped_offset = 0; + st->frag_data = NULL; + st->frag_off = 0; +} +EXPORT_SYMBOL(skb_prepare_seq_read); + +/** + * skb_seq_read - Sequentially read skb data + * @consumed: number of bytes consumed by the caller so far + * @data: destination pointer for data to be returned + * @st: state variable + * + * Reads a block of skb data at @consumed relative to the + * lower offset specified to skb_prepare_seq_read(). Assigns + * the head of the data block to @data and returns the length + * of the block or 0 if the end of the skb data or the upper + * offset has been reached. + * + * The caller is not required to consume all of the data + * returned, i.e. @consumed is typically set to the number + * of bytes already consumed and the next call to + * skb_seq_read() will return the remaining part of the block. + * + * Note 1: The size of each block of data returned can be arbitrary, + * this limitation is the cost for zerocopy sequential + * reads of potentially non linear data. + * + * Note 2: Fragment lists within fragments are not implemented + * at the moment, state->root_skb could be replaced with + * a stack for this purpose. + */ +unsigned int skb_seq_read(unsigned int consumed, const u8 **data, + struct skb_seq_state *st) +{ + unsigned int block_limit, abs_offset = consumed + st->lower_offset; + skb_frag_t *frag; + + if (unlikely(abs_offset >= st->upper_offset)) { + if (st->frag_data) { + kunmap_atomic(st->frag_data); + st->frag_data = NULL; + } + return 0; + } + +next_skb: + block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; + + if (abs_offset < block_limit && !st->frag_data) { + *data = st->cur_skb->data + (abs_offset - st->stepped_offset); + return block_limit - abs_offset; + } + + if (st->frag_idx == 0 && !st->frag_data) + st->stepped_offset += skb_headlen(st->cur_skb); + + while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { + unsigned int pg_idx, pg_off, pg_sz; + + frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; + + pg_idx = 0; + pg_off = skb_frag_off(frag); + pg_sz = skb_frag_size(frag); + + if (skb_frag_must_loop(skb_frag_page(frag))) { + pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT; + pg_off = offset_in_page(pg_off + st->frag_off); + pg_sz = min_t(unsigned int, pg_sz - st->frag_off, + PAGE_SIZE - pg_off); + } + + block_limit = pg_sz + st->stepped_offset; + if (abs_offset < block_limit) { + if (!st->frag_data) + st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx); + + *data = (u8 *)st->frag_data + pg_off + + (abs_offset - st->stepped_offset); + + return block_limit - abs_offset; + } + + if (st->frag_data) { + kunmap_atomic(st->frag_data); + st->frag_data = NULL; + } + + st->stepped_offset += pg_sz; + st->frag_off += pg_sz; + if (st->frag_off == skb_frag_size(frag)) { + st->frag_off = 0; + st->frag_idx++; + } + } + + if (st->frag_data) { + kunmap_atomic(st->frag_data); + st->frag_data = NULL; + } + + if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) { + st->cur_skb = skb_shinfo(st->root_skb)->frag_list; + st->frag_idx = 0; + goto next_skb; + } else if (st->cur_skb->next) { + st->cur_skb = st->cur_skb->next; + st->frag_idx = 0; + goto next_skb; + } + + return 0; +} +EXPORT_SYMBOL(skb_seq_read); + +/** + * skb_abort_seq_read - Abort a sequential read of skb data + * @st: state variable + * + * Must be called if skb_seq_read() was not called until it + * returned 0. + */ +void skb_abort_seq_read(struct skb_seq_state *st) +{ + if (st->frag_data) + kunmap_atomic(st->frag_data); +} +EXPORT_SYMBOL(skb_abort_seq_read); + +#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) + +static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, + struct ts_config *conf, + struct ts_state *state) +{ + return skb_seq_read(offset, text, TS_SKB_CB(state)); +} + +static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) +{ + skb_abort_seq_read(TS_SKB_CB(state)); +} + +/** + * skb_find_text - Find a text pattern in skb data + * @skb: the buffer to look in + * @from: search offset + * @to: search limit + * @config: textsearch configuration + * + * Finds a pattern in the skb data according to the specified + * textsearch configuration. Use textsearch_next() to retrieve + * subsequent occurrences of the pattern. Returns the offset + * to the first occurrence or UINT_MAX if no match was found. + */ +unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, + unsigned int to, struct ts_config *config) +{ + unsigned int patlen = config->ops->get_pattern_len(config); + struct ts_state state; + unsigned int ret; + + BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb)); + + config->get_next_block = skb_ts_get_next_block; + config->finish = skb_ts_finish; + + skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state)); + + ret = textsearch_find(config, &state); + return (ret + patlen <= to - from ? ret : UINT_MAX); +} +EXPORT_SYMBOL(skb_find_text); + +int skb_append_pagefrags(struct sk_buff *skb, struct page *page, + int offset, size_t size) +{ + int i = skb_shinfo(skb)->nr_frags; + + if (skb_can_coalesce(skb, i, page, offset)) { + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); + } else if (i < MAX_SKB_FRAGS) { + skb_zcopy_downgrade_managed(skb); + get_page(page); + skb_fill_page_desc_noacc(skb, i, page, offset, size); + } else { + return -EMSGSIZE; + } + + return 0; +} +EXPORT_SYMBOL_GPL(skb_append_pagefrags); + +/** + * skb_pull_rcsum - pull skb and update receive checksum + * @skb: buffer to update + * @len: length of data pulled + * + * This function performs an skb_pull on the packet and updates + * the CHECKSUM_COMPLETE checksum. It should be used on + * receive path processing instead of skb_pull unless you know + * that the checksum difference is zero (e.g., a valid IP header) + * or you are setting ip_summed to CHECKSUM_NONE. + */ +void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) +{ + unsigned char *data = skb->data; + + BUG_ON(len > skb->len); + __skb_pull(skb, len); + skb_postpull_rcsum(skb, data, len); + return skb->data; +} +EXPORT_SYMBOL_GPL(skb_pull_rcsum); + +static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) +{ + skb_frag_t head_frag; + struct page *page; + + page = virt_to_head_page(frag_skb->head); + __skb_frag_set_page(&head_frag, page); + skb_frag_off_set(&head_frag, frag_skb->data - + (unsigned char *)page_address(page)); + skb_frag_size_set(&head_frag, skb_headlen(frag_skb)); + return head_frag; +} + +struct sk_buff *skb_segment_list(struct sk_buff *skb, + netdev_features_t features, + unsigned int offset) +{ + struct sk_buff *list_skb = skb_shinfo(skb)->frag_list; + unsigned int tnl_hlen = skb_tnl_header_len(skb); + unsigned int delta_truesize = 0; + unsigned int delta_len = 0; + struct sk_buff *tail = NULL; + struct sk_buff *nskb, *tmp; + int len_diff, err; + + skb_push(skb, -skb_network_offset(skb) + offset); + + /* Ensure the head is writeable before touching the shared info */ + err = skb_unclone(skb, GFP_ATOMIC); + if (err) + goto err_linearize; + + skb_shinfo(skb)->frag_list = NULL; + + while (list_skb) { + nskb = list_skb; + list_skb = list_skb->next; + + err = 0; + delta_truesize += nskb->truesize; + if (skb_shared(nskb)) { + tmp = skb_clone(nskb, GFP_ATOMIC); + if (tmp) { + consume_skb(nskb); + nskb = tmp; + err = skb_unclone(nskb, GFP_ATOMIC); + } else { + err = -ENOMEM; + } + } + + if (!tail) + skb->next = nskb; + else + tail->next = nskb; + + if (unlikely(err)) { + nskb->next = list_skb; + goto err_linearize; + } + + tail = nskb; + + delta_len += nskb->len; + + skb_push(nskb, -skb_network_offset(nskb) + offset); + + skb_release_head_state(nskb); + len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb); + __copy_skb_header(nskb, skb); + + skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb)); + nskb->transport_header += len_diff; + skb_copy_from_linear_data_offset(skb, -tnl_hlen, + nskb->data - tnl_hlen, + offset + tnl_hlen); + + if (skb_needs_linearize(nskb, features) && + __skb_linearize(nskb)) + goto err_linearize; + } + + skb->truesize = skb->truesize - delta_truesize; + skb->data_len = skb->data_len - delta_len; + skb->len = skb->len - delta_len; + + skb_gso_reset(skb); + + skb->prev = tail; + + if (skb_needs_linearize(skb, features) && + __skb_linearize(skb)) + goto err_linearize; + + skb_get(skb); + + return skb; + +err_linearize: + kfree_skb_list(skb->next); + skb->next = NULL; + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL_GPL(skb_segment_list); + +/** + * skb_segment - Perform protocol segmentation on skb. + * @head_skb: buffer to segment + * @features: features for the output path (see dev->features) + * + * This function performs segmentation on the given skb. It returns + * a pointer to the first in a list of new skbs for the segments. + * In case of error it returns ERR_PTR(err). + */ +struct sk_buff *skb_segment(struct sk_buff *head_skb, + netdev_features_t features) +{ + struct sk_buff *segs = NULL; + struct sk_buff *tail = NULL; + struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; + unsigned int mss = skb_shinfo(head_skb)->gso_size; + unsigned int doffset = head_skb->data - skb_mac_header(head_skb); + unsigned int offset = doffset; + unsigned int tnl_hlen = skb_tnl_header_len(head_skb); + unsigned int partial_segs = 0; + unsigned int headroom; + unsigned int len = head_skb->len; + struct sk_buff *frag_skb; + skb_frag_t *frag; + __be16 proto; + bool csum, sg; + int err = -ENOMEM; + int i = 0; + int nfrags, pos; + + if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) && + mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) { + struct sk_buff *check_skb; + + for (check_skb = list_skb; check_skb; check_skb = check_skb->next) { + if (skb_headlen(check_skb) && !check_skb->head_frag) { + /* gso_size is untrusted, and we have a frag_list with + * a linear non head_frag item. + * + * If head_skb's headlen does not fit requested gso_size, + * it means that the frag_list members do NOT terminate + * on exact gso_size boundaries. Hence we cannot perform + * skb_frag_t page sharing. Therefore we must fallback to + * copying the frag_list skbs; we do so by disabling SG. + */ + features &= ~NETIF_F_SG; + break; + } + } + } + + __skb_push(head_skb, doffset); + proto = skb_network_protocol(head_skb, NULL); + if (unlikely(!proto)) + return ERR_PTR(-EINVAL); + + sg = !!(features & NETIF_F_SG); + csum = !!can_checksum_protocol(features, proto); + + if (sg && csum && (mss != GSO_BY_FRAGS)) { + if (!(features & NETIF_F_GSO_PARTIAL)) { + struct sk_buff *iter; + unsigned int frag_len; + + if (!list_skb || + !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) + goto normal; + + /* If we get here then all the required + * GSO features except frag_list are supported. + * Try to split the SKB to multiple GSO SKBs + * with no frag_list. + * Currently we can do that only when the buffers don't + * have a linear part and all the buffers except + * the last are of the same length. + */ + frag_len = list_skb->len; + skb_walk_frags(head_skb, iter) { + if (frag_len != iter->len && iter->next) + goto normal; + if (skb_headlen(iter) && !iter->head_frag) + goto normal; + + len -= iter->len; + } + + if (len != frag_len) + goto normal; + } + + /* GSO partial only requires that we trim off any excess that + * doesn't fit into an MSS sized block, so take care of that + * now. + */ + partial_segs = len / mss; + if (partial_segs > 1) + mss *= partial_segs; + else + partial_segs = 0; + } + +normal: + headroom = skb_headroom(head_skb); + pos = skb_headlen(head_skb); + + if (skb_orphan_frags(head_skb, GFP_ATOMIC)) + return ERR_PTR(-ENOMEM); + + nfrags = skb_shinfo(head_skb)->nr_frags; + frag = skb_shinfo(head_skb)->frags; + frag_skb = head_skb; + + do { + struct sk_buff *nskb; + skb_frag_t *nskb_frag; + int hsize; + int size; + + if (unlikely(mss == GSO_BY_FRAGS)) { + len = list_skb->len; + } else { + len = head_skb->len - offset; + if (len > mss) + len = mss; + } + + hsize = skb_headlen(head_skb) - offset; + + if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) && + (skb_headlen(list_skb) == len || sg)) { + BUG_ON(skb_headlen(list_skb) > len); + + nskb = skb_clone(list_skb, GFP_ATOMIC); + if (unlikely(!nskb)) + goto err; + + i = 0; + nfrags = skb_shinfo(list_skb)->nr_frags; + frag = skb_shinfo(list_skb)->frags; + frag_skb = list_skb; + pos += skb_headlen(list_skb); + + while (pos < offset + len) { + BUG_ON(i >= nfrags); + + size = skb_frag_size(frag); + if (pos + size > offset + len) + break; + + i++; + pos += size; + frag++; + } + + list_skb = list_skb->next; + + if (unlikely(pskb_trim(nskb, len))) { + kfree_skb(nskb); + goto err; + } + + hsize = skb_end_offset(nskb); + if (skb_cow_head(nskb, doffset + headroom)) { + kfree_skb(nskb); + goto err; + } + + nskb->truesize += skb_end_offset(nskb) - hsize; + skb_release_head_state(nskb); + __skb_push(nskb, doffset); + } else { + if (hsize < 0) + hsize = 0; + if (hsize > len || !sg) + hsize = len; + + nskb = __alloc_skb(hsize + doffset + headroom, + GFP_ATOMIC, skb_alloc_rx_flag(head_skb), + NUMA_NO_NODE); + + if (unlikely(!nskb)) + goto err; + + skb_reserve(nskb, headroom); + __skb_put(nskb, doffset); + } + + if (segs) + tail->next = nskb; + else + segs = nskb; + tail = nskb; + + __copy_skb_header(nskb, head_skb); + + skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); + skb_reset_mac_len(nskb); + + skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, + nskb->data - tnl_hlen, + doffset + tnl_hlen); + + if (nskb->len == len + doffset) + goto perform_csum_check; + + if (!sg) { + if (!csum) { + if (!nskb->remcsum_offload) + nskb->ip_summed = CHECKSUM_NONE; + SKB_GSO_CB(nskb)->csum = + skb_copy_and_csum_bits(head_skb, offset, + skb_put(nskb, + len), + len); + SKB_GSO_CB(nskb)->csum_start = + skb_headroom(nskb) + doffset; + } else { + if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len)) + goto err; + } + continue; + } + + nskb_frag = skb_shinfo(nskb)->frags; + + skb_copy_from_linear_data_offset(head_skb, offset, + skb_put(nskb, hsize), hsize); + + skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags & + SKBFL_SHARED_FRAG; + + if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) + goto err; + + while (pos < offset + len) { + if (i >= nfrags) { + if (skb_orphan_frags(list_skb, GFP_ATOMIC) || + skb_zerocopy_clone(nskb, list_skb, + GFP_ATOMIC)) + goto err; + + i = 0; + nfrags = skb_shinfo(list_skb)->nr_frags; + frag = skb_shinfo(list_skb)->frags; + frag_skb = list_skb; + if (!skb_headlen(list_skb)) { + BUG_ON(!nfrags); + } else { + BUG_ON(!list_skb->head_frag); + + /* to make room for head_frag. */ + i--; + frag--; + } + + list_skb = list_skb->next; + } + + if (unlikely(skb_shinfo(nskb)->nr_frags >= + MAX_SKB_FRAGS)) { + net_warn_ratelimited( + "skb_segment: too many frags: %u %u\n", + pos, mss); + err = -EINVAL; + goto err; + } + + *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag; + __skb_frag_ref(nskb_frag); + size = skb_frag_size(nskb_frag); + + if (pos < offset) { + skb_frag_off_add(nskb_frag, offset - pos); + skb_frag_size_sub(nskb_frag, offset - pos); + } + + skb_shinfo(nskb)->nr_frags++; + + if (pos + size <= offset + len) { + i++; + frag++; + pos += size; + } else { + skb_frag_size_sub(nskb_frag, pos + size - (offset + len)); + goto skip_fraglist; + } + + nskb_frag++; + } + +skip_fraglist: + nskb->data_len = len - hsize; + nskb->len += nskb->data_len; + nskb->truesize += nskb->data_len; + +perform_csum_check: + if (!csum) { + if (skb_has_shared_frag(nskb) && + __skb_linearize(nskb)) + goto err; + + if (!nskb->remcsum_offload) + nskb->ip_summed = CHECKSUM_NONE; + SKB_GSO_CB(nskb)->csum = + skb_checksum(nskb, doffset, + nskb->len - doffset, 0); + SKB_GSO_CB(nskb)->csum_start = + skb_headroom(nskb) + doffset; + } + } while ((offset += len) < head_skb->len); + + /* Some callers want to get the end of the list. + * Put it in segs->prev to avoid walking the list. + * (see validate_xmit_skb_list() for example) + */ + segs->prev = tail; + + if (partial_segs) { + struct sk_buff *iter; + int type = skb_shinfo(head_skb)->gso_type; + unsigned short gso_size = skb_shinfo(head_skb)->gso_size; + + /* Update type to add partial and then remove dodgy if set */ + type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL; + type &= ~SKB_GSO_DODGY; + + /* Update GSO info and prepare to start updating headers on + * our way back down the stack of protocols. + */ + for (iter = segs; iter; iter = iter->next) { + skb_shinfo(iter)->gso_size = gso_size; + skb_shinfo(iter)->gso_segs = partial_segs; + skb_shinfo(iter)->gso_type = type; + SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset; + } + + if (tail->len - doffset <= gso_size) + skb_shinfo(tail)->gso_size = 0; + else if (tail != segs) + skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size); + } + + /* Following permits correct backpressure, for protocols + * using skb_set_owner_w(). + * Idea is to tranfert ownership from head_skb to last segment. + */ + if (head_skb->destructor == sock_wfree) { + swap(tail->truesize, head_skb->truesize); + swap(tail->destructor, head_skb->destructor); + swap(tail->sk, head_skb->sk); + } + return segs; + +err: + kfree_skb_list(segs); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(skb_segment); + +#ifdef CONFIG_SKB_EXTENSIONS +#define SKB_EXT_ALIGN_VALUE 8 +#define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE) + +static const u8 skb_ext_type_len[] = { +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), +#endif +#ifdef CONFIG_XFRM + [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path), +#endif +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext), +#endif +#if IS_ENABLED(CONFIG_MPTCP) + [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext), +#endif +#if IS_ENABLED(CONFIG_MCTP_FLOWS) + [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), +#endif +}; + +static __always_inline unsigned int skb_ext_total_length(void) +{ + return SKB_EXT_CHUNKSIZEOF(struct skb_ext) + +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + skb_ext_type_len[SKB_EXT_BRIDGE_NF] + +#endif +#ifdef CONFIG_XFRM + skb_ext_type_len[SKB_EXT_SEC_PATH] + +#endif +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + skb_ext_type_len[TC_SKB_EXT] + +#endif +#if IS_ENABLED(CONFIG_MPTCP) + skb_ext_type_len[SKB_EXT_MPTCP] + +#endif +#if IS_ENABLED(CONFIG_MCTP_FLOWS) + skb_ext_type_len[SKB_EXT_MCTP] + +#endif + 0; +} + +static void skb_extensions_init(void) +{ + BUILD_BUG_ON(SKB_EXT_NUM >= 8); + BUILD_BUG_ON(skb_ext_total_length() > 255); + + skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache", + SKB_EXT_ALIGN_VALUE * skb_ext_total_length(), + 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + NULL); +} +#else +static void skb_extensions_init(void) {} +#endif + +void __init skb_init(void) +{ + skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache", + sizeof(struct sk_buff), + 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + offsetof(struct sk_buff, cb), + sizeof_field(struct sk_buff, cb), + NULL); + skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", + sizeof(struct sk_buff_fclones), + 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + NULL); + skb_extensions_init(); +} + +static int +__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len, + unsigned int recursion_level) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + int elt = 0; + + if (unlikely(recursion_level >= 24)) + return -EMSGSIZE; + + if (copy > 0) { + if (copy > len) + copy = len; + sg_set_buf(sg, skb->data + offset, copy); + elt++; + if ((len -= copy) == 0) + return elt; + offset += copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); + if ((copy = end - offset) > 0) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + if (unlikely(elt && sg_is_last(&sg[elt - 1]))) + return -EMSGSIZE; + + if (copy > len) + copy = len; + sg_set_page(&sg[elt], skb_frag_page(frag), copy, + skb_frag_off(frag) + offset - start); + elt++; + if (!(len -= copy)) + return elt; + offset += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end, ret; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (unlikely(elt && sg_is_last(&sg[elt - 1]))) + return -EMSGSIZE; + + if (copy > len) + copy = len; + ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start, + copy, recursion_level + 1); + if (unlikely(ret < 0)) + return ret; + elt += ret; + if ((len -= copy) == 0) + return elt; + offset += copy; + } + start = end; + } + BUG_ON(len); + return elt; +} + +/** + * skb_to_sgvec - Fill a scatter-gather list from a socket buffer + * @skb: Socket buffer containing the buffers to be mapped + * @sg: The scatter-gather list to map into + * @offset: The offset into the buffer's contents to start mapping + * @len: Length of buffer space to be mapped + * + * Fill the specified scatter-gather list with mappings/pointers into a + * region of the buffer space attached to a socket buffer. Returns either + * the number of scatterlist items used, or -EMSGSIZE if the contents + * could not fit. + */ +int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) +{ + int nsg = __skb_to_sgvec(skb, sg, offset, len, 0); + + if (nsg <= 0) + return nsg; + + sg_mark_end(&sg[nsg - 1]); + + return nsg; +} +EXPORT_SYMBOL_GPL(skb_to_sgvec); + +/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given + * sglist without mark the sg which contain last skb data as the end. + * So the caller can mannipulate sg list as will when padding new data after + * the first call without calling sg_unmark_end to expend sg list. + * + * Scenario to use skb_to_sgvec_nomark: + * 1. sg_init_table + * 2. skb_to_sgvec_nomark(payload1) + * 3. skb_to_sgvec_nomark(payload2) + * + * This is equivalent to: + * 1. sg_init_table + * 2. skb_to_sgvec(payload1) + * 3. sg_unmark_end + * 4. skb_to_sgvec(payload2) + * + * When mapping mutilple payload conditionally, skb_to_sgvec_nomark + * is more preferable. + */ +int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, + int offset, int len) +{ + return __skb_to_sgvec(skb, sg, offset, len, 0); +} +EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark); + + + +/** + * skb_cow_data - Check that a socket buffer's data buffers are writable + * @skb: The socket buffer to check. + * @tailbits: Amount of trailing space to be added + * @trailer: Returned pointer to the skb where the @tailbits space begins + * + * Make sure that the data buffers attached to a socket buffer are + * writable. If they are not, private copies are made of the data buffers + * and the socket buffer is set to use these instead. + * + * If @tailbits is given, make sure that there is space to write @tailbits + * bytes of data beyond current end of socket buffer. @trailer will be + * set to point to the skb in which this space begins. + * + * The number of scatterlist elements required to completely map the + * COW'd and extended socket buffer will be returned. + */ +int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) +{ + int copyflag; + int elt; + struct sk_buff *skb1, **skb_p; + + /* If skb is cloned or its head is paged, reallocate + * head pulling out all the pages (pages are considered not writable + * at the moment even if they are anonymous). + */ + if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && + !__pskb_pull_tail(skb, __skb_pagelen(skb))) + return -ENOMEM; + + /* Easy case. Most of packets will go this way. */ + if (!skb_has_frag_list(skb)) { + /* A little of trouble, not enough of space for trailer. + * This should not happen, when stack is tuned to generate + * good frames. OK, on miss we reallocate and reserve even more + * space, 128 bytes is fair. */ + + if (skb_tailroom(skb) < tailbits && + pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) + return -ENOMEM; + + /* Voila! */ + *trailer = skb; + return 1; + } + + /* Misery. We are in troubles, going to mincer fragments... */ + + elt = 1; + skb_p = &skb_shinfo(skb)->frag_list; + copyflag = 0; + + while ((skb1 = *skb_p) != NULL) { + int ntail = 0; + + /* The fragment is partially pulled by someone, + * this can happen on input. Copy it and everything + * after it. */ + + if (skb_shared(skb1)) + copyflag = 1; + + /* If the skb is the last, worry about trailer. */ + + if (skb1->next == NULL && tailbits) { + if (skb_shinfo(skb1)->nr_frags || + skb_has_frag_list(skb1) || + skb_tailroom(skb1) < tailbits) + ntail = tailbits + 128; + } + + if (copyflag || + skb_cloned(skb1) || + ntail || + skb_shinfo(skb1)->nr_frags || + skb_has_frag_list(skb1)) { + struct sk_buff *skb2; + + /* Fuck, we are miserable poor guys... */ + if (ntail == 0) + skb2 = skb_copy(skb1, GFP_ATOMIC); + else + skb2 = skb_copy_expand(skb1, + skb_headroom(skb1), + ntail, + GFP_ATOMIC); + if (unlikely(skb2 == NULL)) + return -ENOMEM; + + if (skb1->sk) + skb_set_owner_w(skb2, skb1->sk); + + /* Looking around. Are we still alive? + * OK, link new skb, drop old one */ + + skb2->next = skb1->next; + *skb_p = skb2; + kfree_skb(skb1); + skb1 = skb2; + } + elt++; + *trailer = skb1; + skb_p = &skb1->next; + } + + return elt; +} +EXPORT_SYMBOL_GPL(skb_cow_data); + +static void sock_rmem_free(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + atomic_sub(skb->truesize, &sk->sk_rmem_alloc); +} + +static void skb_set_err_queue(struct sk_buff *skb) +{ + /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING. + * So, it is safe to (mis)use it to mark skbs on the error queue. + */ + skb->pkt_type = PACKET_OUTGOING; + BUILD_BUG_ON(PACKET_OUTGOING == 0); +} + +/* + * Note: We dont mem charge error packets (no sk_forward_alloc changes) + */ +int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) +{ + if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= + (unsigned int)READ_ONCE(sk->sk_rcvbuf)) + return -ENOMEM; + + skb_orphan(skb); + skb->sk = sk; + skb->destructor = sock_rmem_free; + atomic_add(skb->truesize, &sk->sk_rmem_alloc); + skb_set_err_queue(skb); + + /* before exiting rcu section, make sure dst is refcounted */ + skb_dst_force(skb); + + skb_queue_tail(&sk->sk_error_queue, skb); + if (!sock_flag(sk, SOCK_DEAD)) + sk_error_report(sk); + return 0; +} +EXPORT_SYMBOL(sock_queue_err_skb); + +static bool is_icmp_err_skb(const struct sk_buff *skb) +{ + return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP || + SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6); +} + +struct sk_buff *sock_dequeue_err_skb(struct sock *sk) +{ + struct sk_buff_head *q = &sk->sk_error_queue; + struct sk_buff *skb, *skb_next = NULL; + bool icmp_next = false; + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + skb = __skb_dequeue(q); + if (skb && (skb_next = skb_peek(q))) { + icmp_next = is_icmp_err_skb(skb_next); + if (icmp_next) + sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno; + } + spin_unlock_irqrestore(&q->lock, flags); + + if (is_icmp_err_skb(skb) && !icmp_next) + sk->sk_err = 0; + + if (skb_next) + sk_error_report(sk); + + return skb; +} +EXPORT_SYMBOL(sock_dequeue_err_skb); + +/** + * skb_clone_sk - create clone of skb, and take reference to socket + * @skb: the skb to clone + * + * This function creates a clone of a buffer that holds a reference on + * sk_refcnt. Buffers created via this function are meant to be + * returned using sock_queue_err_skb, or free via kfree_skb. + * + * When passing buffers allocated with this function to sock_queue_err_skb + * it is necessary to wrap the call with sock_hold/sock_put in order to + * prevent the socket from being released prior to being enqueued on + * the sk_error_queue. + */ +struct sk_buff *skb_clone_sk(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct sk_buff *clone; + + if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) + return NULL; + + clone = skb_clone(skb, GFP_ATOMIC); + if (!clone) { + sock_put(sk); + return NULL; + } + + clone->sk = sk; + clone->destructor = sock_efree; + + return clone; +} +EXPORT_SYMBOL(skb_clone_sk); + +static void __skb_complete_tx_timestamp(struct sk_buff *skb, + struct sock *sk, + int tstype, + bool opt_stats) +{ + struct sock_exterr_skb *serr; + int err; + + BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); + + serr = SKB_EXT_ERR(skb); + memset(serr, 0, sizeof(*serr)); + serr->ee.ee_errno = ENOMSG; + serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; + serr->ee.ee_info = tstype; + serr->opt_stats = opt_stats; + serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; + if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { + serr->ee.ee_data = skb_shinfo(skb)->tskey; + if (sk_is_tcp(sk)) + serr->ee.ee_data -= atomic_read(&sk->sk_tskey); + } + + err = sock_queue_err_skb(sk, skb); + + if (err) + kfree_skb(skb); +} + +static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) +{ + bool ret; + + if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly)) + return true; + + read_lock_bh(&sk->sk_callback_lock); + ret = sk->sk_socket && sk->sk_socket->file && + file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW); + read_unlock_bh(&sk->sk_callback_lock); + return ret; +} + +void skb_complete_tx_timestamp(struct sk_buff *skb, + struct skb_shared_hwtstamps *hwtstamps) +{ + struct sock *sk = skb->sk; + + if (!skb_may_tx_timestamp(sk, false)) + goto err; + + /* Take a reference to prevent skb_orphan() from freeing the socket, + * but only if the socket refcount is not zero. + */ + if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { + *skb_hwtstamps(skb) = *hwtstamps; + __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false); + sock_put(sk); + return; + } + +err: + kfree_skb(skb); +} +EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); + +void __skb_tstamp_tx(struct sk_buff *orig_skb, + const struct sk_buff *ack_skb, + struct skb_shared_hwtstamps *hwtstamps, + struct sock *sk, int tstype) +{ + struct sk_buff *skb; + bool tsonly, opt_stats = false; + u32 tsflags; + + if (!sk) + return; + + tsflags = READ_ONCE(sk->sk_tsflags); + if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && + skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) + return; + + tsonly = tsflags & SOF_TIMESTAMPING_OPT_TSONLY; + if (!skb_may_tx_timestamp(sk, tsonly)) + return; + + if (tsonly) { +#ifdef CONFIG_INET + if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) && + sk_is_tcp(sk)) { + skb = tcp_get_timestamping_opt_stats(sk, orig_skb, + ack_skb); + opt_stats = true; + } else +#endif + skb = alloc_skb(0, GFP_ATOMIC); + } else { + skb = skb_clone(orig_skb, GFP_ATOMIC); + + if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) { + kfree_skb(skb); + return; + } + } + if (!skb) + return; + + if (tsonly) { + skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags & + SKBTX_ANY_TSTAMP; + skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey; + } + + if (hwtstamps) + *skb_hwtstamps(skb) = *hwtstamps; + else + __net_timestamp(skb); + + __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats); +} +EXPORT_SYMBOL_GPL(__skb_tstamp_tx); + +void skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps) +{ + return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk, + SCM_TSTAMP_SND); +} +EXPORT_SYMBOL_GPL(skb_tstamp_tx); + +void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) +{ + struct sock *sk = skb->sk; + struct sock_exterr_skb *serr; + int err = 1; + + skb->wifi_acked_valid = 1; + skb->wifi_acked = acked; + + serr = SKB_EXT_ERR(skb); + memset(serr, 0, sizeof(*serr)); + serr->ee.ee_errno = ENOMSG; + serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; + + /* Take a reference to prevent skb_orphan() from freeing the socket, + * but only if the socket refcount is not zero. + */ + if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { + err = sock_queue_err_skb(sk, skb); + sock_put(sk); + } + if (err) + kfree_skb(skb); +} +EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); + +/** + * skb_partial_csum_set - set up and verify partial csum values for packet + * @skb: the skb to set + * @start: the number of bytes after skb->data to start checksumming. + * @off: the offset from start to place the checksum. + * + * For untrusted partially-checksummed packets, we need to make sure the values + * for skb->csum_start and skb->csum_offset are valid so we don't oops. + * + * This function checks and sets those values and skb->ip_summed: if this + * returns false you should drop the packet. + */ +bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) +{ + u32 csum_end = (u32)start + (u32)off + sizeof(__sum16); + u32 csum_start = skb_headroom(skb) + (u32)start; + + if (unlikely(csum_start >= U16_MAX || csum_end > skb_headlen(skb))) { + net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n", + start, off, skb_headroom(skb), skb_headlen(skb)); + return false; + } + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = csum_start; + skb->csum_offset = off; + skb->transport_header = csum_start; + return true; +} +EXPORT_SYMBOL_GPL(skb_partial_csum_set); + +static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len, + unsigned int max) +{ + if (skb_headlen(skb) >= len) + return 0; + + /* If we need to pullup then pullup to the max, so we + * won't need to do it again. + */ + if (max > skb->len) + max = skb->len; + + if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL) + return -ENOMEM; + + if (skb_headlen(skb) < len) + return -EPROTO; + + return 0; +} + +#define MAX_TCP_HDR_LEN (15 * 4) + +static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb, + typeof(IPPROTO_IP) proto, + unsigned int off) +{ + int err; + + switch (proto) { + case IPPROTO_TCP: + err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr), + off + MAX_TCP_HDR_LEN); + if (!err && !skb_partial_csum_set(skb, off, + offsetof(struct tcphdr, + check))) + err = -EPROTO; + return err ? ERR_PTR(err) : &tcp_hdr(skb)->check; + + case IPPROTO_UDP: + err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr), + off + sizeof(struct udphdr)); + if (!err && !skb_partial_csum_set(skb, off, + offsetof(struct udphdr, + check))) + err = -EPROTO; + return err ? ERR_PTR(err) : &udp_hdr(skb)->check; + } + + return ERR_PTR(-EPROTO); +} + +/* This value should be large enough to cover a tagged ethernet header plus + * maximally sized IP and TCP or UDP headers. + */ +#define MAX_IP_HDR_LEN 128 + +static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate) +{ + unsigned int off; + bool fragment; + __sum16 *csum; + int err; + + fragment = false; + + err = skb_maybe_pull_tail(skb, + sizeof(struct iphdr), + MAX_IP_HDR_LEN); + if (err < 0) + goto out; + + if (ip_is_fragment(ip_hdr(skb))) + fragment = true; + + off = ip_hdrlen(skb); + + err = -EPROTO; + + if (fragment) + goto out; + + csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off); + if (IS_ERR(csum)) + return PTR_ERR(csum); + + if (recalculate) + *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - off, + ip_hdr(skb)->protocol, 0); + err = 0; + +out: + return err; +} + +/* This value should be large enough to cover a tagged ethernet header plus + * an IPv6 header, all options, and a maximal TCP or UDP header. + */ +#define MAX_IPV6_HDR_LEN 256 + +#define OPT_HDR(type, skb, off) \ + (type *)(skb_network_header(skb) + (off)) + +static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate) +{ + int err; + u8 nexthdr; + unsigned int off; + unsigned int len; + bool fragment; + bool done; + __sum16 *csum; + + fragment = false; + done = false; + + off = sizeof(struct ipv6hdr); + + err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + nexthdr = ipv6_hdr(skb)->nexthdr; + + len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); + while (off <= len && !done) { + switch (nexthdr) { + case IPPROTO_DSTOPTS: + case IPPROTO_HOPOPTS: + case IPPROTO_ROUTING: { + struct ipv6_opt_hdr *hp; + + err = skb_maybe_pull_tail(skb, + off + + sizeof(struct ipv6_opt_hdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + hp = OPT_HDR(struct ipv6_opt_hdr, skb, off); + nexthdr = hp->nexthdr; + off += ipv6_optlen(hp); + break; + } + case IPPROTO_AH: { + struct ip_auth_hdr *hp; + + err = skb_maybe_pull_tail(skb, + off + + sizeof(struct ip_auth_hdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + hp = OPT_HDR(struct ip_auth_hdr, skb, off); + nexthdr = hp->nexthdr; + off += ipv6_authlen(hp); + break; + } + case IPPROTO_FRAGMENT: { + struct frag_hdr *hp; + + err = skb_maybe_pull_tail(skb, + off + + sizeof(struct frag_hdr), + MAX_IPV6_HDR_LEN); + if (err < 0) + goto out; + + hp = OPT_HDR(struct frag_hdr, skb, off); + + if (hp->frag_off & htons(IP6_OFFSET | IP6_MF)) + fragment = true; + + nexthdr = hp->nexthdr; + off += sizeof(struct frag_hdr); + break; + } + default: + done = true; + break; + } + } + + err = -EPROTO; + + if (!done || fragment) + goto out; + + csum = skb_checksum_setup_ip(skb, nexthdr, off); + if (IS_ERR(csum)) + return PTR_ERR(csum); + + if (recalculate) + *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - off, nexthdr, 0); + err = 0; + +out: + return err; +} + +/** + * skb_checksum_setup - set up partial checksum offset + * @skb: the skb to set up + * @recalculate: if true the pseudo-header checksum will be recalculated + */ +int skb_checksum_setup(struct sk_buff *skb, bool recalculate) +{ + int err; + + switch (skb->protocol) { + case htons(ETH_P_IP): + err = skb_checksum_setup_ipv4(skb, recalculate); + break; + + case htons(ETH_P_IPV6): + err = skb_checksum_setup_ipv6(skb, recalculate); + break; + + default: + err = -EPROTO; + break; + } + + return err; +} +EXPORT_SYMBOL(skb_checksum_setup); + +/** + * skb_checksum_maybe_trim - maybe trims the given skb + * @skb: the skb to check + * @transport_len: the data length beyond the network header + * + * Checks whether the given skb has data beyond the given transport length. + * If so, returns a cloned skb trimmed to this transport length. + * Otherwise returns the provided skb. Returns NULL in error cases + * (e.g. transport_len exceeds skb length or out-of-memory). + * + * Caller needs to set the skb transport header and free any returned skb if it + * differs from the provided skb. + */ +static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb, + unsigned int transport_len) +{ + struct sk_buff *skb_chk; + unsigned int len = skb_transport_offset(skb) + transport_len; + int ret; + + if (skb->len < len) + return NULL; + else if (skb->len == len) + return skb; + + skb_chk = skb_clone(skb, GFP_ATOMIC); + if (!skb_chk) + return NULL; + + ret = pskb_trim_rcsum(skb_chk, len); + if (ret) { + kfree_skb(skb_chk); + return NULL; + } + + return skb_chk; +} + +/** + * skb_checksum_trimmed - validate checksum of an skb + * @skb: the skb to check + * @transport_len: the data length beyond the network header + * @skb_chkf: checksum function to use + * + * Applies the given checksum function skb_chkf to the provided skb. + * Returns a checked and maybe trimmed skb. Returns NULL on error. + * + * If the skb has data beyond the given transport length, then a + * trimmed & cloned skb is checked and returned. + * + * Caller needs to set the skb transport header and free any returned skb if it + * differs from the provided skb. + */ +struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, + unsigned int transport_len, + __sum16(*skb_chkf)(struct sk_buff *skb)) +{ + struct sk_buff *skb_chk; + unsigned int offset = skb_transport_offset(skb); + __sum16 ret; + + skb_chk = skb_checksum_maybe_trim(skb, transport_len); + if (!skb_chk) + goto err; + + if (!pskb_may_pull(skb_chk, offset)) + goto err; + + skb_pull_rcsum(skb_chk, offset); + ret = skb_chkf(skb_chk); + skb_push_rcsum(skb_chk, offset); + + if (ret) + goto err; + + return skb_chk; + +err: + if (skb_chk && skb_chk != skb) + kfree_skb(skb_chk); + + return NULL; + +} +EXPORT_SYMBOL(skb_checksum_trimmed); + +void __skb_warn_lro_forwarding(const struct sk_buff *skb) +{ + net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", + skb->dev->name); +} +EXPORT_SYMBOL(__skb_warn_lro_forwarding); + +void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) +{ + if (head_stolen) { + skb_release_head_state(skb); + kmem_cache_free(skbuff_head_cache, skb); + } else { + __kfree_skb(skb); + } +} +EXPORT_SYMBOL(kfree_skb_partial); + +/** + * skb_try_coalesce - try to merge skb to prior one + * @to: prior buffer + * @from: buffer to add + * @fragstolen: pointer to boolean + * @delta_truesize: how much more was allocated than was requested + */ +bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, + bool *fragstolen, int *delta_truesize) +{ + struct skb_shared_info *to_shinfo, *from_shinfo; + int i, delta, len = from->len; + + *fragstolen = false; + + if (skb_cloned(to)) + return false; + + /* In general, avoid mixing page_pool and non-page_pool allocated + * pages within the same SKB. Additionally avoid dealing with clones + * with page_pool pages, in case the SKB is using page_pool fragment + * references (PP_FLAG_PAGE_FRAG). Since we only take full page + * references for cloned SKBs at the moment that would result in + * inconsistent reference counts. + * In theory we could take full references if @from is cloned and + * !@to->pp_recycle but its tricky (due to potential race with + * the clone disappearing) and rare, so not worth dealing with. + */ + if (to->pp_recycle != from->pp_recycle || + (from->pp_recycle && skb_cloned(from))) + return false; + + if (len <= skb_tailroom(to)) { + if (len) + BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); + *delta_truesize = 0; + return true; + } + + to_shinfo = skb_shinfo(to); + from_shinfo = skb_shinfo(from); + if (to_shinfo->frag_list || from_shinfo->frag_list) + return false; + if (skb_zcopy(to) || skb_zcopy(from)) + return false; + + if (skb_headlen(from) != 0) { + struct page *page; + unsigned int offset; + + if (to_shinfo->nr_frags + + from_shinfo->nr_frags >= MAX_SKB_FRAGS) + return false; + + if (skb_head_is_locked(from)) + return false; + + delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); + + page = virt_to_head_page(from->head); + offset = from->data - (unsigned char *)page_address(page); + + skb_fill_page_desc(to, to_shinfo->nr_frags, + page, offset, skb_headlen(from)); + *fragstolen = true; + } else { + if (to_shinfo->nr_frags + + from_shinfo->nr_frags > MAX_SKB_FRAGS) + return false; + + delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from)); + } + + WARN_ON_ONCE(delta < len); + + memcpy(to_shinfo->frags + to_shinfo->nr_frags, + from_shinfo->frags, + from_shinfo->nr_frags * sizeof(skb_frag_t)); + to_shinfo->nr_frags += from_shinfo->nr_frags; + + if (!skb_cloned(from)) + from_shinfo->nr_frags = 0; + + /* if the skb is not cloned this does nothing + * since we set nr_frags to 0. + */ + for (i = 0; i < from_shinfo->nr_frags; i++) + __skb_frag_ref(&from_shinfo->frags[i]); + + to->truesize += delta; + to->len += len; + to->data_len += len; + + *delta_truesize = delta; + return true; +} +EXPORT_SYMBOL(skb_try_coalesce); + +/** + * skb_scrub_packet - scrub an skb + * + * @skb: buffer to clean + * @xnet: packet is crossing netns + * + * skb_scrub_packet can be used after encapsulating or decapsulting a packet + * into/from a tunnel. Some information have to be cleared during these + * operations. + * skb_scrub_packet can also be used to clean a skb before injecting it in + * another namespace (@xnet == true). We have to clear all information in the + * skb that could impact namespace isolation. + */ +void skb_scrub_packet(struct sk_buff *skb, bool xnet) +{ + skb->pkt_type = PACKET_HOST; + skb->skb_iif = 0; + skb->ignore_df = 0; + skb_dst_drop(skb); + skb_ext_reset(skb); + nf_reset_ct(skb); + nf_reset_trace(skb); + +#ifdef CONFIG_NET_SWITCHDEV + skb->offload_fwd_mark = 0; + skb->offload_l3_fwd_mark = 0; +#endif + + if (!xnet) + return; + + ipvs_reset(skb); + skb->mark = 0; + skb_clear_tstamp(skb); +} +EXPORT_SYMBOL_GPL(skb_scrub_packet); + +/** + * skb_gso_transport_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_transport_seglen is used to determine the real size of the + * individual segments, including Layer4 headers (TCP/UDP). + * + * The MAC/L2 or network (IP, IPv6) headers are not accounted for. + */ +static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + unsigned int thlen = 0; + + if (skb->encapsulation) { + thlen = skb_inner_transport_header(skb) - + skb_transport_header(skb); + + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) + thlen += inner_tcp_hdrlen(skb); + } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { + thlen = tcp_hdrlen(skb); + } else if (unlikely(skb_is_gso_sctp(skb))) { + thlen = sizeof(struct sctphdr); + } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { + thlen = sizeof(struct udphdr); + } + /* UFO sets gso_size to the size of the fragmentation + * payload, i.e. the size of the L4 (UDP) header is already + * accounted for. + */ + return thlen + shinfo->gso_size; +} + +/** + * skb_gso_network_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_network_seglen is used to determine the real size of the + * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). + * + * The MAC/L2 header is not accounted for. + */ +static unsigned int skb_gso_network_seglen(const struct sk_buff *skb) +{ + unsigned int hdr_len = skb_transport_header(skb) - + skb_network_header(skb); + + return hdr_len + skb_gso_transport_seglen(skb); +} + +/** + * skb_gso_mac_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_mac_seglen is used to determine the real size of the + * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 + * headers (TCP/UDP). + */ +static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) +{ + unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); + + return hdr_len + skb_gso_transport_seglen(skb); +} + +/** + * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS + * + * There are a couple of instances where we have a GSO skb, and we + * want to determine what size it would be after it is segmented. + * + * We might want to check: + * - L3+L4+payload size (e.g. IP forwarding) + * - L2+L3+L4+payload size (e.g. sanity check before passing to driver) + * + * This is a helper to do that correctly considering GSO_BY_FRAGS. + * + * @skb: GSO skb + * + * @seg_len: The segmented length (from skb_gso_*_seglen). In the + * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. + * + * @max_len: The maximum permissible length. + * + * Returns true if the segmented length <= max length. + */ +static inline bool skb_gso_size_check(const struct sk_buff *skb, + unsigned int seg_len, + unsigned int max_len) { + const struct skb_shared_info *shinfo = skb_shinfo(skb); + const struct sk_buff *iter; + + if (shinfo->gso_size != GSO_BY_FRAGS) + return seg_len <= max_len; + + /* Undo this so we can re-use header sizes */ + seg_len -= GSO_BY_FRAGS; + + skb_walk_frags(skb, iter) { + if (seg_len + skb_headlen(iter) > max_len) + return false; + } + + return true; +} + +/** + * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU? + * + * @skb: GSO skb + * @mtu: MTU to validate against + * + * skb_gso_validate_network_len validates if a given skb will fit a + * wanted MTU once split. It considers L3 headers, L4 headers, and the + * payload. + */ +bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu) +{ + return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); +} +EXPORT_SYMBOL_GPL(skb_gso_validate_network_len); + +/** + * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? + * + * @skb: GSO skb + * @len: length to validate against + * + * skb_gso_validate_mac_len validates if a given skb will fit a wanted + * length once split, including L2, L3 and L4 headers and the payload. + */ +bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len) +{ + return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len); +} +EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); + +static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) +{ + int mac_len, meta_len; + void *meta; + + if (skb_cow(skb, skb_headroom(skb)) < 0) { + kfree_skb(skb); + return NULL; + } + + mac_len = skb->data - skb_mac_header(skb); + if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) { + memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb), + mac_len - VLAN_HLEN - ETH_TLEN); + } + + meta_len = skb_metadata_len(skb); + if (meta_len) { + meta = skb_metadata_end(skb) - meta_len; + memmove(meta + VLAN_HLEN, meta, meta_len); + } + + skb->mac_header += VLAN_HLEN; + return skb; +} + +struct sk_buff *skb_vlan_untag(struct sk_buff *skb) +{ + struct vlan_hdr *vhdr; + u16 vlan_tci; + + if (unlikely(skb_vlan_tag_present(skb))) { + /* vlan_tci is already set-up so leave this for another time */ + return skb; + } + + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(!skb)) + goto err_free; + /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */ + if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short)))) + goto err_free; + + vhdr = (struct vlan_hdr *)skb->data; + vlan_tci = ntohs(vhdr->h_vlan_TCI); + __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci); + + skb_pull_rcsum(skb, VLAN_HLEN); + vlan_set_encap_proto(skb, vhdr); + + skb = skb_reorder_vlan_header(skb); + if (unlikely(!skb)) + goto err_free; + + skb_reset_network_header(skb); + if (!skb_transport_header_was_set(skb)) + skb_reset_transport_header(skb); + skb_reset_mac_len(skb); + + return skb; + +err_free: + kfree_skb(skb); + return NULL; +} +EXPORT_SYMBOL(skb_vlan_untag); + +int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len) +{ + if (!pskb_may_pull(skb, write_len)) + return -ENOMEM; + + if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) + return 0; + + return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); +} +EXPORT_SYMBOL(skb_ensure_writable); + +/* remove VLAN header from packet and update csum accordingly. + * expects a non skb_vlan_tag_present skb with a vlan tag payload + */ +int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) +{ + struct vlan_hdr *vhdr; + int offset = skb->data - skb_mac_header(skb); + int err; + + if (WARN_ONCE(offset, + "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n", + offset)) { + return -EINVAL; + } + + err = skb_ensure_writable(skb, VLAN_ETH_HLEN); + if (unlikely(err)) + return err; + + skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); + + vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); + *vlan_tci = ntohs(vhdr->h_vlan_TCI); + + memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); + __skb_pull(skb, VLAN_HLEN); + + vlan_set_encap_proto(skb, vhdr); + skb->mac_header += VLAN_HLEN; + + if (skb_network_offset(skb) < ETH_HLEN) + skb_set_network_header(skb, ETH_HLEN); + + skb_reset_mac_len(skb); + + return err; +} +EXPORT_SYMBOL(__skb_vlan_pop); + +/* Pop a vlan tag either from hwaccel or from payload. + * Expects skb->data at mac header. + */ +int skb_vlan_pop(struct sk_buff *skb) +{ + u16 vlan_tci; + __be16 vlan_proto; + int err; + + if (likely(skb_vlan_tag_present(skb))) { + __vlan_hwaccel_clear_tag(skb); + } else { + if (unlikely(!eth_type_vlan(skb->protocol))) + return 0; + + err = __skb_vlan_pop(skb, &vlan_tci); + if (err) + return err; + } + /* move next vlan tag to hw accel tag */ + if (likely(!eth_type_vlan(skb->protocol))) + return 0; + + vlan_proto = skb->protocol; + err = __skb_vlan_pop(skb, &vlan_tci); + if (unlikely(err)) + return err; + + __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); + return 0; +} +EXPORT_SYMBOL(skb_vlan_pop); + +/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present). + * Expects skb->data at mac header. + */ +int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) +{ + if (skb_vlan_tag_present(skb)) { + int offset = skb->data - skb_mac_header(skb); + int err; + + if (WARN_ONCE(offset, + "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n", + offset)) { + return -EINVAL; + } + + err = __vlan_insert_tag(skb, skb->vlan_proto, + skb_vlan_tag_get(skb)); + if (err) + return err; + + skb->protocol = skb->vlan_proto; + skb->mac_len += VLAN_HLEN; + + skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); + } + __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); + return 0; +} +EXPORT_SYMBOL(skb_vlan_push); + +/** + * skb_eth_pop() - Drop the Ethernet header at the head of a packet + * + * @skb: Socket buffer to modify + * + * Drop the Ethernet header of @skb. + * + * Expects that skb->data points to the mac header and that no VLAN tags are + * present. + * + * Returns 0 on success, -errno otherwise. + */ +int skb_eth_pop(struct sk_buff *skb) +{ + if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) || + skb_network_offset(skb) < ETH_HLEN) + return -EPROTO; + + skb_pull_rcsum(skb, ETH_HLEN); + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + return 0; +} +EXPORT_SYMBOL(skb_eth_pop); + +/** + * skb_eth_push() - Add a new Ethernet header at the head of a packet + * + * @skb: Socket buffer to modify + * @dst: Destination MAC address of the new header + * @src: Source MAC address of the new header + * + * Prepend @skb with a new Ethernet header. + * + * Expects that skb->data points to the mac header, which must be empty. + * + * Returns 0 on success, -errno otherwise. + */ +int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, + const unsigned char *src) +{ + struct ethhdr *eth; + int err; + + if (skb_network_offset(skb) || skb_vlan_tag_present(skb)) + return -EPROTO; + + err = skb_cow_head(skb, sizeof(*eth)); + if (err < 0) + return err; + + skb_push(skb, sizeof(*eth)); + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + eth = eth_hdr(skb); + ether_addr_copy(eth->h_dest, dst); + ether_addr_copy(eth->h_source, src); + eth->h_proto = skb->protocol; + + skb_postpush_rcsum(skb, eth, sizeof(*eth)); + + return 0; +} +EXPORT_SYMBOL(skb_eth_push); + +/* Update the ethertype of hdr and the skb csum value if required. */ +static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr, + __be16 ethertype) +{ + if (skb->ip_summed == CHECKSUM_COMPLETE) { + __be16 diff[] = { ~hdr->h_proto, ethertype }; + + skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); + } + + hdr->h_proto = ethertype; +} + +/** + * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of + * the packet + * + * @skb: buffer + * @mpls_lse: MPLS label stack entry to push + * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848) + * @mac_len: length of the MAC header + * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is + * ethernet + * + * Expects skb->data at mac header. + * + * Returns 0 on success, -errno otherwise. + */ +int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, + int mac_len, bool ethernet) +{ + struct mpls_shim_hdr *lse; + int err; + + if (unlikely(!eth_p_mpls(mpls_proto))) + return -EINVAL; + + /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */ + if (skb->encapsulation) + return -EINVAL; + + err = skb_cow_head(skb, MPLS_HLEN); + if (unlikely(err)) + return err; + + if (!skb->inner_protocol) { + skb_set_inner_network_header(skb, skb_network_offset(skb)); + skb_set_inner_protocol(skb, skb->protocol); + } + + skb_push(skb, MPLS_HLEN); + memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), + mac_len); + skb_reset_mac_header(skb); + skb_set_network_header(skb, mac_len); + skb_reset_mac_len(skb); + + lse = mpls_hdr(skb); + lse->label_stack_entry = mpls_lse; + skb_postpush_rcsum(skb, lse, MPLS_HLEN); + + if (ethernet && mac_len >= ETH_HLEN) + skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto); + skb->protocol = mpls_proto; + + return 0; +} +EXPORT_SYMBOL_GPL(skb_mpls_push); + +/** + * skb_mpls_pop() - pop the outermost MPLS header + * + * @skb: buffer + * @next_proto: ethertype of header after popped MPLS header + * @mac_len: length of the MAC header + * @ethernet: flag to indicate if the packet is ethernet + * + * Expects skb->data at mac header. + * + * Returns 0 on success, -errno otherwise. + */ +int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, + bool ethernet) +{ + int err; + + if (unlikely(!eth_p_mpls(skb->protocol))) + return 0; + + err = skb_ensure_writable(skb, mac_len + MPLS_HLEN); + if (unlikely(err)) + return err; + + skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN); + memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), + mac_len); + + __skb_pull(skb, MPLS_HLEN); + skb_reset_mac_header(skb); + skb_set_network_header(skb, mac_len); + + if (ethernet && mac_len >= ETH_HLEN) { + struct ethhdr *hdr; + + /* use mpls_hdr() to get ethertype to account for VLANs. */ + hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN); + skb_mod_eth_type(skb, hdr, next_proto); + } + skb->protocol = next_proto; + + return 0; +} +EXPORT_SYMBOL_GPL(skb_mpls_pop); + +/** + * skb_mpls_update_lse() - modify outermost MPLS header and update csum + * + * @skb: buffer + * @mpls_lse: new MPLS label stack entry to update to + * + * Expects skb->data at mac header. + * + * Returns 0 on success, -errno otherwise. + */ +int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse) +{ + int err; + + if (unlikely(!eth_p_mpls(skb->protocol))) + return -EINVAL; + + err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); + if (unlikely(err)) + return err; + + if (skb->ip_summed == CHECKSUM_COMPLETE) { + __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse }; + + skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); + } + + mpls_hdr(skb)->label_stack_entry = mpls_lse; + + return 0; +} +EXPORT_SYMBOL_GPL(skb_mpls_update_lse); + +/** + * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header + * + * @skb: buffer + * + * Expects skb->data at mac header. + * + * Returns 0 on success, -errno otherwise. + */ +int skb_mpls_dec_ttl(struct sk_buff *skb) +{ + u32 lse; + u8 ttl; + + if (unlikely(!eth_p_mpls(skb->protocol))) + return -EINVAL; + + if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN)) + return -ENOMEM; + + lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry); + ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; + if (!--ttl) + return -EINVAL; + + lse &= ~MPLS_LS_TTL_MASK; + lse |= ttl << MPLS_LS_TTL_SHIFT; + + return skb_mpls_update_lse(skb, cpu_to_be32(lse)); +} +EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); + +/** + * alloc_skb_with_frags - allocate skb with page frags + * + * @header_len: size of linear part + * @data_len: needed length in frags + * @max_page_order: max page order desired. + * @errcode: pointer to error code if any + * @gfp_mask: allocation mask + * + * This can be used to allocate a paged skb, given a maximal order for frags. + */ +struct sk_buff *alloc_skb_with_frags(unsigned long header_len, + unsigned long data_len, + int max_page_order, + int *errcode, + gfp_t gfp_mask) +{ + int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; + unsigned long chunk; + struct sk_buff *skb; + struct page *page; + int i; + + *errcode = -EMSGSIZE; + /* Note this test could be relaxed, if we succeed to allocate + * high order pages... + */ + if (npages > MAX_SKB_FRAGS) + return NULL; + + *errcode = -ENOBUFS; + skb = alloc_skb(header_len, gfp_mask); + if (!skb) + return NULL; + + skb->truesize += npages << PAGE_SHIFT; + + for (i = 0; npages > 0; i++) { + int order = max_page_order; + + while (order) { + if (npages >= 1 << order) { + page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | + __GFP_COMP | + __GFP_NOWARN, + order); + if (page) + goto fill_page; + /* Do not retry other high order allocations */ + order = 1; + max_page_order = 0; + } + order--; + } + page = alloc_page(gfp_mask); + if (!page) + goto failure; +fill_page: + chunk = min_t(unsigned long, data_len, + PAGE_SIZE << order); + skb_fill_page_desc(skb, i, page, 0, chunk); + data_len -= chunk; + npages -= 1 << order; + } + return skb; + +failure: + kfree_skb(skb); + return NULL; +} +EXPORT_SYMBOL(alloc_skb_with_frags); + +/* carve out the first off bytes from skb when off < headlen */ +static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, + const int headlen, gfp_t gfp_mask) +{ + int i; + unsigned int size = skb_end_offset(skb); + int new_hlen = headlen - off; + u8 *data; + + if (skb_pfmemalloc(skb)) + gfp_mask |= __GFP_MEMALLOC; + + data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); + if (!data) + return -ENOMEM; + size = SKB_WITH_OVERHEAD(size); + + /* Copy real data, and all frags */ + skb_copy_from_linear_data_offset(skb, off, data, new_hlen); + skb->len -= off; + + memcpy((struct skb_shared_info *)(data + size), + skb_shinfo(skb), + offsetof(struct skb_shared_info, + frags[skb_shinfo(skb)->nr_frags])); + if (skb_cloned(skb)) { + /* drop the old head gracefully */ + if (skb_orphan_frags(skb, gfp_mask)) { + kfree(data); + return -ENOMEM; + } + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + skb_frag_ref(skb, i); + if (skb_has_frag_list(skb)) + skb_clone_fraglist(skb); + skb_release_data(skb); + } else { + /* we can reuse existing recount- all we did was + * relocate values + */ + skb_free_head(skb); + } + + skb->head = data; + skb->data = data; + skb->head_frag = 0; + skb_set_end_offset(skb, size); + skb_set_tail_pointer(skb, skb_headlen(skb)); + skb_headers_offset_update(skb, 0); + skb->cloned = 0; + skb->hdr_len = 0; + skb->nohdr = 0; + atomic_set(&skb_shinfo(skb)->dataref, 1); + + return 0; +} + +static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp); + +/* carve out the first eat bytes from skb's frag_list. May recurse into + * pskb_carve() + */ +static int pskb_carve_frag_list(struct sk_buff *skb, + struct skb_shared_info *shinfo, int eat, + gfp_t gfp_mask) +{ + struct sk_buff *list = shinfo->frag_list; + struct sk_buff *clone = NULL; + struct sk_buff *insp = NULL; + + do { + if (!list) { + pr_err("Not enough bytes to eat. Want %d\n", eat); + return -EFAULT; + } + if (list->len <= eat) { + /* Eaten as whole. */ + eat -= list->len; + list = list->next; + insp = list; + } else { + /* Eaten partially. */ + if (skb_shared(list)) { + clone = skb_clone(list, gfp_mask); + if (!clone) + return -ENOMEM; + insp = list->next; + list = clone; + } else { + /* This may be pulled without problems. */ + insp = list; + } + if (pskb_carve(list, eat, gfp_mask) < 0) { + kfree_skb(clone); + return -ENOMEM; + } + break; + } + } while (eat); + + /* Free pulled out fragments. */ + while ((list = shinfo->frag_list) != insp) { + shinfo->frag_list = list->next; + consume_skb(list); + } + /* And insert new clone at head. */ + if (clone) { + clone->next = list; + shinfo->frag_list = clone; + } + return 0; +} + +/* carve off first len bytes from skb. Split line (off) is in the + * non-linear part of skb + */ +static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, + int pos, gfp_t gfp_mask) +{ + int i, k = 0; + unsigned int size = skb_end_offset(skb); + u8 *data; + const int nfrags = skb_shinfo(skb)->nr_frags; + struct skb_shared_info *shinfo; + + if (skb_pfmemalloc(skb)) + gfp_mask |= __GFP_MEMALLOC; + + data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); + if (!data) + return -ENOMEM; + size = SKB_WITH_OVERHEAD(size); + + memcpy((struct skb_shared_info *)(data + size), + skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); + if (skb_orphan_frags(skb, gfp_mask)) { + kfree(data); + return -ENOMEM; + } + shinfo = (struct skb_shared_info *)(data + size); + for (i = 0; i < nfrags; i++) { + int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]); + + if (pos + fsize > off) { + shinfo->frags[k] = skb_shinfo(skb)->frags[i]; + + if (pos < off) { + /* Split frag. + * We have two variants in this case: + * 1. Move all the frag to the second + * part, if it is possible. F.e. + * this approach is mandatory for TUX, + * where splitting is expensive. + * 2. Split is accurately. We make this. + */ + skb_frag_off_add(&shinfo->frags[0], off - pos); + skb_frag_size_sub(&shinfo->frags[0], off - pos); + } + skb_frag_ref(skb, i); + k++; + } + pos += fsize; + } + shinfo->nr_frags = k; + if (skb_has_frag_list(skb)) + skb_clone_fraglist(skb); + + /* split line is in frag list */ + if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) { + /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */ + if (skb_has_frag_list(skb)) + kfree_skb_list(skb_shinfo(skb)->frag_list); + kfree(data); + return -ENOMEM; + } + skb_release_data(skb); + + skb->head = data; + skb->head_frag = 0; + skb->data = data; + skb_set_end_offset(skb, size); + skb_reset_tail_pointer(skb); + skb_headers_offset_update(skb, 0); + skb->cloned = 0; + skb->hdr_len = 0; + skb->nohdr = 0; + skb->len -= off; + skb->data_len = skb->len; + atomic_set(&skb_shinfo(skb)->dataref, 1); + return 0; +} + +/* remove len bytes from the beginning of the skb */ +static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp) +{ + int headlen = skb_headlen(skb); + + if (len < headlen) + return pskb_carve_inside_header(skb, len, headlen, gfp); + else + return pskb_carve_inside_nonlinear(skb, len, headlen, gfp); +} + +/* Extract to_copy bytes starting at off from skb, and return this in + * a new skb + */ +struct sk_buff *pskb_extract(struct sk_buff *skb, int off, + int to_copy, gfp_t gfp) +{ + struct sk_buff *clone = skb_clone(skb, gfp); + + if (!clone) + return NULL; + + if (pskb_carve(clone, off, gfp) < 0 || + pskb_trim(clone, to_copy)) { + kfree_skb(clone); + return NULL; + } + return clone; +} +EXPORT_SYMBOL(pskb_extract); + +/** + * skb_condense - try to get rid of fragments/frag_list if possible + * @skb: buffer + * + * Can be used to save memory before skb is added to a busy queue. + * If packet has bytes in frags and enough tail room in skb->head, + * pull all of them, so that we can free the frags right now and adjust + * truesize. + * Notes: + * We do not reallocate skb->head thus can not fail. + * Caller must re-evaluate skb->truesize if needed. + */ +void skb_condense(struct sk_buff *skb) +{ + if (skb->data_len) { + if (skb->data_len > skb->end - skb->tail || + skb_cloned(skb)) + return; + + /* Nice, we can free page frag(s) right now */ + __pskb_pull_tail(skb, skb->data_len); + } + /* At this point, skb->truesize might be over estimated, + * because skb had a fragment, and fragments do not tell + * their truesize. + * When we pulled its content into skb->head, fragment + * was freed, but __pskb_pull_tail() could not possibly + * adjust skb->truesize, not knowing the frag truesize. + */ + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); +} + +#ifdef CONFIG_SKB_EXTENSIONS +static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) +{ + return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); +} + +/** + * __skb_ext_alloc - allocate a new skb extensions storage + * + * @flags: See kmalloc(). + * + * Returns the newly allocated pointer. The pointer can later attached to a + * skb via __skb_ext_set(). + * Note: caller must handle the skb_ext as an opaque data. + */ +struct skb_ext *__skb_ext_alloc(gfp_t flags) +{ + struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags); + + if (new) { + memset(new->offset, 0, sizeof(new->offset)); + refcount_set(&new->refcnt, 1); + } + + return new; +} + +static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, + unsigned int old_active) +{ + struct skb_ext *new; + + if (refcount_read(&old->refcnt) == 1) + return old; + + new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); + if (!new) + return NULL; + + memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); + refcount_set(&new->refcnt, 1); + +#ifdef CONFIG_XFRM + if (old_active & (1 << SKB_EXT_SEC_PATH)) { + struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH); + unsigned int i; + + for (i = 0; i < sp->len; i++) + xfrm_state_hold(sp->xvec[i]); + } +#endif + __skb_ext_put(old); + return new; +} + +/** + * __skb_ext_set - attach the specified extension storage to this skb + * @skb: buffer + * @id: extension id + * @ext: extension storage previously allocated via __skb_ext_alloc() + * + * Existing extensions, if any, are cleared. + * + * Returns the pointer to the extension. + */ +void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, + struct skb_ext *ext) +{ + unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext); + + skb_ext_put(skb); + newlen = newoff + skb_ext_type_len[id]; + ext->chunks = newlen; + ext->offset[id] = newoff; + skb->extensions = ext; + skb->active_extensions = 1 << id; + return skb_ext_get_ptr(ext, id); +} + +/** + * skb_ext_add - allocate space for given extension, COW if needed + * @skb: buffer + * @id: extension to allocate space for + * + * Allocates enough space for the given extension. + * If the extension is already present, a pointer to that extension + * is returned. + * + * If the skb was cloned, COW applies and the returned memory can be + * modified without changing the extension space of clones buffers. + * + * Returns pointer to the extension or NULL on allocation failure. + */ +void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) +{ + struct skb_ext *new, *old = NULL; + unsigned int newlen, newoff; + + if (skb->active_extensions) { + old = skb->extensions; + + new = skb_ext_maybe_cow(old, skb->active_extensions); + if (!new) + return NULL; + + if (__skb_ext_exist(new, id)) + goto set_active; + + newoff = new->chunks; + } else { + newoff = SKB_EXT_CHUNKSIZEOF(*new); + + new = __skb_ext_alloc(GFP_ATOMIC); + if (!new) + return NULL; + } + + newlen = newoff + skb_ext_type_len[id]; + new->chunks = newlen; + new->offset[id] = newoff; +set_active: + skb->slow_gro = 1; + skb->extensions = new; + skb->active_extensions |= 1 << id; + return skb_ext_get_ptr(new, id); +} +EXPORT_SYMBOL(skb_ext_add); + +#ifdef CONFIG_XFRM +static void skb_ext_put_sp(struct sec_path *sp) +{ + unsigned int i; + + for (i = 0; i < sp->len; i++) + xfrm_state_put(sp->xvec[i]); +} +#endif + +#ifdef CONFIG_MCTP_FLOWS +static void skb_ext_put_mctp(struct mctp_flow *flow) +{ + if (flow->key) + mctp_key_unref(flow->key); +} +#endif + +void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) +{ + struct skb_ext *ext = skb->extensions; + + skb->active_extensions &= ~(1 << id); + if (skb->active_extensions == 0) { + skb->extensions = NULL; + __skb_ext_put(ext); +#ifdef CONFIG_XFRM + } else if (id == SKB_EXT_SEC_PATH && + refcount_read(&ext->refcnt) == 1) { + struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH); + + skb_ext_put_sp(sp); + sp->len = 0; +#endif + } +} +EXPORT_SYMBOL(__skb_ext_del); + +void __skb_ext_put(struct skb_ext *ext) +{ + /* If this is last clone, nothing can increment + * it after check passes. Avoids one atomic op. + */ + if (refcount_read(&ext->refcnt) == 1) + goto free_now; + + if (!refcount_dec_and_test(&ext->refcnt)) + return; +free_now: +#ifdef CONFIG_XFRM + if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH)) + skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH)); +#endif +#ifdef CONFIG_MCTP_FLOWS + if (__skb_ext_exist(ext, SKB_EXT_MCTP)) + skb_ext_put_mctp(skb_ext_get_ptr(ext, SKB_EXT_MCTP)); +#endif + + kmem_cache_free(skbuff_ext_cache, ext); +} +EXPORT_SYMBOL(__skb_ext_put); +#endif /* CONFIG_SKB_EXTENSIONS */ + +/** + * skb_attempt_defer_free - queue skb for remote freeing + * @skb: buffer + * + * Put @skb in a per-cpu list, using the cpu which + * allocated the skb/pages to reduce false sharing + * and memory zone spinlock contention. + */ +void skb_attempt_defer_free(struct sk_buff *skb) +{ + int cpu = skb->alloc_cpu; + struct softnet_data *sd; + unsigned long flags; + unsigned int defer_max; + bool kick; + + if (WARN_ON_ONCE(cpu >= nr_cpu_ids) || + !cpu_online(cpu) || + cpu == raw_smp_processor_id()) { +nodefer: __kfree_skb(skb); + return; + } + + sd = &per_cpu(softnet_data, cpu); + defer_max = READ_ONCE(sysctl_skb_defer_max); + if (READ_ONCE(sd->defer_count) >= defer_max) + goto nodefer; + + spin_lock_irqsave(&sd->defer_lock, flags); + /* Send an IPI every time queue reaches half capacity. */ + kick = sd->defer_count == (defer_max >> 1); + /* Paired with the READ_ONCE() few lines above */ + WRITE_ONCE(sd->defer_count, sd->defer_count + 1); + + skb->next = sd->defer_list; + /* Paired with READ_ONCE() in skb_defer_free_flush() */ + WRITE_ONCE(sd->defer_list, skb); + spin_unlock_irqrestore(&sd->defer_lock, flags); + + /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU + * if we are unlucky enough (this seems very unlikely). + */ + if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) + smp_call_function_single_async(cpu, &sd->defer_csd); +} diff --git a/net/core/skmsg.c b/net/core/skmsg.c new file mode 100644 index 000000000..3818035ea --- /dev/null +++ b/net/core/skmsg.c @@ -0,0 +1,1246 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ + +#include <linux/skmsg.h> +#include <linux/skbuff.h> +#include <linux/scatterlist.h> + +#include <net/sock.h> +#include <net/tcp.h> +#include <net/tls.h> + +static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce) +{ + if (msg->sg.end > msg->sg.start && + elem_first_coalesce < msg->sg.end) + return true; + + if (msg->sg.end < msg->sg.start && + (elem_first_coalesce > msg->sg.start || + elem_first_coalesce < msg->sg.end)) + return true; + + return false; +} + +int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, + int elem_first_coalesce) +{ + struct page_frag *pfrag = sk_page_frag(sk); + u32 osize = msg->sg.size; + int ret = 0; + + len -= msg->sg.size; + while (len > 0) { + struct scatterlist *sge; + u32 orig_offset; + int use, i; + + if (!sk_page_frag_refill(sk, pfrag)) { + ret = -ENOMEM; + goto msg_trim; + } + + orig_offset = pfrag->offset; + use = min_t(int, len, pfrag->size - orig_offset); + if (!sk_wmem_schedule(sk, use)) { + ret = -ENOMEM; + goto msg_trim; + } + + i = msg->sg.end; + sk_msg_iter_var_prev(i); + sge = &msg->sg.data[i]; + + if (sk_msg_try_coalesce_ok(msg, elem_first_coalesce) && + sg_page(sge) == pfrag->page && + sge->offset + sge->length == orig_offset) { + sge->length += use; + } else { + if (sk_msg_full(msg)) { + ret = -ENOSPC; + break; + } + + sge = &msg->sg.data[msg->sg.end]; + sg_unmark_end(sge); + sg_set_page(sge, pfrag->page, use, orig_offset); + get_page(pfrag->page); + sk_msg_iter_next(msg, end); + } + + sk_mem_charge(sk, use); + msg->sg.size += use; + pfrag->offset += use; + len -= use; + } + + return ret; + +msg_trim: + sk_msg_trim(sk, msg, osize); + return ret; +} +EXPORT_SYMBOL_GPL(sk_msg_alloc); + +int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src, + u32 off, u32 len) +{ + int i = src->sg.start; + struct scatterlist *sge = sk_msg_elem(src, i); + struct scatterlist *sgd = NULL; + u32 sge_len, sge_off; + + while (off) { + if (sge->length > off) + break; + off -= sge->length; + sk_msg_iter_var_next(i); + if (i == src->sg.end && off) + return -ENOSPC; + sge = sk_msg_elem(src, i); + } + + while (len) { + sge_len = sge->length - off; + if (sge_len > len) + sge_len = len; + + if (dst->sg.end) + sgd = sk_msg_elem(dst, dst->sg.end - 1); + + if (sgd && + (sg_page(sge) == sg_page(sgd)) && + (sg_virt(sge) + off == sg_virt(sgd) + sgd->length)) { + sgd->length += sge_len; + dst->sg.size += sge_len; + } else if (!sk_msg_full(dst)) { + sge_off = sge->offset + off; + sk_msg_page_add(dst, sg_page(sge), sge_len, sge_off); + } else { + return -ENOSPC; + } + + off = 0; + len -= sge_len; + sk_mem_charge(sk, sge_len); + sk_msg_iter_var_next(i); + if (i == src->sg.end && len) + return -ENOSPC; + sge = sk_msg_elem(src, i); + } + + return 0; +} +EXPORT_SYMBOL_GPL(sk_msg_clone); + +void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes) +{ + int i = msg->sg.start; + + do { + struct scatterlist *sge = sk_msg_elem(msg, i); + + if (bytes < sge->length) { + sge->length -= bytes; + sge->offset += bytes; + sk_mem_uncharge(sk, bytes); + break; + } + + sk_mem_uncharge(sk, sge->length); + bytes -= sge->length; + sge->length = 0; + sge->offset = 0; + sk_msg_iter_var_next(i); + } while (bytes && i != msg->sg.end); + msg->sg.start = i; +} +EXPORT_SYMBOL_GPL(sk_msg_return_zero); + +void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes) +{ + int i = msg->sg.start; + + do { + struct scatterlist *sge = &msg->sg.data[i]; + int uncharge = (bytes < sge->length) ? bytes : sge->length; + + sk_mem_uncharge(sk, uncharge); + bytes -= uncharge; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); +} +EXPORT_SYMBOL_GPL(sk_msg_return); + +static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i, + bool charge) +{ + struct scatterlist *sge = sk_msg_elem(msg, i); + u32 len = sge->length; + + /* When the skb owns the memory we free it from consume_skb path. */ + if (!msg->skb) { + if (charge) + sk_mem_uncharge(sk, len); + put_page(sg_page(sge)); + } + memset(sge, 0, sizeof(*sge)); + return len; +} + +static int __sk_msg_free(struct sock *sk, struct sk_msg *msg, u32 i, + bool charge) +{ + struct scatterlist *sge = sk_msg_elem(msg, i); + int freed = 0; + + while (msg->sg.size) { + msg->sg.size -= sge->length; + freed += sk_msg_free_elem(sk, msg, i, charge); + sk_msg_iter_var_next(i); + sk_msg_check_to_free(msg, i, msg->sg.size); + sge = sk_msg_elem(msg, i); + } + consume_skb(msg->skb); + sk_msg_init(msg); + return freed; +} + +int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg) +{ + return __sk_msg_free(sk, msg, msg->sg.start, false); +} +EXPORT_SYMBOL_GPL(sk_msg_free_nocharge); + +int sk_msg_free(struct sock *sk, struct sk_msg *msg) +{ + return __sk_msg_free(sk, msg, msg->sg.start, true); +} +EXPORT_SYMBOL_GPL(sk_msg_free); + +static void __sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, + u32 bytes, bool charge) +{ + struct scatterlist *sge; + u32 i = msg->sg.start; + + while (bytes) { + sge = sk_msg_elem(msg, i); + if (!sge->length) + break; + if (bytes < sge->length) { + if (charge) + sk_mem_uncharge(sk, bytes); + sge->length -= bytes; + sge->offset += bytes; + msg->sg.size -= bytes; + break; + } + + msg->sg.size -= sge->length; + bytes -= sge->length; + sk_msg_free_elem(sk, msg, i, charge); + sk_msg_iter_var_next(i); + sk_msg_check_to_free(msg, i, bytes); + } + msg->sg.start = i; +} + +void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes) +{ + __sk_msg_free_partial(sk, msg, bytes, true); +} +EXPORT_SYMBOL_GPL(sk_msg_free_partial); + +void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg, + u32 bytes) +{ + __sk_msg_free_partial(sk, msg, bytes, false); +} + +void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len) +{ + int trim = msg->sg.size - len; + u32 i = msg->sg.end; + + if (trim <= 0) { + WARN_ON(trim < 0); + return; + } + + sk_msg_iter_var_prev(i); + msg->sg.size = len; + while (msg->sg.data[i].length && + trim >= msg->sg.data[i].length) { + trim -= msg->sg.data[i].length; + sk_msg_free_elem(sk, msg, i, true); + sk_msg_iter_var_prev(i); + if (!trim) + goto out; + } + + msg->sg.data[i].length -= trim; + sk_mem_uncharge(sk, trim); + /* Adjust copybreak if it falls into the trimmed part of last buf */ + if (msg->sg.curr == i && msg->sg.copybreak > msg->sg.data[i].length) + msg->sg.copybreak = msg->sg.data[i].length; +out: + sk_msg_iter_var_next(i); + msg->sg.end = i; + + /* If we trim data a full sg elem before curr pointer update + * copybreak and current so that any future copy operations + * start at new copy location. + * However trimed data that has not yet been used in a copy op + * does not require an update. + */ + if (!msg->sg.size) { + msg->sg.curr = msg->sg.start; + msg->sg.copybreak = 0; + } else if (sk_msg_iter_dist(msg->sg.start, msg->sg.curr) >= + sk_msg_iter_dist(msg->sg.start, msg->sg.end)) { + sk_msg_iter_var_prev(i); + msg->sg.curr = i; + msg->sg.copybreak = msg->sg.data[i].length; + } +} +EXPORT_SYMBOL_GPL(sk_msg_trim); + +int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, + struct sk_msg *msg, u32 bytes) +{ + int i, maxpages, ret = 0, num_elems = sk_msg_elem_used(msg); + const int to_max_pages = MAX_MSG_FRAGS; + struct page *pages[MAX_MSG_FRAGS]; + ssize_t orig, copied, use, offset; + + orig = msg->sg.size; + while (bytes > 0) { + i = 0; + maxpages = to_max_pages - num_elems; + if (maxpages == 0) { + ret = -EFAULT; + goto out; + } + + copied = iov_iter_get_pages2(from, pages, bytes, maxpages, + &offset); + if (copied <= 0) { + ret = -EFAULT; + goto out; + } + + bytes -= copied; + msg->sg.size += copied; + + while (copied) { + use = min_t(int, copied, PAGE_SIZE - offset); + sg_set_page(&msg->sg.data[msg->sg.end], + pages[i], use, offset); + sg_unmark_end(&msg->sg.data[msg->sg.end]); + sk_mem_charge(sk, use); + + offset = 0; + copied -= use; + sk_msg_iter_next(msg, end); + num_elems++; + i++; + } + /* When zerocopy is mixed with sk_msg_*copy* operations we + * may have a copybreak set in this case clear and prefer + * zerocopy remainder when possible. + */ + msg->sg.copybreak = 0; + msg->sg.curr = msg->sg.end; + } +out: + /* Revert iov_iter updates, msg will need to use 'trim' later if it + * also needs to be cleared. + */ + if (ret) + iov_iter_revert(from, msg->sg.size - orig); + return ret; +} +EXPORT_SYMBOL_GPL(sk_msg_zerocopy_from_iter); + +int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, + struct sk_msg *msg, u32 bytes) +{ + int ret = -ENOSPC, i = msg->sg.curr; + struct scatterlist *sge; + u32 copy, buf_size; + void *to; + + do { + sge = sk_msg_elem(msg, i); + /* This is possible if a trim operation shrunk the buffer */ + if (msg->sg.copybreak >= sge->length) { + msg->sg.copybreak = 0; + sk_msg_iter_var_next(i); + if (i == msg->sg.end) + break; + sge = sk_msg_elem(msg, i); + } + + buf_size = sge->length - msg->sg.copybreak; + copy = (buf_size > bytes) ? bytes : buf_size; + to = sg_virt(sge) + msg->sg.copybreak; + msg->sg.copybreak += copy; + if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) + ret = copy_from_iter_nocache(to, copy, from); + else + ret = copy_from_iter(to, copy, from); + if (ret != copy) { + ret = -EFAULT; + goto out; + } + bytes -= copy; + if (!bytes) + break; + msg->sg.copybreak = 0; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); +out: + msg->sg.curr = i; + return ret; +} +EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); + +/* Receive sk_msg from psock->ingress_msg to @msg. */ +int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, + int len, int flags) +{ + struct iov_iter *iter = &msg->msg_iter; + int peek = flags & MSG_PEEK; + struct sk_msg *msg_rx; + int i, copied = 0; + + msg_rx = sk_psock_peek_msg(psock); + while (copied != len) { + struct scatterlist *sge; + + if (unlikely(!msg_rx)) + break; + + i = msg_rx->sg.start; + do { + struct page *page; + int copy; + + sge = sk_msg_elem(msg_rx, i); + copy = sge->length; + page = sg_page(sge); + if (copied + copy > len) + copy = len - copied; + copy = copy_page_to_iter(page, sge->offset, copy, iter); + if (!copy) { + copied = copied ? copied : -EFAULT; + goto out; + } + + copied += copy; + if (likely(!peek)) { + sge->offset += copy; + sge->length -= copy; + if (!msg_rx->skb) + sk_mem_uncharge(sk, copy); + msg_rx->sg.size -= copy; + + if (!sge->length) { + sk_msg_iter_var_next(i); + if (!msg_rx->skb) + put_page(page); + } + } else { + /* Lets not optimize peek case if copy_page_to_iter + * didn't copy the entire length lets just break. + */ + if (copy != sge->length) + goto out; + sk_msg_iter_var_next(i); + } + + if (copied == len) + break; + } while ((i != msg_rx->sg.end) && !sg_is_last(sge)); + + if (unlikely(peek)) { + msg_rx = sk_psock_next_msg(psock, msg_rx); + if (!msg_rx) + break; + continue; + } + + msg_rx->sg.start = i; + if (!sge->length && (i == msg_rx->sg.end || sg_is_last(sge))) { + msg_rx = sk_psock_dequeue_msg(psock); + kfree_sk_msg(msg_rx); + } + msg_rx = sk_psock_peek_msg(psock); + } +out: + return copied; +} +EXPORT_SYMBOL_GPL(sk_msg_recvmsg); + +bool sk_msg_is_readable(struct sock *sk) +{ + struct sk_psock *psock; + bool empty = true; + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock)) + empty = list_empty(&psock->ingress_msg); + rcu_read_unlock(); + return !empty; +} +EXPORT_SYMBOL_GPL(sk_msg_is_readable); + +static struct sk_msg *alloc_sk_msg(gfp_t gfp) +{ + struct sk_msg *msg; + + msg = kzalloc(sizeof(*msg), gfp | __GFP_NOWARN); + if (unlikely(!msg)) + return NULL; + sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS); + return msg; +} + +static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, + struct sk_buff *skb) +{ + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) + return NULL; + + if (!sk_rmem_schedule(sk, skb, skb->truesize)) + return NULL; + + return alloc_sk_msg(GFP_KERNEL); +} + +static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, + u32 off, u32 len, + struct sk_psock *psock, + struct sock *sk, + struct sk_msg *msg) +{ + int num_sge, copied; + + num_sge = skb_to_sgvec(skb, msg->sg.data, off, len); + if (num_sge < 0) { + /* skb linearize may fail with ENOMEM, but lets simply try again + * later if this happens. Under memory pressure we don't want to + * drop the skb. We need to linearize the skb so that the mapping + * in skb_to_sgvec can not error. + */ + if (skb_linearize(skb)) + return -EAGAIN; + + num_sge = skb_to_sgvec(skb, msg->sg.data, off, len); + if (unlikely(num_sge < 0)) + return num_sge; + } + + copied = len; + msg->sg.start = 0; + msg->sg.size = copied; + msg->sg.end = num_sge; + msg->skb = skb; + + sk_psock_queue_msg(psock, msg); + sk_psock_data_ready(sk, psock); + return copied; +} + +static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, + u32 off, u32 len); + +static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, + u32 off, u32 len) +{ + struct sock *sk = psock->sk; + struct sk_msg *msg; + int err; + + /* If we are receiving on the same sock skb->sk is already assigned, + * skip memory accounting and owner transition seeing it already set + * correctly. + */ + if (unlikely(skb->sk == sk)) + return sk_psock_skb_ingress_self(psock, skb, off, len); + msg = sk_psock_create_ingress_msg(sk, skb); + if (!msg) + return -EAGAIN; + + /* This will transition ownership of the data from the socket where + * the BPF program was run initiating the redirect to the socket + * we will eventually receive this data on. The data will be released + * from skb_consume found in __tcp_bpf_recvmsg() after its been copied + * into user buffers. + */ + skb_set_owner_r(skb, sk); + err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); + if (err < 0) + kfree(msg); + return err; +} + +/* Puts an skb on the ingress queue of the socket already assigned to the + * skb. In this case we do not need to check memory limits or skb_set_owner_r + * because the skb is already accounted for here. + */ +static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, + u32 off, u32 len) +{ + struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC); + struct sock *sk = psock->sk; + int err; + + if (unlikely(!msg)) + return -EAGAIN; + skb_set_owner_r(skb, sk); + err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); + if (err < 0) + kfree(msg); + return err; +} + +static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, + u32 off, u32 len, bool ingress) +{ + int err = 0; + + if (!ingress) { + if (!sock_writeable(psock->sk)) + return -EAGAIN; + return skb_send_sock(psock->sk, skb, off, len); + } + skb_get(skb); + err = sk_psock_skb_ingress(psock, skb, off, len); + if (err < 0) + kfree_skb(skb); + return err; +} + +static void sk_psock_skb_state(struct sk_psock *psock, + struct sk_psock_work_state *state, + int len, int off) +{ + spin_lock_bh(&psock->ingress_lock); + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { + state->len = len; + state->off = off; + } + spin_unlock_bh(&psock->ingress_lock); +} + +static void sk_psock_backlog(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct sk_psock *psock = container_of(dwork, struct sk_psock, work); + struct sk_psock_work_state *state = &psock->work_state; + struct sk_buff *skb = NULL; + u32 len = 0, off = 0; + bool ingress; + int ret; + + mutex_lock(&psock->work_mutex); + if (unlikely(state->len)) { + len = state->len; + off = state->off; + } + + while ((skb = skb_peek(&psock->ingress_skb))) { + len = skb->len; + off = 0; + if (skb_bpf_strparser(skb)) { + struct strp_msg *stm = strp_msg(skb); + + off = stm->offset; + len = stm->full_len; + } + ingress = skb_bpf_ingress(skb); + skb_bpf_redirect_clear(skb); + do { + ret = -EIO; + if (!sock_flag(psock->sk, SOCK_DEAD)) + ret = sk_psock_handle_skb(psock, skb, off, + len, ingress); + if (ret <= 0) { + if (ret == -EAGAIN) { + sk_psock_skb_state(psock, state, len, off); + + /* Delay slightly to prioritize any + * other work that might be here. + */ + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + schedule_delayed_work(&psock->work, 1); + goto end; + } + /* Hard errors break pipe and stop xmit. */ + sk_psock_report_error(psock, ret ? -ret : EPIPE); + sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); + goto end; + } + off += ret; + len -= ret; + } while (len); + + skb = skb_dequeue(&psock->ingress_skb); + kfree_skb(skb); + } +end: + mutex_unlock(&psock->work_mutex); +} + +struct sk_psock *sk_psock_init(struct sock *sk, int node) +{ + struct sk_psock *psock; + struct proto *prot; + + write_lock_bh(&sk->sk_callback_lock); + + if (sk_is_inet(sk) && inet_csk_has_ulp(sk)) { + psock = ERR_PTR(-EINVAL); + goto out; + } + + if (sk->sk_user_data) { + psock = ERR_PTR(-EBUSY); + goto out; + } + + psock = kzalloc_node(sizeof(*psock), GFP_ATOMIC | __GFP_NOWARN, node); + if (!psock) { + psock = ERR_PTR(-ENOMEM); + goto out; + } + + prot = READ_ONCE(sk->sk_prot); + psock->sk = sk; + psock->eval = __SK_NONE; + psock->sk_proto = prot; + psock->saved_unhash = prot->unhash; + psock->saved_destroy = prot->destroy; + psock->saved_close = prot->close; + psock->saved_write_space = sk->sk_write_space; + + INIT_LIST_HEAD(&psock->link); + spin_lock_init(&psock->link_lock); + + INIT_DELAYED_WORK(&psock->work, sk_psock_backlog); + mutex_init(&psock->work_mutex); + INIT_LIST_HEAD(&psock->ingress_msg); + spin_lock_init(&psock->ingress_lock); + skb_queue_head_init(&psock->ingress_skb); + + sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED); + refcount_set(&psock->refcnt, 1); + + __rcu_assign_sk_user_data_with_flags(sk, psock, + SK_USER_DATA_NOCOPY | + SK_USER_DATA_PSOCK); + sock_hold(sk); + +out: + write_unlock_bh(&sk->sk_callback_lock); + return psock; +} +EXPORT_SYMBOL_GPL(sk_psock_init); + +struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock) +{ + struct sk_psock_link *link; + + spin_lock_bh(&psock->link_lock); + link = list_first_entry_or_null(&psock->link, struct sk_psock_link, + list); + if (link) + list_del(&link->list); + spin_unlock_bh(&psock->link_lock); + return link; +} + +static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) +{ + struct sk_msg *msg, *tmp; + + list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { + list_del(&msg->list); + sk_msg_free(psock->sk, msg); + kfree(msg); + } +} + +static void __sk_psock_zap_ingress(struct sk_psock *psock) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) { + skb_bpf_redirect_clear(skb); + sock_drop(psock->sk, skb); + } + __sk_psock_purge_ingress_msg(psock); +} + +static void sk_psock_link_destroy(struct sk_psock *psock) +{ + struct sk_psock_link *link, *tmp; + + list_for_each_entry_safe(link, tmp, &psock->link, list) { + list_del(&link->list); + sk_psock_free_link(link); + } +} + +void sk_psock_stop(struct sk_psock *psock) +{ + spin_lock_bh(&psock->ingress_lock); + sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); + sk_psock_cork_free(psock); + spin_unlock_bh(&psock->ingress_lock); +} + +static void sk_psock_done_strp(struct sk_psock *psock); + +static void sk_psock_destroy(struct work_struct *work) +{ + struct sk_psock *psock = container_of(to_rcu_work(work), + struct sk_psock, rwork); + /* No sk_callback_lock since already detached. */ + + sk_psock_done_strp(psock); + + cancel_delayed_work_sync(&psock->work); + __sk_psock_zap_ingress(psock); + mutex_destroy(&psock->work_mutex); + + psock_progs_drop(&psock->progs); + + sk_psock_link_destroy(psock); + sk_psock_cork_free(psock); + + if (psock->sk_redir) + sock_put(psock->sk_redir); + if (psock->sk_pair) + sock_put(psock->sk_pair); + sock_put(psock->sk); + kfree(psock); +} + +void sk_psock_drop(struct sock *sk, struct sk_psock *psock) +{ + write_lock_bh(&sk->sk_callback_lock); + sk_psock_restore_proto(sk, psock); + rcu_assign_sk_user_data(sk, NULL); + if (psock->progs.stream_parser) + sk_psock_stop_strp(sk, psock); + else if (psock->progs.stream_verdict || psock->progs.skb_verdict) + sk_psock_stop_verdict(sk, psock); + write_unlock_bh(&sk->sk_callback_lock); + + sk_psock_stop(psock); + + INIT_RCU_WORK(&psock->rwork, sk_psock_destroy); + queue_rcu_work(system_wq, &psock->rwork); +} +EXPORT_SYMBOL_GPL(sk_psock_drop); + +static int sk_psock_map_verd(int verdict, bool redir) +{ + switch (verdict) { + case SK_PASS: + return redir ? __SK_REDIRECT : __SK_PASS; + case SK_DROP: + default: + break; + } + + return __SK_DROP; +} + +int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, + struct sk_msg *msg) +{ + struct bpf_prog *prog; + int ret; + + rcu_read_lock(); + prog = READ_ONCE(psock->progs.msg_parser); + if (unlikely(!prog)) { + ret = __SK_PASS; + goto out; + } + + sk_msg_compute_data_pointers(msg); + msg->sk = sk; + ret = bpf_prog_run_pin_on_cpu(prog, msg); + ret = sk_psock_map_verd(ret, msg->sk_redir); + psock->apply_bytes = msg->apply_bytes; + if (ret == __SK_REDIRECT) { + if (psock->sk_redir) { + sock_put(psock->sk_redir); + psock->sk_redir = NULL; + } + if (!msg->sk_redir) { + ret = __SK_DROP; + goto out; + } + psock->redir_ingress = sk_msg_to_ingress(msg); + psock->sk_redir = msg->sk_redir; + sock_hold(psock->sk_redir); + } +out: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); + +static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) +{ + struct sk_psock *psock_other; + struct sock *sk_other; + + sk_other = skb_bpf_redirect_fetch(skb); + /* This error is a buggy BPF program, it returned a redirect + * return code, but then didn't set a redirect interface. + */ + if (unlikely(!sk_other)) { + skb_bpf_redirect_clear(skb); + sock_drop(from->sk, skb); + return -EIO; + } + psock_other = sk_psock(sk_other); + /* This error indicates the socket is being torn down or had another + * error that caused the pipe to break. We can't send a packet on + * a socket that is in this state so we drop the skb. + */ + if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { + skb_bpf_redirect_clear(skb); + sock_drop(from->sk, skb); + return -EIO; + } + spin_lock_bh(&psock_other->ingress_lock); + if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { + spin_unlock_bh(&psock_other->ingress_lock); + skb_bpf_redirect_clear(skb); + sock_drop(from->sk, skb); + return -EIO; + } + + skb_queue_tail(&psock_other->ingress_skb, skb); + schedule_delayed_work(&psock_other->work, 0); + spin_unlock_bh(&psock_other->ingress_lock); + return 0; +} + +static void sk_psock_tls_verdict_apply(struct sk_buff *skb, + struct sk_psock *from, int verdict) +{ + switch (verdict) { + case __SK_REDIRECT: + sk_psock_skb_redirect(from, skb); + break; + case __SK_PASS: + case __SK_DROP: + default: + break; + } +} + +int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) +{ + struct bpf_prog *prog; + int ret = __SK_PASS; + + rcu_read_lock(); + prog = READ_ONCE(psock->progs.stream_verdict); + if (likely(prog)) { + skb->sk = psock->sk; + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); + skb->sk = NULL; + } + sk_psock_tls_verdict_apply(skb, psock, ret); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); + +static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, + int verdict) +{ + struct sock *sk_other; + int err = 0; + u32 len, off; + + switch (verdict) { + case __SK_PASS: + err = -EIO; + sk_other = psock->sk; + if (sock_flag(sk_other, SOCK_DEAD) || + !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + goto out_free; + + skb_bpf_set_ingress(skb); + + /* If the queue is empty then we can submit directly + * into the msg queue. If its not empty we have to + * queue work otherwise we may get OOO data. Otherwise, + * if sk_psock_skb_ingress errors will be handled by + * retrying later from workqueue. + */ + if (skb_queue_empty(&psock->ingress_skb)) { + len = skb->len; + off = 0; + if (skb_bpf_strparser(skb)) { + struct strp_msg *stm = strp_msg(skb); + + off = stm->offset; + len = stm->full_len; + } + err = sk_psock_skb_ingress_self(psock, skb, off, len); + } + if (err < 0) { + spin_lock_bh(&psock->ingress_lock); + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { + skb_queue_tail(&psock->ingress_skb, skb); + schedule_delayed_work(&psock->work, 0); + err = 0; + } + spin_unlock_bh(&psock->ingress_lock); + if (err < 0) + goto out_free; + } + break; + case __SK_REDIRECT: + tcp_eat_skb(psock->sk, skb); + err = sk_psock_skb_redirect(psock, skb); + break; + case __SK_DROP: + default: +out_free: + skb_bpf_redirect_clear(skb); + tcp_eat_skb(psock->sk, skb); + sock_drop(psock->sk, skb); + } + + return err; +} + +static void sk_psock_write_space(struct sock *sk) +{ + struct sk_psock *psock; + void (*write_space)(struct sock *sk) = NULL; + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock)) { + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + schedule_delayed_work(&psock->work, 0); + write_space = psock->saved_write_space; + } + rcu_read_unlock(); + if (write_space) + write_space(sk); +} + +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) +static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) +{ + struct sk_psock *psock; + struct bpf_prog *prog; + int ret = __SK_DROP; + struct sock *sk; + + rcu_read_lock(); + sk = strp->sk; + psock = sk_psock(sk); + if (unlikely(!psock)) { + sock_drop(sk, skb); + goto out; + } + prog = READ_ONCE(psock->progs.stream_verdict); + if (likely(prog)) { + skb->sk = sk; + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); + skb_bpf_set_strparser(skb); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); + skb->sk = NULL; + } + sk_psock_verdict_apply(psock, skb, ret); +out: + rcu_read_unlock(); +} + +static int sk_psock_strp_read_done(struct strparser *strp, int err) +{ + return err; +} + +static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) +{ + struct sk_psock *psock = container_of(strp, struct sk_psock, strp); + struct bpf_prog *prog; + int ret = skb->len; + + rcu_read_lock(); + prog = READ_ONCE(psock->progs.stream_parser); + if (likely(prog)) { + skb->sk = psock->sk; + ret = bpf_prog_run_pin_on_cpu(prog, skb); + skb->sk = NULL; + } + rcu_read_unlock(); + return ret; +} + +/* Called with socket lock held. */ +static void sk_psock_strp_data_ready(struct sock *sk) +{ + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock)) { + if (tls_sw_has_ctx_rx(sk)) { + psock->saved_data_ready(sk); + } else { + write_lock_bh(&sk->sk_callback_lock); + strp_data_ready(&psock->strp); + write_unlock_bh(&sk->sk_callback_lock); + } + } + rcu_read_unlock(); +} + +int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) +{ + int ret; + + static const struct strp_callbacks cb = { + .rcv_msg = sk_psock_strp_read, + .read_sock_done = sk_psock_strp_read_done, + .parse_msg = sk_psock_strp_parse, + }; + + ret = strp_init(&psock->strp, sk, &cb); + if (!ret) + sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED); + + return ret; +} + +void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) +{ + if (psock->saved_data_ready) + return; + + psock->saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = sk_psock_strp_data_ready; + sk->sk_write_space = sk_psock_write_space; +} + +void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) +{ + psock_set_prog(&psock->progs.stream_parser, NULL); + + if (!psock->saved_data_ready) + return; + + sk->sk_data_ready = psock->saved_data_ready; + psock->saved_data_ready = NULL; + strp_stop(&psock->strp); +} + +static void sk_psock_done_strp(struct sk_psock *psock) +{ + /* Parser has been stopped */ + if (sk_psock_test_state(psock, SK_PSOCK_RX_STRP_ENABLED)) + strp_done(&psock->strp); +} +#else +static void sk_psock_done_strp(struct sk_psock *psock) +{ +} +#endif /* CONFIG_BPF_STREAM_PARSER */ + +static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb) +{ + struct sk_psock *psock; + struct bpf_prog *prog; + int ret = __SK_DROP; + int len = skb->len; + + rcu_read_lock(); + psock = sk_psock(sk); + if (unlikely(!psock)) { + len = 0; + tcp_eat_skb(sk, skb); + sock_drop(sk, skb); + goto out; + } + prog = READ_ONCE(psock->progs.stream_verdict); + if (!prog) + prog = READ_ONCE(psock->progs.skb_verdict); + if (likely(prog)) { + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); + } + ret = sk_psock_verdict_apply(psock, skb, ret); + if (ret < 0) + len = ret; +out: + rcu_read_unlock(); + return len; +} + +static void sk_psock_verdict_data_ready(struct sock *sk) +{ + struct socket *sock = sk->sk_socket; + int copied; + + if (unlikely(!sock || !sock->ops || !sock->ops->read_skb)) + return; + copied = sock->ops->read_skb(sk, sk_psock_verdict_recv); + if (copied >= 0) { + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + if (psock) + psock->saved_data_ready(sk); + rcu_read_unlock(); + } +} + +void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) +{ + if (psock->saved_data_ready) + return; + + psock->saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = sk_psock_verdict_data_ready; + sk->sk_write_space = sk_psock_write_space; +} + +void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) +{ + psock_set_prog(&psock->progs.stream_verdict, NULL); + psock_set_prog(&psock->progs.skb_verdict, NULL); + + if (!psock->saved_data_ready) + return; + + sk->sk_data_ready = psock->saved_data_ready; + psock->saved_data_ready = NULL; +} diff --git a/net/core/sock.c b/net/core/sock.c new file mode 100644 index 000000000..c8803b95e --- /dev/null +++ b/net/core/sock.c @@ -0,0 +1,4131 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Generic socket support routines. Memory allocators, socket lock/release + * handler for protocols to use and generic option handler. + * + * Authors: Ross Biro + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Florian La Roche, <flla@stud.uni-sb.de> + * Alan Cox, <A.Cox@swansea.ac.uk> + * + * Fixes: + * Alan Cox : Numerous verify_area() problems + * Alan Cox : Connecting on a connecting socket + * now returns an error for tcp. + * Alan Cox : sock->protocol is set correctly. + * and is not sometimes left as 0. + * Alan Cox : connect handles icmp errors on a + * connect properly. Unfortunately there + * is a restart syscall nasty there. I + * can't match BSD without hacking the C + * library. Ideas urgently sought! + * Alan Cox : Disallow bind() to addresses that are + * not ours - especially broadcast ones!! + * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) + * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, + * instead they leave that for the DESTROY timer. + * Alan Cox : Clean up error flag in accept + * Alan Cox : TCP ack handling is buggy, the DESTROY timer + * was buggy. Put a remove_sock() in the handler + * for memory when we hit 0. Also altered the timer + * code. The ACK stuff can wait and needs major + * TCP layer surgery. + * Alan Cox : Fixed TCP ack bug, removed remove sock + * and fixed timer/inet_bh race. + * Alan Cox : Added zapped flag for TCP + * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code + * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb + * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources + * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. + * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... + * Rick Sladkey : Relaxed UDP rules for matching packets. + * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support + * Pauline Middelink : identd support + * Alan Cox : Fixed connect() taking signals I think. + * Alan Cox : SO_LINGER supported + * Alan Cox : Error reporting fixes + * Anonymous : inet_create tidied up (sk->reuse setting) + * Alan Cox : inet sockets don't set sk->type! + * Alan Cox : Split socket option code + * Alan Cox : Callbacks + * Alan Cox : Nagle flag for Charles & Johannes stuff + * Alex : Removed restriction on inet fioctl + * Alan Cox : Splitting INET from NET core + * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() + * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code + * Alan Cox : Split IP from generic code + * Alan Cox : New kfree_skbmem() + * Alan Cox : Make SO_DEBUG superuser only. + * Alan Cox : Allow anyone to clear SO_DEBUG + * (compatibility fix) + * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. + * Alan Cox : Allocator for a socket is settable. + * Alan Cox : SO_ERROR includes soft errors. + * Alan Cox : Allow NULL arguments on some SO_ opts + * Alan Cox : Generic socket allocation to make hooks + * easier (suggested by Craig Metz). + * Michael Pall : SO_ERROR returns positive errno again + * Steve Whitehouse: Added default destructor to free + * protocol private data. + * Steve Whitehouse: Added various other default routines + * common to several socket families. + * Chris Evans : Call suser() check last on F_SETOWN + * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. + * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() + * Andi Kleen : Fix write_space callback + * Chris Evans : Security fixes - signedness again + * Arnaldo C. Melo : cleanups, use skb_queue_purge + * + * To Fix: + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <asm/unaligned.h> +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/errqueue.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/poll.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/init.h> +#include <linux/highmem.h> +#include <linux/user_namespace.h> +#include <linux/static_key.h> +#include <linux/memcontrol.h> +#include <linux/prefetch.h> +#include <linux/compat.h> + +#include <linux/uaccess.h> + +#include <linux/netdevice.h> +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/net_namespace.h> +#include <net/request_sock.h> +#include <net/sock.h> +#include <linux/net_tstamp.h> +#include <net/xfrm.h> +#include <linux/ipsec.h> +#include <net/cls_cgroup.h> +#include <net/netprio_cgroup.h> +#include <linux/sock_diag.h> + +#include <linux/filter.h> +#include <net/sock_reuseport.h> +#include <net/bpf_sk_storage.h> + +#include <trace/events/sock.h> + +#include <net/tcp.h> +#include <net/busy_poll.h> + +#include <linux/ethtool.h> + +#include "dev.h" + +static DEFINE_MUTEX(proto_list_mutex); +static LIST_HEAD(proto_list); + +static void sock_def_write_space_wfree(struct sock *sk); +static void sock_def_write_space(struct sock *sk); + +/** + * sk_ns_capable - General socket capability test + * @sk: Socket to use a capability on or through + * @user_ns: The user namespace of the capability to use + * @cap: The capability to use + * + * Test to see if the opener of the socket had when the socket was + * created and the current process has the capability @cap in the user + * namespace @user_ns. + */ +bool sk_ns_capable(const struct sock *sk, + struct user_namespace *user_ns, int cap) +{ + return file_ns_capable(sk->sk_socket->file, user_ns, cap) && + ns_capable(user_ns, cap); +} +EXPORT_SYMBOL(sk_ns_capable); + +/** + * sk_capable - Socket global capability test + * @sk: Socket to use a capability on or through + * @cap: The global capability to use + * + * Test to see if the opener of the socket had when the socket was + * created and the current process has the capability @cap in all user + * namespaces. + */ +bool sk_capable(const struct sock *sk, int cap) +{ + return sk_ns_capable(sk, &init_user_ns, cap); +} +EXPORT_SYMBOL(sk_capable); + +/** + * sk_net_capable - Network namespace socket capability test + * @sk: Socket to use a capability on or through + * @cap: The capability to use + * + * Test to see if the opener of the socket had when the socket was created + * and the current process has the capability @cap over the network namespace + * the socket is a member of. + */ +bool sk_net_capable(const struct sock *sk, int cap) +{ + return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); +} +EXPORT_SYMBOL(sk_net_capable); + +/* + * Each address family might have different locking rules, so we have + * one slock key per address family and separate keys for internal and + * userspace sockets. + */ +static struct lock_class_key af_family_keys[AF_MAX]; +static struct lock_class_key af_family_kern_keys[AF_MAX]; +static struct lock_class_key af_family_slock_keys[AF_MAX]; +static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; + +/* + * Make lock validator output more readable. (we pre-construct these + * strings build-time, so that runtime initialization of socket + * locks is fast): + */ + +#define _sock_locks(x) \ + x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ + x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ + x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ + x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ + x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ + x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ + x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ + x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ + x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ + x "27" , x "28" , x "AF_CAN" , \ + x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ + x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ + x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ + x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ + x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ + x "AF_MCTP" , \ + x "AF_MAX" + +static const char *const af_family_key_strings[AF_MAX+1] = { + _sock_locks("sk_lock-") +}; +static const char *const af_family_slock_key_strings[AF_MAX+1] = { + _sock_locks("slock-") +}; +static const char *const af_family_clock_key_strings[AF_MAX+1] = { + _sock_locks("clock-") +}; + +static const char *const af_family_kern_key_strings[AF_MAX+1] = { + _sock_locks("k-sk_lock-") +}; +static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { + _sock_locks("k-slock-") +}; +static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { + _sock_locks("k-clock-") +}; +static const char *const af_family_rlock_key_strings[AF_MAX+1] = { + _sock_locks("rlock-") +}; +static const char *const af_family_wlock_key_strings[AF_MAX+1] = { + _sock_locks("wlock-") +}; +static const char *const af_family_elock_key_strings[AF_MAX+1] = { + _sock_locks("elock-") +}; + +/* + * sk_callback_lock and sk queues locking rules are per-address-family, + * so split the lock classes by using a per-AF key: + */ +static struct lock_class_key af_callback_keys[AF_MAX]; +static struct lock_class_key af_rlock_keys[AF_MAX]; +static struct lock_class_key af_wlock_keys[AF_MAX]; +static struct lock_class_key af_elock_keys[AF_MAX]; +static struct lock_class_key af_kern_callback_keys[AF_MAX]; + +/* Run time adjustable parameters. */ +__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; +EXPORT_SYMBOL(sysctl_wmem_max); +__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; +EXPORT_SYMBOL(sysctl_rmem_max); +__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; +__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; + +/* Maximal space eaten by iovec or ancillary data plus some space */ +int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); +EXPORT_SYMBOL(sysctl_optmem_max); + +int sysctl_tstamp_allow_data __read_mostly = 1; + +DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); +EXPORT_SYMBOL_GPL(memalloc_socks_key); + +/** + * sk_set_memalloc - sets %SOCK_MEMALLOC + * @sk: socket to set it on + * + * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. + * It's the responsibility of the admin to adjust min_free_kbytes + * to meet the requirements + */ +void sk_set_memalloc(struct sock *sk) +{ + sock_set_flag(sk, SOCK_MEMALLOC); + sk->sk_allocation |= __GFP_MEMALLOC; + static_branch_inc(&memalloc_socks_key); +} +EXPORT_SYMBOL_GPL(sk_set_memalloc); + +void sk_clear_memalloc(struct sock *sk) +{ + sock_reset_flag(sk, SOCK_MEMALLOC); + sk->sk_allocation &= ~__GFP_MEMALLOC; + static_branch_dec(&memalloc_socks_key); + + /* + * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward + * progress of swapping. SOCK_MEMALLOC may be cleared while + * it has rmem allocations due to the last swapfile being deactivated + * but there is a risk that the socket is unusable due to exceeding + * the rmem limits. Reclaim the reserves and obey rmem limits again. + */ + sk_mem_reclaim(sk); +} +EXPORT_SYMBOL_GPL(sk_clear_memalloc); + +int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) +{ + int ret; + unsigned int noreclaim_flag; + + /* these should have been dropped before queueing */ + BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); + + noreclaim_flag = memalloc_noreclaim_save(); + ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv, + tcp_v6_do_rcv, + tcp_v4_do_rcv, + sk, skb); + memalloc_noreclaim_restore(noreclaim_flag); + + return ret; +} +EXPORT_SYMBOL(__sk_backlog_rcv); + +void sk_error_report(struct sock *sk) +{ + sk->sk_error_report(sk); + + switch (sk->sk_family) { + case AF_INET: + fallthrough; + case AF_INET6: + trace_inet_sk_error_report(sk); + break; + default: + break; + } +} +EXPORT_SYMBOL(sk_error_report); + +int sock_get_timeout(long timeo, void *optval, bool old_timeval) +{ + struct __kernel_sock_timeval tv; + + if (timeo == MAX_SCHEDULE_TIMEOUT) { + tv.tv_sec = 0; + tv.tv_usec = 0; + } else { + tv.tv_sec = timeo / HZ; + tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; + } + + if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { + struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; + *(struct old_timeval32 *)optval = tv32; + return sizeof(tv32); + } + + if (old_timeval) { + struct __kernel_old_timeval old_tv; + old_tv.tv_sec = tv.tv_sec; + old_tv.tv_usec = tv.tv_usec; + *(struct __kernel_old_timeval *)optval = old_tv; + return sizeof(old_tv); + } + + *(struct __kernel_sock_timeval *)optval = tv; + return sizeof(tv); +} +EXPORT_SYMBOL(sock_get_timeout); + +int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, + sockptr_t optval, int optlen, bool old_timeval) +{ + if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { + struct old_timeval32 tv32; + + if (optlen < sizeof(tv32)) + return -EINVAL; + + if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) + return -EFAULT; + tv->tv_sec = tv32.tv_sec; + tv->tv_usec = tv32.tv_usec; + } else if (old_timeval) { + struct __kernel_old_timeval old_tv; + + if (optlen < sizeof(old_tv)) + return -EINVAL; + if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) + return -EFAULT; + tv->tv_sec = old_tv.tv_sec; + tv->tv_usec = old_tv.tv_usec; + } else { + if (optlen < sizeof(*tv)) + return -EINVAL; + if (copy_from_sockptr(tv, optval, sizeof(*tv))) + return -EFAULT; + } + + return 0; +} +EXPORT_SYMBOL(sock_copy_user_timeval); + +static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, + bool old_timeval) +{ + struct __kernel_sock_timeval tv; + int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval); + long val; + + if (err) + return err; + + if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) + return -EDOM; + + if (tv.tv_sec < 0) { + static int warned __read_mostly; + + WRITE_ONCE(*timeo_p, 0); + if (warned < 10 && net_ratelimit()) { + warned++; + pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", + __func__, current->comm, task_pid_nr(current)); + } + return 0; + } + val = MAX_SCHEDULE_TIMEOUT; + if ((tv.tv_sec || tv.tv_usec) && + (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))) + val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, + USEC_PER_SEC / HZ); + WRITE_ONCE(*timeo_p, val); + return 0; +} + +static bool sock_needs_netstamp(const struct sock *sk) +{ + switch (sk->sk_family) { + case AF_UNSPEC: + case AF_UNIX: + return false; + default: + return true; + } +} + +static void sock_disable_timestamp(struct sock *sk, unsigned long flags) +{ + if (sk->sk_flags & flags) { + sk->sk_flags &= ~flags; + if (sock_needs_netstamp(sk) && + !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) + net_disable_timestamp(); + } +} + + +int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + unsigned long flags; + struct sk_buff_head *list = &sk->sk_receive_queue; + + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { + atomic_inc(&sk->sk_drops); + trace_sock_rcvqueue_full(sk, skb); + return -ENOMEM; + } + + if (!sk_rmem_schedule(sk, skb, skb->truesize)) { + atomic_inc(&sk->sk_drops); + return -ENOBUFS; + } + + skb->dev = NULL; + skb_set_owner_r(skb, sk); + + /* we escape from rcu protected region, make sure we dont leak + * a norefcounted dst + */ + skb_dst_force(skb); + + spin_lock_irqsave(&list->lock, flags); + sock_skb_set_dropcount(sk, skb); + __skb_queue_tail(list, skb); + spin_unlock_irqrestore(&list->lock, flags); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk); + return 0; +} +EXPORT_SYMBOL(__sock_queue_rcv_skb); + +int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, + enum skb_drop_reason *reason) +{ + enum skb_drop_reason drop_reason; + int err; + + err = sk_filter(sk, skb); + if (err) { + drop_reason = SKB_DROP_REASON_SOCKET_FILTER; + goto out; + } + err = __sock_queue_rcv_skb(sk, skb); + switch (err) { + case -ENOMEM: + drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF; + break; + case -ENOBUFS: + drop_reason = SKB_DROP_REASON_PROTO_MEM; + break; + default: + drop_reason = SKB_NOT_DROPPED_YET; + break; + } +out: + if (reason) + *reason = drop_reason; + return err; +} +EXPORT_SYMBOL(sock_queue_rcv_skb_reason); + +int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, + const int nested, unsigned int trim_cap, bool refcounted) +{ + int rc = NET_RX_SUCCESS; + + if (sk_filter_trim_cap(sk, skb, trim_cap)) + goto discard_and_relse; + + skb->dev = NULL; + + if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { + atomic_inc(&sk->sk_drops); + goto discard_and_relse; + } + if (nested) + bh_lock_sock_nested(sk); + else + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { + /* + * trylock + unlock semantics: + */ + mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); + + rc = sk_backlog_rcv(sk, skb); + + mutex_release(&sk->sk_lock.dep_map, _RET_IP_); + } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { + bh_unlock_sock(sk); + atomic_inc(&sk->sk_drops); + goto discard_and_relse; + } + + bh_unlock_sock(sk); +out: + if (refcounted) + sock_put(sk); + return rc; +discard_and_relse: + kfree_skb(skb); + goto out; +} +EXPORT_SYMBOL(__sk_receive_skb); + +INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, + u32)); +INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, + u32)); +struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) +{ + struct dst_entry *dst = __sk_dst_get(sk); + + if (dst && dst->obsolete && + INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, + dst, cookie) == NULL) { + sk_tx_queue_clear(sk); + WRITE_ONCE(sk->sk_dst_pending_confirm, 0); + RCU_INIT_POINTER(sk->sk_dst_cache, NULL); + dst_release(dst); + return NULL; + } + + return dst; +} +EXPORT_SYMBOL(__sk_dst_check); + +struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) +{ + struct dst_entry *dst = sk_dst_get(sk); + + if (dst && dst->obsolete && + INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, + dst, cookie) == NULL) { + sk_dst_reset(sk); + dst_release(dst); + return NULL; + } + + return dst; +} +EXPORT_SYMBOL(sk_dst_check); + +static int sock_bindtoindex_locked(struct sock *sk, int ifindex) +{ + int ret = -ENOPROTOOPT; +#ifdef CONFIG_NETDEVICES + struct net *net = sock_net(sk); + + /* Sorry... */ + ret = -EPERM; + if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) + goto out; + + ret = -EINVAL; + if (ifindex < 0) + goto out; + + /* Paired with all READ_ONCE() done locklessly. */ + WRITE_ONCE(sk->sk_bound_dev_if, ifindex); + + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); + sk_dst_reset(sk); + + ret = 0; + +out: +#endif + + return ret; +} + +int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) +{ + int ret; + + if (lock_sk) + lock_sock(sk); + ret = sock_bindtoindex_locked(sk, ifindex); + if (lock_sk) + release_sock(sk); + + return ret; +} +EXPORT_SYMBOL(sock_bindtoindex); + +static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) +{ + int ret = -ENOPROTOOPT; +#ifdef CONFIG_NETDEVICES + struct net *net = sock_net(sk); + char devname[IFNAMSIZ]; + int index; + + ret = -EINVAL; + if (optlen < 0) + goto out; + + /* Bind this socket to a particular device like "eth0", + * as specified in the passed interface name. If the + * name is "" or the option length is zero the socket + * is not bound. + */ + if (optlen > IFNAMSIZ - 1) + optlen = IFNAMSIZ - 1; + memset(devname, 0, sizeof(devname)); + + ret = -EFAULT; + if (copy_from_sockptr(devname, optval, optlen)) + goto out; + + index = 0; + if (devname[0] != '\0') { + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, devname); + if (dev) + index = dev->ifindex; + rcu_read_unlock(); + ret = -ENODEV; + if (!dev) + goto out; + } + + sockopt_lock_sock(sk); + ret = sock_bindtoindex_locked(sk, index); + sockopt_release_sock(sk); +out: +#endif + + return ret; +} + +static int sock_getbindtodevice(struct sock *sk, sockptr_t optval, + sockptr_t optlen, int len) +{ + int ret = -ENOPROTOOPT; +#ifdef CONFIG_NETDEVICES + int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); + struct net *net = sock_net(sk); + char devname[IFNAMSIZ]; + + if (bound_dev_if == 0) { + len = 0; + goto zero; + } + + ret = -EINVAL; + if (len < IFNAMSIZ) + goto out; + + ret = netdev_get_name(net, devname, bound_dev_if); + if (ret) + goto out; + + len = strlen(devname) + 1; + + ret = -EFAULT; + if (copy_to_sockptr(optval, devname, len)) + goto out; + +zero: + ret = -EFAULT; + if (copy_to_sockptr(optlen, &len, sizeof(int))) + goto out; + + ret = 0; + +out: +#endif + + return ret; +} + +bool sk_mc_loop(struct sock *sk) +{ + if (dev_recursion_level()) + return false; + if (!sk) + return true; + /* IPV6_ADDRFORM can change sk->sk_family under us. */ + switch (READ_ONCE(sk->sk_family)) { + case AF_INET: + return inet_sk(sk)->mc_loop; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + return inet6_sk(sk)->mc_loop; +#endif + } + WARN_ON_ONCE(1); + return true; +} +EXPORT_SYMBOL(sk_mc_loop); + +void sock_set_reuseaddr(struct sock *sk) +{ + lock_sock(sk); + sk->sk_reuse = SK_CAN_REUSE; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_reuseaddr); + +void sock_set_reuseport(struct sock *sk) +{ + lock_sock(sk); + sk->sk_reuseport = true; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_reuseport); + +void sock_no_linger(struct sock *sk) +{ + lock_sock(sk); + WRITE_ONCE(sk->sk_lingertime, 0); + sock_set_flag(sk, SOCK_LINGER); + release_sock(sk); +} +EXPORT_SYMBOL(sock_no_linger); + +void sock_set_priority(struct sock *sk, u32 priority) +{ + lock_sock(sk); + WRITE_ONCE(sk->sk_priority, priority); + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_priority); + +void sock_set_sndtimeo(struct sock *sk, s64 secs) +{ + lock_sock(sk); + if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) + WRITE_ONCE(sk->sk_sndtimeo, secs * HZ); + else + WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT); + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_sndtimeo); + +static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) +{ + if (val) { + sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); + sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); + sock_set_flag(sk, SOCK_RCVTSTAMP); + sock_enable_timestamp(sk, SOCK_TIMESTAMP); + } else { + sock_reset_flag(sk, SOCK_RCVTSTAMP); + sock_reset_flag(sk, SOCK_RCVTSTAMPNS); + } +} + +void sock_enable_timestamps(struct sock *sk) +{ + lock_sock(sk); + __sock_set_timestamps(sk, true, false, true); + release_sock(sk); +} +EXPORT_SYMBOL(sock_enable_timestamps); + +void sock_set_timestamp(struct sock *sk, int optname, bool valbool) +{ + switch (optname) { + case SO_TIMESTAMP_OLD: + __sock_set_timestamps(sk, valbool, false, false); + break; + case SO_TIMESTAMP_NEW: + __sock_set_timestamps(sk, valbool, true, false); + break; + case SO_TIMESTAMPNS_OLD: + __sock_set_timestamps(sk, valbool, false, true); + break; + case SO_TIMESTAMPNS_NEW: + __sock_set_timestamps(sk, valbool, true, true); + break; + } +} + +static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) +{ + struct net *net = sock_net(sk); + struct net_device *dev = NULL; + bool match = false; + int *vclock_index; + int i, num; + + if (sk->sk_bound_dev_if) + dev = dev_get_by_index(net, sk->sk_bound_dev_if); + + if (!dev) { + pr_err("%s: sock not bind to device\n", __func__); + return -EOPNOTSUPP; + } + + num = ethtool_get_phc_vclocks(dev, &vclock_index); + dev_put(dev); + + for (i = 0; i < num; i++) { + if (*(vclock_index + i) == phc_index) { + match = true; + break; + } + } + + if (num > 0) + kfree(vclock_index); + + if (!match) + return -EINVAL; + + WRITE_ONCE(sk->sk_bind_phc, phc_index); + + return 0; +} + +int sock_set_timestamping(struct sock *sk, int optname, + struct so_timestamping timestamping) +{ + int val = timestamping.flags; + int ret; + + if (val & ~SOF_TIMESTAMPING_MASK) + return -EINVAL; + + if (val & SOF_TIMESTAMPING_OPT_ID && + !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { + if (sk_is_tcp(sk)) { + if ((1 << sk->sk_state) & + (TCPF_CLOSE | TCPF_LISTEN)) + return -EINVAL; + atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); + } else { + atomic_set(&sk->sk_tskey, 0); + } + } + + if (val & SOF_TIMESTAMPING_OPT_STATS && + !(val & SOF_TIMESTAMPING_OPT_TSONLY)) + return -EINVAL; + + if (val & SOF_TIMESTAMPING_BIND_PHC) { + ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc); + if (ret) + return ret; + } + + WRITE_ONCE(sk->sk_tsflags, val); + sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); + + if (val & SOF_TIMESTAMPING_RX_SOFTWARE) + sock_enable_timestamp(sk, + SOCK_TIMESTAMPING_RX_SOFTWARE); + else + sock_disable_timestamp(sk, + (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); + return 0; +} + +void sock_set_keepalive(struct sock *sk) +{ + lock_sock(sk); + if (sk->sk_prot->keepalive) + sk->sk_prot->keepalive(sk, true); + sock_valbool_flag(sk, SOCK_KEEPOPEN, true); + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_keepalive); + +static void __sock_set_rcvbuf(struct sock *sk, int val) +{ + /* Ensure val * 2 fits into an int, to prevent max_t() from treating it + * as a negative value. + */ + val = min_t(int, val, INT_MAX / 2); + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + + /* We double it on the way in to account for "struct sk_buff" etc. + * overhead. Applications assume that the SO_RCVBUF setting they make + * will allow that much actual data to be received on that socket. + * + * Applications are unaware that "struct sk_buff" and other overheads + * allocate from the receive buffer during socket buffer allocation. + * + * And after considering the possible alternatives, returning the value + * we actually used in getsockopt is the most desirable behavior. + */ + WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); +} + +void sock_set_rcvbuf(struct sock *sk, int val) +{ + lock_sock(sk); + __sock_set_rcvbuf(sk, val); + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_rcvbuf); + +static void __sock_set_mark(struct sock *sk, u32 val) +{ + if (val != sk->sk_mark) { + WRITE_ONCE(sk->sk_mark, val); + sk_dst_reset(sk); + } +} + +void sock_set_mark(struct sock *sk, u32 val) +{ + lock_sock(sk); + __sock_set_mark(sk, val); + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_mark); + +static void sock_release_reserved_memory(struct sock *sk, int bytes) +{ + /* Round down bytes to multiple of pages */ + bytes = round_down(bytes, PAGE_SIZE); + + WARN_ON(bytes > sk->sk_reserved_mem); + WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes); + sk_mem_reclaim(sk); +} + +static int sock_reserve_memory(struct sock *sk, int bytes) +{ + long allocated; + bool charged; + int pages; + + if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk)) + return -EOPNOTSUPP; + + if (!bytes) + return 0; + + pages = sk_mem_pages(bytes); + + /* pre-charge to memcg */ + charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); + if (!charged) + return -ENOMEM; + + /* pre-charge to forward_alloc */ + sk_memory_allocated_add(sk, pages); + allocated = sk_memory_allocated(sk); + /* If the system goes into memory pressure with this + * precharge, give up and return error. + */ + if (allocated > sk_prot_mem_limits(sk, 1)) { + sk_memory_allocated_sub(sk, pages); + mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); + return -ENOMEM; + } + sk_forward_alloc_add(sk, pages << PAGE_SHIFT); + + WRITE_ONCE(sk->sk_reserved_mem, + sk->sk_reserved_mem + (pages << PAGE_SHIFT)); + + return 0; +} + +void sockopt_lock_sock(struct sock *sk) +{ + /* When current->bpf_ctx is set, the setsockopt is called from + * a bpf prog. bpf has ensured the sk lock has been + * acquired before calling setsockopt(). + */ + if (has_current_bpf_ctx()) + return; + + lock_sock(sk); +} +EXPORT_SYMBOL(sockopt_lock_sock); + +void sockopt_release_sock(struct sock *sk) +{ + if (has_current_bpf_ctx()) + return; + + release_sock(sk); +} +EXPORT_SYMBOL(sockopt_release_sock); + +bool sockopt_ns_capable(struct user_namespace *ns, int cap) +{ + return has_current_bpf_ctx() || ns_capable(ns, cap); +} +EXPORT_SYMBOL(sockopt_ns_capable); + +bool sockopt_capable(int cap) +{ + return has_current_bpf_ctx() || capable(cap); +} +EXPORT_SYMBOL(sockopt_capable); + +/* + * This is meant for all protocols to use and covers goings on + * at the socket level. Everything here is generic. + */ + +int sk_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct so_timestamping timestamping; + struct socket *sock = sk->sk_socket; + struct sock_txtime sk_txtime; + int val; + int valbool; + struct linger ling; + int ret = 0; + + /* + * Options without arguments + */ + + if (optname == SO_BINDTODEVICE) + return sock_setbindtodevice(sk, optval, optlen); + + if (optlen < sizeof(int)) + return -EINVAL; + + if (copy_from_sockptr(&val, optval, sizeof(val))) + return -EFAULT; + + valbool = val ? 1 : 0; + + sockopt_lock_sock(sk); + + switch (optname) { + case SO_DEBUG: + if (val && !sockopt_capable(CAP_NET_ADMIN)) + ret = -EACCES; + else + sock_valbool_flag(sk, SOCK_DBG, valbool); + break; + case SO_REUSEADDR: + sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); + break; + case SO_REUSEPORT: + sk->sk_reuseport = valbool; + break; + case SO_TYPE: + case SO_PROTOCOL: + case SO_DOMAIN: + case SO_ERROR: + ret = -ENOPROTOOPT; + break; + case SO_DONTROUTE: + sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); + sk_dst_reset(sk); + break; + case SO_BROADCAST: + sock_valbool_flag(sk, SOCK_BROADCAST, valbool); + break; + case SO_SNDBUF: + /* Don't error on this BSD doesn't and if you think + * about it this is right. Otherwise apps have to + * play 'guess the biggest size' games. RCVBUF/SNDBUF + * are treated in BSD as hints + */ + val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); +set_sndbuf: + /* Ensure val * 2 fits into an int, to prevent max_t() + * from treating it as a negative value. + */ + val = min_t(int, val, INT_MAX / 2); + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + WRITE_ONCE(sk->sk_sndbuf, + max_t(int, val * 2, SOCK_MIN_SNDBUF)); + /* Wake up sending tasks if we upped the value. */ + sk->sk_write_space(sk); + break; + + case SO_SNDBUFFORCE: + if (!sockopt_capable(CAP_NET_ADMIN)) { + ret = -EPERM; + break; + } + + /* No negative values (to prevent underflow, as val will be + * multiplied by 2). + */ + if (val < 0) + val = 0; + goto set_sndbuf; + + case SO_RCVBUF: + /* Don't error on this BSD doesn't and if you think + * about it this is right. Otherwise apps have to + * play 'guess the biggest size' games. RCVBUF/SNDBUF + * are treated in BSD as hints + */ + __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max))); + break; + + case SO_RCVBUFFORCE: + if (!sockopt_capable(CAP_NET_ADMIN)) { + ret = -EPERM; + break; + } + + /* No negative values (to prevent underflow, as val will be + * multiplied by 2). + */ + __sock_set_rcvbuf(sk, max(val, 0)); + break; + + case SO_KEEPALIVE: + if (sk->sk_prot->keepalive) + sk->sk_prot->keepalive(sk, valbool); + sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); + break; + + case SO_OOBINLINE: + sock_valbool_flag(sk, SOCK_URGINLINE, valbool); + break; + + case SO_NO_CHECK: + sk->sk_no_check_tx = valbool; + break; + + case SO_PRIORITY: + if ((val >= 0 && val <= 6) || + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + WRITE_ONCE(sk->sk_priority, val); + else + ret = -EPERM; + break; + + case SO_LINGER: + if (optlen < sizeof(ling)) { + ret = -EINVAL; /* 1003.1g */ + break; + } + if (copy_from_sockptr(&ling, optval, sizeof(ling))) { + ret = -EFAULT; + break; + } + if (!ling.l_onoff) { + sock_reset_flag(sk, SOCK_LINGER); + } else { + unsigned long t_sec = ling.l_linger; + + if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ) + WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT); + else + WRITE_ONCE(sk->sk_lingertime, t_sec * HZ); + sock_set_flag(sk, SOCK_LINGER); + } + break; + + case SO_BSDCOMPAT: + break; + + case SO_PASSCRED: + if (valbool) + set_bit(SOCK_PASSCRED, &sock->flags); + else + clear_bit(SOCK_PASSCRED, &sock->flags); + break; + + case SO_TIMESTAMP_OLD: + case SO_TIMESTAMP_NEW: + case SO_TIMESTAMPNS_OLD: + case SO_TIMESTAMPNS_NEW: + sock_set_timestamp(sk, optname, valbool); + break; + + case SO_TIMESTAMPING_NEW: + case SO_TIMESTAMPING_OLD: + if (optlen == sizeof(timestamping)) { + if (copy_from_sockptr(×tamping, optval, + sizeof(timestamping))) { + ret = -EFAULT; + break; + } + } else { + memset(×tamping, 0, sizeof(timestamping)); + timestamping.flags = val; + } + ret = sock_set_timestamping(sk, optname, timestamping); + break; + + case SO_RCVLOWAT: + if (val < 0) + val = INT_MAX; + if (sock && sock->ops->set_rcvlowat) + ret = sock->ops->set_rcvlowat(sk, val); + else + WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); + break; + + case SO_RCVTIMEO_OLD: + case SO_RCVTIMEO_NEW: + ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, + optlen, optname == SO_RCVTIMEO_OLD); + break; + + case SO_SNDTIMEO_OLD: + case SO_SNDTIMEO_NEW: + ret = sock_set_timeout(&sk->sk_sndtimeo, optval, + optlen, optname == SO_SNDTIMEO_OLD); + break; + + case SO_ATTACH_FILTER: { + struct sock_fprog fprog; + + ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); + if (!ret) + ret = sk_attach_filter(&fprog, sk); + break; + } + case SO_ATTACH_BPF: + ret = -EINVAL; + if (optlen == sizeof(u32)) { + u32 ufd; + + ret = -EFAULT; + if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) + break; + + ret = sk_attach_bpf(ufd, sk); + } + break; + + case SO_ATTACH_REUSEPORT_CBPF: { + struct sock_fprog fprog; + + ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); + if (!ret) + ret = sk_reuseport_attach_filter(&fprog, sk); + break; + } + case SO_ATTACH_REUSEPORT_EBPF: + ret = -EINVAL; + if (optlen == sizeof(u32)) { + u32 ufd; + + ret = -EFAULT; + if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) + break; + + ret = sk_reuseport_attach_bpf(ufd, sk); + } + break; + + case SO_DETACH_REUSEPORT_BPF: + ret = reuseport_detach_prog(sk); + break; + + case SO_DETACH_FILTER: + ret = sk_detach_filter(sk); + break; + + case SO_LOCK_FILTER: + if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) + ret = -EPERM; + else + sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); + break; + + case SO_PASSSEC: + if (valbool) + set_bit(SOCK_PASSSEC, &sock->flags); + else + clear_bit(SOCK_PASSSEC, &sock->flags); + break; + case SO_MARK: + if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && + !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + break; + } + + __sock_set_mark(sk, val); + break; + case SO_RCVMARK: + sock_valbool_flag(sk, SOCK_RCVMARK, valbool); + break; + + case SO_RXQ_OVFL: + sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); + break; + + case SO_WIFI_STATUS: + sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); + break; + + case SO_PEEK_OFF: + if (sock->ops->set_peek_off) + ret = sock->ops->set_peek_off(sk, val); + else + ret = -EOPNOTSUPP; + break; + + case SO_NOFCS: + sock_valbool_flag(sk, SOCK_NOFCS, valbool); + break; + + case SO_SELECT_ERR_QUEUE: + sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); + break; + +#ifdef CONFIG_NET_RX_BUSY_POLL + case SO_BUSY_POLL: + /* allow unprivileged users to decrease the value */ + if ((val > sk->sk_ll_usec) && !sockopt_capable(CAP_NET_ADMIN)) + ret = -EPERM; + else { + if (val < 0) + ret = -EINVAL; + else + WRITE_ONCE(sk->sk_ll_usec, val); + } + break; + case SO_PREFER_BUSY_POLL: + if (valbool && !sockopt_capable(CAP_NET_ADMIN)) + ret = -EPERM; + else + WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); + break; + case SO_BUSY_POLL_BUDGET: + if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) { + ret = -EPERM; + } else { + if (val < 0 || val > U16_MAX) + ret = -EINVAL; + else + WRITE_ONCE(sk->sk_busy_poll_budget, val); + } + break; +#endif + + case SO_MAX_PACING_RATE: + { + unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; + + if (sizeof(ulval) != sizeof(val) && + optlen >= sizeof(ulval) && + copy_from_sockptr(&ulval, optval, sizeof(ulval))) { + ret = -EFAULT; + break; + } + if (ulval != ~0UL) + cmpxchg(&sk->sk_pacing_status, + SK_PACING_NONE, + SK_PACING_NEEDED); + /* Pairs with READ_ONCE() from sk_getsockopt() */ + WRITE_ONCE(sk->sk_max_pacing_rate, ulval); + sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval); + break; + } + case SO_INCOMING_CPU: + reuseport_update_incoming_cpu(sk, val); + break; + + case SO_CNX_ADVICE: + if (val == 1) + dst_negative_advice(sk); + break; + + case SO_ZEROCOPY: + if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { + if (!(sk_is_tcp(sk) || + (sk->sk_type == SOCK_DGRAM && + sk->sk_protocol == IPPROTO_UDP))) + ret = -EOPNOTSUPP; + } else if (sk->sk_family != PF_RDS) { + ret = -EOPNOTSUPP; + } + if (!ret) { + if (val < 0 || val > 1) + ret = -EINVAL; + else + sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); + } + break; + + case SO_TXTIME: + if (optlen != sizeof(struct sock_txtime)) { + ret = -EINVAL; + break; + } else if (copy_from_sockptr(&sk_txtime, optval, + sizeof(struct sock_txtime))) { + ret = -EFAULT; + break; + } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { + ret = -EINVAL; + break; + } + /* CLOCK_MONOTONIC is only used by sch_fq, and this packet + * scheduler has enough safe guards. + */ + if (sk_txtime.clockid != CLOCK_MONOTONIC && + !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + break; + } + sock_valbool_flag(sk, SOCK_TXTIME, true); + sk->sk_clockid = sk_txtime.clockid; + sk->sk_txtime_deadline_mode = + !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); + sk->sk_txtime_report_errors = + !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); + break; + + case SO_BINDTOIFINDEX: + ret = sock_bindtoindex_locked(sk, val); + break; + + case SO_BUF_LOCK: + if (val & ~SOCK_BUF_LOCK_MASK) { + ret = -EINVAL; + break; + } + sk->sk_userlocks = val | (sk->sk_userlocks & + ~SOCK_BUF_LOCK_MASK); + break; + + case SO_RESERVE_MEM: + { + int delta; + + if (val < 0) { + ret = -EINVAL; + break; + } + + delta = val - sk->sk_reserved_mem; + if (delta < 0) + sock_release_reserved_memory(sk, -delta); + else + ret = sock_reserve_memory(sk, delta); + break; + } + + case SO_TXREHASH: + if (val < -1 || val > 1) { + ret = -EINVAL; + break; + } + if ((u8)val == SOCK_TXREHASH_DEFAULT) + val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); + /* Paired with READ_ONCE() in tcp_rtx_synack() + * and sk_getsockopt(). + */ + WRITE_ONCE(sk->sk_txrehash, (u8)val); + break; + + default: + ret = -ENOPROTOOPT; + break; + } + sockopt_release_sock(sk); + return ret; +} + +int sock_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + return sk_setsockopt(sock->sk, level, optname, + optval, optlen); +} +EXPORT_SYMBOL(sock_setsockopt); + +static const struct cred *sk_get_peer_cred(struct sock *sk) +{ + const struct cred *cred; + + spin_lock(&sk->sk_peer_lock); + cred = get_cred(sk->sk_peer_cred); + spin_unlock(&sk->sk_peer_lock); + + return cred; +} + +static void cred_to_ucred(struct pid *pid, const struct cred *cred, + struct ucred *ucred) +{ + ucred->pid = pid_vnr(pid); + ucred->uid = ucred->gid = -1; + if (cred) { + struct user_namespace *current_ns = current_user_ns(); + + ucred->uid = from_kuid_munged(current_ns, cred->euid); + ucred->gid = from_kgid_munged(current_ns, cred->egid); + } +} + +static int groups_to_user(sockptr_t dst, const struct group_info *src) +{ + struct user_namespace *user_ns = current_user_ns(); + int i; + + for (i = 0; i < src->ngroups; i++) { + gid_t gid = from_kgid_munged(user_ns, src->gid[i]); + + if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid))) + return -EFAULT; + } + + return 0; +} + +int sk_getsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, sockptr_t optlen) +{ + struct socket *sock = sk->sk_socket; + + union { + int val; + u64 val64; + unsigned long ulval; + struct linger ling; + struct old_timeval32 tm32; + struct __kernel_old_timeval tm; + struct __kernel_sock_timeval stm; + struct sock_txtime txtime; + struct so_timestamping timestamping; + } v; + + int lv = sizeof(int); + int len; + + if (copy_from_sockptr(&len, optlen, sizeof(int))) + return -EFAULT; + if (len < 0) + return -EINVAL; + + memset(&v, 0, sizeof(v)); + + switch (optname) { + case SO_DEBUG: + v.val = sock_flag(sk, SOCK_DBG); + break; + + case SO_DONTROUTE: + v.val = sock_flag(sk, SOCK_LOCALROUTE); + break; + + case SO_BROADCAST: + v.val = sock_flag(sk, SOCK_BROADCAST); + break; + + case SO_SNDBUF: + v.val = READ_ONCE(sk->sk_sndbuf); + break; + + case SO_RCVBUF: + v.val = READ_ONCE(sk->sk_rcvbuf); + break; + + case SO_REUSEADDR: + v.val = sk->sk_reuse; + break; + + case SO_REUSEPORT: + v.val = sk->sk_reuseport; + break; + + case SO_KEEPALIVE: + v.val = sock_flag(sk, SOCK_KEEPOPEN); + break; + + case SO_TYPE: + v.val = sk->sk_type; + break; + + case SO_PROTOCOL: + v.val = sk->sk_protocol; + break; + + case SO_DOMAIN: + v.val = sk->sk_family; + break; + + case SO_ERROR: + v.val = -sock_error(sk); + if (v.val == 0) + v.val = xchg(&sk->sk_err_soft, 0); + break; + + case SO_OOBINLINE: + v.val = sock_flag(sk, SOCK_URGINLINE); + break; + + case SO_NO_CHECK: + v.val = sk->sk_no_check_tx; + break; + + case SO_PRIORITY: + v.val = READ_ONCE(sk->sk_priority); + break; + + case SO_LINGER: + lv = sizeof(v.ling); + v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); + v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ; + break; + + case SO_BSDCOMPAT: + break; + + case SO_TIMESTAMP_OLD: + v.val = sock_flag(sk, SOCK_RCVTSTAMP) && + !sock_flag(sk, SOCK_TSTAMP_NEW) && + !sock_flag(sk, SOCK_RCVTSTAMPNS); + break; + + case SO_TIMESTAMPNS_OLD: + v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); + break; + + case SO_TIMESTAMP_NEW: + v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); + break; + + case SO_TIMESTAMPNS_NEW: + v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); + break; + + case SO_TIMESTAMPING_OLD: + case SO_TIMESTAMPING_NEW: + lv = sizeof(v.timestamping); + /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only + * returning the flags when they were set through the same option. + * Don't change the beviour for the old case SO_TIMESTAMPING_OLD. + */ + if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) { + v.timestamping.flags = READ_ONCE(sk->sk_tsflags); + v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc); + } + break; + + case SO_RCVTIMEO_OLD: + case SO_RCVTIMEO_NEW: + lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v, + SO_RCVTIMEO_OLD == optname); + break; + + case SO_SNDTIMEO_OLD: + case SO_SNDTIMEO_NEW: + lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v, + SO_SNDTIMEO_OLD == optname); + break; + + case SO_RCVLOWAT: + v.val = READ_ONCE(sk->sk_rcvlowat); + break; + + case SO_SNDLOWAT: + v.val = 1; + break; + + case SO_PASSCRED: + v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); + break; + + case SO_PEERCRED: + { + struct ucred peercred; + if (len > sizeof(peercred)) + len = sizeof(peercred); + + spin_lock(&sk->sk_peer_lock); + cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); + spin_unlock(&sk->sk_peer_lock); + + if (copy_to_sockptr(optval, &peercred, len)) + return -EFAULT; + goto lenout; + } + + case SO_PEERGROUPS: + { + const struct cred *cred; + int ret, n; + + cred = sk_get_peer_cred(sk); + if (!cred) + return -ENODATA; + + n = cred->group_info->ngroups; + if (len < n * sizeof(gid_t)) { + len = n * sizeof(gid_t); + put_cred(cred); + return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE; + } + len = n * sizeof(gid_t); + + ret = groups_to_user(optval, cred->group_info); + put_cred(cred); + if (ret) + return ret; + goto lenout; + } + + case SO_PEERNAME: + { + struct sockaddr_storage address; + + lv = sock->ops->getname(sock, (struct sockaddr *)&address, 2); + if (lv < 0) + return -ENOTCONN; + if (lv < len) + return -EINVAL; + if (copy_to_sockptr(optval, &address, len)) + return -EFAULT; + goto lenout; + } + + /* Dubious BSD thing... Probably nobody even uses it, but + * the UNIX standard wants it for whatever reason... -DaveM + */ + case SO_ACCEPTCONN: + v.val = sk->sk_state == TCP_LISTEN; + break; + + case SO_PASSSEC: + v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); + break; + + case SO_PEERSEC: + return security_socket_getpeersec_stream(sock, optval.user, optlen.user, len); + + case SO_MARK: + v.val = READ_ONCE(sk->sk_mark); + break; + + case SO_RCVMARK: + v.val = sock_flag(sk, SOCK_RCVMARK); + break; + + case SO_RXQ_OVFL: + v.val = sock_flag(sk, SOCK_RXQ_OVFL); + break; + + case SO_WIFI_STATUS: + v.val = sock_flag(sk, SOCK_WIFI_STATUS); + break; + + case SO_PEEK_OFF: + if (!sock->ops->set_peek_off) + return -EOPNOTSUPP; + + v.val = READ_ONCE(sk->sk_peek_off); + break; + case SO_NOFCS: + v.val = sock_flag(sk, SOCK_NOFCS); + break; + + case SO_BINDTODEVICE: + return sock_getbindtodevice(sk, optval, optlen, len); + + case SO_GET_FILTER: + len = sk_get_filter(sk, optval, len); + if (len < 0) + return len; + + goto lenout; + + case SO_LOCK_FILTER: + v.val = sock_flag(sk, SOCK_FILTER_LOCKED); + break; + + case SO_BPF_EXTENSIONS: + v.val = bpf_tell_extensions(); + break; + + case SO_SELECT_ERR_QUEUE: + v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); + break; + +#ifdef CONFIG_NET_RX_BUSY_POLL + case SO_BUSY_POLL: + v.val = READ_ONCE(sk->sk_ll_usec); + break; + case SO_PREFER_BUSY_POLL: + v.val = READ_ONCE(sk->sk_prefer_busy_poll); + break; +#endif + + case SO_MAX_PACING_RATE: + /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */ + if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { + lv = sizeof(v.ulval); + v.ulval = READ_ONCE(sk->sk_max_pacing_rate); + } else { + /* 32bit version */ + v.val = min_t(unsigned long, ~0U, + READ_ONCE(sk->sk_max_pacing_rate)); + } + break; + + case SO_INCOMING_CPU: + v.val = READ_ONCE(sk->sk_incoming_cpu); + break; + + case SO_MEMINFO: + { + u32 meminfo[SK_MEMINFO_VARS]; + + sk_get_meminfo(sk, meminfo); + + len = min_t(unsigned int, len, sizeof(meminfo)); + if (copy_to_sockptr(optval, &meminfo, len)) + return -EFAULT; + + goto lenout; + } + +#ifdef CONFIG_NET_RX_BUSY_POLL + case SO_INCOMING_NAPI_ID: + v.val = READ_ONCE(sk->sk_napi_id); + + /* aggregate non-NAPI IDs down to 0 */ + if (v.val < MIN_NAPI_ID) + v.val = 0; + + break; +#endif + + case SO_COOKIE: + lv = sizeof(u64); + if (len < lv) + return -EINVAL; + v.val64 = sock_gen_cookie(sk); + break; + + case SO_ZEROCOPY: + v.val = sock_flag(sk, SOCK_ZEROCOPY); + break; + + case SO_TXTIME: + lv = sizeof(v.txtime); + v.txtime.clockid = sk->sk_clockid; + v.txtime.flags |= sk->sk_txtime_deadline_mode ? + SOF_TXTIME_DEADLINE_MODE : 0; + v.txtime.flags |= sk->sk_txtime_report_errors ? + SOF_TXTIME_REPORT_ERRORS : 0; + break; + + case SO_BINDTOIFINDEX: + v.val = READ_ONCE(sk->sk_bound_dev_if); + break; + + case SO_NETNS_COOKIE: + lv = sizeof(u64); + if (len != lv) + return -EINVAL; + v.val64 = sock_net(sk)->net_cookie; + break; + + case SO_BUF_LOCK: + v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK; + break; + + case SO_RESERVE_MEM: + v.val = READ_ONCE(sk->sk_reserved_mem); + break; + + case SO_TXREHASH: + /* Paired with WRITE_ONCE() in sk_setsockopt() */ + v.val = READ_ONCE(sk->sk_txrehash); + break; + + default: + /* We implement the SO_SNDLOWAT etc to not be settable + * (1003.1g 7). + */ + return -ENOPROTOOPT; + } + + if (len > lv) + len = lv; + if (copy_to_sockptr(optval, &v, len)) + return -EFAULT; +lenout: + if (copy_to_sockptr(optlen, &len, sizeof(int))) + return -EFAULT; + return 0; +} + +int sock_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + return sk_getsockopt(sock->sk, level, optname, + USER_SOCKPTR(optval), + USER_SOCKPTR(optlen)); +} + +/* + * Initialize an sk_lock. + * + * (We also register the sk_lock with the lock validator.) + */ +static inline void sock_lock_init(struct sock *sk) +{ + if (sk->sk_kern_sock) + sock_lock_init_class_and_name( + sk, + af_family_kern_slock_key_strings[sk->sk_family], + af_family_kern_slock_keys + sk->sk_family, + af_family_kern_key_strings[sk->sk_family], + af_family_kern_keys + sk->sk_family); + else + sock_lock_init_class_and_name( + sk, + af_family_slock_key_strings[sk->sk_family], + af_family_slock_keys + sk->sk_family, + af_family_key_strings[sk->sk_family], + af_family_keys + sk->sk_family); +} + +/* + * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, + * even temporarly, because of RCU lookups. sk_node should also be left as is. + * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end + */ +static void sock_copy(struct sock *nsk, const struct sock *osk) +{ + const struct proto *prot = READ_ONCE(osk->sk_prot); +#ifdef CONFIG_SECURITY_NETWORK + void *sptr = nsk->sk_security; +#endif + + /* If we move sk_tx_queue_mapping out of the private section, + * we must check if sk_tx_queue_clear() is called after + * sock_copy() in sk_clone_lock(). + */ + BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < + offsetof(struct sock, sk_dontcopy_begin) || + offsetof(struct sock, sk_tx_queue_mapping) >= + offsetof(struct sock, sk_dontcopy_end)); + + memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); + + memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, + prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); + +#ifdef CONFIG_SECURITY_NETWORK + nsk->sk_security = sptr; + security_sk_clone(osk, nsk); +#endif +} + +static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, + int family) +{ + struct sock *sk; + struct kmem_cache *slab; + + slab = prot->slab; + if (slab != NULL) { + sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); + if (!sk) + return sk; + if (want_init_on_alloc(priority)) + sk_prot_clear_nulls(sk, prot->obj_size); + } else + sk = kmalloc(prot->obj_size, priority); + + if (sk != NULL) { + if (security_sk_alloc(sk, family, priority)) + goto out_free; + + if (!try_module_get(prot->owner)) + goto out_free_sec; + } + + return sk; + +out_free_sec: + security_sk_free(sk); +out_free: + if (slab != NULL) + kmem_cache_free(slab, sk); + else + kfree(sk); + return NULL; +} + +static void sk_prot_free(struct proto *prot, struct sock *sk) +{ + struct kmem_cache *slab; + struct module *owner; + + owner = prot->owner; + slab = prot->slab; + + cgroup_sk_free(&sk->sk_cgrp_data); + mem_cgroup_sk_free(sk); + security_sk_free(sk); + if (slab != NULL) + kmem_cache_free(slab, sk); + else + kfree(sk); + module_put(owner); +} + +/** + * sk_alloc - All socket objects are allocated here + * @net: the applicable net namespace + * @family: protocol family + * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * @prot: struct proto associated with this new sock instance + * @kern: is this to be a kernel socket? + */ +struct sock *sk_alloc(struct net *net, int family, gfp_t priority, + struct proto *prot, int kern) +{ + struct sock *sk; + + sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); + if (sk) { + sk->sk_family = family; + /* + * See comment in struct sock definition to understand + * why we need sk_prot_creator -acme + */ + sk->sk_prot = sk->sk_prot_creator = prot; + sk->sk_kern_sock = kern; + sock_lock_init(sk); + sk->sk_net_refcnt = kern ? 0 : 1; + if (likely(sk->sk_net_refcnt)) { + get_net_track(net, &sk->ns_tracker, priority); + sock_inuse_add(net, 1); + } + + sock_net_set(sk, net); + refcount_set(&sk->sk_wmem_alloc, 1); + + mem_cgroup_sk_alloc(sk); + cgroup_sk_alloc(&sk->sk_cgrp_data); + sock_update_classid(&sk->sk_cgrp_data); + sock_update_netprioidx(&sk->sk_cgrp_data); + sk_tx_queue_clear(sk); + } + + return sk; +} +EXPORT_SYMBOL(sk_alloc); + +/* Sockets having SOCK_RCU_FREE will call this function after one RCU + * grace period. This is the case for UDP sockets and TCP listeners. + */ +static void __sk_destruct(struct rcu_head *head) +{ + struct sock *sk = container_of(head, struct sock, sk_rcu); + struct sk_filter *filter; + + if (sk->sk_destruct) + sk->sk_destruct(sk); + + filter = rcu_dereference_check(sk->sk_filter, + refcount_read(&sk->sk_wmem_alloc) == 0); + if (filter) { + sk_filter_uncharge(sk, filter); + RCU_INIT_POINTER(sk->sk_filter, NULL); + } + + sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); + +#ifdef CONFIG_BPF_SYSCALL + bpf_sk_storage_free(sk); +#endif + + if (atomic_read(&sk->sk_omem_alloc)) + pr_debug("%s: optmem leakage (%d bytes) detected\n", + __func__, atomic_read(&sk->sk_omem_alloc)); + + if (sk->sk_frag.page) { + put_page(sk->sk_frag.page); + sk->sk_frag.page = NULL; + } + + /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ + put_cred(sk->sk_peer_cred); + put_pid(sk->sk_peer_pid); + + if (likely(sk->sk_net_refcnt)) + put_net_track(sock_net(sk), &sk->ns_tracker); + sk_prot_free(sk->sk_prot_creator, sk); +} + +void sk_destruct(struct sock *sk) +{ + bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); + + if (rcu_access_pointer(sk->sk_reuseport_cb)) { + reuseport_detach_sock(sk); + use_call_rcu = true; + } + + if (use_call_rcu) + call_rcu(&sk->sk_rcu, __sk_destruct); + else + __sk_destruct(&sk->sk_rcu); +} + +static void __sk_free(struct sock *sk) +{ + if (likely(sk->sk_net_refcnt)) + sock_inuse_add(sock_net(sk), -1); + + if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) + sock_diag_broadcast_destroy(sk); + else + sk_destruct(sk); +} + +void sk_free(struct sock *sk) +{ + /* + * We subtract one from sk_wmem_alloc and can know if + * some packets are still in some tx queue. + * If not null, sock_wfree() will call __sk_free(sk) later + */ + if (refcount_dec_and_test(&sk->sk_wmem_alloc)) + __sk_free(sk); +} +EXPORT_SYMBOL(sk_free); + +static void sk_init_common(struct sock *sk) +{ + skb_queue_head_init(&sk->sk_receive_queue); + skb_queue_head_init(&sk->sk_write_queue); + skb_queue_head_init(&sk->sk_error_queue); + + rwlock_init(&sk->sk_callback_lock); + lockdep_set_class_and_name(&sk->sk_receive_queue.lock, + af_rlock_keys + sk->sk_family, + af_family_rlock_key_strings[sk->sk_family]); + lockdep_set_class_and_name(&sk->sk_write_queue.lock, + af_wlock_keys + sk->sk_family, + af_family_wlock_key_strings[sk->sk_family]); + lockdep_set_class_and_name(&sk->sk_error_queue.lock, + af_elock_keys + sk->sk_family, + af_family_elock_key_strings[sk->sk_family]); + lockdep_set_class_and_name(&sk->sk_callback_lock, + af_callback_keys + sk->sk_family, + af_family_clock_key_strings[sk->sk_family]); +} + +/** + * sk_clone_lock - clone a socket, and lock its clone + * @sk: the socket to clone + * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * + * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) + */ +struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) +{ + struct proto *prot = READ_ONCE(sk->sk_prot); + struct sk_filter *filter; + bool is_charged = true; + struct sock *newsk; + + newsk = sk_prot_alloc(prot, priority, sk->sk_family); + if (!newsk) + goto out; + + sock_copy(newsk, sk); + + newsk->sk_prot_creator = prot; + + /* SANITY */ + if (likely(newsk->sk_net_refcnt)) { + get_net_track(sock_net(newsk), &newsk->ns_tracker, priority); + sock_inuse_add(sock_net(newsk), 1); + } + sk_node_init(&newsk->sk_node); + sock_lock_init(newsk); + bh_lock_sock(newsk); + newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; + newsk->sk_backlog.len = 0; + + atomic_set(&newsk->sk_rmem_alloc, 0); + + /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ + refcount_set(&newsk->sk_wmem_alloc, 1); + + atomic_set(&newsk->sk_omem_alloc, 0); + sk_init_common(newsk); + + newsk->sk_dst_cache = NULL; + newsk->sk_dst_pending_confirm = 0; + newsk->sk_wmem_queued = 0; + newsk->sk_forward_alloc = 0; + newsk->sk_reserved_mem = 0; + atomic_set(&newsk->sk_drops, 0); + newsk->sk_send_head = NULL; + newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; + atomic_set(&newsk->sk_zckey, 0); + + sock_reset_flag(newsk, SOCK_DONE); + + /* sk->sk_memcg will be populated at accept() time */ + newsk->sk_memcg = NULL; + + cgroup_sk_clone(&newsk->sk_cgrp_data); + + rcu_read_lock(); + filter = rcu_dereference(sk->sk_filter); + if (filter != NULL) + /* though it's an empty new sock, the charging may fail + * if sysctl_optmem_max was changed between creation of + * original socket and cloning + */ + is_charged = sk_filter_charge(newsk, filter); + RCU_INIT_POINTER(newsk->sk_filter, filter); + rcu_read_unlock(); + + if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { + /* We need to make sure that we don't uncharge the new + * socket if we couldn't charge it in the first place + * as otherwise we uncharge the parent's filter. + */ + if (!is_charged) + RCU_INIT_POINTER(newsk->sk_filter, NULL); + sk_free_unlock_clone(newsk); + newsk = NULL; + goto out; + } + RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); + + if (bpf_sk_storage_clone(sk, newsk)) { + sk_free_unlock_clone(newsk); + newsk = NULL; + goto out; + } + + /* Clear sk_user_data if parent had the pointer tagged + * as not suitable for copying when cloning. + */ + if (sk_user_data_is_nocopy(newsk)) + newsk->sk_user_data = NULL; + + newsk->sk_err = 0; + newsk->sk_err_soft = 0; + newsk->sk_priority = 0; + newsk->sk_incoming_cpu = raw_smp_processor_id(); + + /* Before updating sk_refcnt, we must commit prior changes to memory + * (Documentation/RCU/rculist_nulls.rst for details) + */ + smp_wmb(); + refcount_set(&newsk->sk_refcnt, 2); + + /* Increment the counter in the same struct proto as the master + * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that + * is the same as sk->sk_prot->socks, as this field was copied + * with memcpy). + * + * This _changes_ the previous behaviour, where + * tcp_create_openreq_child always was incrementing the + * equivalent to tcp_prot->socks (inet_sock_nr), so this have + * to be taken into account in all callers. -acme + */ + sk_refcnt_debug_inc(newsk); + sk_set_socket(newsk, NULL); + sk_tx_queue_clear(newsk); + RCU_INIT_POINTER(newsk->sk_wq, NULL); + + if (newsk->sk_prot->sockets_allocated) + sk_sockets_allocated_inc(newsk); + + if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) + net_enable_timestamp(); +out: + return newsk; +} +EXPORT_SYMBOL_GPL(sk_clone_lock); + +void sk_free_unlock_clone(struct sock *sk) +{ + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + sk->sk_destruct = NULL; + bh_unlock_sock(sk); + sk_free(sk); +} +EXPORT_SYMBOL_GPL(sk_free_unlock_clone); + +static void sk_trim_gso_size(struct sock *sk) +{ + if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE) + return; +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6 && + sk_is_tcp(sk) && + !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) + return; +#endif + sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE; +} + +void sk_setup_caps(struct sock *sk, struct dst_entry *dst) +{ + u32 max_segs = 1; + + sk->sk_route_caps = dst->dev->features; + if (sk_is_tcp(sk)) + sk->sk_route_caps |= NETIF_F_GSO; + if (sk->sk_route_caps & NETIF_F_GSO) + sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; + if (unlikely(sk->sk_gso_disabled)) + sk->sk_route_caps &= ~NETIF_F_GSO_MASK; + if (sk_can_gso(sk)) { + if (dst->header_len && !xfrm_dst_offload_ok(dst)) { + sk->sk_route_caps &= ~NETIF_F_GSO_MASK; + } else { + sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; + /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */ + sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size); + sk_trim_gso_size(sk); + sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1); + /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ + max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); + } + } + sk->sk_gso_max_segs = max_segs; + sk_dst_set(sk, dst); +} +EXPORT_SYMBOL_GPL(sk_setup_caps); + +/* + * Simple resource managers for sockets. + */ + + +/* + * Write buffer destructor automatically called from kfree_skb. + */ +void sock_wfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + unsigned int len = skb->truesize; + bool free; + + if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { + if (sock_flag(sk, SOCK_RCU_FREE) && + sk->sk_write_space == sock_def_write_space) { + rcu_read_lock(); + free = refcount_sub_and_test(len, &sk->sk_wmem_alloc); + sock_def_write_space_wfree(sk); + rcu_read_unlock(); + if (unlikely(free)) + __sk_free(sk); + return; + } + + /* + * Keep a reference on sk_wmem_alloc, this will be released + * after sk_write_space() call + */ + WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); + sk->sk_write_space(sk); + len = 1; + } + /* + * if sk_wmem_alloc reaches 0, we must finish what sk_free() + * could not do because of in-flight packets + */ + if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) + __sk_free(sk); +} +EXPORT_SYMBOL(sock_wfree); + +/* This variant of sock_wfree() is used by TCP, + * since it sets SOCK_USE_WRITE_QUEUE. + */ +void __sock_wfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) + __sk_free(sk); +} + +void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) +{ + skb_orphan(skb); + skb->sk = sk; +#ifdef CONFIG_INET + if (unlikely(!sk_fullsock(sk))) { + skb->destructor = sock_edemux; + sock_hold(sk); + return; + } +#endif + skb->destructor = sock_wfree; + skb_set_hash_from_sk(skb, sk); + /* + * We used to take a refcount on sk, but following operation + * is enough to guarantee sk_free() wont free this sock until + * all in-flight packets are completed + */ + refcount_add(skb->truesize, &sk->sk_wmem_alloc); +} +EXPORT_SYMBOL(skb_set_owner_w); + +static bool can_skb_orphan_partial(const struct sk_buff *skb) +{ +#ifdef CONFIG_TLS_DEVICE + /* Drivers depend on in-order delivery for crypto offload, + * partial orphan breaks out-of-order-OK logic. + */ + if (skb->decrypted) + return false; +#endif + return (skb->destructor == sock_wfree || + (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); +} + +/* This helper is used by netem, as it can hold packets in its + * delay queue. We want to allow the owner socket to send more + * packets, as if they were already TX completed by a typical driver. + * But we also want to keep skb->sk set because some packet schedulers + * rely on it (sch_fq for example). + */ +void skb_orphan_partial(struct sk_buff *skb) +{ + if (skb_is_tcp_pure_ack(skb)) + return; + + if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) + return; + + skb_orphan(skb); +} +EXPORT_SYMBOL(skb_orphan_partial); + +/* + * Read buffer destructor automatically called from kfree_skb. + */ +void sock_rfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + unsigned int len = skb->truesize; + + atomic_sub(len, &sk->sk_rmem_alloc); + sk_mem_uncharge(sk, len); +} +EXPORT_SYMBOL(sock_rfree); + +/* + * Buffer destructor for skbs that are not used directly in read or write + * path, e.g. for error handler skbs. Automatically called from kfree_skb. + */ +void sock_efree(struct sk_buff *skb) +{ + sock_put(skb->sk); +} +EXPORT_SYMBOL(sock_efree); + +/* Buffer destructor for prefetch/receive path where reference count may + * not be held, e.g. for listen sockets. + */ +#ifdef CONFIG_INET +void sock_pfree(struct sk_buff *skb) +{ + if (sk_is_refcounted(skb->sk)) + sock_gen_put(skb->sk); +} +EXPORT_SYMBOL(sock_pfree); +#endif /* CONFIG_INET */ + +kuid_t sock_i_uid(struct sock *sk) +{ + kuid_t uid; + + read_lock_bh(&sk->sk_callback_lock); + uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; + read_unlock_bh(&sk->sk_callback_lock); + return uid; +} +EXPORT_SYMBOL(sock_i_uid); + +unsigned long __sock_i_ino(struct sock *sk) +{ + unsigned long ino; + + read_lock(&sk->sk_callback_lock); + ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; + read_unlock(&sk->sk_callback_lock); + return ino; +} +EXPORT_SYMBOL(__sock_i_ino); + +unsigned long sock_i_ino(struct sock *sk) +{ + unsigned long ino; + + local_bh_disable(); + ino = __sock_i_ino(sk); + local_bh_enable(); + return ino; +} +EXPORT_SYMBOL(sock_i_ino); + +/* + * Allocate a skb from the socket's send buffer. + */ +struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, + gfp_t priority) +{ + if (force || + refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { + struct sk_buff *skb = alloc_skb(size, priority); + + if (skb) { + skb_set_owner_w(skb, sk); + return skb; + } + } + return NULL; +} +EXPORT_SYMBOL(sock_wmalloc); + +static void sock_ofree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + atomic_sub(skb->truesize, &sk->sk_omem_alloc); +} + +struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, + gfp_t priority) +{ + struct sk_buff *skb; + + /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ + if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > + READ_ONCE(sysctl_optmem_max)) + return NULL; + + skb = alloc_skb(size, priority); + if (!skb) + return NULL; + + atomic_add(skb->truesize, &sk->sk_omem_alloc); + skb->sk = sk; + skb->destructor = sock_ofree; + return skb; +} + +/* + * Allocate a memory block from the socket's option memory buffer. + */ +void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) +{ + int optmem_max = READ_ONCE(sysctl_optmem_max); + + if ((unsigned int)size <= optmem_max && + atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { + void *mem; + /* First do the add, to avoid the race if kmalloc + * might sleep. + */ + atomic_add(size, &sk->sk_omem_alloc); + mem = kmalloc(size, priority); + if (mem) + return mem; + atomic_sub(size, &sk->sk_omem_alloc); + } + return NULL; +} +EXPORT_SYMBOL(sock_kmalloc); + +/* Free an option memory block. Note, we actually want the inline + * here as this allows gcc to detect the nullify and fold away the + * condition entirely. + */ +static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, + const bool nullify) +{ + if (WARN_ON_ONCE(!mem)) + return; + if (nullify) + kfree_sensitive(mem); + else + kfree(mem); + atomic_sub(size, &sk->sk_omem_alloc); +} + +void sock_kfree_s(struct sock *sk, void *mem, int size) +{ + __sock_kfree_s(sk, mem, size, false); +} +EXPORT_SYMBOL(sock_kfree_s); + +void sock_kzfree_s(struct sock *sk, void *mem, int size) +{ + __sock_kfree_s(sk, mem, size, true); +} +EXPORT_SYMBOL(sock_kzfree_s); + +/* It is almost wait_for_tcp_memory minus release_sock/lock_sock. + I think, these locks should be removed for datagram sockets. + */ +static long sock_wait_for_wmem(struct sock *sk, long timeo) +{ + DEFINE_WAIT(wait); + + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); + for (;;) { + if (!timeo) + break; + if (signal_pending(current)) + break; + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) + break; + if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) + break; + if (READ_ONCE(sk->sk_err)) + break; + timeo = schedule_timeout(timeo); + } + finish_wait(sk_sleep(sk), &wait); + return timeo; +} + + +/* + * Generic send/receive buffer handlers + */ + +struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, + unsigned long data_len, int noblock, + int *errcode, int max_page_order) +{ + struct sk_buff *skb; + long timeo; + int err; + + timeo = sock_sndtimeo(sk, noblock); + for (;;) { + err = sock_error(sk); + if (err != 0) + goto failure; + + err = -EPIPE; + if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) + goto failure; + + if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) + break; + + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; + if (!timeo) + goto failure; + if (signal_pending(current)) + goto interrupted; + timeo = sock_wait_for_wmem(sk, timeo); + } + skb = alloc_skb_with_frags(header_len, data_len, max_page_order, + errcode, sk->sk_allocation); + if (skb) + skb_set_owner_w(skb, sk); + return skb; + +interrupted: + err = sock_intr_errno(timeo); +failure: + *errcode = err; + return NULL; +} +EXPORT_SYMBOL(sock_alloc_send_pskb); + +int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, + struct sockcm_cookie *sockc) +{ + u32 tsflags; + + switch (cmsg->cmsg_type) { + case SO_MARK: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && + !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + return -EPERM; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->mark = *(u32 *)CMSG_DATA(cmsg); + break; + case SO_TIMESTAMPING_OLD: + case SO_TIMESTAMPING_NEW: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + + tsflags = *(u32 *)CMSG_DATA(cmsg); + if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) + return -EINVAL; + + sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; + sockc->tsflags |= tsflags; + break; + case SCM_TXTIME: + if (!sock_flag(sk, SOCK_TXTIME)) + return -EINVAL; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) + return -EINVAL; + sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); + break; + /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ + case SCM_RIGHTS: + case SCM_CREDENTIALS: + break; + default: + return -EINVAL; + } + return 0; +} +EXPORT_SYMBOL(__sock_cmsg_send); + +int sock_cmsg_send(struct sock *sk, struct msghdr *msg, + struct sockcm_cookie *sockc) +{ + struct cmsghdr *cmsg; + int ret; + + for_each_cmsghdr(cmsg, msg) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + if (cmsg->cmsg_level != SOL_SOCKET) + continue; + ret = __sock_cmsg_send(sk, msg, cmsg, sockc); + if (ret) + return ret; + } + return 0; +} +EXPORT_SYMBOL(sock_cmsg_send); + +static void sk_enter_memory_pressure(struct sock *sk) +{ + if (!sk->sk_prot->enter_memory_pressure) + return; + + sk->sk_prot->enter_memory_pressure(sk); +} + +static void sk_leave_memory_pressure(struct sock *sk) +{ + if (sk->sk_prot->leave_memory_pressure) { + INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure, + tcp_leave_memory_pressure, sk); + } else { + unsigned long *memory_pressure = sk->sk_prot->memory_pressure; + + if (memory_pressure && READ_ONCE(*memory_pressure)) + WRITE_ONCE(*memory_pressure, 0); + } +} + +DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); + +/** + * skb_page_frag_refill - check that a page_frag contains enough room + * @sz: minimum size of the fragment we want to get + * @pfrag: pointer to page_frag + * @gfp: priority for memory allocation + * + * Note: While this allocator tries to use high order pages, there is + * no guarantee that allocations succeed. Therefore, @sz MUST be + * less or equal than PAGE_SIZE. + */ +bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) +{ + if (pfrag->page) { + if (page_ref_count(pfrag->page) == 1) { + pfrag->offset = 0; + return true; + } + if (pfrag->offset + sz <= pfrag->size) + return true; + put_page(pfrag->page); + } + + pfrag->offset = 0; + if (SKB_FRAG_PAGE_ORDER && + !static_branch_unlikely(&net_high_order_alloc_disable_key)) { + /* Avoid direct reclaim but allow kswapd to wake */ + pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | + __GFP_COMP | __GFP_NOWARN | + __GFP_NORETRY, + SKB_FRAG_PAGE_ORDER); + if (likely(pfrag->page)) { + pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; + return true; + } + } + pfrag->page = alloc_page(gfp); + if (likely(pfrag->page)) { + pfrag->size = PAGE_SIZE; + return true; + } + return false; +} +EXPORT_SYMBOL(skb_page_frag_refill); + +bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +{ + if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) + return true; + + sk_enter_memory_pressure(sk); + sk_stream_moderate_sndbuf(sk); + return false; +} +EXPORT_SYMBOL(sk_page_frag_refill); + +void __lock_sock(struct sock *sk) + __releases(&sk->sk_lock.slock) + __acquires(&sk->sk_lock.slock) +{ + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock_bh(&sk->sk_lock.slock); + schedule(); + spin_lock_bh(&sk->sk_lock.slock); + if (!sock_owned_by_user(sk)) + break; + } + finish_wait(&sk->sk_lock.wq, &wait); +} + +void __release_sock(struct sock *sk) + __releases(&sk->sk_lock.slock) + __acquires(&sk->sk_lock.slock) +{ + struct sk_buff *skb, *next; + + while ((skb = sk->sk_backlog.head) != NULL) { + sk->sk_backlog.head = sk->sk_backlog.tail = NULL; + + spin_unlock_bh(&sk->sk_lock.slock); + + do { + next = skb->next; + prefetch(next); + DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb)); + skb_mark_not_on_list(skb); + sk_backlog_rcv(sk, skb); + + cond_resched(); + + skb = next; + } while (skb != NULL); + + spin_lock_bh(&sk->sk_lock.slock); + } + + /* + * Doing the zeroing here guarantee we can not loop forever + * while a wild producer attempts to flood us. + */ + sk->sk_backlog.len = 0; +} + +void __sk_flush_backlog(struct sock *sk) +{ + spin_lock_bh(&sk->sk_lock.slock); + __release_sock(sk); + spin_unlock_bh(&sk->sk_lock.slock); +} +EXPORT_SYMBOL_GPL(__sk_flush_backlog); + +/** + * sk_wait_data - wait for data to arrive at sk_receive_queue + * @sk: sock to wait on + * @timeo: for how long + * @skb: last skb seen on sk_receive_queue + * + * Now socket state including sk->sk_err is changed only under lock, + * hence we may omit checks after joining wait queue. + * We check receive queue before schedule() only as optimization; + * it is very likely that release_sock() added new data. + */ +int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int rc; + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + return rc; +} +EXPORT_SYMBOL(sk_wait_data); + +/** + * __sk_mem_raise_allocated - increase memory_allocated + * @sk: socket + * @size: memory size to allocate + * @amt: pages to allocate + * @kind: allocation type + * + * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc + */ +int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) +{ + bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg; + struct proto *prot = sk->sk_prot; + bool charged = true; + long allocated; + + sk_memory_allocated_add(sk, amt); + allocated = sk_memory_allocated(sk); + if (memcg_charge && + !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt, + gfp_memcg_charge()))) + goto suppress_allocation; + + /* Under limit. */ + if (allocated <= sk_prot_mem_limits(sk, 0)) { + sk_leave_memory_pressure(sk); + return 1; + } + + /* Under pressure. */ + if (allocated > sk_prot_mem_limits(sk, 1)) + sk_enter_memory_pressure(sk); + + /* Over hard limit. */ + if (allocated > sk_prot_mem_limits(sk, 2)) + goto suppress_allocation; + + /* guarantee minimum buffer size under pressure */ + if (kind == SK_MEM_RECV) { + if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) + return 1; + + } else { /* SK_MEM_SEND */ + int wmem0 = sk_get_wmem0(sk, prot); + + if (sk->sk_type == SOCK_STREAM) { + if (sk->sk_wmem_queued < wmem0) + return 1; + } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { + return 1; + } + } + + if (sk_has_memory_pressure(sk)) { + u64 alloc; + + if (!sk_under_memory_pressure(sk)) + return 1; + alloc = sk_sockets_allocated_read_positive(sk); + if (sk_prot_mem_limits(sk, 2) > alloc * + sk_mem_pages(sk->sk_wmem_queued + + atomic_read(&sk->sk_rmem_alloc) + + sk->sk_forward_alloc)) + return 1; + } + +suppress_allocation: + + if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { + sk_stream_moderate_sndbuf(sk); + + /* Fail only if socket is _under_ its sndbuf. + * In this case we cannot block, so that we have to fail. + */ + if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { + /* Force charge with __GFP_NOFAIL */ + if (memcg_charge && !charged) { + mem_cgroup_charge_skmem(sk->sk_memcg, amt, + gfp_memcg_charge() | __GFP_NOFAIL); + } + return 1; + } + } + + if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) + trace_sock_exceed_buf_limit(sk, prot, allocated, kind); + + sk_memory_allocated_sub(sk, amt); + + if (memcg_charge && charged) + mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); + + return 0; +} + +/** + * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated + * @sk: socket + * @size: memory size to allocate + * @kind: allocation type + * + * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means + * rmem allocation. This function assumes that protocols which have + * memory_pressure use sk_wmem_queued as write buffer accounting. + */ +int __sk_mem_schedule(struct sock *sk, int size, int kind) +{ + int ret, amt = sk_mem_pages(size); + + sk_forward_alloc_add(sk, amt << PAGE_SHIFT); + ret = __sk_mem_raise_allocated(sk, size, amt, kind); + if (!ret) + sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT)); + return ret; +} +EXPORT_SYMBOL(__sk_mem_schedule); + +/** + * __sk_mem_reduce_allocated - reclaim memory_allocated + * @sk: socket + * @amount: number of quanta + * + * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc + */ +void __sk_mem_reduce_allocated(struct sock *sk, int amount) +{ + sk_memory_allocated_sub(sk, amount); + + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); + + if (sk_under_global_memory_pressure(sk) && + (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) + sk_leave_memory_pressure(sk); +} + +/** + * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated + * @sk: socket + * @amount: number of bytes (rounded down to a PAGE_SIZE multiple) + */ +void __sk_mem_reclaim(struct sock *sk, int amount) +{ + amount >>= PAGE_SHIFT; + sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT)); + __sk_mem_reduce_allocated(sk, amount); +} +EXPORT_SYMBOL(__sk_mem_reclaim); + +int sk_set_peek_off(struct sock *sk, int val) +{ + WRITE_ONCE(sk->sk_peek_off, val); + return 0; +} +EXPORT_SYMBOL_GPL(sk_set_peek_off); + +/* + * Set of default routines for initialising struct proto_ops when + * the protocol does not support a particular function. In certain + * cases where it makes no sense for a protocol to have a "do nothing" + * function, some default processing is provided. + */ + +int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_bind); + +int sock_no_connect(struct socket *sock, struct sockaddr *saddr, + int len, int flags) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_connect); + +int sock_no_socketpair(struct socket *sock1, struct socket *sock2) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_socketpair); + +int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, + bool kern) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_accept); + +int sock_no_getname(struct socket *sock, struct sockaddr *saddr, + int peer) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_getname); + +int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_ioctl); + +int sock_no_listen(struct socket *sock, int backlog) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_listen); + +int sock_no_shutdown(struct socket *sock, int how) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_shutdown); + +int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_sendmsg); + +int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_sendmsg_locked); + +int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, + int flags) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_recvmsg); + +int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) +{ + /* Mirror missing mmap method error code */ + return -ENODEV; +} +EXPORT_SYMBOL(sock_no_mmap); + +/* + * When a file is received (via SCM_RIGHTS, etc), we must bump the + * various sock-based usage counts. + */ +void __receive_sock(struct file *file) +{ + struct socket *sock; + + sock = sock_from_file(file); + if (sock) { + sock_update_netprioidx(&sock->sk->sk_cgrp_data); + sock_update_classid(&sock->sk->sk_cgrp_data); + } +} + +ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) +{ + ssize_t res; + struct msghdr msg = {.msg_flags = flags}; + struct kvec iov; + char *kaddr = kmap(page); + iov.iov_base = kaddr + offset; + iov.iov_len = size; + res = kernel_sendmsg(sock, &msg, &iov, 1, size); + kunmap(page); + return res; +} +EXPORT_SYMBOL(sock_no_sendpage); + +ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, + int offset, size_t size, int flags) +{ + ssize_t res; + struct msghdr msg = {.msg_flags = flags}; + struct kvec iov; + char *kaddr = kmap(page); + + iov.iov_base = kaddr + offset; + iov.iov_len = size; + res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size); + kunmap(page); + return res; +} +EXPORT_SYMBOL(sock_no_sendpage_locked); + +/* + * Default Socket Callbacks + */ + +static void sock_def_wakeup(struct sock *sk) +{ + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (skwq_has_sleeper(wq)) + wake_up_interruptible_all(&wq->wait); + rcu_read_unlock(); +} + +static void sock_def_error_report(struct sock *sk) +{ + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (skwq_has_sleeper(wq)) + wake_up_interruptible_poll(&wq->wait, EPOLLERR); + sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); + rcu_read_unlock(); +} + +void sock_def_readable(struct sock *sk) +{ + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (skwq_has_sleeper(wq)) + wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | + EPOLLRDNORM | EPOLLRDBAND); + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + rcu_read_unlock(); +} + +static void sock_def_write_space(struct sock *sk) +{ + struct socket_wq *wq; + + rcu_read_lock(); + + /* Do not wake up a writer until he can make "significant" + * progress. --DaveM + */ + if (sock_writeable(sk)) { + wq = rcu_dereference(sk->sk_wq); + if (skwq_has_sleeper(wq)) + wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | + EPOLLWRNORM | EPOLLWRBAND); + + /* Should agree with poll, otherwise some programs break */ + sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + } + + rcu_read_unlock(); +} + +/* An optimised version of sock_def_write_space(), should only be called + * for SOCK_RCU_FREE sockets under RCU read section and after putting + * ->sk_wmem_alloc. + */ +static void sock_def_write_space_wfree(struct sock *sk) +{ + /* Do not wake up a writer until he can make "significant" + * progress. --DaveM + */ + if (sock_writeable(sk)) { + struct socket_wq *wq = rcu_dereference(sk->sk_wq); + + /* rely on refcount_sub from sock_wfree() */ + smp_mb__after_atomic(); + if (wq && waitqueue_active(&wq->wait)) + wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | + EPOLLWRNORM | EPOLLWRBAND); + + /* Should agree with poll, otherwise some programs break */ + sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + } +} + +static void sock_def_destruct(struct sock *sk) +{ +} + +void sk_send_sigurg(struct sock *sk) +{ + if (sk->sk_socket && sk->sk_socket->file) + if (send_sigurg(&sk->sk_socket->file->f_owner)) + sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); +} +EXPORT_SYMBOL(sk_send_sigurg); + +void sk_reset_timer(struct sock *sk, struct timer_list* timer, + unsigned long expires) +{ + if (!mod_timer(timer, expires)) + sock_hold(sk); +} +EXPORT_SYMBOL(sk_reset_timer); + +void sk_stop_timer(struct sock *sk, struct timer_list* timer) +{ + if (del_timer(timer)) + __sock_put(sk); +} +EXPORT_SYMBOL(sk_stop_timer); + +void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) +{ + if (del_timer_sync(timer)) + __sock_put(sk); +} +EXPORT_SYMBOL(sk_stop_timer_sync); + +void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) +{ + sk_init_common(sk); + sk->sk_send_head = NULL; + + timer_setup(&sk->sk_timer, NULL, 0); + + sk->sk_allocation = GFP_KERNEL; + sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); + sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); + sk->sk_state = TCP_CLOSE; + sk_set_socket(sk, sock); + + sock_set_flag(sk, SOCK_ZAPPED); + + if (sock) { + sk->sk_type = sock->type; + RCU_INIT_POINTER(sk->sk_wq, &sock->wq); + sock->sk = sk; + } else { + RCU_INIT_POINTER(sk->sk_wq, NULL); + } + sk->sk_uid = uid; + + rwlock_init(&sk->sk_callback_lock); + if (sk->sk_kern_sock) + lockdep_set_class_and_name( + &sk->sk_callback_lock, + af_kern_callback_keys + sk->sk_family, + af_family_kern_clock_key_strings[sk->sk_family]); + else + lockdep_set_class_and_name( + &sk->sk_callback_lock, + af_callback_keys + sk->sk_family, + af_family_clock_key_strings[sk->sk_family]); + + sk->sk_state_change = sock_def_wakeup; + sk->sk_data_ready = sock_def_readable; + sk->sk_write_space = sock_def_write_space; + sk->sk_error_report = sock_def_error_report; + sk->sk_destruct = sock_def_destruct; + + sk->sk_frag.page = NULL; + sk->sk_frag.offset = 0; + sk->sk_peek_off = -1; + + sk->sk_peer_pid = NULL; + sk->sk_peer_cred = NULL; + spin_lock_init(&sk->sk_peer_lock); + + sk->sk_write_pending = 0; + sk->sk_rcvlowat = 1; + sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + + sk->sk_stamp = SK_DEFAULT_STAMP; +#if BITS_PER_LONG==32 + seqlock_init(&sk->sk_stamp_seq); +#endif + atomic_set(&sk->sk_zckey, 0); + +#ifdef CONFIG_NET_RX_BUSY_POLL + sk->sk_napi_id = 0; + sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read); +#endif + + sk->sk_max_pacing_rate = ~0UL; + sk->sk_pacing_rate = ~0UL; + WRITE_ONCE(sk->sk_pacing_shift, 10); + sk->sk_incoming_cpu = -1; + + sk_rx_queue_clear(sk); + /* + * Before updating sk_refcnt, we must commit prior changes to memory + * (Documentation/RCU/rculist_nulls.rst for details) + */ + smp_wmb(); + refcount_set(&sk->sk_refcnt, 1); + atomic_set(&sk->sk_drops, 0); +} +EXPORT_SYMBOL(sock_init_data_uid); + +void sock_init_data(struct socket *sock, struct sock *sk) +{ + kuid_t uid = sock ? + SOCK_INODE(sock)->i_uid : + make_kuid(sock_net(sk)->user_ns, 0); + + sock_init_data_uid(sock, sk, uid); +} +EXPORT_SYMBOL(sock_init_data); + +void lock_sock_nested(struct sock *sk, int subclass) +{ + /* The sk_lock has mutex_lock() semantics here. */ + mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); + + might_sleep(); + spin_lock_bh(&sk->sk_lock.slock); + if (sock_owned_by_user_nocheck(sk)) + __lock_sock(sk); + sk->sk_lock.owned = 1; + spin_unlock_bh(&sk->sk_lock.slock); +} +EXPORT_SYMBOL(lock_sock_nested); + +void release_sock(struct sock *sk) +{ + spin_lock_bh(&sk->sk_lock.slock); + if (sk->sk_backlog.tail) + __release_sock(sk); + + /* Warning : release_cb() might need to release sk ownership, + * ie call sock_release_ownership(sk) before us. + */ + if (sk->sk_prot->release_cb) + sk->sk_prot->release_cb(sk); + + sock_release_ownership(sk); + if (waitqueue_active(&sk->sk_lock.wq)) + wake_up(&sk->sk_lock.wq); + spin_unlock_bh(&sk->sk_lock.slock); +} +EXPORT_SYMBOL(release_sock); + +bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) +{ + might_sleep(); + spin_lock_bh(&sk->sk_lock.slock); + + if (!sock_owned_by_user_nocheck(sk)) { + /* + * Fast path return with bottom halves disabled and + * sock::sk_lock.slock held. + * + * The 'mutex' is not contended and holding + * sock::sk_lock.slock prevents all other lockers to + * proceed so the corresponding unlock_sock_fast() can + * avoid the slow path of release_sock() completely and + * just release slock. + * + * From a semantical POV this is equivalent to 'acquiring' + * the 'mutex', hence the corresponding lockdep + * mutex_release() has to happen in the fast path of + * unlock_sock_fast(). + */ + return false; + } + + __lock_sock(sk); + sk->sk_lock.owned = 1; + __acquire(&sk->sk_lock.slock); + spin_unlock_bh(&sk->sk_lock.slock); + return true; +} +EXPORT_SYMBOL(__lock_sock_fast); + +int sock_gettstamp(struct socket *sock, void __user *userstamp, + bool timeval, bool time32) +{ + struct sock *sk = sock->sk; + struct timespec64 ts; + + sock_enable_timestamp(sk, SOCK_TIMESTAMP); + ts = ktime_to_timespec64(sock_read_timestamp(sk)); + if (ts.tv_sec == -1) + return -ENOENT; + if (ts.tv_sec == 0) { + ktime_t kt = ktime_get_real(); + sock_write_timestamp(sk, kt); + ts = ktime_to_timespec64(kt); + } + + if (timeval) + ts.tv_nsec /= 1000; + +#ifdef CONFIG_COMPAT_32BIT_TIME + if (time32) + return put_old_timespec32(&ts, userstamp); +#endif +#ifdef CONFIG_SPARC64 + /* beware of padding in sparc64 timeval */ + if (timeval && !in_compat_syscall()) { + struct __kernel_old_timeval __user tv = { + .tv_sec = ts.tv_sec, + .tv_usec = ts.tv_nsec, + }; + if (copy_to_user(userstamp, &tv, sizeof(tv))) + return -EFAULT; + return 0; + } +#endif + return put_timespec64(&ts, userstamp); +} +EXPORT_SYMBOL(sock_gettstamp); + +void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) +{ + if (!sock_flag(sk, flag)) { + unsigned long previous_flags = sk->sk_flags; + + sock_set_flag(sk, flag); + /* + * we just set one of the two flags which require net + * time stamping, but time stamping might have been on + * already because of the other one + */ + if (sock_needs_netstamp(sk) && + !(previous_flags & SK_FLAGS_TIMESTAMP)) + net_enable_timestamp(); + } +} + +int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, + int level, int type) +{ + struct sock_exterr_skb *serr; + struct sk_buff *skb; + int copied, err; + + err = -EAGAIN; + skb = sock_dequeue_err_skb(sk); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + err = skb_copy_datagram_msg(skb, 0, msg, copied); + if (err) + goto out_free_skb; + + sock_recv_timestamp(msg, sk, skb); + + serr = SKB_EXT_ERR(skb); + put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); + + msg->msg_flags |= MSG_ERRQUEUE; + err = copied; + +out_free_skb: + kfree_skb(skb); +out: + return err; +} +EXPORT_SYMBOL(sock_recv_errqueue); + +/* + * Get a socket option on an socket. + * + * FIX: POSIX 1003.1g is very ambiguous here. It states that + * asynchronous errors should be reported by getsockopt. We assume + * this means if you specify SO_ERROR (otherwise whats the point of it). + */ +int sock_common_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen); +} +EXPORT_SYMBOL(sock_common_getsockopt); + +int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) +{ + struct sock *sk = sock->sk; + int addr_len = 0; + int err; + + err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len); + if (err >= 0) + msg->msg_namelen = addr_len; + return err; +} +EXPORT_SYMBOL(sock_common_recvmsg); + +/* + * Set socket options on an inet socket. + */ +int sock_common_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen); +} +EXPORT_SYMBOL(sock_common_setsockopt); + +void sk_common_release(struct sock *sk) +{ + if (sk->sk_prot->destroy) + sk->sk_prot->destroy(sk); + + /* + * Observation: when sk_common_release is called, processes have + * no access to socket. But net still has. + * Step one, detach it from networking: + * + * A. Remove from hash tables. + */ + + sk->sk_prot->unhash(sk); + + /* + * In this point socket cannot receive new packets, but it is possible + * that some packets are in flight because some CPU runs receiver and + * did hash table lookup before we unhashed socket. They will achieve + * receive queue and will be purged by socket destructor. + * + * Also we still have packets pending on receive queue and probably, + * our own packets waiting in device queues. sock_destroy will drain + * receive queue, but transmitted packets will delay socket destruction + * until the last reference will be released. + */ + + sock_orphan(sk); + + xfrm_sk_free_policy(sk); + + sk_refcnt_debug_release(sk); + + sock_put(sk); +} +EXPORT_SYMBOL(sk_common_release); + +void sk_get_meminfo(const struct sock *sk, u32 *mem) +{ + memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); + + mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); + mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); + mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); + mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); + mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk); + mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); + mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); + mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); + mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); +} + +#ifdef CONFIG_PROC_FS +static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); + +int sock_prot_inuse_get(struct net *net, struct proto *prot) +{ + int cpu, idx = prot->inuse_idx; + int res = 0; + + for_each_possible_cpu(cpu) + res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; + + return res >= 0 ? res : 0; +} +EXPORT_SYMBOL_GPL(sock_prot_inuse_get); + +int sock_inuse_get(struct net *net) +{ + int cpu, res = 0; + + for_each_possible_cpu(cpu) + res += per_cpu_ptr(net->core.prot_inuse, cpu)->all; + + return res; +} + +EXPORT_SYMBOL_GPL(sock_inuse_get); + +static int __net_init sock_inuse_init_net(struct net *net) +{ + net->core.prot_inuse = alloc_percpu(struct prot_inuse); + if (net->core.prot_inuse == NULL) + return -ENOMEM; + return 0; +} + +static void __net_exit sock_inuse_exit_net(struct net *net) +{ + free_percpu(net->core.prot_inuse); +} + +static struct pernet_operations net_inuse_ops = { + .init = sock_inuse_init_net, + .exit = sock_inuse_exit_net, +}; + +static __init int net_inuse_init(void) +{ + if (register_pernet_subsys(&net_inuse_ops)) + panic("Cannot initialize net inuse counters"); + + return 0; +} + +core_initcall(net_inuse_init); + +static int assign_proto_idx(struct proto *prot) +{ + prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); + + if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { + pr_err("PROTO_INUSE_NR exhausted\n"); + return -ENOSPC; + } + + set_bit(prot->inuse_idx, proto_inuse_idx); + return 0; +} + +static void release_proto_idx(struct proto *prot) +{ + if (prot->inuse_idx != PROTO_INUSE_NR - 1) + clear_bit(prot->inuse_idx, proto_inuse_idx); +} +#else +static inline int assign_proto_idx(struct proto *prot) +{ + return 0; +} + +static inline void release_proto_idx(struct proto *prot) +{ +} + +#endif + +static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) +{ + if (!twsk_prot) + return; + kfree(twsk_prot->twsk_slab_name); + twsk_prot->twsk_slab_name = NULL; + kmem_cache_destroy(twsk_prot->twsk_slab); + twsk_prot->twsk_slab = NULL; +} + +static int tw_prot_init(const struct proto *prot) +{ + struct timewait_sock_ops *twsk_prot = prot->twsk_prot; + + if (!twsk_prot) + return 0; + + twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", + prot->name); + if (!twsk_prot->twsk_slab_name) + return -ENOMEM; + + twsk_prot->twsk_slab = + kmem_cache_create(twsk_prot->twsk_slab_name, + twsk_prot->twsk_obj_size, 0, + SLAB_ACCOUNT | prot->slab_flags, + NULL); + if (!twsk_prot->twsk_slab) { + pr_crit("%s: Can't create timewait sock SLAB cache!\n", + prot->name); + return -ENOMEM; + } + + return 0; +} + +static void req_prot_cleanup(struct request_sock_ops *rsk_prot) +{ + if (!rsk_prot) + return; + kfree(rsk_prot->slab_name); + rsk_prot->slab_name = NULL; + kmem_cache_destroy(rsk_prot->slab); + rsk_prot->slab = NULL; +} + +static int req_prot_init(const struct proto *prot) +{ + struct request_sock_ops *rsk_prot = prot->rsk_prot; + + if (!rsk_prot) + return 0; + + rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", + prot->name); + if (!rsk_prot->slab_name) + return -ENOMEM; + + rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, + rsk_prot->obj_size, 0, + SLAB_ACCOUNT | prot->slab_flags, + NULL); + + if (!rsk_prot->slab) { + pr_crit("%s: Can't create request sock SLAB cache!\n", + prot->name); + return -ENOMEM; + } + return 0; +} + +int proto_register(struct proto *prot, int alloc_slab) +{ + int ret = -ENOBUFS; + + if (prot->memory_allocated && !prot->sysctl_mem) { + pr_err("%s: missing sysctl_mem\n", prot->name); + return -EINVAL; + } + if (prot->memory_allocated && !prot->per_cpu_fw_alloc) { + pr_err("%s: missing per_cpu_fw_alloc\n", prot->name); + return -EINVAL; + } + if (alloc_slab) { + prot->slab = kmem_cache_create_usercopy(prot->name, + prot->obj_size, 0, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | + prot->slab_flags, + prot->useroffset, prot->usersize, + NULL); + + if (prot->slab == NULL) { + pr_crit("%s: Can't create sock SLAB cache!\n", + prot->name); + goto out; + } + + if (req_prot_init(prot)) + goto out_free_request_sock_slab; + + if (tw_prot_init(prot)) + goto out_free_timewait_sock_slab; + } + + mutex_lock(&proto_list_mutex); + ret = assign_proto_idx(prot); + if (ret) { + mutex_unlock(&proto_list_mutex); + goto out_free_timewait_sock_slab; + } + list_add(&prot->node, &proto_list); + mutex_unlock(&proto_list_mutex); + return ret; + +out_free_timewait_sock_slab: + if (alloc_slab) + tw_prot_cleanup(prot->twsk_prot); +out_free_request_sock_slab: + if (alloc_slab) { + req_prot_cleanup(prot->rsk_prot); + + kmem_cache_destroy(prot->slab); + prot->slab = NULL; + } +out: + return ret; +} +EXPORT_SYMBOL(proto_register); + +void proto_unregister(struct proto *prot) +{ + mutex_lock(&proto_list_mutex); + release_proto_idx(prot); + list_del(&prot->node); + mutex_unlock(&proto_list_mutex); + + kmem_cache_destroy(prot->slab); + prot->slab = NULL; + + req_prot_cleanup(prot->rsk_prot); + tw_prot_cleanup(prot->twsk_prot); +} +EXPORT_SYMBOL(proto_unregister); + +int sock_load_diag_module(int family, int protocol) +{ + if (!protocol) { + if (!sock_is_registered(family)) + return -ENOENT; + + return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, + NETLINK_SOCK_DIAG, family); + } + +#ifdef CONFIG_INET + if (family == AF_INET && + protocol != IPPROTO_RAW && + protocol < MAX_INET_PROTOS && + !rcu_access_pointer(inet_protos[protocol])) + return -ENOENT; +#endif + + return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, + NETLINK_SOCK_DIAG, family, protocol); +} +EXPORT_SYMBOL(sock_load_diag_module); + +#ifdef CONFIG_PROC_FS +static void *proto_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(proto_list_mutex) +{ + mutex_lock(&proto_list_mutex); + return seq_list_start_head(&proto_list, *pos); +} + +static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return seq_list_next(v, &proto_list, pos); +} + +static void proto_seq_stop(struct seq_file *seq, void *v) + __releases(proto_list_mutex) +{ + mutex_unlock(&proto_list_mutex); +} + +static char proto_method_implemented(const void *method) +{ + return method == NULL ? 'n' : 'y'; +} +static long sock_prot_memory_allocated(struct proto *proto) +{ + return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; +} + +static const char *sock_prot_memory_pressure(struct proto *proto) +{ + return proto->memory_pressure != NULL ? + proto_memory_pressure(proto) ? "yes" : "no" : "NI"; +} + +static void proto_seq_printf(struct seq_file *seq, struct proto *proto) +{ + + seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " + "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", + proto->name, + proto->obj_size, + sock_prot_inuse_get(seq_file_net(seq), proto), + sock_prot_memory_allocated(proto), + sock_prot_memory_pressure(proto), + proto->max_header, + proto->slab == NULL ? "no" : "yes", + module_name(proto->owner), + proto_method_implemented(proto->close), + proto_method_implemented(proto->connect), + proto_method_implemented(proto->disconnect), + proto_method_implemented(proto->accept), + proto_method_implemented(proto->ioctl), + proto_method_implemented(proto->init), + proto_method_implemented(proto->destroy), + proto_method_implemented(proto->shutdown), + proto_method_implemented(proto->setsockopt), + proto_method_implemented(proto->getsockopt), + proto_method_implemented(proto->sendmsg), + proto_method_implemented(proto->recvmsg), + proto_method_implemented(proto->sendpage), + proto_method_implemented(proto->bind), + proto_method_implemented(proto->backlog_rcv), + proto_method_implemented(proto->hash), + proto_method_implemented(proto->unhash), + proto_method_implemented(proto->get_port), + proto_method_implemented(proto->enter_memory_pressure)); +} + +static int proto_seq_show(struct seq_file *seq, void *v) +{ + if (v == &proto_list) + seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", + "protocol", + "size", + "sockets", + "memory", + "press", + "maxhdr", + "slab", + "module", + "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); + else + proto_seq_printf(seq, list_entry(v, struct proto, node)); + return 0; +} + +static const struct seq_operations proto_seq_ops = { + .start = proto_seq_start, + .next = proto_seq_next, + .stop = proto_seq_stop, + .show = proto_seq_show, +}; + +static __net_init int proto_init_net(struct net *net) +{ + if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, + sizeof(struct seq_net_private))) + return -ENOMEM; + + return 0; +} + +static __net_exit void proto_exit_net(struct net *net) +{ + remove_proc_entry("protocols", net->proc_net); +} + + +static __net_initdata struct pernet_operations proto_net_ops = { + .init = proto_init_net, + .exit = proto_exit_net, +}; + +static int __init proto_init(void) +{ + return register_pernet_subsys(&proto_net_ops); +} + +subsys_initcall(proto_init); + +#endif /* PROC_FS */ + +#ifdef CONFIG_NET_RX_BUSY_POLL +bool sk_busy_loop_end(void *p, unsigned long start_time) +{ + struct sock *sk = p; + + if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) + return true; + + if (sk_is_udp(sk) && + !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue)) + return true; + + return sk_busy_loop_timeout(sk, start_time); +} +EXPORT_SYMBOL(sk_busy_loop_end); +#endif /* CONFIG_NET_RX_BUSY_POLL */ + +int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) +{ + if (!sk->sk_prot->bind_add) + return -EOPNOTSUPP; + return sk->sk_prot->bind_add(sk, addr, addr_len); +} +EXPORT_SYMBOL(sock_bind_add); diff --git a/net/core/sock_destructor.h b/net/core/sock_destructor.h new file mode 100644 index 000000000..2f396e6bf --- /dev/null +++ b/net/core/sock_destructor.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_CORE_SOCK_DESTRUCTOR_H +#define _NET_CORE_SOCK_DESTRUCTOR_H +#include <net/tcp.h> + +static inline bool is_skb_wmem(const struct sk_buff *skb) +{ + return skb->destructor == sock_wfree || + skb->destructor == __sock_wfree || + (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree); +} +#endif diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c new file mode 100644 index 000000000..f7cf74cdd --- /dev/null +++ b/net/core/sock_diag.c @@ -0,0 +1,340 @@ +/* License: GPL */ + +#include <linux/filter.h> +#include <linux/mutex.h> +#include <linux/socket.h> +#include <linux/skbuff.h> +#include <net/netlink.h> +#include <net/net_namespace.h> +#include <linux/module.h> +#include <net/sock.h> +#include <linux/kernel.h> +#include <linux/tcp.h> +#include <linux/workqueue.h> +#include <linux/nospec.h> +#include <linux/cookie.h> +#include <linux/inet_diag.h> +#include <linux/sock_diag.h> + +static const struct sock_diag_handler *sock_diag_handlers[AF_MAX]; +static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh); +static DEFINE_MUTEX(sock_diag_table_mutex); +static struct workqueue_struct *broadcast_wq; + +DEFINE_COOKIE(sock_cookie); + +u64 __sock_gen_cookie(struct sock *sk) +{ + while (1) { + u64 res = atomic64_read(&sk->sk_cookie); + + if (res) + return res; + res = gen_cookie_next(&sock_cookie); + atomic64_cmpxchg(&sk->sk_cookie, 0, res); + } +} + +int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie) +{ + u64 res; + + if (cookie[0] == INET_DIAG_NOCOOKIE && cookie[1] == INET_DIAG_NOCOOKIE) + return 0; + + res = sock_gen_cookie(sk); + if ((u32)res != cookie[0] || (u32)(res >> 32) != cookie[1]) + return -ESTALE; + + return 0; +} +EXPORT_SYMBOL_GPL(sock_diag_check_cookie); + +void sock_diag_save_cookie(struct sock *sk, __u32 *cookie) +{ + u64 res = sock_gen_cookie(sk); + + cookie[0] = (u32)res; + cookie[1] = (u32)(res >> 32); +} +EXPORT_SYMBOL_GPL(sock_diag_save_cookie); + +int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype) +{ + u32 mem[SK_MEMINFO_VARS]; + + sk_get_meminfo(sk, mem); + + return nla_put(skb, attrtype, sizeof(mem), &mem); +} +EXPORT_SYMBOL_GPL(sock_diag_put_meminfo); + +int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk, + struct sk_buff *skb, int attrtype) +{ + struct sock_fprog_kern *fprog; + struct sk_filter *filter; + struct nlattr *attr; + unsigned int flen; + int err = 0; + + if (!may_report_filterinfo) { + nla_reserve(skb, attrtype, 0); + return 0; + } + + rcu_read_lock(); + filter = rcu_dereference(sk->sk_filter); + if (!filter) + goto out; + + fprog = filter->prog->orig_prog; + if (!fprog) + goto out; + + flen = bpf_classic_proglen(fprog); + + attr = nla_reserve(skb, attrtype, flen); + if (attr == NULL) { + err = -EMSGSIZE; + goto out; + } + + memcpy(nla_data(attr), fprog->filter, flen); +out: + rcu_read_unlock(); + return err; +} +EXPORT_SYMBOL(sock_diag_put_filterinfo); + +struct broadcast_sk { + struct sock *sk; + struct work_struct work; +}; + +static size_t sock_diag_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct inet_diag_msg) + + nla_total_size(sizeof(u8)) /* INET_DIAG_PROTOCOL */ + + nla_total_size_64bit(sizeof(struct tcp_info))); /* INET_DIAG_INFO */ +} + +static void sock_diag_broadcast_destroy_work(struct work_struct *work) +{ + struct broadcast_sk *bsk = + container_of(work, struct broadcast_sk, work); + struct sock *sk = bsk->sk; + const struct sock_diag_handler *hndl; + struct sk_buff *skb; + const enum sknetlink_groups group = sock_diag_destroy_group(sk); + int err = -1; + + WARN_ON(group == SKNLGRP_NONE); + + skb = nlmsg_new(sock_diag_nlmsg_size(), GFP_KERNEL); + if (!skb) + goto out; + + mutex_lock(&sock_diag_table_mutex); + hndl = sock_diag_handlers[sk->sk_family]; + if (hndl && hndl->get_info) + err = hndl->get_info(skb, sk); + mutex_unlock(&sock_diag_table_mutex); + + if (!err) + nlmsg_multicast(sock_net(sk)->diag_nlsk, skb, 0, group, + GFP_KERNEL); + else + kfree_skb(skb); +out: + sk_destruct(sk); + kfree(bsk); +} + +void sock_diag_broadcast_destroy(struct sock *sk) +{ + /* Note, this function is often called from an interrupt context. */ + struct broadcast_sk *bsk = + kmalloc(sizeof(struct broadcast_sk), GFP_ATOMIC); + if (!bsk) + return sk_destruct(sk); + bsk->sk = sk; + INIT_WORK(&bsk->work, sock_diag_broadcast_destroy_work); + queue_work(broadcast_wq, &bsk->work); +} + +void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) +{ + mutex_lock(&sock_diag_table_mutex); + inet_rcv_compat = fn; + mutex_unlock(&sock_diag_table_mutex); +} +EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat); + +void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) +{ + mutex_lock(&sock_diag_table_mutex); + inet_rcv_compat = NULL; + mutex_unlock(&sock_diag_table_mutex); +} +EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat); + +int sock_diag_register(const struct sock_diag_handler *hndl) +{ + int err = 0; + + if (hndl->family >= AF_MAX) + return -EINVAL; + + mutex_lock(&sock_diag_table_mutex); + if (sock_diag_handlers[hndl->family]) + err = -EBUSY; + else + sock_diag_handlers[hndl->family] = hndl; + mutex_unlock(&sock_diag_table_mutex); + + return err; +} +EXPORT_SYMBOL_GPL(sock_diag_register); + +void sock_diag_unregister(const struct sock_diag_handler *hnld) +{ + int family = hnld->family; + + if (family >= AF_MAX) + return; + + mutex_lock(&sock_diag_table_mutex); + BUG_ON(sock_diag_handlers[family] != hnld); + sock_diag_handlers[family] = NULL; + mutex_unlock(&sock_diag_table_mutex); +} +EXPORT_SYMBOL_GPL(sock_diag_unregister); + +static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + int err; + struct sock_diag_req *req = nlmsg_data(nlh); + const struct sock_diag_handler *hndl; + + if (nlmsg_len(nlh) < sizeof(*req)) + return -EINVAL; + + if (req->sdiag_family >= AF_MAX) + return -EINVAL; + req->sdiag_family = array_index_nospec(req->sdiag_family, AF_MAX); + + if (sock_diag_handlers[req->sdiag_family] == NULL) + sock_load_diag_module(req->sdiag_family, 0); + + mutex_lock(&sock_diag_table_mutex); + hndl = sock_diag_handlers[req->sdiag_family]; + if (hndl == NULL) + err = -ENOENT; + else if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY) + err = hndl->dump(skb, nlh); + else if (nlh->nlmsg_type == SOCK_DESTROY && hndl->destroy) + err = hndl->destroy(skb, nlh); + else + err = -EOPNOTSUPP; + mutex_unlock(&sock_diag_table_mutex); + + return err; +} + +static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + int ret; + + switch (nlh->nlmsg_type) { + case TCPDIAG_GETSOCK: + case DCCPDIAG_GETSOCK: + if (inet_rcv_compat == NULL) + sock_load_diag_module(AF_INET, 0); + + mutex_lock(&sock_diag_table_mutex); + if (inet_rcv_compat != NULL) + ret = inet_rcv_compat(skb, nlh); + else + ret = -EOPNOTSUPP; + mutex_unlock(&sock_diag_table_mutex); + + return ret; + case SOCK_DIAG_BY_FAMILY: + case SOCK_DESTROY: + return __sock_diag_cmd(skb, nlh); + default: + return -EINVAL; + } +} + +static DEFINE_MUTEX(sock_diag_mutex); + +static void sock_diag_rcv(struct sk_buff *skb) +{ + mutex_lock(&sock_diag_mutex); + netlink_rcv_skb(skb, &sock_diag_rcv_msg); + mutex_unlock(&sock_diag_mutex); +} + +static int sock_diag_bind(struct net *net, int group) +{ + switch (group) { + case SKNLGRP_INET_TCP_DESTROY: + case SKNLGRP_INET_UDP_DESTROY: + if (!sock_diag_handlers[AF_INET]) + sock_load_diag_module(AF_INET, 0); + break; + case SKNLGRP_INET6_TCP_DESTROY: + case SKNLGRP_INET6_UDP_DESTROY: + if (!sock_diag_handlers[AF_INET6]) + sock_load_diag_module(AF_INET6, 0); + break; + } + return 0; +} + +int sock_diag_destroy(struct sock *sk, int err) +{ + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + if (!sk->sk_prot->diag_destroy) + return -EOPNOTSUPP; + + return sk->sk_prot->diag_destroy(sk, err); +} +EXPORT_SYMBOL_GPL(sock_diag_destroy); + +static int __net_init diag_net_init(struct net *net) +{ + struct netlink_kernel_cfg cfg = { + .groups = SKNLGRP_MAX, + .input = sock_diag_rcv, + .bind = sock_diag_bind, + .flags = NL_CFG_F_NONROOT_RECV, + }; + + net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, &cfg); + return net->diag_nlsk == NULL ? -ENOMEM : 0; +} + +static void __net_exit diag_net_exit(struct net *net) +{ + netlink_kernel_release(net->diag_nlsk); + net->diag_nlsk = NULL; +} + +static struct pernet_operations diag_net_ops = { + .init = diag_net_init, + .exit = diag_net_exit, +}; + +static int __init sock_diag_init(void) +{ + broadcast_wq = alloc_workqueue("sock_diag_events", 0, 0); + BUG_ON(!broadcast_wq); + return register_pernet_subsys(&diag_net_ops); +} +device_initcall(sock_diag_init); diff --git a/net/core/sock_map.c b/net/core/sock_map.c new file mode 100644 index 000000000..91140bc05 --- /dev/null +++ b/net/core/sock_map.c @@ -0,0 +1,1701 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ + +#include <linux/bpf.h> +#include <linux/btf_ids.h> +#include <linux/filter.h> +#include <linux/errno.h> +#include <linux/file.h> +#include <linux/net.h> +#include <linux/workqueue.h> +#include <linux/skmsg.h> +#include <linux/list.h> +#include <linux/jhash.h> +#include <linux/sock_diag.h> +#include <net/udp.h> + +struct bpf_stab { + struct bpf_map map; + struct sock **sks; + struct sk_psock_progs progs; + raw_spinlock_t lock; +}; + +#define SOCK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + +static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, u32 which); +static struct sk_psock_progs *sock_map_progs(struct bpf_map *map); + +static struct bpf_map *sock_map_alloc(union bpf_attr *attr) +{ + struct bpf_stab *stab; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + if (attr->max_entries == 0 || + attr->key_size != 4 || + (attr->value_size != sizeof(u32) && + attr->value_size != sizeof(u64)) || + attr->map_flags & ~SOCK_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + + stab = bpf_map_area_alloc(sizeof(*stab), NUMA_NO_NODE); + if (!stab) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&stab->map, attr); + raw_spin_lock_init(&stab->lock); + + stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries * + sizeof(struct sock *), + stab->map.numa_node); + if (!stab->sks) { + bpf_map_area_free(stab); + return ERR_PTR(-ENOMEM); + } + + return &stab->map; +} + +int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) +{ + u32 ufd = attr->target_fd; + struct bpf_map *map; + struct fd f; + int ret; + + if (attr->attach_flags || attr->replace_bpf_fd) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + ret = sock_map_prog_update(map, prog, NULL, attr->attach_type); + fdput(f); + return ret; +} + +int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) +{ + u32 ufd = attr->target_fd; + struct bpf_prog *prog; + struct bpf_map *map; + struct fd f; + int ret; + + if (attr->attach_flags || attr->replace_bpf_fd) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + prog = bpf_prog_get(attr->attach_bpf_fd); + if (IS_ERR(prog)) { + ret = PTR_ERR(prog); + goto put_map; + } + + if (prog->type != ptype) { + ret = -EINVAL; + goto put_prog; + } + + ret = sock_map_prog_update(map, NULL, prog, attr->attach_type); +put_prog: + bpf_prog_put(prog); +put_map: + fdput(f); + return ret; +} + +static void sock_map_sk_acquire(struct sock *sk) + __acquires(&sk->sk_lock.slock) +{ + lock_sock(sk); + rcu_read_lock(); +} + +static void sock_map_sk_release(struct sock *sk) + __releases(&sk->sk_lock.slock) +{ + rcu_read_unlock(); + release_sock(sk); +} + +static void sock_map_add_link(struct sk_psock *psock, + struct sk_psock_link *link, + struct bpf_map *map, void *link_raw) +{ + link->link_raw = link_raw; + link->map = map; + spin_lock_bh(&psock->link_lock); + list_add_tail(&link->list, &psock->link); + spin_unlock_bh(&psock->link_lock); +} + +static void sock_map_del_link(struct sock *sk, + struct sk_psock *psock, void *link_raw) +{ + bool strp_stop = false, verdict_stop = false; + struct sk_psock_link *link, *tmp; + + spin_lock_bh(&psock->link_lock); + list_for_each_entry_safe(link, tmp, &psock->link, list) { + if (link->link_raw == link_raw) { + struct bpf_map *map = link->map; + struct sk_psock_progs *progs = sock_map_progs(map); + + if (psock->saved_data_ready && progs->stream_parser) + strp_stop = true; + if (psock->saved_data_ready && progs->stream_verdict) + verdict_stop = true; + if (psock->saved_data_ready && progs->skb_verdict) + verdict_stop = true; + list_del(&link->list); + sk_psock_free_link(link); + } + } + spin_unlock_bh(&psock->link_lock); + if (strp_stop || verdict_stop) { + write_lock_bh(&sk->sk_callback_lock); + if (strp_stop) + sk_psock_stop_strp(sk, psock); + if (verdict_stop) + sk_psock_stop_verdict(sk, psock); + + if (psock->psock_update_sk_prot) + psock->psock_update_sk_prot(sk, psock, false); + write_unlock_bh(&sk->sk_callback_lock); + } +} + +static void sock_map_unref(struct sock *sk, void *link_raw) +{ + struct sk_psock *psock = sk_psock(sk); + + if (likely(psock)) { + sock_map_del_link(sk, psock, link_raw); + sk_psock_put(sk, psock); + } +} + +static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) +{ + if (!sk->sk_prot->psock_update_sk_prot) + return -EINVAL; + psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot; + return sk->sk_prot->psock_update_sk_prot(sk, psock, false); +} + +static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) +{ + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + if (psock) { + if (sk->sk_prot->close != sock_map_close) { + psock = ERR_PTR(-EBUSY); + goto out; + } + + if (!refcount_inc_not_zero(&psock->refcnt)) + psock = ERR_PTR(-EBUSY); + } +out: + rcu_read_unlock(); + return psock; +} + +static int sock_map_link(struct bpf_map *map, struct sock *sk) +{ + struct sk_psock_progs *progs = sock_map_progs(map); + struct bpf_prog *stream_verdict = NULL; + struct bpf_prog *stream_parser = NULL; + struct bpf_prog *skb_verdict = NULL; + struct bpf_prog *msg_parser = NULL; + struct sk_psock *psock; + int ret; + + stream_verdict = READ_ONCE(progs->stream_verdict); + if (stream_verdict) { + stream_verdict = bpf_prog_inc_not_zero(stream_verdict); + if (IS_ERR(stream_verdict)) + return PTR_ERR(stream_verdict); + } + + stream_parser = READ_ONCE(progs->stream_parser); + if (stream_parser) { + stream_parser = bpf_prog_inc_not_zero(stream_parser); + if (IS_ERR(stream_parser)) { + ret = PTR_ERR(stream_parser); + goto out_put_stream_verdict; + } + } + + msg_parser = READ_ONCE(progs->msg_parser); + if (msg_parser) { + msg_parser = bpf_prog_inc_not_zero(msg_parser); + if (IS_ERR(msg_parser)) { + ret = PTR_ERR(msg_parser); + goto out_put_stream_parser; + } + } + + skb_verdict = READ_ONCE(progs->skb_verdict); + if (skb_verdict) { + skb_verdict = bpf_prog_inc_not_zero(skb_verdict); + if (IS_ERR(skb_verdict)) { + ret = PTR_ERR(skb_verdict); + goto out_put_msg_parser; + } + } + + psock = sock_map_psock_get_checked(sk); + if (IS_ERR(psock)) { + ret = PTR_ERR(psock); + goto out_progs; + } + + if (psock) { + if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || + (stream_parser && READ_ONCE(psock->progs.stream_parser)) || + (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) || + (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) || + (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) || + (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { + sk_psock_put(sk, psock); + ret = -EBUSY; + goto out_progs; + } + } else { + psock = sk_psock_init(sk, map->numa_node); + if (IS_ERR(psock)) { + ret = PTR_ERR(psock); + goto out_progs; + } + } + + if (msg_parser) + psock_set_prog(&psock->progs.msg_parser, msg_parser); + if (stream_parser) + psock_set_prog(&psock->progs.stream_parser, stream_parser); + if (stream_verdict) + psock_set_prog(&psock->progs.stream_verdict, stream_verdict); + if (skb_verdict) + psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + + /* msg_* and stream_* programs references tracked in psock after this + * point. Reference dec and cleanup will occur through psock destructor + */ + ret = sock_map_init_proto(sk, psock); + if (ret < 0) { + sk_psock_put(sk, psock); + goto out; + } + + write_lock_bh(&sk->sk_callback_lock); + if (stream_parser && stream_verdict && !psock->saved_data_ready) { + ret = sk_psock_init_strp(sk, psock); + if (ret) { + write_unlock_bh(&sk->sk_callback_lock); + sk_psock_put(sk, psock); + goto out; + } + sk_psock_start_strp(sk, psock); + } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { + sk_psock_start_verdict(sk,psock); + } else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) { + sk_psock_start_verdict(sk, psock); + } + write_unlock_bh(&sk->sk_callback_lock); + return 0; +out_progs: + if (skb_verdict) + bpf_prog_put(skb_verdict); +out_put_msg_parser: + if (msg_parser) + bpf_prog_put(msg_parser); +out_put_stream_parser: + if (stream_parser) + bpf_prog_put(stream_parser); +out_put_stream_verdict: + if (stream_verdict) + bpf_prog_put(stream_verdict); +out: + return ret; +} + +static void sock_map_free(struct bpf_map *map) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + int i; + + /* After the sync no updates or deletes will be in-flight so it + * is safe to walk map and remove entries without risking a race + * in EEXIST update case. + */ + synchronize_rcu(); + for (i = 0; i < stab->map.max_entries; i++) { + struct sock **psk = &stab->sks[i]; + struct sock *sk; + + sk = xchg(psk, NULL); + if (sk) { + sock_hold(sk); + lock_sock(sk); + rcu_read_lock(); + sock_map_unref(sk, psk); + rcu_read_unlock(); + release_sock(sk); + sock_put(sk); + } + } + + /* wait for psock readers accessing its map link */ + synchronize_rcu(); + + bpf_map_area_free(stab->sks); + bpf_map_area_free(stab); +} + +static void sock_map_release_progs(struct bpf_map *map) +{ + psock_progs_drop(&container_of(map, struct bpf_stab, map)->progs); +} + +static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (unlikely(key >= map->max_entries)) + return NULL; + return READ_ONCE(stab->sks[key]); +} + +static void *sock_map_lookup(struct bpf_map *map, void *key) +{ + struct sock *sk; + + sk = __sock_map_lookup_elem(map, *(u32 *)key); + if (!sk) + return NULL; + if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) + return NULL; + return sk; +} + +static void *sock_map_lookup_sys(struct bpf_map *map, void *key) +{ + struct sock *sk; + + if (map->value_size != sizeof(u64)) + return ERR_PTR(-ENOSPC); + + sk = __sock_map_lookup_elem(map, *(u32 *)key); + if (!sk) + return ERR_PTR(-ENOENT); + + __sock_gen_cookie(sk); + return &sk->sk_cookie; +} + +static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, + struct sock **psk) +{ + struct sock *sk; + int err = 0; + + raw_spin_lock_bh(&stab->lock); + sk = *psk; + if (!sk_test || sk_test == sk) + sk = xchg(psk, NULL); + + if (likely(sk)) + sock_map_unref(sk, psk); + else + err = -EINVAL; + + raw_spin_unlock_bh(&stab->lock); + return err; +} + +static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk, + void *link_raw) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + + __sock_map_delete(stab, sk, link_raw); +} + +static int sock_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + u32 i = *(u32 *)key; + struct sock **psk; + + if (unlikely(i >= map->max_entries)) + return -EINVAL; + + psk = &stab->sks[i]; + return __sock_map_delete(stab, NULL, psk); +} + +static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + u32 i = key ? *(u32 *)key : U32_MAX; + u32 *key_next = next; + + if (i == stab->map.max_entries - 1) + return -ENOENT; + if (i >= stab->map.max_entries) + *key_next = 0; + else + *key_next = i + 1; + return 0; +} + +static int sock_map_update_common(struct bpf_map *map, u32 idx, + struct sock *sk, u64 flags) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct sk_psock_link *link; + struct sk_psock *psock; + struct sock *osk; + int ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(idx >= map->max_entries)) + return -E2BIG; + + link = sk_psock_init_link(); + if (!link) + return -ENOMEM; + + ret = sock_map_link(map, sk); + if (ret < 0) + goto out_free; + + psock = sk_psock(sk); + WARN_ON_ONCE(!psock); + + raw_spin_lock_bh(&stab->lock); + osk = stab->sks[idx]; + if (osk && flags == BPF_NOEXIST) { + ret = -EEXIST; + goto out_unlock; + } else if (!osk && flags == BPF_EXIST) { + ret = -ENOENT; + goto out_unlock; + } + + sock_map_add_link(psock, link, map, &stab->sks[idx]); + stab->sks[idx] = sk; + if (osk) + sock_map_unref(osk, &stab->sks[idx]); + raw_spin_unlock_bh(&stab->lock); + return 0; +out_unlock: + raw_spin_unlock_bh(&stab->lock); + if (psock) + sk_psock_put(sk, psock); +out_free: + sk_psock_free_link(link); + return ret; +} + +static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops) +{ + return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || + ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB || + ops->op == BPF_SOCK_OPS_TCP_LISTEN_CB; +} + +static bool sock_map_redirect_allowed(const struct sock *sk) +{ + if (sk_is_tcp(sk)) + return sk->sk_state != TCP_LISTEN; + else + return sk->sk_state == TCP_ESTABLISHED; +} + +static bool sock_map_sk_is_suitable(const struct sock *sk) +{ + return !!sk->sk_prot->psock_update_sk_prot; +} + +static bool sock_map_sk_state_allowed(const struct sock *sk) +{ + if (sk_is_tcp(sk)) + return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); + if (sk_is_stream_unix(sk)) + return (1 << sk->sk_state) & TCPF_ESTABLISHED; + return true; +} + +static int sock_hash_update_common(struct bpf_map *map, void *key, + struct sock *sk, u64 flags); + +int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, + u64 flags) +{ + struct socket *sock; + struct sock *sk; + int ret; + u64 ufd; + + if (map->value_size == sizeof(u64)) + ufd = *(u64 *)value; + else + ufd = *(u32 *)value; + if (ufd > S32_MAX) + return -EINVAL; + + sock = sockfd_lookup(ufd, &ret); + if (!sock) + return ret; + sk = sock->sk; + if (!sk) { + ret = -EINVAL; + goto out; + } + if (!sock_map_sk_is_suitable(sk)) { + ret = -EOPNOTSUPP; + goto out; + } + + sock_map_sk_acquire(sk); + if (!sock_map_sk_state_allowed(sk)) + ret = -EOPNOTSUPP; + else if (map->map_type == BPF_MAP_TYPE_SOCKMAP) + ret = sock_map_update_common(map, *(u32 *)key, sk, flags); + else + ret = sock_hash_update_common(map, key, sk, flags); + sock_map_sk_release(sk); +out: + sockfd_put(sock); + return ret; +} + +static int sock_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + struct sock *sk = (struct sock *)value; + int ret; + + if (unlikely(!sk || !sk_fullsock(sk))) + return -EINVAL; + + if (!sock_map_sk_is_suitable(sk)) + return -EOPNOTSUPP; + + local_bh_disable(); + bh_lock_sock(sk); + if (!sock_map_sk_state_allowed(sk)) + ret = -EOPNOTSUPP; + else if (map->map_type == BPF_MAP_TYPE_SOCKMAP) + ret = sock_map_update_common(map, *(u32 *)key, sk, flags); + else + ret = sock_hash_update_common(map, key, sk, flags); + bh_unlock_sock(sk); + local_bh_enable(); + return ret; +} + +BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops, + struct bpf_map *, map, void *, key, u64, flags) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (likely(sock_map_sk_is_suitable(sops->sk) && + sock_map_op_okay(sops))) + return sock_map_update_common(map, *(u32 *)key, sops->sk, + flags); + return -EOPNOTSUPP; +} + +const struct bpf_func_proto bpf_sock_map_update_proto = { + .func = bpf_sock_map_update, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, + struct bpf_map *, map, u32, key, u64, flags) +{ + struct sock *sk; + + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + + sk = __sock_map_lookup_elem(map, key); + if (unlikely(!sk || !sock_map_redirect_allowed(sk))) + return SK_DROP; + + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); + return SK_PASS; +} + +const struct bpf_func_proto bpf_sk_redirect_map_proto = { + .func = bpf_sk_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg, + struct bpf_map *, map, u32, key, u64, flags) +{ + struct sock *sk; + + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + + sk = __sock_map_lookup_elem(map, key); + if (unlikely(!sk || !sock_map_redirect_allowed(sk))) + return SK_DROP; + if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) + return SK_DROP; + + msg->flags = flags; + msg->sk_redir = sk; + return SK_PASS; +} + +const struct bpf_func_proto bpf_msg_redirect_map_proto = { + .func = bpf_msg_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + +struct sock_map_seq_info { + struct bpf_map *map; + struct sock *sk; + u32 index; +}; + +struct bpf_iter__sockmap { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_map *, map); + __bpf_md_ptr(void *, key); + __bpf_md_ptr(struct sock *, sk); +}; + +DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta, + struct bpf_map *map, void *key, + struct sock *sk) + +static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info) +{ + if (unlikely(info->index >= info->map->max_entries)) + return NULL; + + info->sk = __sock_map_lookup_elem(info->map, info->index); + + /* can't return sk directly, since that might be NULL */ + return info; +} + +static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(rcu) +{ + struct sock_map_seq_info *info = seq->private; + + if (*pos == 0) + ++*pos; + + /* pairs with sock_map_seq_stop */ + rcu_read_lock(); + return sock_map_seq_lookup_elem(info); +} + +static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) + __must_hold(rcu) +{ + struct sock_map_seq_info *info = seq->private; + + ++*pos; + ++info->index; + + return sock_map_seq_lookup_elem(info); +} + +static int sock_map_seq_show(struct seq_file *seq, void *v) + __must_hold(rcu) +{ + struct sock_map_seq_info *info = seq->private; + struct bpf_iter__sockmap ctx = {}; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, !v); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.map = info->map; + if (v) { + ctx.key = &info->index; + ctx.sk = info->sk; + } + + return bpf_iter_run_prog(prog, &ctx); +} + +static void sock_map_seq_stop(struct seq_file *seq, void *v) + __releases(rcu) +{ + if (!v) + (void)sock_map_seq_show(seq, NULL); + + /* pairs with sock_map_seq_start */ + rcu_read_unlock(); +} + +static const struct seq_operations sock_map_seq_ops = { + .start = sock_map_seq_start, + .next = sock_map_seq_next, + .stop = sock_map_seq_stop, + .show = sock_map_seq_show, +}; + +static int sock_map_init_seq_private(void *priv_data, + struct bpf_iter_aux_info *aux) +{ + struct sock_map_seq_info *info = priv_data; + + bpf_map_inc_with_uref(aux->map); + info->map = aux->map; + return 0; +} + +static void sock_map_fini_seq_private(void *priv_data) +{ + struct sock_map_seq_info *info = priv_data; + + bpf_map_put_with_uref(info->map); +} + +static const struct bpf_iter_seq_info sock_map_iter_seq_info = { + .seq_ops = &sock_map_seq_ops, + .init_seq_private = sock_map_init_seq_private, + .fini_seq_private = sock_map_fini_seq_private, + .seq_priv_size = sizeof(struct sock_map_seq_info), +}; + +BTF_ID_LIST_SINGLE(sock_map_btf_ids, struct, bpf_stab) +const struct bpf_map_ops sock_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc = sock_map_alloc, + .map_free = sock_map_free, + .map_get_next_key = sock_map_get_next_key, + .map_lookup_elem_sys_only = sock_map_lookup_sys, + .map_update_elem = sock_map_update_elem, + .map_delete_elem = sock_map_delete_elem, + .map_lookup_elem = sock_map_lookup, + .map_release_uref = sock_map_release_progs, + .map_check_btf = map_check_no_btf, + .map_btf_id = &sock_map_btf_ids[0], + .iter_seq_info = &sock_map_iter_seq_info, +}; + +struct bpf_shtab_elem { + struct rcu_head rcu; + u32 hash; + struct sock *sk; + struct hlist_node node; + u8 key[]; +}; + +struct bpf_shtab_bucket { + struct hlist_head head; + raw_spinlock_t lock; +}; + +struct bpf_shtab { + struct bpf_map map; + struct bpf_shtab_bucket *buckets; + u32 buckets_num; + u32 elem_size; + struct sk_psock_progs progs; + atomic_t count; +}; + +static inline u32 sock_hash_bucket_hash(const void *key, u32 len) +{ + return jhash(key, len, 0); +} + +static struct bpf_shtab_bucket *sock_hash_select_bucket(struct bpf_shtab *htab, + u32 hash) +{ + return &htab->buckets[hash & (htab->buckets_num - 1)]; +} + +static struct bpf_shtab_elem * +sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, + u32 key_size) +{ + struct bpf_shtab_elem *elem; + + hlist_for_each_entry_rcu(elem, head, node) { + if (elem->hash == hash && + !memcmp(&elem->key, key, key_size)) + return elem; + } + + return NULL; +} + +static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); + u32 key_size = map->key_size, hash; + struct bpf_shtab_bucket *bucket; + struct bpf_shtab_elem *elem; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + hash = sock_hash_bucket_hash(key, key_size); + bucket = sock_hash_select_bucket(htab, hash); + elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); + + return elem ? elem->sk : NULL; +} + +static void sock_hash_free_elem(struct bpf_shtab *htab, + struct bpf_shtab_elem *elem) +{ + atomic_dec(&htab->count); + kfree_rcu(elem, rcu); +} + +static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, + void *link_raw) +{ + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); + struct bpf_shtab_elem *elem_probe, *elem = link_raw; + struct bpf_shtab_bucket *bucket; + + WARN_ON_ONCE(!rcu_read_lock_held()); + bucket = sock_hash_select_bucket(htab, elem->hash); + + /* elem may be deleted in parallel from the map, but access here + * is okay since it's going away only after RCU grace period. + * However, we need to check whether it's still present. + */ + raw_spin_lock_bh(&bucket->lock); + elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash, + elem->key, map->key_size); + if (elem_probe && elem_probe == elem) { + hlist_del_rcu(&elem->node); + sock_map_unref(elem->sk, elem); + sock_hash_free_elem(htab, elem); + } + raw_spin_unlock_bh(&bucket->lock); +} + +static int sock_hash_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); + u32 hash, key_size = map->key_size; + struct bpf_shtab_bucket *bucket; + struct bpf_shtab_elem *elem; + int ret = -ENOENT; + + hash = sock_hash_bucket_hash(key, key_size); + bucket = sock_hash_select_bucket(htab, hash); + + raw_spin_lock_bh(&bucket->lock); + elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); + if (elem) { + hlist_del_rcu(&elem->node); + sock_map_unref(elem->sk, elem); + sock_hash_free_elem(htab, elem); + ret = 0; + } + raw_spin_unlock_bh(&bucket->lock); + return ret; +} + +static struct bpf_shtab_elem *sock_hash_alloc_elem(struct bpf_shtab *htab, + void *key, u32 key_size, + u32 hash, struct sock *sk, + struct bpf_shtab_elem *old) +{ + struct bpf_shtab_elem *new; + + if (atomic_inc_return(&htab->count) > htab->map.max_entries) { + if (!old) { + atomic_dec(&htab->count); + return ERR_PTR(-E2BIG); + } + } + + new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, + GFP_ATOMIC | __GFP_NOWARN, + htab->map.numa_node); + if (!new) { + atomic_dec(&htab->count); + return ERR_PTR(-ENOMEM); + } + memcpy(new->key, key, key_size); + new->sk = sk; + new->hash = hash; + return new; +} + +static int sock_hash_update_common(struct bpf_map *map, void *key, + struct sock *sk, u64 flags) +{ + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); + u32 key_size = map->key_size, hash; + struct bpf_shtab_elem *elem, *elem_new; + struct bpf_shtab_bucket *bucket; + struct sk_psock_link *link; + struct sk_psock *psock; + int ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + + link = sk_psock_init_link(); + if (!link) + return -ENOMEM; + + ret = sock_map_link(map, sk); + if (ret < 0) + goto out_free; + + psock = sk_psock(sk); + WARN_ON_ONCE(!psock); + + hash = sock_hash_bucket_hash(key, key_size); + bucket = sock_hash_select_bucket(htab, hash); + + raw_spin_lock_bh(&bucket->lock); + elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); + if (elem && flags == BPF_NOEXIST) { + ret = -EEXIST; + goto out_unlock; + } else if (!elem && flags == BPF_EXIST) { + ret = -ENOENT; + goto out_unlock; + } + + elem_new = sock_hash_alloc_elem(htab, key, key_size, hash, sk, elem); + if (IS_ERR(elem_new)) { + ret = PTR_ERR(elem_new); + goto out_unlock; + } + + sock_map_add_link(psock, link, map, elem_new); + /* Add new element to the head of the list, so that + * concurrent search will find it before old elem. + */ + hlist_add_head_rcu(&elem_new->node, &bucket->head); + if (elem) { + hlist_del_rcu(&elem->node); + sock_map_unref(elem->sk, elem); + sock_hash_free_elem(htab, elem); + } + raw_spin_unlock_bh(&bucket->lock); + return 0; +out_unlock: + raw_spin_unlock_bh(&bucket->lock); + sk_psock_put(sk, psock); +out_free: + sk_psock_free_link(link); + return ret; +} + +static int sock_hash_get_next_key(struct bpf_map *map, void *key, + void *key_next) +{ + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); + struct bpf_shtab_elem *elem, *elem_next; + u32 hash, key_size = map->key_size; + struct hlist_head *head; + int i = 0; + + if (!key) + goto find_first_elem; + hash = sock_hash_bucket_hash(key, key_size); + head = &sock_hash_select_bucket(htab, hash)->head; + elem = sock_hash_lookup_elem_raw(head, hash, key, key_size); + if (!elem) + goto find_first_elem; + + elem_next = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&elem->node)), + struct bpf_shtab_elem, node); + if (elem_next) { + memcpy(key_next, elem_next->key, key_size); + return 0; + } + + i = hash & (htab->buckets_num - 1); + i++; +find_first_elem: + for (; i < htab->buckets_num; i++) { + head = &sock_hash_select_bucket(htab, i)->head; + elem_next = hlist_entry_safe(rcu_dereference(hlist_first_rcu(head)), + struct bpf_shtab_elem, node); + if (elem_next) { + memcpy(key_next, elem_next->key, key_size); + return 0; + } + } + + return -ENOENT; +} + +static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) +{ + struct bpf_shtab *htab; + int i, err; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + if (attr->max_entries == 0 || + attr->key_size == 0 || + (attr->value_size != sizeof(u32) && + attr->value_size != sizeof(u64)) || + attr->map_flags & ~SOCK_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + if (attr->key_size > MAX_BPF_STACK) + return ERR_PTR(-E2BIG); + + htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); + if (!htab) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&htab->map, attr); + + htab->buckets_num = roundup_pow_of_two(htab->map.max_entries); + htab->elem_size = sizeof(struct bpf_shtab_elem) + + round_up(htab->map.key_size, 8); + if (htab->buckets_num == 0 || + htab->buckets_num > U32_MAX / sizeof(struct bpf_shtab_bucket)) { + err = -EINVAL; + goto free_htab; + } + + htab->buckets = bpf_map_area_alloc(htab->buckets_num * + sizeof(struct bpf_shtab_bucket), + htab->map.numa_node); + if (!htab->buckets) { + err = -ENOMEM; + goto free_htab; + } + + for (i = 0; i < htab->buckets_num; i++) { + INIT_HLIST_HEAD(&htab->buckets[i].head); + raw_spin_lock_init(&htab->buckets[i].lock); + } + + return &htab->map; +free_htab: + bpf_map_area_free(htab); + return ERR_PTR(err); +} + +static void sock_hash_free(struct bpf_map *map) +{ + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); + struct bpf_shtab_bucket *bucket; + struct hlist_head unlink_list; + struct bpf_shtab_elem *elem; + struct hlist_node *node; + int i; + + /* After the sync no updates or deletes will be in-flight so it + * is safe to walk map and remove entries without risking a race + * in EEXIST update case. + */ + synchronize_rcu(); + for (i = 0; i < htab->buckets_num; i++) { + bucket = sock_hash_select_bucket(htab, i); + + /* We are racing with sock_hash_delete_from_link to + * enter the spin-lock critical section. Every socket on + * the list is still linked to sockhash. Since link + * exists, psock exists and holds a ref to socket. That + * lets us to grab a socket ref too. + */ + raw_spin_lock_bh(&bucket->lock); + hlist_for_each_entry(elem, &bucket->head, node) + sock_hold(elem->sk); + hlist_move_list(&bucket->head, &unlink_list); + raw_spin_unlock_bh(&bucket->lock); + + /* Process removed entries out of atomic context to + * block for socket lock before deleting the psock's + * link to sockhash. + */ + hlist_for_each_entry_safe(elem, node, &unlink_list, node) { + hlist_del(&elem->node); + lock_sock(elem->sk); + rcu_read_lock(); + sock_map_unref(elem->sk, elem); + rcu_read_unlock(); + release_sock(elem->sk); + sock_put(elem->sk); + sock_hash_free_elem(htab, elem); + } + } + + /* wait for psock readers accessing its map link */ + synchronize_rcu(); + + bpf_map_area_free(htab->buckets); + bpf_map_area_free(htab); +} + +static void *sock_hash_lookup_sys(struct bpf_map *map, void *key) +{ + struct sock *sk; + + if (map->value_size != sizeof(u64)) + return ERR_PTR(-ENOSPC); + + sk = __sock_hash_lookup_elem(map, key); + if (!sk) + return ERR_PTR(-ENOENT); + + __sock_gen_cookie(sk); + return &sk->sk_cookie; +} + +static void *sock_hash_lookup(struct bpf_map *map, void *key) +{ + struct sock *sk; + + sk = __sock_hash_lookup_elem(map, key); + if (!sk) + return NULL; + if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) + return NULL; + return sk; +} + +static void sock_hash_release_progs(struct bpf_map *map) +{ + psock_progs_drop(&container_of(map, struct bpf_shtab, map)->progs); +} + +BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops, + struct bpf_map *, map, void *, key, u64, flags) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (likely(sock_map_sk_is_suitable(sops->sk) && + sock_map_op_okay(sops))) + return sock_hash_update_common(map, key, sops->sk, flags); + return -EOPNOTSUPP; +} + +const struct bpf_func_proto bpf_sock_hash_update_proto = { + .func = bpf_sock_hash_update, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, + struct bpf_map *, map, void *, key, u64, flags) +{ + struct sock *sk; + + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + + sk = __sock_hash_lookup_elem(map, key); + if (unlikely(!sk || !sock_map_redirect_allowed(sk))) + return SK_DROP; + + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); + return SK_PASS; +} + +const struct bpf_func_proto bpf_sk_redirect_hash_proto = { + .func = bpf_sk_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg, + struct bpf_map *, map, void *, key, u64, flags) +{ + struct sock *sk; + + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + + sk = __sock_hash_lookup_elem(map, key); + if (unlikely(!sk || !sock_map_redirect_allowed(sk))) + return SK_DROP; + if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) + return SK_DROP; + + msg->flags = flags; + msg->sk_redir = sk; + return SK_PASS; +} + +const struct bpf_func_proto bpf_msg_redirect_hash_proto = { + .func = bpf_msg_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +struct sock_hash_seq_info { + struct bpf_map *map; + struct bpf_shtab *htab; + u32 bucket_id; +}; + +static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info, + struct bpf_shtab_elem *prev_elem) +{ + const struct bpf_shtab *htab = info->htab; + struct bpf_shtab_bucket *bucket; + struct bpf_shtab_elem *elem; + struct hlist_node *node; + + /* try to find next elem in the same bucket */ + if (prev_elem) { + node = rcu_dereference(hlist_next_rcu(&prev_elem->node)); + elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); + if (elem) + return elem; + + /* no more elements, continue in the next bucket */ + info->bucket_id++; + } + + for (; info->bucket_id < htab->buckets_num; info->bucket_id++) { + bucket = &htab->buckets[info->bucket_id]; + node = rcu_dereference(hlist_first_rcu(&bucket->head)); + elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); + if (elem) + return elem; + } + + return NULL; +} + +static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(rcu) +{ + struct sock_hash_seq_info *info = seq->private; + + if (*pos == 0) + ++*pos; + + /* pairs with sock_hash_seq_stop */ + rcu_read_lock(); + return sock_hash_seq_find_next(info, NULL); +} + +static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos) + __must_hold(rcu) +{ + struct sock_hash_seq_info *info = seq->private; + + ++*pos; + return sock_hash_seq_find_next(info, v); +} + +static int sock_hash_seq_show(struct seq_file *seq, void *v) + __must_hold(rcu) +{ + struct sock_hash_seq_info *info = seq->private; + struct bpf_iter__sockmap ctx = {}; + struct bpf_shtab_elem *elem = v; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, !elem); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.map = info->map; + if (elem) { + ctx.key = elem->key; + ctx.sk = elem->sk; + } + + return bpf_iter_run_prog(prog, &ctx); +} + +static void sock_hash_seq_stop(struct seq_file *seq, void *v) + __releases(rcu) +{ + if (!v) + (void)sock_hash_seq_show(seq, NULL); + + /* pairs with sock_hash_seq_start */ + rcu_read_unlock(); +} + +static const struct seq_operations sock_hash_seq_ops = { + .start = sock_hash_seq_start, + .next = sock_hash_seq_next, + .stop = sock_hash_seq_stop, + .show = sock_hash_seq_show, +}; + +static int sock_hash_init_seq_private(void *priv_data, + struct bpf_iter_aux_info *aux) +{ + struct sock_hash_seq_info *info = priv_data; + + bpf_map_inc_with_uref(aux->map); + info->map = aux->map; + info->htab = container_of(aux->map, struct bpf_shtab, map); + return 0; +} + +static void sock_hash_fini_seq_private(void *priv_data) +{ + struct sock_hash_seq_info *info = priv_data; + + bpf_map_put_with_uref(info->map); +} + +static const struct bpf_iter_seq_info sock_hash_iter_seq_info = { + .seq_ops = &sock_hash_seq_ops, + .init_seq_private = sock_hash_init_seq_private, + .fini_seq_private = sock_hash_fini_seq_private, + .seq_priv_size = sizeof(struct sock_hash_seq_info), +}; + +BTF_ID_LIST_SINGLE(sock_hash_map_btf_ids, struct, bpf_shtab) +const struct bpf_map_ops sock_hash_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc = sock_hash_alloc, + .map_free = sock_hash_free, + .map_get_next_key = sock_hash_get_next_key, + .map_update_elem = sock_map_update_elem, + .map_delete_elem = sock_hash_delete_elem, + .map_lookup_elem = sock_hash_lookup, + .map_lookup_elem_sys_only = sock_hash_lookup_sys, + .map_release_uref = sock_hash_release_progs, + .map_check_btf = map_check_no_btf, + .map_btf_id = &sock_hash_map_btf_ids[0], + .iter_seq_info = &sock_hash_iter_seq_info, +}; + +static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) +{ + switch (map->map_type) { + case BPF_MAP_TYPE_SOCKMAP: + return &container_of(map, struct bpf_stab, map)->progs; + case BPF_MAP_TYPE_SOCKHASH: + return &container_of(map, struct bpf_shtab, map)->progs; + default: + break; + } + + return NULL; +} + +static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog, + u32 which) +{ + struct sk_psock_progs *progs = sock_map_progs(map); + + if (!progs) + return -EOPNOTSUPP; + + switch (which) { + case BPF_SK_MSG_VERDICT: + *pprog = &progs->msg_parser; + break; +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) + case BPF_SK_SKB_STREAM_PARSER: + *pprog = &progs->stream_parser; + break; +#endif + case BPF_SK_SKB_STREAM_VERDICT: + if (progs->skb_verdict) + return -EBUSY; + *pprog = &progs->stream_verdict; + break; + case BPF_SK_SKB_VERDICT: + if (progs->stream_verdict) + return -EBUSY; + *pprog = &progs->skb_verdict; + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, u32 which) +{ + struct bpf_prog **pprog; + int ret; + + ret = sock_map_prog_lookup(map, &pprog, which); + if (ret) + return ret; + + if (old) + return psock_replace_prog(pprog, prog, old); + + psock_set_prog(pprog, prog); + return 0; +} + +int sock_map_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + u32 prog_cnt = 0, flags = 0, ufd = attr->target_fd; + struct bpf_prog **pprog; + struct bpf_prog *prog; + struct bpf_map *map; + struct fd f; + u32 id = 0; + int ret; + + if (attr->query.query_flags) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + rcu_read_lock(); + + ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type); + if (ret) + goto end; + + prog = *pprog; + prog_cnt = !prog ? 0 : 1; + + if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) + goto end; + + /* we do not hold the refcnt, the bpf prog may be released + * asynchronously and the id would be set to 0. + */ + id = data_race(prog->aux->id); + if (id == 0) + prog_cnt = 0; + +end: + rcu_read_unlock(); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)) || + (id != 0 && copy_to_user(prog_ids, &id, sizeof(u32))) || + copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) + ret = -EFAULT; + + fdput(f); + return ret; +} + +static void sock_map_unlink(struct sock *sk, struct sk_psock_link *link) +{ + switch (link->map->map_type) { + case BPF_MAP_TYPE_SOCKMAP: + return sock_map_delete_from_link(link->map, sk, + link->link_raw); + case BPF_MAP_TYPE_SOCKHASH: + return sock_hash_delete_from_link(link->map, sk, + link->link_raw); + default: + break; + } +} + +static void sock_map_remove_links(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_link *link; + + while ((link = sk_psock_link_pop(psock))) { + sock_map_unlink(sk, link); + sk_psock_free_link(link); + } +} + +void sock_map_unhash(struct sock *sk) +{ + void (*saved_unhash)(struct sock *sk); + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + saved_unhash = READ_ONCE(sk->sk_prot)->unhash; + } else { + saved_unhash = psock->saved_unhash; + sock_map_remove_links(sk, psock); + rcu_read_unlock(); + } + if (WARN_ON_ONCE(saved_unhash == sock_map_unhash)) + return; + if (saved_unhash) + saved_unhash(sk); +} +EXPORT_SYMBOL_GPL(sock_map_unhash); + +void sock_map_destroy(struct sock *sk) +{ + void (*saved_destroy)(struct sock *sk); + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock_get(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + saved_destroy = READ_ONCE(sk->sk_prot)->destroy; + } else { + saved_destroy = psock->saved_destroy; + sock_map_remove_links(sk, psock); + rcu_read_unlock(); + sk_psock_stop(psock); + sk_psock_put(sk, psock); + } + if (WARN_ON_ONCE(saved_destroy == sock_map_destroy)) + return; + if (saved_destroy) + saved_destroy(sk); +} +EXPORT_SYMBOL_GPL(sock_map_destroy); + +void sock_map_close(struct sock *sk, long timeout) +{ + void (*saved_close)(struct sock *sk, long timeout); + struct sk_psock *psock; + + lock_sock(sk); + rcu_read_lock(); + psock = sk_psock_get(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + release_sock(sk); + saved_close = READ_ONCE(sk->sk_prot)->close; + } else { + saved_close = psock->saved_close; + sock_map_remove_links(sk, psock); + rcu_read_unlock(); + sk_psock_stop(psock); + release_sock(sk); + cancel_delayed_work_sync(&psock->work); + sk_psock_put(sk, psock); + } + + /* Make sure we do not recurse. This is a bug. + * Leak the socket instead of crashing on a stack overflow. + */ + if (WARN_ON_ONCE(saved_close == sock_map_close)) + return; + saved_close(sk, timeout); +} +EXPORT_SYMBOL_GPL(sock_map_close); + +static int sock_map_iter_attach_target(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) +{ + struct bpf_map *map; + int err = -EINVAL; + + if (!linfo->map.map_fd) + return -EBADF; + + map = bpf_map_get_with_uref(linfo->map.map_fd); + if (IS_ERR(map)) + return PTR_ERR(map); + + if (map->map_type != BPF_MAP_TYPE_SOCKMAP && + map->map_type != BPF_MAP_TYPE_SOCKHASH) + goto put_map; + + if (prog->aux->max_rdonly_access > map->key_size) { + err = -EACCES; + goto put_map; + } + + aux->map = map; + return 0; + +put_map: + bpf_map_put_with_uref(map); + return err; +} + +static void sock_map_iter_detach_target(struct bpf_iter_aux_info *aux) +{ + bpf_map_put_with_uref(aux->map); +} + +static struct bpf_iter_reg sock_map_iter_reg = { + .target = "sockmap", + .attach_target = sock_map_iter_attach_target, + .detach_target = sock_map_iter_detach_target, + .show_fdinfo = bpf_iter_map_show_fdinfo, + .fill_link_info = bpf_iter_map_fill_link_info, + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__sockmap, key), + PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, + { offsetof(struct bpf_iter__sockmap, sk), + PTR_TO_BTF_ID_OR_NULL }, + }, +}; + +static int __init bpf_sockmap_iter_init(void) +{ + sock_map_iter_reg.ctx_arg_info[1].btf_id = + btf_sock_ids[BTF_SOCK_TYPE_SOCK]; + return bpf_iter_reg_target(&sock_map_iter_reg); +} +late_initcall(bpf_sockmap_iter_init); diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c new file mode 100644 index 000000000..5a165286e --- /dev/null +++ b/net/core/sock_reuseport.c @@ -0,0 +1,749 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * To speed up listener socket lookup, create an array to store all sockets + * listening on the same port. This allows a decision to be made after finding + * the first socket. An optional BPF program can also be configured for + * selecting the socket index from the array of available sockets. + */ + +#include <net/ip.h> +#include <net/sock_reuseport.h> +#include <linux/bpf.h> +#include <linux/idr.h> +#include <linux/filter.h> +#include <linux/rcupdate.h> + +#define INIT_SOCKS 128 + +DEFINE_SPINLOCK(reuseport_lock); + +static DEFINE_IDA(reuseport_ida); +static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse, + struct sock_reuseport *reuse, bool bind_inany); + +void reuseport_has_conns_set(struct sock *sk) +{ + struct sock_reuseport *reuse; + + if (!rcu_access_pointer(sk->sk_reuseport_cb)) + return; + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + if (likely(reuse)) + reuse->has_conns = 1; + spin_unlock_bh(&reuseport_lock); +} +EXPORT_SYMBOL(reuseport_has_conns_set); + +static void __reuseport_get_incoming_cpu(struct sock_reuseport *reuse) +{ + /* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */ + WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu + 1); +} + +static void __reuseport_put_incoming_cpu(struct sock_reuseport *reuse) +{ + /* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */ + WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu - 1); +} + +static void reuseport_get_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse) +{ + if (sk->sk_incoming_cpu >= 0) + __reuseport_get_incoming_cpu(reuse); +} + +static void reuseport_put_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse) +{ + if (sk->sk_incoming_cpu >= 0) + __reuseport_put_incoming_cpu(reuse); +} + +void reuseport_update_incoming_cpu(struct sock *sk, int val) +{ + struct sock_reuseport *reuse; + int old_sk_incoming_cpu; + + if (unlikely(!rcu_access_pointer(sk->sk_reuseport_cb))) { + /* Paired with REAE_ONCE() in sk_incoming_cpu_update() + * and compute_score(). + */ + WRITE_ONCE(sk->sk_incoming_cpu, val); + return; + } + + spin_lock_bh(&reuseport_lock); + + /* This must be done under reuseport_lock to avoid a race with + * reuseport_grow(), which accesses sk->sk_incoming_cpu without + * lock_sock() when detaching a shutdown()ed sk. + * + * Paired with READ_ONCE() in reuseport_select_sock_by_hash(). + */ + old_sk_incoming_cpu = sk->sk_incoming_cpu; + WRITE_ONCE(sk->sk_incoming_cpu, val); + + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + + /* reuseport_grow() has detached a closed sk. */ + if (!reuse) + goto out; + + if (old_sk_incoming_cpu < 0 && val >= 0) + __reuseport_get_incoming_cpu(reuse); + else if (old_sk_incoming_cpu >= 0 && val < 0) + __reuseport_put_incoming_cpu(reuse); + +out: + spin_unlock_bh(&reuseport_lock); +} + +static int reuseport_sock_index(struct sock *sk, + const struct sock_reuseport *reuse, + bool closed) +{ + int left, right; + + if (!closed) { + left = 0; + right = reuse->num_socks; + } else { + left = reuse->max_socks - reuse->num_closed_socks; + right = reuse->max_socks; + } + + for (; left < right; left++) + if (reuse->socks[left] == sk) + return left; + return -1; +} + +static void __reuseport_add_sock(struct sock *sk, + struct sock_reuseport *reuse) +{ + reuse->socks[reuse->num_socks] = sk; + /* paired with smp_rmb() in reuseport_(select|migrate)_sock() */ + smp_wmb(); + reuse->num_socks++; + reuseport_get_incoming_cpu(sk, reuse); +} + +static bool __reuseport_detach_sock(struct sock *sk, + struct sock_reuseport *reuse) +{ + int i = reuseport_sock_index(sk, reuse, false); + + if (i == -1) + return false; + + reuse->socks[i] = reuse->socks[reuse->num_socks - 1]; + reuse->num_socks--; + reuseport_put_incoming_cpu(sk, reuse); + + return true; +} + +static void __reuseport_add_closed_sock(struct sock *sk, + struct sock_reuseport *reuse) +{ + reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk; + /* paired with READ_ONCE() in inet_csk_bind_conflict() */ + WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1); + reuseport_get_incoming_cpu(sk, reuse); +} + +static bool __reuseport_detach_closed_sock(struct sock *sk, + struct sock_reuseport *reuse) +{ + int i = reuseport_sock_index(sk, reuse, true); + + if (i == -1) + return false; + + reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks]; + /* paired with READ_ONCE() in inet_csk_bind_conflict() */ + WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1); + reuseport_put_incoming_cpu(sk, reuse); + + return true; +} + +static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) +{ + unsigned int size = sizeof(struct sock_reuseport) + + sizeof(struct sock *) * max_socks; + struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC); + + if (!reuse) + return NULL; + + reuse->max_socks = max_socks; + + RCU_INIT_POINTER(reuse->prog, NULL); + return reuse; +} + +int reuseport_alloc(struct sock *sk, bool bind_inany) +{ + struct sock_reuseport *reuse; + int id, ret = 0; + + /* bh lock used since this function call may precede hlist lock in + * soft irq of receive path or setsockopt from process context + */ + spin_lock_bh(&reuseport_lock); + + /* Allocation attempts can occur concurrently via the setsockopt path + * and the bind/hash path. Nothing to do when we lose the race. + */ + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + if (reuse) { + if (reuse->num_closed_socks) { + /* sk was shutdown()ed before */ + ret = reuseport_resurrect(sk, reuse, NULL, bind_inany); + goto out; + } + + /* Only set reuse->bind_inany if the bind_inany is true. + * Otherwise, it will overwrite the reuse->bind_inany + * which was set by the bind/hash path. + */ + if (bind_inany) + reuse->bind_inany = bind_inany; + goto out; + } + + reuse = __reuseport_alloc(INIT_SOCKS); + if (!reuse) { + ret = -ENOMEM; + goto out; + } + + id = ida_alloc(&reuseport_ida, GFP_ATOMIC); + if (id < 0) { + kfree(reuse); + ret = id; + goto out; + } + + reuse->reuseport_id = id; + reuse->bind_inany = bind_inany; + reuse->socks[0] = sk; + reuse->num_socks = 1; + reuseport_get_incoming_cpu(sk, reuse); + rcu_assign_pointer(sk->sk_reuseport_cb, reuse); + +out: + spin_unlock_bh(&reuseport_lock); + + return ret; +} +EXPORT_SYMBOL(reuseport_alloc); + +static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) +{ + struct sock_reuseport *more_reuse; + u32 more_socks_size, i; + + more_socks_size = reuse->max_socks * 2U; + if (more_socks_size > U16_MAX) { + if (reuse->num_closed_socks) { + /* Make room by removing a closed sk. + * The child has already been migrated. + * Only reqsk left at this point. + */ + struct sock *sk; + + sk = reuse->socks[reuse->max_socks - reuse->num_closed_socks]; + RCU_INIT_POINTER(sk->sk_reuseport_cb, NULL); + __reuseport_detach_closed_sock(sk, reuse); + + return reuse; + } + + return NULL; + } + + more_reuse = __reuseport_alloc(more_socks_size); + if (!more_reuse) + return NULL; + + more_reuse->num_socks = reuse->num_socks; + more_reuse->num_closed_socks = reuse->num_closed_socks; + more_reuse->prog = reuse->prog; + more_reuse->reuseport_id = reuse->reuseport_id; + more_reuse->bind_inany = reuse->bind_inany; + more_reuse->has_conns = reuse->has_conns; + more_reuse->incoming_cpu = reuse->incoming_cpu; + + memcpy(more_reuse->socks, reuse->socks, + reuse->num_socks * sizeof(struct sock *)); + memcpy(more_reuse->socks + + (more_reuse->max_socks - more_reuse->num_closed_socks), + reuse->socks + (reuse->max_socks - reuse->num_closed_socks), + reuse->num_closed_socks * sizeof(struct sock *)); + more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts); + + for (i = 0; i < reuse->max_socks; ++i) + rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb, + more_reuse); + + /* Note: we use kfree_rcu here instead of reuseport_free_rcu so + * that reuse and more_reuse can temporarily share a reference + * to prog. + */ + kfree_rcu(reuse, rcu); + return more_reuse; +} + +static void reuseport_free_rcu(struct rcu_head *head) +{ + struct sock_reuseport *reuse; + + reuse = container_of(head, struct sock_reuseport, rcu); + sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1)); + ida_free(&reuseport_ida, reuse->reuseport_id); + kfree(reuse); +} + +/** + * reuseport_add_sock - Add a socket to the reuseport group of another. + * @sk: New socket to add to the group. + * @sk2: Socket belonging to the existing reuseport group. + * @bind_inany: Whether or not the group is bound to a local INANY address. + * + * May return ENOMEM and not add socket to group under memory pressure. + */ +int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) +{ + struct sock_reuseport *old_reuse, *reuse; + + if (!rcu_access_pointer(sk2->sk_reuseport_cb)) { + int err = reuseport_alloc(sk2, bind_inany); + + if (err) + return err; + } + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk2->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + if (old_reuse && old_reuse->num_closed_socks) { + /* sk was shutdown()ed before */ + int err = reuseport_resurrect(sk, old_reuse, reuse, reuse->bind_inany); + + spin_unlock_bh(&reuseport_lock); + return err; + } + + if (old_reuse && old_reuse->num_socks != 1) { + spin_unlock_bh(&reuseport_lock); + return -EBUSY; + } + + if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) { + reuse = reuseport_grow(reuse); + if (!reuse) { + spin_unlock_bh(&reuseport_lock); + return -ENOMEM; + } + } + + __reuseport_add_sock(sk, reuse); + rcu_assign_pointer(sk->sk_reuseport_cb, reuse); + + spin_unlock_bh(&reuseport_lock); + + if (old_reuse) + call_rcu(&old_reuse->rcu, reuseport_free_rcu); + return 0; +} +EXPORT_SYMBOL(reuseport_add_sock); + +static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse, + struct sock_reuseport *reuse, bool bind_inany) +{ + if (old_reuse == reuse) { + /* If sk was in the same reuseport group, just pop sk out of + * the closed section and push sk into the listening section. + */ + __reuseport_detach_closed_sock(sk, old_reuse); + __reuseport_add_sock(sk, old_reuse); + return 0; + } + + if (!reuse) { + /* In bind()/listen() path, we cannot carry over the eBPF prog + * for the shutdown()ed socket. In setsockopt() path, we should + * not change the eBPF prog of listening sockets by attaching a + * prog to the shutdown()ed socket. Thus, we will allocate a new + * reuseport group and detach sk from the old group. + */ + int id; + + reuse = __reuseport_alloc(INIT_SOCKS); + if (!reuse) + return -ENOMEM; + + id = ida_alloc(&reuseport_ida, GFP_ATOMIC); + if (id < 0) { + kfree(reuse); + return id; + } + + reuse->reuseport_id = id; + reuse->bind_inany = bind_inany; + } else { + /* Move sk from the old group to the new one if + * - all the other listeners in the old group were close()d or + * shutdown()ed, and then sk2 has listen()ed on the same port + * OR + * - sk listen()ed without bind() (or with autobind), was + * shutdown()ed, and then listen()s on another port which + * sk2 listen()s on. + */ + if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) { + reuse = reuseport_grow(reuse); + if (!reuse) + return -ENOMEM; + } + } + + __reuseport_detach_closed_sock(sk, old_reuse); + __reuseport_add_sock(sk, reuse); + rcu_assign_pointer(sk->sk_reuseport_cb, reuse); + + if (old_reuse->num_socks + old_reuse->num_closed_socks == 0) + call_rcu(&old_reuse->rcu, reuseport_free_rcu); + + return 0; +} + +void reuseport_detach_sock(struct sock *sk) +{ + struct sock_reuseport *reuse; + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + + /* reuseport_grow() has detached a closed sk */ + if (!reuse) + goto out; + + /* Notify the bpf side. The sk may be added to a sockarray + * map. If so, sockarray logic will remove it from the map. + * + * Other bpf map types that work with reuseport, like sockmap, + * don't need an explicit callback from here. They override sk + * unhash/close ops to remove the sk from the map before we + * get to this point. + */ + bpf_sk_reuseport_detach(sk); + + rcu_assign_pointer(sk->sk_reuseport_cb, NULL); + + if (!__reuseport_detach_closed_sock(sk, reuse)) + __reuseport_detach_sock(sk, reuse); + + if (reuse->num_socks + reuse->num_closed_socks == 0) + call_rcu(&reuse->rcu, reuseport_free_rcu); + +out: + spin_unlock_bh(&reuseport_lock); +} +EXPORT_SYMBOL(reuseport_detach_sock); + +void reuseport_stop_listen_sock(struct sock *sk) +{ + if (sk->sk_protocol == IPPROTO_TCP) { + struct sock_reuseport *reuse; + struct bpf_prog *prog; + + spin_lock_bh(&reuseport_lock); + + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + prog = rcu_dereference_protected(reuse->prog, + lockdep_is_held(&reuseport_lock)); + + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req) || + (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) { + /* Migration capable, move sk from the listening section + * to the closed section. + */ + bpf_sk_reuseport_detach(sk); + + __reuseport_detach_sock(sk, reuse); + __reuseport_add_closed_sock(sk, reuse); + + spin_unlock_bh(&reuseport_lock); + return; + } + + spin_unlock_bh(&reuseport_lock); + } + + /* Not capable to do migration, detach immediately */ + reuseport_detach_sock(sk); +} +EXPORT_SYMBOL(reuseport_stop_listen_sock); + +static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks, + struct bpf_prog *prog, struct sk_buff *skb, + int hdr_len) +{ + struct sk_buff *nskb = NULL; + u32 index; + + if (skb_shared(skb)) { + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return NULL; + skb = nskb; + } + + /* temporarily advance data past protocol header */ + if (!pskb_pull(skb, hdr_len)) { + kfree_skb(nskb); + return NULL; + } + index = bpf_prog_run_save_cb(prog, skb); + __skb_push(skb, hdr_len); + + consume_skb(nskb); + + if (index >= socks) + return NULL; + + return reuse->socks[index]; +} + +static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse, + u32 hash, u16 num_socks) +{ + struct sock *first_valid_sk = NULL; + int i, j; + + i = j = reciprocal_scale(hash, num_socks); + do { + struct sock *sk = reuse->socks[i]; + + if (sk->sk_state != TCP_ESTABLISHED) { + /* Paired with WRITE_ONCE() in __reuseport_(get|put)_incoming_cpu(). */ + if (!READ_ONCE(reuse->incoming_cpu)) + return sk; + + /* Paired with WRITE_ONCE() in reuseport_update_incoming_cpu(). */ + if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) + return sk; + + if (!first_valid_sk) + first_valid_sk = sk; + } + + i++; + if (i >= num_socks) + i = 0; + } while (i != j); + + return first_valid_sk; +} + +/** + * reuseport_select_sock - Select a socket from an SO_REUSEPORT group. + * @sk: First socket in the group. + * @hash: When no BPF filter is available, use this hash to select. + * @skb: skb to run through BPF filter. + * @hdr_len: BPF filter expects skb data pointer at payload data. If + * the skb does not yet point at the payload, this parameter represents + * how far the pointer needs to advance to reach the payload. + * Returns a socket that should receive the packet (or NULL on error). + */ +struct sock *reuseport_select_sock(struct sock *sk, + u32 hash, + struct sk_buff *skb, + int hdr_len) +{ + struct sock_reuseport *reuse; + struct bpf_prog *prog; + struct sock *sk2 = NULL; + u16 socks; + + rcu_read_lock(); + reuse = rcu_dereference(sk->sk_reuseport_cb); + + /* if memory allocation failed or add call is not yet complete */ + if (!reuse) + goto out; + + prog = rcu_dereference(reuse->prog); + socks = READ_ONCE(reuse->num_socks); + if (likely(socks)) { + /* paired with smp_wmb() in __reuseport_add_sock() */ + smp_rmb(); + + if (!prog || !skb) + goto select_by_hash; + + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) + sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, NULL, hash); + else + sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len); + +select_by_hash: + /* no bpf or invalid bpf result: fall back to hash usage */ + if (!sk2) + sk2 = reuseport_select_sock_by_hash(reuse, hash, socks); + } + +out: + rcu_read_unlock(); + return sk2; +} +EXPORT_SYMBOL(reuseport_select_sock); + +/** + * reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group. + * @sk: close()ed or shutdown()ed socket in the group. + * @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or + * NEW_SYN_RECV request socket during 3WHS. + * @skb: skb to run through BPF filter. + * Returns a socket (with sk_refcnt +1) that should accept the child socket + * (or NULL on error). + */ +struct sock *reuseport_migrate_sock(struct sock *sk, + struct sock *migrating_sk, + struct sk_buff *skb) +{ + struct sock_reuseport *reuse; + struct sock *nsk = NULL; + bool allocated = false; + struct bpf_prog *prog; + u16 socks; + u32 hash; + + rcu_read_lock(); + + reuse = rcu_dereference(sk->sk_reuseport_cb); + if (!reuse) + goto out; + + socks = READ_ONCE(reuse->num_socks); + if (unlikely(!socks)) + goto failure; + + /* paired with smp_wmb() in __reuseport_add_sock() */ + smp_rmb(); + + hash = migrating_sk->sk_hash; + prog = rcu_dereference(reuse->prog); + if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) { + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req)) + goto select_by_hash; + goto failure; + } + + if (!skb) { + skb = alloc_skb(0, GFP_ATOMIC); + if (!skb) + goto failure; + allocated = true; + } + + nsk = bpf_run_sk_reuseport(reuse, sk, prog, skb, migrating_sk, hash); + + if (allocated) + kfree_skb(skb); + +select_by_hash: + if (!nsk) + nsk = reuseport_select_sock_by_hash(reuse, hash, socks); + + if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt))) { + nsk = NULL; + goto failure; + } + +out: + rcu_read_unlock(); + return nsk; + +failure: + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); + goto out; +} +EXPORT_SYMBOL(reuseport_migrate_sock); + +int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) +{ + struct sock_reuseport *reuse; + struct bpf_prog *old_prog; + + if (sk_unhashed(sk)) { + int err; + + if (!sk->sk_reuseport) + return -EINVAL; + + err = reuseport_alloc(sk, false); + if (err) + return err; + } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { + /* The socket wasn't bound with SO_REUSEPORT */ + return -EINVAL; + } + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + old_prog = rcu_dereference_protected(reuse->prog, + lockdep_is_held(&reuseport_lock)); + rcu_assign_pointer(reuse->prog, prog); + spin_unlock_bh(&reuseport_lock); + + sk_reuseport_prog_free(old_prog); + return 0; +} +EXPORT_SYMBOL(reuseport_attach_prog); + +int reuseport_detach_prog(struct sock *sk) +{ + struct sock_reuseport *reuse; + struct bpf_prog *old_prog; + + old_prog = NULL; + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + + /* reuse must be checked after acquiring the reuseport_lock + * because reuseport_grow() can detach a closed sk. + */ + if (!reuse) { + spin_unlock_bh(&reuseport_lock); + return sk->sk_reuseport ? -ENOENT : -EINVAL; + } + + if (sk_unhashed(sk) && reuse->num_closed_socks) { + spin_unlock_bh(&reuseport_lock); + return -ENOENT; + } + + old_prog = rcu_replace_pointer(reuse->prog, old_prog, + lockdep_is_held(&reuseport_lock)); + spin_unlock_bh(&reuseport_lock); + + if (!old_prog) + return -ENOENT; + + sk_reuseport_prog_free(old_prog); + return 0; +} +EXPORT_SYMBOL(reuseport_detach_prog); diff --git a/net/core/stream.c b/net/core/stream.c new file mode 100644 index 000000000..30e7deff4 --- /dev/null +++ b/net/core/stream.c @@ -0,0 +1,220 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * SUCS NET3: + * + * Generic stream handling routines. These are generic for most + * protocols. Even IP. Tonight 8-). + * This is used because TCP, LLC (others too) layer all have mostly + * identical sendmsg() and recvmsg() code. + * So we (will) share it here. + * + * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br> + * (from old tcp.c code) + * Alan Cox <alan@lxorguk.ukuu.org.uk> (Borrowed comments 8-)) + */ + +#include <linux/module.h> +#include <linux/sched/signal.h> +#include <linux/net.h> +#include <linux/signal.h> +#include <linux/tcp.h> +#include <linux/wait.h> +#include <net/sock.h> + +/** + * sk_stream_write_space - stream socket write_space callback. + * @sk: socket + * + * FIXME: write proper description + */ +void sk_stream_write_space(struct sock *sk) +{ + struct socket *sock = sk->sk_socket; + struct socket_wq *wq; + + if (__sk_stream_is_writeable(sk, 1) && sock) { + clear_bit(SOCK_NOSPACE, &sock->flags); + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (skwq_has_sleeper(wq)) + wake_up_interruptible_poll(&wq->wait, EPOLLOUT | + EPOLLWRNORM | EPOLLWRBAND); + if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) + sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT); + rcu_read_unlock(); + } +} + +/** + * sk_stream_wait_connect - Wait for a socket to get into the connected state + * @sk: sock to wait on + * @timeo_p: for how long to wait + * + * Must be called with the socket locked. + */ +int sk_stream_wait_connect(struct sock *sk, long *timeo_p) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct task_struct *tsk = current; + int done; + + do { + int err = sock_error(sk); + if (err) + return err; + if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) + return -EPIPE; + if (!*timeo_p) + return -EAGAIN; + if (signal_pending(tsk)) + return sock_intr_errno(*timeo_p); + + add_wait_queue(sk_sleep(sk), &wait); + sk->sk_write_pending++; + done = sk_wait_event(sk, timeo_p, + !READ_ONCE(sk->sk_err) && + !((1 << READ_ONCE(sk->sk_state)) & + ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)), &wait); + remove_wait_queue(sk_sleep(sk), &wait); + sk->sk_write_pending--; + } while (!done); + return done < 0 ? done : 0; +} +EXPORT_SYMBOL(sk_stream_wait_connect); + +/** + * sk_stream_closing - Return 1 if we still have things to send in our buffers. + * @sk: socket to verify + */ +static int sk_stream_closing(const struct sock *sk) +{ + return (1 << READ_ONCE(sk->sk_state)) & + (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK); +} + +void sk_stream_wait_close(struct sock *sk, long timeout) +{ + if (timeout) { + DEFINE_WAIT_FUNC(wait, woken_wake_function); + + add_wait_queue(sk_sleep(sk), &wait); + + do { + if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk), &wait)) + break; + } while (!signal_pending(current) && timeout); + + remove_wait_queue(sk_sleep(sk), &wait); + } +} +EXPORT_SYMBOL(sk_stream_wait_close); + +/** + * sk_stream_wait_memory - Wait for more memory for a socket + * @sk: socket to wait for memory + * @timeo_p: for how long + */ +int sk_stream_wait_memory(struct sock *sk, long *timeo_p) +{ + int ret, err = 0; + long vm_wait = 0; + long current_timeo = *timeo_p; + DEFINE_WAIT_FUNC(wait, woken_wake_function); + + if (sk_stream_memory_free(sk)) + current_timeo = vm_wait = prandom_u32_max(HZ / 5) + 2; + + add_wait_queue(sk_sleep(sk), &wait); + + while (1) { + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); + + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + goto do_error; + if (!*timeo_p) + goto do_eagain; + if (signal_pending(current)) + goto do_interrupted; + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); + if (sk_stream_memory_free(sk) && !vm_wait) + break; + + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + sk->sk_write_pending++; + ret = sk_wait_event(sk, ¤t_timeo, READ_ONCE(sk->sk_err) || + (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) || + (sk_stream_memory_free(sk) && !vm_wait), + &wait); + sk->sk_write_pending--; + if (ret < 0) + goto do_error; + + if (vm_wait) { + vm_wait -= current_timeo; + current_timeo = *timeo_p; + if (current_timeo != MAX_SCHEDULE_TIMEOUT && + (current_timeo -= vm_wait) < 0) + current_timeo = 0; + vm_wait = 0; + } + *timeo_p = current_timeo; + } +out: + if (!sock_flag(sk, SOCK_DEAD)) + remove_wait_queue(sk_sleep(sk), &wait); + return err; + +do_error: + err = -EPIPE; + goto out; +do_eagain: + /* Make sure that whenever EAGAIN is returned, EPOLLOUT event can + * be generated later. + * When TCP receives ACK packets that make room, tcp_check_space() + * only calls tcp_new_space() if SOCK_NOSPACE is set. + */ + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; + goto out; +do_interrupted: + err = sock_intr_errno(*timeo_p); + goto out; +} +EXPORT_SYMBOL(sk_stream_wait_memory); + +int sk_stream_error(struct sock *sk, int flags, int err) +{ + if (err == -EPIPE) + err = sock_error(sk) ? : -EPIPE; + if (err == -EPIPE && !(flags & MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); + return err; +} +EXPORT_SYMBOL(sk_stream_error); + +void sk_stream_kill_queues(struct sock *sk) +{ + /* First the read buffer. */ + __skb_queue_purge(&sk->sk_receive_queue); + + /* Next, the error queue. + * We need to use queue lock, because other threads might + * add packets to the queue without socket lock being held. + */ + skb_queue_purge(&sk->sk_error_queue); + + /* Next, the write queue. */ + WARN_ON_ONCE(!skb_queue_empty(&sk->sk_write_queue)); + + /* Account for returned memory. */ + sk_mem_reclaim_final(sk); + + WARN_ON_ONCE(sk->sk_wmem_queued); + + /* It is _impossible_ for the backlog to contain anything + * when we get here. All user references to this socket + * have gone away, only the net layer knows can touch it. + */ +} +EXPORT_SYMBOL(sk_stream_kill_queues); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c new file mode 100644 index 000000000..5b1ce656b --- /dev/null +++ b/net/core/sysctl_net_core.c @@ -0,0 +1,687 @@ +// SPDX-License-Identifier: GPL-2.0 +/* -*- linux-c -*- + * sysctl_net_core.c: sysctl interface to net core subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/core directory entry (empty =) ). [MS] + */ + +#include <linux/filter.h> +#include <linux/mm.h> +#include <linux/sysctl.h> +#include <linux/module.h> +#include <linux/socket.h> +#include <linux/netdevice.h> +#include <linux/ratelimit.h> +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/slab.h> + +#include <net/ip.h> +#include <net/sock.h> +#include <net/net_ratelimit.h> +#include <net/busy_poll.h> +#include <net/pkt_sched.h> + +#include "dev.h" + +static int int_3600 = 3600; +static int min_sndbuf = SOCK_MIN_SNDBUF; +static int min_rcvbuf = SOCK_MIN_RCVBUF; +static int max_skb_frags = MAX_SKB_FRAGS; + +static int net_msg_warn; /* Unused, but still a sysctl */ + +int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0; +EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); + +/* 0 - Keep current behavior: + * IPv4: inherit all current settings from init_net + * IPv6: reset all settings to default + * 1 - Both inherit all current settings from init_net + * 2 - Both reset all settings to default + * 3 - Both inherit all settings from current netns + */ +int sysctl_devconf_inherit_init_net __read_mostly; +EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); + +#ifdef CONFIG_RPS +static int rps_sock_flow_sysctl(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + unsigned int orig_size, size; + int ret, i; + struct ctl_table tmp = { + .data = &size, + .maxlen = sizeof(size), + .mode = table->mode + }; + struct rps_sock_flow_table *orig_sock_table, *sock_table; + static DEFINE_MUTEX(sock_flow_mutex); + + mutex_lock(&sock_flow_mutex); + + orig_sock_table = rcu_dereference_protected(rps_sock_flow_table, + lockdep_is_held(&sock_flow_mutex)); + size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; + + ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); + + if (write) { + if (size) { + if (size > 1<<29) { + /* Enforce limit to prevent overflow */ + mutex_unlock(&sock_flow_mutex); + return -EINVAL; + } + size = roundup_pow_of_two(size); + if (size != orig_size) { + sock_table = + vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size)); + if (!sock_table) { + mutex_unlock(&sock_flow_mutex); + return -ENOMEM; + } + rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; + sock_table->mask = size - 1; + } else + sock_table = orig_sock_table; + + for (i = 0; i < size; i++) + sock_table->ents[i] = RPS_NO_CPU; + } else + sock_table = NULL; + + if (sock_table != orig_sock_table) { + rcu_assign_pointer(rps_sock_flow_table, sock_table); + if (sock_table) { + static_branch_inc(&rps_needed); + static_branch_inc(&rfs_needed); + } + if (orig_sock_table) { + static_branch_dec(&rps_needed); + static_branch_dec(&rfs_needed); + kvfree_rcu(orig_sock_table); + } + } + } + + mutex_unlock(&sock_flow_mutex); + + return ret; +} +#endif /* CONFIG_RPS */ + +#ifdef CONFIG_NET_FLOW_LIMIT +static DEFINE_MUTEX(flow_limit_update_mutex); + +static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct sd_flow_limit *cur; + struct softnet_data *sd; + cpumask_var_t mask; + int i, len, ret = 0; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + if (write) { + ret = cpumask_parse(buffer, mask); + if (ret) + goto done; + + mutex_lock(&flow_limit_update_mutex); + len = sizeof(*cur) + netdev_flow_limit_table_len; + for_each_possible_cpu(i) { + sd = &per_cpu(softnet_data, i); + cur = rcu_dereference_protected(sd->flow_limit, + lockdep_is_held(&flow_limit_update_mutex)); + if (cur && !cpumask_test_cpu(i, mask)) { + RCU_INIT_POINTER(sd->flow_limit, NULL); + kfree_rcu(cur); + } else if (!cur && cpumask_test_cpu(i, mask)) { + cur = kzalloc_node(len, GFP_KERNEL, + cpu_to_node(i)); + if (!cur) { + /* not unwinding previous changes */ + ret = -ENOMEM; + goto write_unlock; + } + cur->num_buckets = netdev_flow_limit_table_len; + rcu_assign_pointer(sd->flow_limit, cur); + } + } +write_unlock: + mutex_unlock(&flow_limit_update_mutex); + } else { + char kbuf[128]; + + if (*ppos || !*lenp) { + *lenp = 0; + goto done; + } + + cpumask_clear(mask); + rcu_read_lock(); + for_each_possible_cpu(i) { + sd = &per_cpu(softnet_data, i); + if (rcu_dereference(sd->flow_limit)) + cpumask_set_cpu(i, mask); + } + rcu_read_unlock(); + + len = min(sizeof(kbuf) - 1, *lenp); + len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); + if (!len) { + *lenp = 0; + goto done; + } + if (len < *lenp) + kbuf[len++] = '\n'; + memcpy(buffer, kbuf, len); + *lenp = len; + *ppos += len; + } + +done: + free_cpumask_var(mask); + return ret; +} + +static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + unsigned int old, *ptr; + int ret; + + mutex_lock(&flow_limit_update_mutex); + + ptr = table->data; + old = *ptr; + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (!ret && write && !is_power_of_2(*ptr)) { + *ptr = old; + ret = -EINVAL; + } + + mutex_unlock(&flow_limit_update_mutex); + return ret; +} +#endif /* CONFIG_NET_FLOW_LIMIT */ + +#ifdef CONFIG_NET_SCHED +static int set_default_qdisc(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + char id[IFNAMSIZ]; + struct ctl_table tbl = { + .data = id, + .maxlen = IFNAMSIZ, + }; + int ret; + + qdisc_get_default(id, IFNAMSIZ); + + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) + ret = qdisc_set_default(id); + return ret; +} +#endif + +static int proc_do_dev_weight(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + static DEFINE_MUTEX(dev_weight_mutex); + int ret, weight; + + mutex_lock(&dev_weight_mutex); + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (!ret && write) { + weight = READ_ONCE(weight_p); + WRITE_ONCE(dev_rx_weight, weight * dev_weight_rx_bias); + WRITE_ONCE(dev_tx_weight, weight * dev_weight_tx_bias); + } + mutex_unlock(&dev_weight_mutex); + + return ret; +} + +static int proc_do_rss_key(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table fake_table; + char buf[NETDEV_RSS_KEY_LEN * 3]; + + snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key); + fake_table.data = buf; + fake_table.maxlen = sizeof(buf); + return proc_dostring(&fake_table, write, buffer, lenp, ppos); +} + +#ifdef CONFIG_BPF_JIT +static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, + void *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret, jit_enable = *(int *)table->data; + int min = *(int *)table->extra1; + int max = *(int *)table->extra2; + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + tmp.data = &jit_enable; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + if (jit_enable < 2 || + (jit_enable == 2 && bpf_dump_raw_ok(current_cred()))) { + *(int *)table->data = jit_enable; + if (jit_enable == 2) + pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n"); + } else { + ret = -EPERM; + } + } + + if (write && ret && min == max) + pr_info_once("CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set to 1.\n"); + + return ret; +} + +# ifdef CONFIG_HAVE_EBPF_JIT +static int +proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} +# endif /* CONFIG_HAVE_EBPF_JIT */ + +static int +proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); +} +#endif + +static struct ctl_table net_core_table[] = { + { + .procname = "wmem_max", + .data = &sysctl_wmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_sndbuf, + }, + { + .procname = "rmem_max", + .data = &sysctl_rmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_rcvbuf, + }, + { + .procname = "wmem_default", + .data = &sysctl_wmem_default, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_sndbuf, + }, + { + .procname = "rmem_default", + .data = &sysctl_rmem_default, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_rcvbuf, + }, + { + .procname = "dev_weight", + .data = &weight_p, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_dev_weight, + }, + { + .procname = "dev_weight_rx_bias", + .data = &dev_weight_rx_bias, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_dev_weight, + }, + { + .procname = "dev_weight_tx_bias", + .data = &dev_weight_tx_bias, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_dev_weight, + }, + { + .procname = "netdev_max_backlog", + .data = &netdev_max_backlog, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "netdev_rss_key", + .data = &netdev_rss_key, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_do_rss_key, + }, +#ifdef CONFIG_BPF_JIT + { + .procname = "bpf_jit_enable", + .data = &bpf_jit_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_bpf_enable, +# ifdef CONFIG_BPF_JIT_ALWAYS_ON + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, +# else + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, +# endif + }, +# ifdef CONFIG_HAVE_EBPF_JIT + { + .procname = "bpf_jit_harden", + .data = &bpf_jit_harden, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax_bpf_restricted, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "bpf_jit_kallsyms", + .data = &bpf_jit_kallsyms, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax_bpf_restricted, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +# endif + { + .procname = "bpf_jit_limit", + .data = &bpf_jit_limit, + .maxlen = sizeof(long), + .mode = 0600, + .proc_handler = proc_dolongvec_minmax_bpf_restricted, + .extra1 = SYSCTL_LONG_ONE, + .extra2 = &bpf_jit_limit_max, + }, +#endif + { + .procname = "netdev_tstamp_prequeue", + .data = &netdev_tstamp_prequeue, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "message_cost", + .data = &net_ratelimit_state.interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "message_burst", + .data = &net_ratelimit_state.burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "optmem_max", + .data = &sysctl_optmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tstamp_allow_data", + .data = &sysctl_tstamp_allow_data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, +#ifdef CONFIG_RPS + { + .procname = "rps_sock_flow_entries", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = rps_sock_flow_sysctl + }, +#endif +#ifdef CONFIG_NET_FLOW_LIMIT + { + .procname = "flow_limit_cpu_bitmap", + .mode = 0644, + .proc_handler = flow_limit_cpu_sysctl + }, + { + .procname = "flow_limit_table_len", + .data = &netdev_flow_limit_table_len, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = flow_limit_table_len_sysctl + }, +#endif /* CONFIG_NET_FLOW_LIMIT */ +#ifdef CONFIG_NET_RX_BUSY_POLL + { + .procname = "busy_poll", + .data = &sysctl_net_busy_poll, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "busy_read", + .data = &sysctl_net_busy_read, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +#endif +#ifdef CONFIG_NET_SCHED + { + .procname = "default_qdisc", + .mode = 0644, + .maxlen = IFNAMSIZ, + .proc_handler = set_default_qdisc + }, +#endif + { + .procname = "netdev_budget", + .data = &netdev_budget, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "warnings", + .data = &net_msg_warn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "max_skb_frags", + .data = &sysctl_max_skb_frags, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &max_skb_frags, + }, + { + .procname = "netdev_budget_usecs", + .data = &netdev_budget_usecs, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "fb_tunnels_only_for_init_net", + .data = &sysctl_fb_tunnels_only_for_init_net, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "devconf_inherit_init_net", + .data = &sysctl_devconf_inherit_init_net, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_THREE, + }, + { + .procname = "high_order_alloc_disable", + .data = &net_high_order_alloc_disable_key.key, + .maxlen = sizeof(net_high_order_alloc_disable_key), + .mode = 0644, + .proc_handler = proc_do_static_key, + }, + { + .procname = "gro_normal_batch", + .data = &gro_normal_batch, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, + { + .procname = "netdev_unregister_timeout_secs", + .data = &netdev_unregister_timeout_secs, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &int_3600, + }, + { + .procname = "skb_defer_max", + .data = &sysctl_skb_defer_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { } +}; + +static struct ctl_table netns_core_table[] = { + { + .procname = "somaxconn", + .data = &init_net.core.sysctl_somaxconn, + .maxlen = sizeof(int), + .mode = 0644, + .extra1 = SYSCTL_ZERO, + .proc_handler = proc_dointvec_minmax + }, + { + .procname = "txrehash", + .data = &init_net.core.sysctl_txrehash, + .maxlen = sizeof(u8), + .mode = 0644, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + .proc_handler = proc_dou8vec_minmax, + }, + { } +}; + +static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str) +{ + /* fallback tunnels for initns only */ + if (!strncmp(str, "initns", 6)) + sysctl_fb_tunnels_only_for_init_net = 1; + /* no fallback tunnels anywhere */ + else if (!strncmp(str, "none", 4)) + sysctl_fb_tunnels_only_for_init_net = 2; + + return 1; +} +__setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup); + +static __net_init int sysctl_core_net_init(struct net *net) +{ + struct ctl_table *tbl, *tmp; + + tbl = netns_core_table; + if (!net_eq(net, &init_net)) { + tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); + if (tbl == NULL) + goto err_dup; + + for (tmp = tbl; tmp->procname; tmp++) + tmp->data += (char *)net - (char *)&init_net; + + /* Don't export any sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) { + tbl[0].procname = NULL; + } + } + + net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl); + if (net->core.sysctl_hdr == NULL) + goto err_reg; + + return 0; + +err_reg: + if (tbl != netns_core_table) + kfree(tbl); +err_dup: + return -ENOMEM; +} + +static __net_exit void sysctl_core_net_exit(struct net *net) +{ + struct ctl_table *tbl; + + tbl = net->core.sysctl_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->core.sysctl_hdr); + BUG_ON(tbl == netns_core_table); + kfree(tbl); +} + +static __net_initdata struct pernet_operations sysctl_core_ops = { + .init = sysctl_core_net_init, + .exit = sysctl_core_net_exit, +}; + +static __init int sysctl_core_init(void) +{ + register_net_sysctl(&init_net, "net/core", net_core_table); + return register_pernet_subsys(&sysctl_core_ops); +} + +fs_initcall(sysctl_core_init); diff --git a/net/core/timestamping.c b/net/core/timestamping.c new file mode 100644 index 000000000..04840697f --- /dev/null +++ b/net/core/timestamping.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * PTP 1588 clock support - support for timestamping in PHY devices + * + * Copyright (C) 2010 OMICRON electronics GmbH + */ +#include <linux/errqueue.h> +#include <linux/phy.h> +#include <linux/ptp_classify.h> +#include <linux/skbuff.h> +#include <linux/export.h> + +static unsigned int classify(const struct sk_buff *skb) +{ + if (likely(skb->dev && skb->dev->phydev && + skb->dev->phydev->mii_ts)) + return ptp_classify_raw(skb); + else + return PTP_CLASS_NONE; +} + +void skb_clone_tx_timestamp(struct sk_buff *skb) +{ + struct mii_timestamper *mii_ts; + struct sk_buff *clone; + unsigned int type; + + if (!skb->sk) + return; + + type = classify(skb); + if (type == PTP_CLASS_NONE) + return; + + mii_ts = skb->dev->phydev->mii_ts; + if (likely(mii_ts->txtstamp)) { + clone = skb_clone_sk(skb); + if (!clone) + return; + mii_ts->txtstamp(mii_ts, clone, type); + } +} +EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp); + +bool skb_defer_rx_timestamp(struct sk_buff *skb) +{ + struct mii_timestamper *mii_ts; + unsigned int type; + + if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->mii_ts) + return false; + + if (skb_headroom(skb) < ETH_HLEN) + return false; + + __skb_push(skb, ETH_HLEN); + + type = ptp_classify_raw(skb); + + __skb_pull(skb, ETH_HLEN); + + if (type == PTP_CLASS_NONE) + return false; + + mii_ts = skb->dev->phydev->mii_ts; + if (likely(mii_ts->rxtstamp)) + return mii_ts->rxtstamp(mii_ts, skb, type); + + return false; +} +EXPORT_SYMBOL_GPL(skb_defer_rx_timestamp); diff --git a/net/core/tso.c b/net/core/tso.c new file mode 100644 index 000000000..4148f6d48 --- /dev/null +++ b/net/core/tso.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/export.h> +#include <linux/if_vlan.h> +#include <net/ip.h> +#include <net/tso.h> +#include <asm/unaligned.h> + +/* Calculate expected number of TX descriptors */ +int tso_count_descs(const struct sk_buff *skb) +{ + /* The Marvell Way */ + return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags; +} +EXPORT_SYMBOL(tso_count_descs); + +void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso, + int size, bool is_last) +{ + int hdr_len = skb_transport_offset(skb) + tso->tlen; + int mac_hdr_len = skb_network_offset(skb); + + memcpy(hdr, skb->data, hdr_len); + if (!tso->ipv6) { + struct iphdr *iph = (void *)(hdr + mac_hdr_len); + + iph->id = htons(tso->ip_id); + iph->tot_len = htons(size + hdr_len - mac_hdr_len); + tso->ip_id++; + } else { + struct ipv6hdr *iph = (void *)(hdr + mac_hdr_len); + + iph->payload_len = htons(size + tso->tlen); + } + hdr += skb_transport_offset(skb); + if (tso->tlen != sizeof(struct udphdr)) { + struct tcphdr *tcph = (struct tcphdr *)hdr; + + put_unaligned_be32(tso->tcp_seq, &tcph->seq); + + if (!is_last) { + /* Clear all special flags for not last packet */ + tcph->psh = 0; + tcph->fin = 0; + tcph->rst = 0; + } + } else { + struct udphdr *uh = (struct udphdr *)hdr; + + uh->len = htons(sizeof(*uh) + size); + } +} +EXPORT_SYMBOL(tso_build_hdr); + +void tso_build_data(const struct sk_buff *skb, struct tso_t *tso, int size) +{ + tso->tcp_seq += size; /* not worth avoiding this operation for UDP */ + tso->size -= size; + tso->data += size; + + if ((tso->size == 0) && + (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx]; + + /* Move to next segment */ + tso->size = skb_frag_size(frag); + tso->data = skb_frag_address(frag); + tso->next_frag_idx++; + } +} +EXPORT_SYMBOL(tso_build_data); + +int tso_start(struct sk_buff *skb, struct tso_t *tso) +{ + int tlen = skb_is_gso_tcp(skb) ? tcp_hdrlen(skb) : sizeof(struct udphdr); + int hdr_len = skb_transport_offset(skb) + tlen; + + tso->tlen = tlen; + tso->ip_id = ntohs(ip_hdr(skb)->id); + tso->tcp_seq = (tlen != sizeof(struct udphdr)) ? ntohl(tcp_hdr(skb)->seq) : 0; + tso->next_frag_idx = 0; + tso->ipv6 = vlan_get_protocol(skb) == htons(ETH_P_IPV6); + + /* Build first data */ + tso->size = skb_headlen(skb) - hdr_len; + tso->data = skb->data + hdr_len; + if ((tso->size == 0) && + (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx]; + + /* Move to next segment */ + tso->size = skb_frag_size(frag); + tso->data = skb_frag_address(frag); + tso->next_frag_idx++; + } + return hdr_len; +} +EXPORT_SYMBOL(tso_start); diff --git a/net/core/utils.c b/net/core/utils.c new file mode 100644 index 000000000..938495bc1 --- /dev/null +++ b/net/core/utils.c @@ -0,0 +1,486 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Generic address resultion entity + * + * Authors: + * net_random Alan Cox + * net_ratelimit Andi Kleen + * in{4,6}_pton YOSHIFUJI Hideaki, Copyright (C)2006 USAGI/WIDE Project + * + * Created by Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + */ + +#include <linux/module.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/inet.h> +#include <linux/mm.h> +#include <linux/net.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/percpu.h> +#include <linux/init.h> +#include <linux/ratelimit.h> +#include <linux/socket.h> + +#include <net/sock.h> +#include <net/net_ratelimit.h> +#include <net/ipv6.h> + +#include <asm/byteorder.h> +#include <linux/uaccess.h> + +DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10); +/* + * All net warning printk()s should be guarded by this function. + */ +int net_ratelimit(void) +{ + return __ratelimit(&net_ratelimit_state); +} +EXPORT_SYMBOL(net_ratelimit); + +/* + * Convert an ASCII string to binary IP. + * This is outside of net/ipv4/ because various code that uses IP addresses + * is otherwise not dependent on the TCP/IP stack. + */ + +__be32 in_aton(const char *str) +{ + unsigned int l; + unsigned int val; + int i; + + l = 0; + for (i = 0; i < 4; i++) { + l <<= 8; + if (*str != '\0') { + val = 0; + while (*str != '\0' && *str != '.' && *str != '\n') { + val *= 10; + val += *str - '0'; + str++; + } + l |= val; + if (*str != '\0') + str++; + } + } + return htonl(l); +} +EXPORT_SYMBOL(in_aton); + +#define IN6PTON_XDIGIT 0x00010000 +#define IN6PTON_DIGIT 0x00020000 +#define IN6PTON_COLON_MASK 0x00700000 +#define IN6PTON_COLON_1 0x00100000 /* single : requested */ +#define IN6PTON_COLON_2 0x00200000 /* second : requested */ +#define IN6PTON_COLON_1_2 0x00400000 /* :: requested */ +#define IN6PTON_DOT 0x00800000 /* . */ +#define IN6PTON_DELIM 0x10000000 +#define IN6PTON_NULL 0x20000000 /* first/tail */ +#define IN6PTON_UNKNOWN 0x40000000 + +static inline int xdigit2bin(char c, int delim) +{ + int val; + + if (c == delim || c == '\0') + return IN6PTON_DELIM; + if (c == ':') + return IN6PTON_COLON_MASK; + if (c == '.') + return IN6PTON_DOT; + + val = hex_to_bin(c); + if (val >= 0) + return val | IN6PTON_XDIGIT | (val < 10 ? IN6PTON_DIGIT : 0); + + if (delim == -1) + return IN6PTON_DELIM; + return IN6PTON_UNKNOWN; +} + +/** + * in4_pton - convert an IPv4 address from literal to binary representation + * @src: the start of the IPv4 address string + * @srclen: the length of the string, -1 means strlen(src) + * @dst: the binary (u8[4] array) representation of the IPv4 address + * @delim: the delimiter of the IPv4 address in @src, -1 means no delimiter + * @end: A pointer to the end of the parsed string will be placed here + * + * Return one on success, return zero when any error occurs + * and @end will point to the end of the parsed string. + * + */ +int in4_pton(const char *src, int srclen, + u8 *dst, + int delim, const char **end) +{ + const char *s; + u8 *d; + u8 dbuf[4]; + int ret = 0; + int i; + int w = 0; + + if (srclen < 0) + srclen = strlen(src); + s = src; + d = dbuf; + i = 0; + while (1) { + int c; + c = xdigit2bin(srclen > 0 ? *s : '\0', delim); + if (!(c & (IN6PTON_DIGIT | IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK))) { + goto out; + } + if (c & (IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK)) { + if (w == 0) + goto out; + *d++ = w & 0xff; + w = 0; + i++; + if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) { + if (i != 4) + goto out; + break; + } + goto cont; + } + w = (w * 10) + c; + if ((w & 0xffff) > 255) { + goto out; + } +cont: + if (i >= 4) + goto out; + s++; + srclen--; + } + ret = 1; + memcpy(dst, dbuf, sizeof(dbuf)); +out: + if (end) + *end = s; + return ret; +} +EXPORT_SYMBOL(in4_pton); + +/** + * in6_pton - convert an IPv6 address from literal to binary representation + * @src: the start of the IPv6 address string + * @srclen: the length of the string, -1 means strlen(src) + * @dst: the binary (u8[16] array) representation of the IPv6 address + * @delim: the delimiter of the IPv6 address in @src, -1 means no delimiter + * @end: A pointer to the end of the parsed string will be placed here + * + * Return one on success, return zero when any error occurs + * and @end will point to the end of the parsed string. + * + */ +int in6_pton(const char *src, int srclen, + u8 *dst, + int delim, const char **end) +{ + const char *s, *tok = NULL; + u8 *d, *dc = NULL; + u8 dbuf[16]; + int ret = 0; + int i; + int state = IN6PTON_COLON_1_2 | IN6PTON_XDIGIT | IN6PTON_NULL; + int w = 0; + + memset(dbuf, 0, sizeof(dbuf)); + + s = src; + d = dbuf; + if (srclen < 0) + srclen = strlen(src); + + while (1) { + int c; + + c = xdigit2bin(srclen > 0 ? *s : '\0', delim); + if (!(c & state)) + goto out; + if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) { + /* process one 16-bit word */ + if (!(state & IN6PTON_NULL)) { + *d++ = (w >> 8) & 0xff; + *d++ = w & 0xff; + } + w = 0; + if (c & IN6PTON_DELIM) { + /* We've processed last word */ + break; + } + /* + * COLON_1 => XDIGIT + * COLON_2 => XDIGIT|DELIM + * COLON_1_2 => COLON_2 + */ + switch (state & IN6PTON_COLON_MASK) { + case IN6PTON_COLON_2: + dc = d; + state = IN6PTON_XDIGIT | IN6PTON_DELIM; + if (dc - dbuf >= sizeof(dbuf)) + state |= IN6PTON_NULL; + break; + case IN6PTON_COLON_1|IN6PTON_COLON_1_2: + state = IN6PTON_XDIGIT | IN6PTON_COLON_2; + break; + case IN6PTON_COLON_1: + state = IN6PTON_XDIGIT; + break; + case IN6PTON_COLON_1_2: + state = IN6PTON_COLON_2; + break; + default: + state = 0; + } + tok = s + 1; + goto cont; + } + + if (c & IN6PTON_DOT) { + ret = in4_pton(tok ? tok : s, srclen + (int)(s - tok), d, delim, &s); + if (ret > 0) { + d += 4; + break; + } + goto out; + } + + w = (w << 4) | (0xff & c); + state = IN6PTON_COLON_1 | IN6PTON_DELIM; + if (!(w & 0xf000)) { + state |= IN6PTON_XDIGIT; + } + if (!dc && d + 2 < dbuf + sizeof(dbuf)) { + state |= IN6PTON_COLON_1_2; + state &= ~IN6PTON_DELIM; + } + if (d + 2 >= dbuf + sizeof(dbuf)) { + state &= ~(IN6PTON_COLON_1|IN6PTON_COLON_1_2); + } +cont: + if ((dc && d + 4 < dbuf + sizeof(dbuf)) || + d + 4 == dbuf + sizeof(dbuf)) { + state |= IN6PTON_DOT; + } + if (d >= dbuf + sizeof(dbuf)) { + state &= ~(IN6PTON_XDIGIT|IN6PTON_COLON_MASK); + } + s++; + srclen--; + } + + i = 15; d--; + + if (dc) { + while (d >= dc) + dst[i--] = *d--; + while (i >= dc - dbuf) + dst[i--] = 0; + while (i >= 0) + dst[i--] = *d--; + } else + memcpy(dst, dbuf, sizeof(dbuf)); + + ret = 1; +out: + if (end) + *end = s; + return ret; +} +EXPORT_SYMBOL(in6_pton); + +static int inet4_pton(const char *src, u16 port_num, + struct sockaddr_storage *addr) +{ + struct sockaddr_in *addr4 = (struct sockaddr_in *)addr; + int srclen = strlen(src); + + if (srclen > INET_ADDRSTRLEN) + return -EINVAL; + + if (in4_pton(src, srclen, (u8 *)&addr4->sin_addr.s_addr, + '\n', NULL) == 0) + return -EINVAL; + + addr4->sin_family = AF_INET; + addr4->sin_port = htons(port_num); + + return 0; +} + +static int inet6_pton(struct net *net, const char *src, u16 port_num, + struct sockaddr_storage *addr) +{ + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr; + const char *scope_delim; + int srclen = strlen(src); + + if (srclen > INET6_ADDRSTRLEN) + return -EINVAL; + + if (in6_pton(src, srclen, (u8 *)&addr6->sin6_addr.s6_addr, + '%', &scope_delim) == 0) + return -EINVAL; + + if (ipv6_addr_type(&addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL && + src + srclen != scope_delim && *scope_delim == '%') { + struct net_device *dev; + char scope_id[16]; + size_t scope_len = min_t(size_t, sizeof(scope_id) - 1, + src + srclen - scope_delim - 1); + + memcpy(scope_id, scope_delim + 1, scope_len); + scope_id[scope_len] = '\0'; + + dev = dev_get_by_name(net, scope_id); + if (dev) { + addr6->sin6_scope_id = dev->ifindex; + dev_put(dev); + } else if (kstrtouint(scope_id, 0, &addr6->sin6_scope_id)) { + return -EINVAL; + } + } + + addr6->sin6_family = AF_INET6; + addr6->sin6_port = htons(port_num); + + return 0; +} + +/** + * inet_pton_with_scope - convert an IPv4/IPv6 and port to socket address + * @net: net namespace (used for scope handling) + * @af: address family, AF_INET, AF_INET6 or AF_UNSPEC for either + * @src: the start of the address string + * @port: the start of the port string (or NULL for none) + * @addr: output socket address + * + * Return zero on success, return errno when any error occurs. + */ +int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af, + const char *src, const char *port, struct sockaddr_storage *addr) +{ + u16 port_num; + int ret = -EINVAL; + + if (port) { + if (kstrtou16(port, 0, &port_num)) + return -EINVAL; + } else { + port_num = 0; + } + + switch (af) { + case AF_INET: + ret = inet4_pton(src, port_num, addr); + break; + case AF_INET6: + ret = inet6_pton(net, src, port_num, addr); + break; + case AF_UNSPEC: + ret = inet4_pton(src, port_num, addr); + if (ret) + ret = inet6_pton(net, src, port_num, addr); + break; + default: + pr_err("unexpected address family %d\n", af); + } + + return ret; +} +EXPORT_SYMBOL(inet_pton_with_scope); + +bool inet_addr_is_any(struct sockaddr *addr) +{ + if (addr->sa_family == AF_INET6) { + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr; + const struct sockaddr_in6 in6_any = + { .sin6_addr = IN6ADDR_ANY_INIT }; + + if (!memcmp(in6->sin6_addr.s6_addr, + in6_any.sin6_addr.s6_addr, 16)) + return true; + } else if (addr->sa_family == AF_INET) { + struct sockaddr_in *in = (struct sockaddr_in *)addr; + + if (in->sin_addr.s_addr == htonl(INADDR_ANY)) + return true; + } else { + pr_warn("unexpected address family %u\n", addr->sa_family); + } + + return false; +} +EXPORT_SYMBOL(inet_addr_is_any); + +void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, + __be32 from, __be32 to, bool pseudohdr) +{ + if (skb->ip_summed != CHECKSUM_PARTIAL) { + csum_replace4(sum, from, to); + if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) + skb->csum = ~csum_add(csum_sub(~(skb->csum), + (__force __wsum)from), + (__force __wsum)to); + } else if (pseudohdr) + *sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum), + (__force __wsum)from), + (__force __wsum)to)); +} +EXPORT_SYMBOL(inet_proto_csum_replace4); + +/** + * inet_proto_csum_replace16 - update layer 4 header checksum field + * @sum: Layer 4 header checksum field + * @skb: sk_buff for the packet + * @from: old IPv6 address + * @to: new IPv6 address + * @pseudohdr: True if layer 4 header checksum includes pseudoheader + * + * Update layer 4 header as per the update in IPv6 src/dst address. + * + * There is no need to update skb->csum in this function, because update in two + * fields a.) IPv6 src/dst address and b.) L4 header checksum cancels each other + * for skb->csum calculation. Whereas inet_proto_csum_replace4 function needs to + * update skb->csum, because update in 3 fields a.) IPv4 src/dst address, + * b.) IPv4 Header checksum and c.) L4 header checksum results in same diff as + * L4 Header checksum for skb->csum calculation. + */ +void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, + const __be32 *from, const __be32 *to, + bool pseudohdr) +{ + __be32 diff[] = { + ~from[0], ~from[1], ~from[2], ~from[3], + to[0], to[1], to[2], to[3], + }; + if (skb->ip_summed != CHECKSUM_PARTIAL) { + *sum = csum_fold(csum_partial(diff, sizeof(diff), + ~csum_unfold(*sum))); + } else if (pseudohdr) + *sum = ~csum_fold(csum_partial(diff, sizeof(diff), + csum_unfold(*sum))); +} +EXPORT_SYMBOL(inet_proto_csum_replace16); + +void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, + __wsum diff, bool pseudohdr) +{ + if (skb->ip_summed != CHECKSUM_PARTIAL) { + csum_replace_by_diff(sum, diff); + if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) + skb->csum = ~csum_sub(diff, skb->csum); + } else if (pseudohdr) { + *sum = ~csum_fold(csum_add(diff, csum_unfold(*sum))); + } +} +EXPORT_SYMBOL(inet_proto_csum_replace_by_diff); diff --git a/net/core/xdp.c b/net/core/xdp.c new file mode 100644 index 000000000..844c9d99d --- /dev/null +++ b/net/core/xdp.c @@ -0,0 +1,711 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* net/core/xdp.c + * + * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + */ +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/netdevice.h> +#include <linux/slab.h> +#include <linux/idr.h> +#include <linux/rhashtable.h> +#include <linux/bug.h> +#include <net/page_pool.h> + +#include <net/xdp.h> +#include <net/xdp_priv.h> /* struct xdp_mem_allocator */ +#include <trace/events/xdp.h> +#include <net/xdp_sock_drv.h> + +#define REG_STATE_NEW 0x0 +#define REG_STATE_REGISTERED 0x1 +#define REG_STATE_UNREGISTERED 0x2 +#define REG_STATE_UNUSED 0x3 + +static DEFINE_IDA(mem_id_pool); +static DEFINE_MUTEX(mem_id_lock); +#define MEM_ID_MAX 0xFFFE +#define MEM_ID_MIN 1 +static int mem_id_next = MEM_ID_MIN; + +static bool mem_id_init; /* false */ +static struct rhashtable *mem_id_ht; + +static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed) +{ + const u32 *k = data; + const u32 key = *k; + + BUILD_BUG_ON(sizeof_field(struct xdp_mem_allocator, mem.id) + != sizeof(u32)); + + /* Use cyclic increasing ID as direct hash key */ + return key; +} + +static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct xdp_mem_allocator *xa = ptr; + u32 mem_id = *(u32 *)arg->key; + + return xa->mem.id != mem_id; +} + +static const struct rhashtable_params mem_id_rht_params = { + .nelem_hint = 64, + .head_offset = offsetof(struct xdp_mem_allocator, node), + .key_offset = offsetof(struct xdp_mem_allocator, mem.id), + .key_len = sizeof_field(struct xdp_mem_allocator, mem.id), + .max_size = MEM_ID_MAX, + .min_size = 8, + .automatic_shrinking = true, + .hashfn = xdp_mem_id_hashfn, + .obj_cmpfn = xdp_mem_id_cmp, +}; + +static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) +{ + struct xdp_mem_allocator *xa; + + xa = container_of(rcu, struct xdp_mem_allocator, rcu); + + /* Allow this ID to be reused */ + ida_simple_remove(&mem_id_pool, xa->mem.id); + + kfree(xa); +} + +static void mem_xa_remove(struct xdp_mem_allocator *xa) +{ + trace_mem_disconnect(xa); + + if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) + call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); +} + +static void mem_allocator_disconnect(void *allocator) +{ + struct xdp_mem_allocator *xa; + struct rhashtable_iter iter; + + mutex_lock(&mem_id_lock); + + rhashtable_walk_enter(mem_id_ht, &iter); + do { + rhashtable_walk_start(&iter); + + while ((xa = rhashtable_walk_next(&iter)) && !IS_ERR(xa)) { + if (xa->allocator == allocator) + mem_xa_remove(xa); + } + + rhashtable_walk_stop(&iter); + + } while (xa == ERR_PTR(-EAGAIN)); + rhashtable_walk_exit(&iter); + + mutex_unlock(&mem_id_lock); +} + +void xdp_unreg_mem_model(struct xdp_mem_info *mem) +{ + struct xdp_mem_allocator *xa; + int type = mem->type; + int id = mem->id; + + /* Reset mem info to defaults */ + mem->id = 0; + mem->type = 0; + + if (id == 0) + return; + + if (type == MEM_TYPE_PAGE_POOL) { + rcu_read_lock(); + xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); + page_pool_destroy(xa->page_pool); + rcu_read_unlock(); + } +} +EXPORT_SYMBOL_GPL(xdp_unreg_mem_model); + +void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) +{ + if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { + WARN(1, "Missing register, driver bug"); + return; + } + + xdp_unreg_mem_model(&xdp_rxq->mem); +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model); + +void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) +{ + /* Simplify driver cleanup code paths, allow unreg "unused" */ + if (xdp_rxq->reg_state == REG_STATE_UNUSED) + return; + + xdp_rxq_info_unreg_mem_model(xdp_rxq); + + xdp_rxq->reg_state = REG_STATE_UNREGISTERED; + xdp_rxq->dev = NULL; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg); + +static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq) +{ + memset(xdp_rxq, 0, sizeof(*xdp_rxq)); +} + +/* Returns 0 on success, negative on failure */ +int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, + struct net_device *dev, u32 queue_index, + unsigned int napi_id, u32 frag_size) +{ + if (!dev) { + WARN(1, "Missing net_device from driver"); + return -ENODEV; + } + + if (xdp_rxq->reg_state == REG_STATE_UNUSED) { + WARN(1, "Driver promised not to register this"); + return -EINVAL; + } + + if (xdp_rxq->reg_state == REG_STATE_REGISTERED) { + WARN(1, "Missing unregister, handled but fix driver"); + xdp_rxq_info_unreg(xdp_rxq); + } + + /* State either UNREGISTERED or NEW */ + xdp_rxq_info_init(xdp_rxq); + xdp_rxq->dev = dev; + xdp_rxq->queue_index = queue_index; + xdp_rxq->napi_id = napi_id; + xdp_rxq->frag_size = frag_size; + + xdp_rxq->reg_state = REG_STATE_REGISTERED; + return 0; +} +EXPORT_SYMBOL_GPL(__xdp_rxq_info_reg); + +void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq) +{ + xdp_rxq->reg_state = REG_STATE_UNUSED; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_unused); + +bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq) +{ + return (xdp_rxq->reg_state == REG_STATE_REGISTERED); +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg); + +static int __mem_id_init_hash_table(void) +{ + struct rhashtable *rht; + int ret; + + if (unlikely(mem_id_init)) + return 0; + + rht = kzalloc(sizeof(*rht), GFP_KERNEL); + if (!rht) + return -ENOMEM; + + ret = rhashtable_init(rht, &mem_id_rht_params); + if (ret < 0) { + kfree(rht); + return ret; + } + mem_id_ht = rht; + smp_mb(); /* mutex lock should provide enough pairing */ + mem_id_init = true; + + return 0; +} + +/* Allocate a cyclic ID that maps to allocator pointer. + * See: https://www.kernel.org/doc/html/latest/core-api/idr.html + * + * Caller must lock mem_id_lock. + */ +static int __mem_id_cyclic_get(gfp_t gfp) +{ + int retries = 1; + int id; + +again: + id = ida_simple_get(&mem_id_pool, mem_id_next, MEM_ID_MAX, gfp); + if (id < 0) { + if (id == -ENOSPC) { + /* Cyclic allocator, reset next id */ + if (retries--) { + mem_id_next = MEM_ID_MIN; + goto again; + } + } + return id; /* errno */ + } + mem_id_next = id + 1; + + return id; +} + +static bool __is_supported_mem_type(enum xdp_mem_type type) +{ + if (type == MEM_TYPE_PAGE_POOL) + return is_page_pool_compiled_in(); + + if (type >= MEM_TYPE_MAX) + return false; + + return true; +} + +static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem, + enum xdp_mem_type type, + void *allocator) +{ + struct xdp_mem_allocator *xdp_alloc; + gfp_t gfp = GFP_KERNEL; + int id, errno, ret; + void *ptr; + + if (!__is_supported_mem_type(type)) + return ERR_PTR(-EOPNOTSUPP); + + mem->type = type; + + if (!allocator) { + if (type == MEM_TYPE_PAGE_POOL) + return ERR_PTR(-EINVAL); /* Setup time check page_pool req */ + return NULL; + } + + /* Delay init of rhashtable to save memory if feature isn't used */ + if (!mem_id_init) { + mutex_lock(&mem_id_lock); + ret = __mem_id_init_hash_table(); + mutex_unlock(&mem_id_lock); + if (ret < 0) { + WARN_ON(1); + return ERR_PTR(ret); + } + } + + xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp); + if (!xdp_alloc) + return ERR_PTR(-ENOMEM); + + mutex_lock(&mem_id_lock); + id = __mem_id_cyclic_get(gfp); + if (id < 0) { + errno = id; + goto err; + } + mem->id = id; + xdp_alloc->mem = *mem; + xdp_alloc->allocator = allocator; + + /* Insert allocator into ID lookup table */ + ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node); + if (IS_ERR(ptr)) { + ida_simple_remove(&mem_id_pool, mem->id); + mem->id = 0; + errno = PTR_ERR(ptr); + goto err; + } + + if (type == MEM_TYPE_PAGE_POOL) + page_pool_use_xdp_mem(allocator, mem_allocator_disconnect, mem); + + mutex_unlock(&mem_id_lock); + + return xdp_alloc; +err: + mutex_unlock(&mem_id_lock); + kfree(xdp_alloc); + return ERR_PTR(errno); +} + +int xdp_reg_mem_model(struct xdp_mem_info *mem, + enum xdp_mem_type type, void *allocator) +{ + struct xdp_mem_allocator *xdp_alloc; + + xdp_alloc = __xdp_reg_mem_model(mem, type, allocator); + if (IS_ERR(xdp_alloc)) + return PTR_ERR(xdp_alloc); + return 0; +} +EXPORT_SYMBOL_GPL(xdp_reg_mem_model); + +int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, + enum xdp_mem_type type, void *allocator) +{ + struct xdp_mem_allocator *xdp_alloc; + + if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { + WARN(1, "Missing register, driver bug"); + return -EFAULT; + } + + xdp_alloc = __xdp_reg_mem_model(&xdp_rxq->mem, type, allocator); + if (IS_ERR(xdp_alloc)) + return PTR_ERR(xdp_alloc); + + if (trace_mem_connect_enabled() && xdp_alloc) + trace_mem_connect(xdp_alloc, xdp_rxq); + return 0; +} + +EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); + +/* XDP RX runs under NAPI protection, and in different delivery error + * scenarios (e.g. queue full), it is possible to return the xdp_frame + * while still leveraging this protection. The @napi_direct boolean + * is used for those calls sites. Thus, allowing for faster recycling + * of xdp_frames/pages in those cases. + */ +void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, + struct xdp_buff *xdp) +{ + struct page *page; + + switch (mem->type) { + case MEM_TYPE_PAGE_POOL: + page = virt_to_head_page(data); + if (napi_direct && xdp_return_frame_no_direct()) + napi_direct = false; + /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) + * as mem->type knows this a page_pool page + */ + page_pool_put_full_page(page->pp, page, napi_direct); + break; + case MEM_TYPE_PAGE_SHARED: + page_frag_free(data); + break; + case MEM_TYPE_PAGE_ORDER0: + page = virt_to_page(data); /* Assumes order0 page*/ + put_page(page); + break; + case MEM_TYPE_XSK_BUFF_POOL: + /* NB! Only valid from an xdp_buff! */ + xsk_buff_free(xdp); + break; + default: + /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ + WARN(1, "Incorrect XDP memory type (%d) usage", mem->type); + break; + } +} + +void xdp_return_frame(struct xdp_frame *xdpf) +{ + struct skb_shared_info *sinfo; + int i; + + if (likely(!xdp_frame_has_frags(xdpf))) + goto out; + + sinfo = xdp_get_shared_info_from_frame(xdpf); + for (i = 0; i < sinfo->nr_frags; i++) { + struct page *page = skb_frag_page(&sinfo->frags[i]); + + __xdp_return(page_address(page), &xdpf->mem, false, NULL); + } +out: + __xdp_return(xdpf->data, &xdpf->mem, false, NULL); +} +EXPORT_SYMBOL_GPL(xdp_return_frame); + +void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) +{ + struct skb_shared_info *sinfo; + int i; + + if (likely(!xdp_frame_has_frags(xdpf))) + goto out; + + sinfo = xdp_get_shared_info_from_frame(xdpf); + for (i = 0; i < sinfo->nr_frags; i++) { + struct page *page = skb_frag_page(&sinfo->frags[i]); + + __xdp_return(page_address(page), &xdpf->mem, true, NULL); + } +out: + __xdp_return(xdpf->data, &xdpf->mem, true, NULL); +} +EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); + +/* XDP bulk APIs introduce a defer/flush mechanism to return + * pages belonging to the same xdp_mem_allocator object + * (identified via the mem.id field) in bulk to optimize + * I-cache and D-cache. + * The bulk queue size is set to 16 to be aligned to how + * XDP_REDIRECT bulking works. The bulk is flushed when + * it is full or when mem.id changes. + * xdp_frame_bulk is usually stored/allocated on the function + * call-stack to avoid locking penalties. + */ +void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq) +{ + struct xdp_mem_allocator *xa = bq->xa; + + if (unlikely(!xa || !bq->count)) + return; + + page_pool_put_page_bulk(xa->page_pool, bq->q, bq->count); + /* bq->xa is not cleared to save lookup, if mem.id same in next bulk */ + bq->count = 0; +} +EXPORT_SYMBOL_GPL(xdp_flush_frame_bulk); + +/* Must be called with rcu_read_lock held */ +void xdp_return_frame_bulk(struct xdp_frame *xdpf, + struct xdp_frame_bulk *bq) +{ + struct xdp_mem_info *mem = &xdpf->mem; + struct xdp_mem_allocator *xa; + + if (mem->type != MEM_TYPE_PAGE_POOL) { + xdp_return_frame(xdpf); + return; + } + + xa = bq->xa; + if (unlikely(!xa)) { + xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + bq->count = 0; + bq->xa = xa; + } + + if (bq->count == XDP_BULK_QUEUE_SIZE) + xdp_flush_frame_bulk(bq); + + if (unlikely(mem->id != xa->mem.id)) { + xdp_flush_frame_bulk(bq); + bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + } + + if (unlikely(xdp_frame_has_frags(xdpf))) { + struct skb_shared_info *sinfo; + int i; + + sinfo = xdp_get_shared_info_from_frame(xdpf); + for (i = 0; i < sinfo->nr_frags; i++) { + skb_frag_t *frag = &sinfo->frags[i]; + + bq->q[bq->count++] = skb_frag_address(frag); + if (bq->count == XDP_BULK_QUEUE_SIZE) + xdp_flush_frame_bulk(bq); + } + } + bq->q[bq->count++] = xdpf->data; +} +EXPORT_SYMBOL_GPL(xdp_return_frame_bulk); + +void xdp_return_buff(struct xdp_buff *xdp) +{ + struct skb_shared_info *sinfo; + int i; + + if (likely(!xdp_buff_has_frags(xdp))) + goto out; + + sinfo = xdp_get_shared_info_from_buff(xdp); + for (i = 0; i < sinfo->nr_frags; i++) { + struct page *page = skb_frag_page(&sinfo->frags[i]); + + __xdp_return(page_address(page), &xdp->rxq->mem, true, xdp); + } +out: + __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp); +} +EXPORT_SYMBOL_GPL(xdp_return_buff); + +/* Only called for MEM_TYPE_PAGE_POOL see xdp.h */ +void __xdp_release_frame(void *data, struct xdp_mem_info *mem) +{ + struct xdp_mem_allocator *xa; + struct page *page; + + rcu_read_lock(); + xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + page = virt_to_head_page(data); + if (xa) + page_pool_release_page(xa->page_pool, page); + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(__xdp_release_frame); + +void xdp_attachment_setup(struct xdp_attachment_info *info, + struct netdev_bpf *bpf) +{ + if (info->prog) + bpf_prog_put(info->prog); + info->prog = bpf->prog; + info->flags = bpf->flags; +} +EXPORT_SYMBOL_GPL(xdp_attachment_setup); + +struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) +{ + unsigned int metasize, totsize; + void *addr, *data_to_copy; + struct xdp_frame *xdpf; + struct page *page; + + /* Clone into a MEM_TYPE_PAGE_ORDER0 xdp_frame. */ + metasize = xdp_data_meta_unsupported(xdp) ? 0 : + xdp->data - xdp->data_meta; + totsize = xdp->data_end - xdp->data + metasize; + + if (sizeof(*xdpf) + totsize > PAGE_SIZE) + return NULL; + + page = dev_alloc_page(); + if (!page) + return NULL; + + addr = page_to_virt(page); + xdpf = addr; + memset(xdpf, 0, sizeof(*xdpf)); + + addr += sizeof(*xdpf); + data_to_copy = metasize ? xdp->data_meta : xdp->data; + memcpy(addr, data_to_copy, totsize); + + xdpf->data = addr + metasize; + xdpf->len = totsize - metasize; + xdpf->headroom = 0; + xdpf->metasize = metasize; + xdpf->frame_sz = PAGE_SIZE; + xdpf->mem.type = MEM_TYPE_PAGE_ORDER0; + + xsk_buff_free(xdp); + return xdpf; +} +EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame); + +/* Used by XDP_WARN macro, to avoid inlining WARN() in fast-path */ +void xdp_warn(const char *msg, const char *func, const int line) +{ + WARN(1, "XDP_WARN: %s(line:%d): %s\n", func, line, msg); +}; +EXPORT_SYMBOL_GPL(xdp_warn); + +int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp) +{ + n_skb = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, + n_skb, skbs); + if (unlikely(!n_skb)) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL_GPL(xdp_alloc_skb_bulk); + +struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, + struct sk_buff *skb, + struct net_device *dev) +{ + struct skb_shared_info *sinfo = xdp_get_shared_info_from_frame(xdpf); + unsigned int headroom, frame_size; + void *hard_start; + u8 nr_frags; + + /* xdp frags frame */ + if (unlikely(xdp_frame_has_frags(xdpf))) + nr_frags = sinfo->nr_frags; + + /* Part of headroom was reserved to xdpf */ + headroom = sizeof(*xdpf) + xdpf->headroom; + + /* Memory size backing xdp_frame data already have reserved + * room for build_skb to place skb_shared_info in tailroom. + */ + frame_size = xdpf->frame_sz; + + hard_start = xdpf->data - headroom; + skb = build_skb_around(skb, hard_start, frame_size); + if (unlikely(!skb)) + return NULL; + + skb_reserve(skb, headroom); + __skb_put(skb, xdpf->len); + if (xdpf->metasize) + skb_metadata_set(skb, xdpf->metasize); + + if (unlikely(xdp_frame_has_frags(xdpf))) + xdp_update_skb_shared_info(skb, nr_frags, + sinfo->xdp_frags_size, + nr_frags * xdpf->frame_sz, + xdp_frame_is_frag_pfmemalloc(xdpf)); + + /* Essential SKB info: protocol and skb->dev */ + skb->protocol = eth_type_trans(skb, dev); + + /* Optional SKB info, currently missing: + * - HW checksum info (skb->ip_summed) + * - HW RX hash (skb_set_hash) + * - RX ring dev queue index (skb_record_rx_queue) + */ + + /* Until page_pool get SKB return path, release DMA here */ + xdp_release_frame(xdpf); + + /* Allow SKB to reuse area used by xdp_frame */ + xdp_scrub_frame(xdpf); + + return skb; +} +EXPORT_SYMBOL_GPL(__xdp_build_skb_from_frame); + +struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, + struct net_device *dev) +{ + struct sk_buff *skb; + + skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); + if (unlikely(!skb)) + return NULL; + + memset(skb, 0, offsetof(struct sk_buff, tail)); + + return __xdp_build_skb_from_frame(xdpf, skb, dev); +} +EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame); + +struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf) +{ + unsigned int headroom, totalsize; + struct xdp_frame *nxdpf; + struct page *page; + void *addr; + + headroom = xdpf->headroom + sizeof(*xdpf); + totalsize = headroom + xdpf->len; + + if (unlikely(totalsize > PAGE_SIZE)) + return NULL; + page = dev_alloc_page(); + if (!page) + return NULL; + addr = page_to_virt(page); + + memcpy(addr, xdpf, totalsize); + + nxdpf = addr; + nxdpf->data = addr + headroom; + nxdpf->frame_sz = PAGE_SIZE; + nxdpf->mem.type = MEM_TYPE_PAGE_ORDER0; + nxdpf->mem.id = 0; + + return nxdpf; +} |