diff options
Diffstat (limited to '')
635 files changed, 23159 insertions, 14174 deletions
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index f001582345..9404dd551d 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -478,6 +478,8 @@ static struct sk_buff *vlan_gro_receive(struct list_head *head, if (unlikely(!vhdr)) goto out; + NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = hlen; + type = vhdr->h_vlan_encapsulated_proto; ptype = gro_find_receive_by_type(type); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 407b2335f0..3efba4f857 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -149,7 +149,7 @@ static int vlan_dev_change_mtu(struct net_device *dev, int new_mtu) if (max_mtu < new_mtu) return -ERANGE; - dev->mtu = new_mtu; + WRITE_ONCE(dev->mtu, new_mtu); return 0; } @@ -504,28 +504,6 @@ static void vlan_dev_set_rx_mode(struct net_device *vlan_dev) dev_uc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev); } -/* - * vlan network devices have devices nesting below it, and are a special - * "super class" of normal network devices; split their locks off into a - * separate class since they always nest. - */ -static struct lock_class_key vlan_netdev_xmit_lock_key; -static struct lock_class_key vlan_netdev_addr_lock_key; - -static void vlan_dev_set_lockdep_one(struct net_device *dev, - struct netdev_queue *txq, - void *unused) -{ - lockdep_set_class(&txq->_xmit_lock, &vlan_netdev_xmit_lock_key); -} - -static void vlan_dev_set_lockdep_class(struct net_device *dev) -{ - lockdep_set_class(&dev->addr_list_lock, - &vlan_netdev_addr_lock_key); - netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, NULL); -} - static __be16 vlan_parse_protocol(const struct sk_buff *skb) { struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); @@ -559,7 +537,7 @@ static const struct header_ops vlan_passthru_header_ops = { .parse_protocol = vlan_parse_protocol, }; -static struct device_type vlan_type = { +static const struct device_type vlan_type = { .name = "vlan", }; @@ -627,7 +605,7 @@ static int vlan_dev_init(struct net_device *dev) SET_NETDEV_DEVTYPE(dev, &vlan_type); - vlan_dev_set_lockdep_class(dev); + netdev_lockdep_set_classes(dev); vlan->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->vlan_pcpu_stats) @@ -784,9 +762,9 @@ static void vlan_dev_netpoll_cleanup(struct net_device *dev) static int vlan_dev_get_iflink(const struct net_device *dev) { - struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; + const struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; - return real_dev->ifindex; + return READ_ONCE(real_dev->ifindex); } static int vlan_dev_fill_forward_path(struct net_device_path_ctx *ctx, diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index a3b68243fd..cf5219df79 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -117,17 +117,15 @@ static int vlan_changelink(struct net_device *dev, struct nlattr *tb[], return err; } if (data[IFLA_VLAN_INGRESS_QOS]) { - nla_for_each_nested(attr, data[IFLA_VLAN_INGRESS_QOS], rem) { - if (nla_type(attr) != IFLA_VLAN_QOS_MAPPING) - continue; + nla_for_each_nested_type(attr, IFLA_VLAN_QOS_MAPPING, + data[IFLA_VLAN_INGRESS_QOS], rem) { m = nla_data(attr); vlan_dev_set_ingress_priority(dev, m->to, m->from); } } if (data[IFLA_VLAN_EGRESS_QOS]) { - nla_for_each_nested(attr, data[IFLA_VLAN_EGRESS_QOS], rem) { - if (nla_type(attr) != IFLA_VLAN_QOS_MAPPING) - continue; + nla_for_each_nested_type(attr, IFLA_VLAN_QOS_MAPPING, + data[IFLA_VLAN_EGRESS_QOS], rem) { m = nla_data(attr); err = vlan_dev_set_egress_priority(dev, m->from, m->to); if (err) diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c index 7825c12974..87b959da00 100644 --- a/net/8021q/vlanproc.c +++ b/net/8021q/vlanproc.c @@ -163,48 +163,34 @@ void vlan_proc_rem_dev(struct net_device *vlandev) * The following few functions build the content of /proc/net/vlan/config */ -/* start read of /proc/net/vlan/config */ -static void *vlan_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(rcu) +static void *vlan_seq_from_index(struct seq_file *seq, loff_t *pos) { + unsigned long ifindex = *pos; struct net_device *dev; - struct net *net = seq_file_net(seq); - loff_t i = 1; - - rcu_read_lock(); - if (*pos == 0) - return SEQ_START_TOKEN; - for_each_netdev_rcu(net, dev) { + for_each_netdev_dump(seq_file_net(seq), dev, ifindex) { if (!is_vlan_dev(dev)) continue; - - if (i++ == *pos) - return dev; + *pos = dev->ifindex; + return dev; } + return NULL; +} + +static void *vlan_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(rcu) +{ + rcu_read_lock(); + if (*pos == 0) + return SEQ_START_TOKEN; - return NULL; + return vlan_seq_from_index(seq, pos); } static void *vlan_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct net_device *dev; - struct net *net = seq_file_net(seq); - ++*pos; - - dev = v; - if (v == SEQ_START_TOKEN) - dev = net_device_entry(&net->dev_base_head); - - for_each_netdev_continue_rcu(net, dev) { - if (!is_vlan_dev(dev)) - continue; - - return dev; - } - - return NULL; + return vlan_seq_from_index(seq, pos); } static void vlan_seq_stop(struct seq_file *seq, void *v) diff --git a/net/9p/Kconfig b/net/9p/Kconfig index 00ebce9e5a..bcdab9c23b 100644 --- a/net/9p/Kconfig +++ b/net/9p/Kconfig @@ -5,6 +5,7 @@ menuconfig NET_9P tristate "Plan 9 Resource Sharing Support (9P2000)" + select NETFS_SUPPORT help If you say Y here, you will get experimental support for Plan 9 resource sharing via the 9P2000 protocol. diff --git a/net/9p/client.c b/net/9p/client.c index f7e90b4769..5cd94721d9 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -18,6 +18,7 @@ #include <linux/sched/signal.h> #include <linux/uaccess.h> #include <linux/uio.h> +#include <linux/netfs.h> #include <net/9p/9p.h> #include <linux/parser.h> #include <linux/seq_file.h> @@ -235,6 +236,8 @@ static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc, if (!fc->sdata) return -ENOMEM; fc->capacity = alloc_msize; + fc->id = 0; + fc->tag = P9_NOTAG; return 0; } @@ -1661,6 +1664,54 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) } EXPORT_SYMBOL(p9_client_write); +void +p9_client_write_subreq(struct netfs_io_subrequest *subreq) +{ + struct netfs_io_request *wreq = subreq->rreq; + struct p9_fid *fid = wreq->netfs_priv; + struct p9_client *clnt = fid->clnt; + struct p9_req_t *req; + unsigned long long start = subreq->start + subreq->transferred; + int written, len = subreq->len - subreq->transferred; + int err; + + p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu len %d\n", + fid->fid, start, len); + + /* Don't bother zerocopy for small IO (< 1024) */ + if (clnt->trans_mod->zc_request && len > 1024) { + req = p9_client_zc_rpc(clnt, P9_TWRITE, NULL, &subreq->io_iter, + 0, wreq->len, P9_ZC_HDR_SZ, "dqd", + fid->fid, start, len); + } else { + req = p9_client_rpc(clnt, P9_TWRITE, "dqV", fid->fid, + start, len, &subreq->io_iter); + } + if (IS_ERR(req)) { + netfs_write_subrequest_terminated(subreq, PTR_ERR(req), false); + return; + } + + err = p9pdu_readf(&req->rc, clnt->proto_version, "d", &written); + if (err) { + trace_9p_protocol_dump(clnt, &req->rc); + p9_req_put(clnt, req); + netfs_write_subrequest_terminated(subreq, err, false); + return; + } + + if (written > len) { + pr_err("bogus RWRITE count (%d > %u)\n", written, len); + written = len; + } + + p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", len); + + p9_req_put(clnt, req); + netfs_write_subrequest_terminated(subreq, written, false); +} +EXPORT_SYMBOL(p9_client_write_subreq); + struct p9_wstat *p9_client_stat(struct p9_fid *fid) { int err; diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 1a3948b8c4..196060dc61 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -95,7 +95,6 @@ struct p9_poll_wait { * @unsent_req_list: accounting for requests that haven't been sent * @rreq: read request * @wreq: write request - * @req: current request being processed (if any) * @tmp_buf: temporary buffer to read in header * @rc: temporary fcall for reading current frame * @wpos: write position for current frame diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index e305071eb7..0b8086f58a 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -781,7 +781,6 @@ static struct virtio_driver p9_virtio_drv = { .feature_table = features, .feature_table_size = ARRAY_SIZE(features), .driver.name = KBUILD_MODNAME, - .driver.owner = THIS_MODULE, .id_table = id_table, .probe = p9_virtio_probe, .remove = p9_virtio_remove, diff --git a/net/Kconfig b/net/Kconfig index 4adc47d0c9..f0a8692496 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -60,6 +60,9 @@ config NET_XGRESS config NET_REDIRECT bool +config SKB_DECRYPTED + bool + config SKB_EXTENSIONS bool @@ -331,6 +334,7 @@ config NET_RX_BUSY_POLL config BQL bool + prompt "Enable Byte Queue Limits" depends on SYSFS select DQL default y @@ -448,6 +452,9 @@ config GRO_CELLS config SOCK_VALIDATE_XMIT bool +config NET_IEEE8021Q_HELPERS + bool + config NET_SELFTESTS def_tristate PHYLIB depends on PHYLIB && INET diff --git a/net/Makefile b/net/Makefile index b06b5539e7..65bb8c72a3 100644 --- a/net/Makefile +++ b/net/Makefile @@ -17,7 +17,7 @@ obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_XFRM) += xfrm/ -obj-$(CONFIG_UNIX_SCM) += unix/ +obj-$(CONFIG_UNIX) += unix/ obj-y += ipv6/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 198f5ba2fe..b068651984 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -88,6 +88,7 @@ static inline void atalk_remove_socket(struct sock *sk) static struct sock *atalk_search_socket(struct sockaddr_at *to, struct atalk_iface *atif) { + struct sock *def_socket = NULL; struct sock *s; read_lock_bh(&atalk_sockets_lock); @@ -98,8 +99,20 @@ static struct sock *atalk_search_socket(struct sockaddr_at *to, continue; if (to->sat_addr.s_net == ATADDR_ANYNET && - to->sat_addr.s_node == ATADDR_BCAST) - goto found; + to->sat_addr.s_node == ATADDR_BCAST) { + if (atif->address.s_node == at->src_node && + atif->address.s_net == at->src_net) { + /* This socket's address matches the address of the interface + * that received the packet -- use it + */ + goto found; + } + + /* Continue searching for a socket matching the interface address, + * but use this socket by default if no other one is found + */ + def_socket = s; + } if (to->sat_addr.s_net == at->src_net && (to->sat_addr.s_node == at->src_node || @@ -116,7 +129,7 @@ static struct sock *atalk_search_socket(struct sockaddr_at *to, goto found; } } - s = NULL; + s = def_socket; found: read_unlock_bh(&atalk_sockets_lock); return s; diff --git a/net/appletalk/sysctl_net_atalk.c b/net/appletalk/sysctl_net_atalk.c index d945b7c017..7aebfe9032 100644 --- a/net/appletalk/sysctl_net_atalk.c +++ b/net/appletalk/sysctl_net_atalk.c @@ -40,7 +40,6 @@ static struct ctl_table atalk_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { }, }; static struct ctl_table_header *atalk_table_header; diff --git a/net/atm/clip.c b/net/atm/clip.c index 294cb9efe3..42b910cb4e 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -345,7 +345,7 @@ static netdev_tx_t clip_start_xmit(struct sk_buff *skb, dev->stats.tx_dropped++; return NETDEV_TX_OK; } - rt = (struct rtable *) dst; + rt = dst_rtable(dst); if (rt->rt_gw_family == AF_INET) daddr = &rt->rt_gw4; else @@ -463,7 +463,7 @@ static int clip_setentry(struct atm_vcc *vcc, __be32 ip) unlink_clip_vcc(clip_vcc); return 0; } - rt = ip_route_output(&init_net, ip, 0, 1, 0); + rt = ip_route_output(&init_net, ip, 0, 0, 0, RT_SCOPE_LINK); if (IS_ERR(rt)) return PTR_ERR(rt); neigh = __neigh_lookup(&arp_tbl, &ip, rt->dst.dev, 1); diff --git a/net/atm/common.c b/net/atm/common.c index 2a1ec014e9..9b75699992 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -116,7 +116,7 @@ static void vcc_write_space(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible(&wq->wait); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); diff --git a/net/atm/svc.c b/net/atm/svc.c index 36a814f1fb..f8137ae693 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -324,8 +324,8 @@ out: return error; } -static int svc_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +static int svc_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { struct sock *sk = sock->sk; struct sk_buff *skb; @@ -336,7 +336,7 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags, lock_sock(sk); - error = svc_create(sock_net(sk), newsock, 0, kern); + error = svc_create(sock_net(sk), newsock, 0, arg->kern); if (error) goto out; @@ -355,7 +355,7 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags, error = -sk->sk_err; break; } - if (flags & O_NONBLOCK) { + if (arg->flags & O_NONBLOCK) { error = -EAGAIN; break; } diff --git a/net/ax25/Kconfig b/net/ax25/Kconfig index fdb666607f..e23a3dc14b 100644 --- a/net/ax25/Kconfig +++ b/net/ax25/Kconfig @@ -4,7 +4,7 @@ # menuconfig HAMRADIO - depends on NET && !S390 + depends on NET bool "Amateur Radio support" help If you want to connect your Linux box to an amateur radio, answer Y diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 9169efb2f4..d6f9fae06a 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1373,13 +1373,15 @@ out_release: return err; } -static int ax25_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +static int ax25_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { struct sk_buff *skb; struct sock *newsk; + ax25_dev *ax25_dev; DEFINE_WAIT(wait); struct sock *sk; + ax25_cb *ax25; int err = 0; if (sock->state != SS_UNCONNECTED) @@ -1409,7 +1411,7 @@ static int ax25_accept(struct socket *sock, struct socket *newsock, int flags, if (skb) break; - if (flags & O_NONBLOCK) { + if (arg->flags & O_NONBLOCK) { err = -EWOULDBLOCK; break; } @@ -1434,6 +1436,10 @@ static int ax25_accept(struct socket *sock, struct socket *newsock, int flags, kfree_skb(skb); sk_acceptq_removed(sk); newsock->state = SS_CONNECTED; + ax25 = sk_to_ax25(newsk); + ax25_dev = ax25->ax25_dev; + netdev_hold(ax25_dev->dev, &ax25->dev_tracker, GFP_ATOMIC); + ax25_dev_hold(ax25_dev); out: release_sock(sk); diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index 282ec581c0..9efd6690b3 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -22,11 +22,12 @@ #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> +#include <linux/list.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/init.h> -ax25_dev *ax25_dev_list; +static LIST_HEAD(ax25_dev_list); DEFINE_SPINLOCK(ax25_dev_lock); ax25_dev *ax25_addr_ax25dev(ax25_address *addr) @@ -34,10 +35,11 @@ ax25_dev *ax25_addr_ax25dev(ax25_address *addr) ax25_dev *ax25_dev, *res = NULL; spin_lock_bh(&ax25_dev_lock); - for (ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next) + list_for_each_entry(ax25_dev, &ax25_dev_list, list) if (ax25cmp(addr, (const ax25_address *)ax25_dev->dev->dev_addr) == 0) { res = ax25_dev; ax25_dev_hold(ax25_dev); + break; } spin_unlock_bh(&ax25_dev_lock); @@ -59,7 +61,6 @@ void ax25_dev_device_up(struct net_device *dev) } refcount_set(&ax25_dev->refcount, 1); - dev->ax25_ptr = ax25_dev; ax25_dev->dev = dev; netdev_hold(dev, &ax25_dev->dev_tracker, GFP_KERNEL); ax25_dev->forward = NULL; @@ -78,17 +79,19 @@ void ax25_dev_device_up(struct net_device *dev) ax25_dev->values[AX25_VALUES_N2] = AX25_DEF_N2; ax25_dev->values[AX25_VALUES_PACLEN] = AX25_DEF_PACLEN; ax25_dev->values[AX25_VALUES_PROTOCOL] = AX25_DEF_PROTOCOL; + +#ifdef CONFIG_AX25_DAMA_SLAVE ax25_dev->values[AX25_VALUES_DS_TIMEOUT]= AX25_DEF_DS_TIMEOUT; +#endif #if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER) ax25_ds_setup_timer(ax25_dev); #endif spin_lock_bh(&ax25_dev_lock); - ax25_dev->next = ax25_dev_list; - ax25_dev_list = ax25_dev; + list_add(&ax25_dev->list, &ax25_dev_list); + dev->ax25_ptr = ax25_dev; spin_unlock_bh(&ax25_dev_lock); - ax25_dev_hold(ax25_dev); ax25_register_dev_sysctl(ax25_dev); } @@ -111,32 +114,19 @@ void ax25_dev_device_down(struct net_device *dev) /* * Remove any packet forwarding that points to this device. */ - for (s = ax25_dev_list; s != NULL; s = s->next) + list_for_each_entry(s, &ax25_dev_list, list) if (s->forward == dev) s->forward = NULL; - if ((s = ax25_dev_list) == ax25_dev) { - ax25_dev_list = s->next; - goto unlock_put; - } - - while (s != NULL && s->next != NULL) { - if (s->next == ax25_dev) { - s->next = ax25_dev->next; - goto unlock_put; + list_for_each_entry(s, &ax25_dev_list, list) { + if (s == ax25_dev) { + list_del(&s->list); + break; } - - s = s->next; } - spin_unlock_bh(&ax25_dev_lock); - dev->ax25_ptr = NULL; - ax25_dev_put(ax25_dev); - return; -unlock_put: - spin_unlock_bh(&ax25_dev_lock); - ax25_dev_put(ax25_dev); dev->ax25_ptr = NULL; + spin_unlock_bh(&ax25_dev_lock); netdev_put(dev, &ax25_dev->dev_tracker); ax25_dev_put(ax25_dev); } @@ -200,16 +190,13 @@ struct net_device *ax25_fwd_dev(struct net_device *dev) */ void __exit ax25_dev_free(void) { - ax25_dev *s, *ax25_dev; + ax25_dev *s, *n; spin_lock_bh(&ax25_dev_lock); - ax25_dev = ax25_dev_list; - while (ax25_dev != NULL) { - s = ax25_dev; - netdev_put(ax25_dev->dev, &ax25_dev->dev_tracker); - ax25_dev = ax25_dev->next; - kfree(s); + list_for_each_entry_safe(s, n, &ax25_dev_list, list) { + netdev_put(s->dev, &s->dev_tracker); + list_del(&s->list); + ax25_dev_put(s); } - ax25_dev_list = NULL; spin_unlock_bh(&ax25_dev_lock); } diff --git a/net/ax25/sysctl_net_ax25.c b/net/ax25/sysctl_net_ax25.c index db66e11e7f..68753aa303 100644 --- a/net/ax25/sysctl_net_ax25.c +++ b/net/ax25/sysctl_net_ax25.c @@ -141,8 +141,6 @@ static const struct ctl_table ax25_param_table[] = { .extra2 = &max_ds_timeout }, #endif - - { } /* that's all, folks! */ }; int ax25_register_dev_sysctl(ax25_dev *ax25_dev) @@ -155,6 +153,7 @@ int ax25_register_dev_sysctl(ax25_dev *ax25_dev) if (!table) return -ENOMEM; + BUILD_BUG_ON(ARRAY_SIZE(ax25_param_table) != AX25_MAX_VALUES); for (k = 0; k < AX25_MAX_VALUES; k++) table[k].data = &ax25_dev->values[k]; @@ -171,7 +170,7 @@ int ax25_register_dev_sysctl(ax25_dev *ax25_dev) void ax25_unregister_dev_sysctl(ax25_dev *ax25_dev) { struct ctl_table_header *header = ax25_dev->sysheader; - struct ctl_table *table; + const struct ctl_table *table; if (header) { ax25_dev->sysheader = NULL; diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 28a939d560..4c7e855343 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -684,7 +684,7 @@ static bool batadv_dat_forward_data(struct batadv_priv *bat_priv, cand = batadv_dat_select_candidates(bat_priv, ip, vid); if (!cand) - goto out; + return ret; batadv_dbg(BATADV_DBG_DAT, bat_priv, "DHT_SEND for %pI4\n", &ip); @@ -728,7 +728,6 @@ free_orig: batadv_orig_node_put(cand[i].orig_node); } -out: kfree(cand); return ret; } diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 5fc754b0b3..8e0f44c716 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -14,7 +14,6 @@ #include <linux/crc32c.h> #include <linux/device.h> #include <linux/errno.h> -#include <linux/genetlink.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> @@ -38,6 +37,7 @@ #include <linux/string.h> #include <linux/workqueue.h> #include <net/dsfield.h> +#include <net/genetlink.h> #include <net/rtnetlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> @@ -691,29 +691,31 @@ int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type, "%s%s", BATADV_UEV_TYPE_VAR, batadv_uev_type_str[type]); if (!uevent_env[0]) - goto out; + goto report_error; uevent_env[1] = kasprintf(GFP_ATOMIC, "%s%s", BATADV_UEV_ACTION_VAR, batadv_uev_action_str[action]); if (!uevent_env[1]) - goto out; + goto free_first_env; /* If the event is DEL, ignore the data field */ if (action != BATADV_UEV_DEL) { uevent_env[2] = kasprintf(GFP_ATOMIC, "%s%s", BATADV_UEV_DATA_VAR, data); if (!uevent_env[2]) - goto out; + goto free_second_env; } ret = kobject_uevent_env(bat_kobj, KOBJ_CHANGE, uevent_env); -out: - kfree(uevent_env[0]); - kfree(uevent_env[1]); kfree(uevent_env[2]); +free_second_env: + kfree(uevent_env[1]); +free_first_env: + kfree(uevent_env[0]); if (ret) +report_error: batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Impossible to send uevent for (%s,%s,%s) event (err: %d)\n", batadv_uev_type_str[type], diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 870dcd7f17..3d4c36ae2e 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2024.0" +#define BATADV_SOURCE_VERSION "2024.2" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 1f7ed9d4f6..9362cd9d6f 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -15,8 +15,6 @@ #include <linux/cache.h> #include <linux/err.h> #include <linux/errno.h> -#include <linux/export.h> -#include <linux/genetlink.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 71c143d4b6..8f6dd2c6ee 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -12,6 +12,7 @@ #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> +#include <linux/if_vlan.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/list.h> @@ -132,6 +133,29 @@ batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node, } /** + * batadv_vlan_id_valid() - check if vlan id is in valid batman-adv encoding + * @vid: the VLAN identifier + * + * Return: true when either no vlan is set or if VLAN is in correct range, + * false otherwise + */ +static bool batadv_vlan_id_valid(unsigned short vid) +{ + unsigned short non_vlan = vid & ~(BATADV_VLAN_HAS_TAG | VLAN_VID_MASK); + + if (vid == 0) + return true; + + if (!(vid & BATADV_VLAN_HAS_TAG)) + return false; + + if (non_vlan) + return false; + + return true; +} + +/** * batadv_orig_node_vlan_new() - search and possibly create an orig_node_vlan * object * @orig_node: the originator serving the VLAN @@ -149,6 +173,9 @@ batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, { struct batadv_orig_node_vlan *vlan; + if (!batadv_vlan_id_valid(vid)) + return NULL; + spin_lock_bh(&orig_node->vlan_list_lock); /* first look if an object for this vid already exists */ @@ -1266,6 +1293,8 @@ void batadv_purge_orig_ref(struct batadv_priv *bat_priv) /* for all origins... */ for (i = 0; i < hash->size; i++) { head = &hash->table[i]; + if (hlist_empty(head)) + continue; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 89c51b3cf4..30ecbc2ef1 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -159,7 +159,7 @@ static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu) if (new_mtu < ETH_MIN_MTU || new_mtu > batadv_hardif_min_mtu(dev)) return -EINVAL; - dev->mtu = new_mtu; + WRITE_ONCE(dev->mtu, new_mtu); bat_priv->mtu_set_by_user = new_mtu; return 0; diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h index 5dd52bc5ca..6b816cf1a9 100644 --- a/net/batman-adv/trace.h +++ b/net/batman-adv/trace.h @@ -40,8 +40,8 @@ TRACE_EVENT(batadv_dbg, ), TP_fast_assign( - __assign_str(device, bat_priv->soft_iface->name); - __assign_str(driver, KBUILD_MODNAME); + __assign_str(device); + __assign_str(driver); __assign_vstr(msg, vaf->fmt, vaf->va); ), diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 715cbafbf6..50cfec8cca 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -133,7 +133,7 @@ static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_btle_dev *dev, struct in6_addr *daddr, struct sk_buff *skb) { - struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); int count = atomic_read(&dev->peer_count); const struct in6_addr *nexthop; struct lowpan_peer *peer; @@ -572,7 +572,7 @@ static void netdev_setup(struct net_device *dev) dev->needs_free_netdev = true; } -static struct device_type bt_type = { +static const struct device_type bt_type = { .name = "bluetooth", }; diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c index 5a6a49885a..ec45f77fce 100644 --- a/net/bluetooth/bnep/core.c +++ b/net/bluetooth/bnep/core.c @@ -385,7 +385,8 @@ static int bnep_rx_frame(struct bnep_session *s, struct sk_buff *skb) case BNEP_COMPRESSED_DST_ONLY: __skb_put_data(nskb, skb_mac_header(skb), ETH_ALEN); - __skb_put_data(nskb, s->eh.h_source, ETH_ALEN + 2); + __skb_put_data(nskb, s->eh.h_source, ETH_ALEN); + put_unaligned(s->eh.h_proto, (__be16 *)__skb_put(nskb, 2)); break; case BNEP_GENERAL: @@ -549,7 +550,7 @@ static struct device *bnep_get_device(struct bnep_session *session) return &conn->hcon->dev; } -static struct device_type bnep_type = { +static const struct device_type bnep_type = { .name = "bluetooth", }; diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 18f97b2288..080053a85b 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1,7 +1,7 @@ /* BlueZ - Bluetooth protocol stack for Linux Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved. - Copyright 2023 NXP + Copyright 2023-2024 NXP Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> @@ -241,13 +241,13 @@ static int configure_datapath_sync(struct hci_dev *hdev, struct bt_codec *codec) __u8 vnd_len, *vnd_data = NULL; struct hci_op_configure_data_path *cmd = NULL; + /* Do not take below 2 checks as error since the 1st means user do not + * want to use HFP offload mode and the 2nd means the vendor controller + * do not need to send below HCI command for offload mode. + */ if (!codec->data_path || !hdev->get_codec_config_data) return 0; - /* Do not take me as error */ - if (!hdev->get_codec_config_data) - return 0; - err = hdev->get_codec_config_data(hdev, ESCO_LINK, codec, &vnd_len, &vnd_data); if (err < 0) @@ -664,11 +664,6 @@ static void le_conn_timeout(struct work_struct *work) hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM); } -struct iso_cig_params { - struct hci_cp_le_set_cig_params cp; - struct hci_cis_params cis[0x1f]; -}; - struct iso_list_data { union { u8 cig; @@ -904,16 +899,42 @@ static int hci_conn_hash_alloc_unset(struct hci_dev *hdev) U16_MAX, GFP_ATOMIC); } -struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, - u8 role, u16 handle) +static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, + u8 role, u16 handle) { struct hci_conn *conn; + switch (type) { + case ACL_LINK: + if (!hdev->acl_mtu) + return ERR_PTR(-ECONNREFUSED); + break; + case ISO_LINK: + if (hdev->iso_mtu) + /* Dedicated ISO Buffer exists */ + break; + fallthrough; + case LE_LINK: + if (hdev->le_mtu && hdev->le_mtu < HCI_MIN_LE_MTU) + return ERR_PTR(-ECONNREFUSED); + if (!hdev->le_mtu && hdev->acl_mtu < HCI_MIN_LE_MTU) + return ERR_PTR(-ECONNREFUSED); + break; + case SCO_LINK: + case ESCO_LINK: + if (!hdev->sco_pkts) + /* Controller does not support SCO or eSCO over HCI */ + return ERR_PTR(-ECONNREFUSED); + break; + default: + return ERR_PTR(-ECONNREFUSED); + } + bt_dev_dbg(hdev, "dst %pMR handle 0x%4.4x", dst, handle); conn = kzalloc(sizeof(*conn), GFP_KERNEL); if (!conn) - return NULL; + return ERR_PTR(-ENOMEM); bacpy(&conn->dst, dst); bacpy(&conn->src, &hdev->bdaddr); @@ -944,10 +965,12 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, switch (type) { case ACL_LINK: conn->pkt_type = hdev->pkt_type & ACL_PTYPE_MASK; + conn->mtu = hdev->acl_mtu; break; case LE_LINK: /* conn->src should reflect the local identity address */ hci_copy_identity_address(hdev, &conn->src, &conn->src_type); + conn->mtu = hdev->le_mtu ? hdev->le_mtu : hdev->acl_mtu; break; case ISO_LINK: /* conn->src should reflect the local identity address */ @@ -959,6 +982,8 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, else if (conn->role == HCI_ROLE_MASTER) conn->cleanup = cis_cleanup; + conn->mtu = hdev->iso_mtu ? hdev->iso_mtu : + hdev->le_mtu ? hdev->le_mtu : hdev->acl_mtu; break; case SCO_LINK: if (lmp_esco_capable(hdev)) @@ -966,9 +991,12 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, (hdev->esco_type & EDR_ESCO_MASK); else conn->pkt_type = hdev->pkt_type & SCO_PTYPE_MASK; + + conn->mtu = hdev->sco_mtu; break; case ESCO_LINK: conn->pkt_type = hdev->esco_type & ~EDR_ESCO_MASK; + conn->mtu = hdev->sco_mtu; break; } @@ -1011,9 +1039,18 @@ struct hci_conn *hci_conn_add_unset(struct hci_dev *hdev, int type, handle = hci_conn_hash_alloc_unset(hdev); if (unlikely(handle < 0)) - return NULL; + return ERR_PTR(-ECONNREFUSED); - return hci_conn_add(hdev, type, dst, role, handle); + return __hci_conn_add(hdev, type, dst, role, handle); +} + +struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, + u8 role, u16 handle) +{ + if (handle > HCI_CONN_HANDLE_MAX) + return ERR_PTR(-EINVAL); + + return __hci_conn_add(hdev, type, dst, role, handle); } static void hci_conn_cleanup_child(struct hci_conn *conn, u8 reason) @@ -1140,8 +1177,7 @@ struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src, uint8_t src_type) list_for_each_entry(d, &hci_dev_list, list) { if (!test_bit(HCI_UP, &d->flags) || - hci_dev_test_flag(d, HCI_USER_CHANNEL) || - d->dev_type != HCI_PRIMARY) + hci_dev_test_flag(d, HCI_USER_CHANNEL)) continue; /* Simple routing: @@ -1317,8 +1353,8 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, bacpy(&conn->dst, dst); } else { conn = hci_conn_add_unset(hdev, LE_LINK, dst, role); - if (!conn) - return ERR_PTR(-ENOMEM); + if (IS_ERR(conn)) + return conn; hci_conn_hold(conn); conn->pending_sec_level = sec_level; } @@ -1494,8 +1530,8 @@ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst, return ERR_PTR(-EADDRINUSE); conn = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_MASTER); - if (!conn) - return ERR_PTR(-ENOMEM); + if (IS_ERR(conn)) + return conn; conn->state = BT_CONNECT; @@ -1538,8 +1574,8 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, BT_DBG("requesting refresh of dst_addr"); conn = hci_conn_add_unset(hdev, LE_LINK, dst, HCI_ROLE_MASTER); - if (!conn) - return ERR_PTR(-ENOMEM); + if (IS_ERR(conn)) + return conn; if (hci_explicit_conn_params_set(hdev, dst, dst_type) < 0) { hci_conn_del(conn); @@ -1586,8 +1622,8 @@ struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, acl = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst); if (!acl) { acl = hci_conn_add_unset(hdev, ACL_LINK, dst, HCI_ROLE_MASTER); - if (!acl) - return ERR_PTR(-ENOMEM); + if (IS_ERR(acl)) + return acl; } hci_conn_hold(acl); @@ -1655,9 +1691,9 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, sco = hci_conn_hash_lookup_ba(hdev, type, dst); if (!sco) { sco = hci_conn_add_unset(hdev, type, dst, HCI_ROLE_MASTER); - if (!sco) { + if (IS_ERR(sco)) { hci_conn_drop(acl); - return ERR_PTR(-ENOMEM); + return sco; } } @@ -1722,34 +1758,33 @@ static int hci_le_create_big(struct hci_conn *conn, struct bt_iso_qos *qos) static int set_cig_params_sync(struct hci_dev *hdev, void *data) { + DEFINE_FLEX(struct hci_cp_le_set_cig_params, pdu, cis, num_cis, 0x1f); u8 cig_id = PTR_UINT(data); struct hci_conn *conn; struct bt_iso_qos *qos; - struct iso_cig_params pdu; + u8 aux_num_cis = 0; u8 cis_id; conn = hci_conn_hash_lookup_cig(hdev, cig_id); if (!conn) return 0; - memset(&pdu, 0, sizeof(pdu)); - qos = &conn->iso_qos; - pdu.cp.cig_id = cig_id; - hci_cpu_to_le24(qos->ucast.out.interval, pdu.cp.c_interval); - hci_cpu_to_le24(qos->ucast.in.interval, pdu.cp.p_interval); - pdu.cp.sca = qos->ucast.sca; - pdu.cp.packing = qos->ucast.packing; - pdu.cp.framing = qos->ucast.framing; - pdu.cp.c_latency = cpu_to_le16(qos->ucast.out.latency); - pdu.cp.p_latency = cpu_to_le16(qos->ucast.in.latency); + pdu->cig_id = cig_id; + hci_cpu_to_le24(qos->ucast.out.interval, pdu->c_interval); + hci_cpu_to_le24(qos->ucast.in.interval, pdu->p_interval); + pdu->sca = qos->ucast.sca; + pdu->packing = qos->ucast.packing; + pdu->framing = qos->ucast.framing; + pdu->c_latency = cpu_to_le16(qos->ucast.out.latency); + pdu->p_latency = cpu_to_le16(qos->ucast.in.latency); /* Reprogram all CIS(s) with the same CIG, valid range are: * num_cis: 0x00 to 0x1F * cis_id: 0x00 to 0xEF */ for (cis_id = 0x00; cis_id < 0xf0 && - pdu.cp.num_cis < ARRAY_SIZE(pdu.cis); cis_id++) { + aux_num_cis < pdu->num_cis; cis_id++) { struct hci_cis_params *cis; conn = hci_conn_hash_lookup_cis(hdev, NULL, 0, cig_id, cis_id); @@ -1758,7 +1793,7 @@ static int set_cig_params_sync(struct hci_dev *hdev, void *data) qos = &conn->iso_qos; - cis = &pdu.cis[pdu.cp.num_cis++]; + cis = &pdu->cis[aux_num_cis++]; cis->cis_id = cis_id; cis->c_sdu = cpu_to_le16(conn->iso_qos.ucast.out.sdu); cis->p_sdu = cpu_to_le16(conn->iso_qos.ucast.in.sdu); @@ -1769,14 +1804,14 @@ static int set_cig_params_sync(struct hci_dev *hdev, void *data) cis->c_rtn = qos->ucast.out.rtn; cis->p_rtn = qos->ucast.in.rtn; } + pdu->num_cis = aux_num_cis; - if (!pdu.cp.num_cis) + if (!pdu->num_cis) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_CIG_PARAMS, - sizeof(pdu.cp) + - pdu.cp.num_cis * sizeof(pdu.cis[0]), &pdu, - HCI_CMD_TIMEOUT); + struct_size(pdu, cis, pdu->num_cis), + pdu, HCI_CMD_TIMEOUT); } static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos) @@ -1847,8 +1882,8 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, qos->ucast.cis); if (!cis) { cis = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_MASTER); - if (!cis) - return ERR_PTR(-ENOMEM); + if (IS_ERR(cis)) + return cis; cis->cleanup = cis_cleanup; cis->dst_type = dst_type; cis->iso_qos.ucast.cig = BT_ISO_QOS_CIG_UNSET; @@ -1983,14 +2018,8 @@ static void hci_iso_qos_setup(struct hci_dev *hdev, struct hci_conn *conn, struct bt_iso_io_qos *qos, __u8 phy) { /* Only set MTU if PHY is enabled */ - if (!qos->sdu && qos->phy) { - if (hdev->iso_mtu > 0) - qos->sdu = hdev->iso_mtu; - else if (hdev->le_mtu > 0) - qos->sdu = hdev->le_mtu; - else - qos->sdu = hdev->acl_mtu; - } + if (!qos->sdu && qos->phy) + qos->sdu = conn->mtu; /* Use the same PHY as ACL if set to any */ if (qos->phy == BT_ISO_PHY_ANY) @@ -2059,18 +2088,31 @@ static int create_pa_sync(struct hci_dev *hdev, void *data) return hci_update_passive_scan_sync(hdev); } -int hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, - __u8 sid, struct bt_iso_qos *qos) +struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, + __u8 dst_type, __u8 sid, + struct bt_iso_qos *qos) { struct hci_cp_le_pa_create_sync *cp; + struct hci_conn *conn; + int err; if (hci_dev_test_and_set_flag(hdev, HCI_PA_SYNC)) - return -EBUSY; + return ERR_PTR(-EBUSY); + + conn = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_SLAVE); + if (IS_ERR(conn)) + return conn; + + conn->iso_qos = *qos; + conn->state = BT_LISTEN; + + hci_conn_hold(conn); cp = kzalloc(sizeof(*cp), GFP_KERNEL); if (!cp) { hci_dev_clear_flag(hdev, HCI_PA_SYNC); - return -ENOMEM; + hci_conn_drop(conn); + return ERR_PTR(-ENOMEM); } cp->options = qos->bcast.options; @@ -2082,20 +2124,24 @@ int hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, cp->sync_cte_type = qos->bcast.sync_cte_type; /* Queue start pa_create_sync and scan */ - return hci_cmd_sync_queue(hdev, create_pa_sync, cp, create_pa_complete); + err = hci_cmd_sync_queue(hdev, create_pa_sync, cp, create_pa_complete); + if (err < 0) { + hci_conn_drop(conn); + kfree(cp); + return ERR_PTR(err); + } + + return conn; } int hci_le_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, struct bt_iso_qos *qos, __u16 sync_handle, __u8 num_bis, __u8 bis[]) { - struct _packed { - struct hci_cp_le_big_create_sync cp; - __u8 bis[0x11]; - } pdu; + DEFINE_FLEX(struct hci_cp_le_big_create_sync, pdu, bis, num_bis, 0x11); int err; - if (num_bis < 0x01 || num_bis > sizeof(pdu.bis)) + if (num_bis < 0x01 || num_bis > pdu->num_bis) return -EINVAL; err = qos_set_big(hdev, qos); @@ -2105,18 +2151,17 @@ int hci_le_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, if (hcon) hcon->iso_qos.bcast.big = qos->bcast.big; - memset(&pdu, 0, sizeof(pdu)); - pdu.cp.handle = qos->bcast.big; - pdu.cp.sync_handle = cpu_to_le16(sync_handle); - pdu.cp.encryption = qos->bcast.encryption; - memcpy(pdu.cp.bcode, qos->bcast.bcode, sizeof(pdu.cp.bcode)); - pdu.cp.mse = qos->bcast.mse; - pdu.cp.timeout = cpu_to_le16(qos->bcast.timeout); - pdu.cp.num_bis = num_bis; - memcpy(pdu.bis, bis, num_bis); + pdu->handle = qos->bcast.big; + pdu->sync_handle = cpu_to_le16(sync_handle); + pdu->encryption = qos->bcast.encryption; + memcpy(pdu->bcode, qos->bcast.bcode, sizeof(pdu->bcode)); + pdu->mse = qos->bcast.mse; + pdu->timeout = cpu_to_le16(qos->bcast.timeout); + pdu->num_bis = num_bis; + memcpy(pdu->bis, bis, num_bis); return hci_send_cmd(hdev, HCI_OP_LE_BIG_CREATE_SYNC, - sizeof(pdu.cp) + num_bis, &pdu); + struct_size(pdu, bis, num_bis), pdu); } static void create_big_complete(struct hci_dev *hdev, void *data, int err) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 0592369579..6ecb110bf4 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -63,50 +63,6 @@ DEFINE_MUTEX(hci_cb_list_lock); /* HCI ID Numbering */ static DEFINE_IDA(hci_index_ida); -static int hci_scan_req(struct hci_request *req, unsigned long opt) -{ - __u8 scan = opt; - - BT_DBG("%s %x", req->hdev->name, scan); - - /* Inquiry and Page scans */ - hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); - return 0; -} - -static int hci_auth_req(struct hci_request *req, unsigned long opt) -{ - __u8 auth = opt; - - BT_DBG("%s %x", req->hdev->name, auth); - - /* Authentication */ - hci_req_add(req, HCI_OP_WRITE_AUTH_ENABLE, 1, &auth); - return 0; -} - -static int hci_encrypt_req(struct hci_request *req, unsigned long opt) -{ - __u8 encrypt = opt; - - BT_DBG("%s %x", req->hdev->name, encrypt); - - /* Encryption */ - hci_req_add(req, HCI_OP_WRITE_ENCRYPT_MODE, 1, &encrypt); - return 0; -} - -static int hci_linkpol_req(struct hci_request *req, unsigned long opt) -{ - __le16 policy = cpu_to_le16(opt); - - BT_DBG("%s %x", req->hdev->name, policy); - - /* Default link policy */ - hci_req_add(req, HCI_OP_WRITE_DEF_LINK_POLICY, 2, &policy); - return 0; -} - /* Get HCI device by index. * Device is held on return. */ struct hci_dev *hci_dev_get(int index) @@ -149,8 +105,6 @@ void hci_discovery_set_state(struct hci_dev *hdev, int state) { int old_state = hdev->discovery.state; - BT_DBG("%s state %u -> %u", hdev->name, hdev->discovery.state, state); - if (old_state == state) return; @@ -173,6 +127,8 @@ void hci_discovery_set_state(struct hci_dev *hdev, int state) case DISCOVERY_STOPPING: break; } + + bt_dev_dbg(hdev, "state %u -> %u", old_state, state); } void hci_inquiry_cache_flush(struct hci_dev *hdev) @@ -395,11 +351,6 @@ int hci_inquiry(void __user *arg) goto done; } - if (hdev->dev_type != HCI_PRIMARY) { - err = -EOPNOTSUPP; - goto done; - } - if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { err = -EOPNOTSUPP; goto done; @@ -733,6 +684,7 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg) { struct hci_dev *hdev; struct hci_dev_req dr; + __le16 policy; int err = 0; if (copy_from_user(&dr, arg, sizeof(dr))) @@ -752,11 +704,6 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg) goto done; } - if (hdev->dev_type != HCI_PRIMARY) { - err = -EOPNOTSUPP; - goto done; - } - if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { err = -EOPNOTSUPP; goto done; @@ -764,8 +711,8 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg) switch (cmd) { case HCISETAUTH: - err = hci_req_sync(hdev, hci_auth_req, dr.dev_opt, - HCI_INIT_TIMEOUT, NULL); + err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_AUTH_ENABLE, + 1, &dr.dev_opt, HCI_CMD_TIMEOUT); break; case HCISETENCRYPT: @@ -776,19 +723,21 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg) if (!test_bit(HCI_AUTH, &hdev->flags)) { /* Auth must be enabled first */ - err = hci_req_sync(hdev, hci_auth_req, dr.dev_opt, - HCI_INIT_TIMEOUT, NULL); + err = hci_cmd_sync_status(hdev, + HCI_OP_WRITE_AUTH_ENABLE, + 1, &dr.dev_opt, + HCI_CMD_TIMEOUT); if (err) break; } - err = hci_req_sync(hdev, hci_encrypt_req, dr.dev_opt, - HCI_INIT_TIMEOUT, NULL); + err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_ENCRYPT_MODE, + 1, &dr.dev_opt, HCI_CMD_TIMEOUT); break; case HCISETSCAN: - err = hci_req_sync(hdev, hci_scan_req, dr.dev_opt, - HCI_INIT_TIMEOUT, NULL); + err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_SCAN_ENABLE, + 1, &dr.dev_opt, HCI_CMD_TIMEOUT); /* Ensure that the connectable and discoverable states * get correctly modified as this was a non-mgmt change. @@ -798,8 +747,10 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg) break; case HCISETLINKPOL: - err = hci_req_sync(hdev, hci_linkpol_req, dr.dev_opt, - HCI_INIT_TIMEOUT, NULL); + policy = cpu_to_le16(dr.dev_opt); + + err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_DEF_LINK_POLICY, + 2, &policy, HCI_CMD_TIMEOUT); break; case HCISETLINKMODE: @@ -910,7 +861,7 @@ int hci_get_dev_info(void __user *arg) strscpy(di.name, hdev->name, sizeof(di.name)); di.bdaddr = hdev->bdaddr; - di.type = (hdev->bus & 0x0f) | ((hdev->dev_type & 0x03) << 4); + di.type = (hdev->bus & 0x0f); di.flags = flags; di.pkt_type = hdev->pkt_type; if (lmp_bredr_capable(hdev)) { @@ -940,20 +891,51 @@ int hci_get_dev_info(void __user *arg) /* ---- Interface to HCI drivers ---- */ +static int hci_dev_do_poweroff(struct hci_dev *hdev) +{ + int err; + + BT_DBG("%s %p", hdev->name, hdev); + + hci_req_sync_lock(hdev); + + err = hci_set_powered_sync(hdev, false); + + hci_req_sync_unlock(hdev); + + return err; +} + static int hci_rfkill_set_block(void *data, bool blocked) { struct hci_dev *hdev = data; + int err; BT_DBG("%p name %s blocked %d", hdev, hdev->name, blocked); if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) return -EBUSY; + if (blocked == hci_dev_test_flag(hdev, HCI_RFKILLED)) + return 0; + if (blocked) { hci_dev_set_flag(hdev, HCI_RFKILLED); + if (!hci_dev_test_flag(hdev, HCI_SETUP) && - !hci_dev_test_flag(hdev, HCI_CONFIG)) - hci_dev_do_close(hdev); + !hci_dev_test_flag(hdev, HCI_CONFIG)) { + err = hci_dev_do_poweroff(hdev); + if (err) { + bt_dev_err(hdev, "Error when powering off device on rfkill (%d)", + err); + + /* Make sure the device is still closed even if + * anything during power off sequence (eg. + * disconnecting devices) failed. + */ + hci_dev_do_close(hdev); + } + } } else { hci_dev_clear_flag(hdev, HCI_RFKILLED); } @@ -995,8 +977,7 @@ static void hci_power_on(struct work_struct *work) */ if (hci_dev_test_flag(hdev, HCI_RFKILLED) || hci_dev_test_flag(hdev, HCI_UNCONFIGURED) || - (hdev->dev_type == HCI_PRIMARY && - !bacmp(&hdev->bdaddr, BDADDR_ANY) && + (!bacmp(&hdev->bdaddr, BDADDR_ANY) && !bacmp(&hdev->static_addr, BDADDR_ANY))) { hci_dev_clear_flag(hdev, HCI_AUTO_OFF); hci_dev_do_close(hdev); @@ -1738,6 +1719,15 @@ struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance, adv->pending = true; adv->instance = instance; + + /* If controller support only one set and the instance is set to + * 1 then there is no option other than using handle 0x00. + */ + if (hdev->le_num_of_adv_sets == 1 && instance == 1) + adv->handle = 0x00; + else + adv->handle = instance; + list_add(&adv->list, &hdev->adv_instances); hdev->adv_instance_cnt++; } @@ -2492,16 +2482,16 @@ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv) hdev->le_adv_channel_map = 0x07; hdev->le_adv_min_interval = 0x0800; hdev->le_adv_max_interval = 0x0800; - hdev->le_scan_interval = 0x0060; - hdev->le_scan_window = 0x0030; - hdev->le_scan_int_suspend = 0x0400; - hdev->le_scan_window_suspend = 0x0012; + hdev->le_scan_interval = DISCOV_LE_SCAN_INT_FAST; + hdev->le_scan_window = DISCOV_LE_SCAN_WIN_FAST; + hdev->le_scan_int_suspend = DISCOV_LE_SCAN_INT_SLOW1; + hdev->le_scan_window_suspend = DISCOV_LE_SCAN_WIN_SLOW1; hdev->le_scan_int_discovery = DISCOV_LE_SCAN_INT; hdev->le_scan_window_discovery = DISCOV_LE_SCAN_WIN; - hdev->le_scan_int_adv_monitor = 0x0060; - hdev->le_scan_window_adv_monitor = 0x0030; - hdev->le_scan_int_connect = 0x0060; - hdev->le_scan_window_connect = 0x0060; + hdev->le_scan_int_adv_monitor = DISCOV_LE_SCAN_INT_FAST; + hdev->le_scan_window_adv_monitor = DISCOV_LE_SCAN_WIN_FAST; + hdev->le_scan_int_connect = DISCOV_LE_SCAN_INT_CONN; + hdev->le_scan_window_connect = DISCOV_LE_SCAN_WIN_CONN; hdev->le_conn_min_interval = 0x0018; hdev->le_conn_max_interval = 0x0028; hdev->le_conn_latency = 0x0000; @@ -2518,7 +2508,7 @@ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv) hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M; hdev->le_num_of_adv_sets = HCI_MAX_ADV_INSTANCES; hdev->def_multi_adv_rotation_duration = HCI_DEFAULT_ADV_DURATION; - hdev->def_le_autoconnect_timeout = HCI_LE_AUTOCONN_TIMEOUT; + hdev->def_le_autoconnect_timeout = HCI_LE_CONN_TIMEOUT; hdev->min_le_tx_power = HCI_TX_POWER_INVALID; hdev->max_le_tx_power = HCI_TX_POWER_INVALID; @@ -2604,20 +2594,7 @@ int hci_register_dev(struct hci_dev *hdev) if (!hdev->open || !hdev->close || !hdev->send) return -EINVAL; - /* Do not allow HCI_AMP devices to register at index 0, - * so the index can be used as the AMP controller ID. - */ - switch (hdev->dev_type) { - case HCI_PRIMARY: - id = ida_simple_get(&hci_index_ida, 0, HCI_MAX_ID, GFP_KERNEL); - break; - case HCI_AMP: - id = ida_simple_get(&hci_index_ida, 1, HCI_MAX_ID, GFP_KERNEL); - break; - default: - return -EINVAL; - } - + id = ida_alloc_max(&hci_index_ida, HCI_MAX_ID - 1, GFP_KERNEL); if (id < 0) return id; @@ -2669,12 +2646,10 @@ int hci_register_dev(struct hci_dev *hdev) hci_dev_set_flag(hdev, HCI_SETUP); hci_dev_set_flag(hdev, HCI_AUTO_OFF); - if (hdev->dev_type == HCI_PRIMARY) { - /* Assume BR/EDR support until proven otherwise (such as - * through reading supported features during init. - */ - hci_dev_set_flag(hdev, HCI_BREDR_ENABLED); - } + /* Assume BR/EDR support until proven otherwise (such as + * through reading supported features during init. + */ + hci_dev_set_flag(hdev, HCI_BREDR_ENABLED); write_lock(&hci_dev_list_lock); list_add(&hdev->list, &hci_dev_list); @@ -2711,7 +2686,7 @@ err_wqueue: destroy_workqueue(hdev->workqueue); destroy_workqueue(hdev->req_workqueue); err: - ida_simple_remove(&hci_index_ida, hdev->id); + ida_free(&hci_index_ida, hdev->id); return error; } @@ -2730,14 +2705,16 @@ void hci_unregister_dev(struct hci_dev *hdev) list_del(&hdev->list); write_unlock(&hci_dev_list_lock); + cancel_work_sync(&hdev->rx_work); + cancel_work_sync(&hdev->cmd_work); + cancel_work_sync(&hdev->tx_work); cancel_work_sync(&hdev->power_on); + cancel_work_sync(&hdev->error_reset); hci_cmd_sync_clear(hdev); hci_unregister_suspend_notifier(hdev); - msft_unregister(hdev); - hci_dev_do_close(hdev); if (!test_bit(HCI_INIT, &hdev->flags) && @@ -2791,10 +2768,11 @@ void hci_release_dev(struct hci_dev *hdev) hci_discovery_filter_clear(hdev); hci_blocked_keys_clear(hdev); hci_codec_list_clear(&hdev->local_codecs); + msft_release(hdev); hci_dev_unlock(hdev); ida_destroy(&hdev->unset_handle_ida); - ida_simple_remove(&hci_index_ida, hdev->id); + ida_free(&hci_index_ida, hdev->id); kfree_skb(hdev->sent_cmd); kfree_skb(hdev->req_skb); kfree_skb(hdev->recv_event); @@ -3211,17 +3189,7 @@ static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue, hci_skb_pkt_type(skb) = HCI_ACLDATA_PKT; - switch (hdev->dev_type) { - case HCI_PRIMARY: - hci_add_acl_hdr(skb, conn->handle, flags); - break; - case HCI_AMP: - hci_add_acl_hdr(skb, chan->handle, flags); - break; - default: - bt_dev_err(hdev, "unknown dev_type %d", hdev->dev_type); - return; - } + hci_add_acl_hdr(skb, conn->handle, flags); list = skb_shinfo(skb)->frag_list; if (!list) { @@ -3381,9 +3349,6 @@ static inline void hci_quote_sent(struct hci_conn *conn, int num, int *quote) case ACL_LINK: cnt = hdev->acl_cnt; break; - case AMP_LINK: - cnt = hdev->block_cnt; - break; case SCO_LINK: case ESCO_LINK: cnt = hdev->sco_cnt; @@ -3581,12 +3546,6 @@ static void hci_prio_recalculate(struct hci_dev *hdev, __u8 type) } -static inline int __get_blocks(struct hci_dev *hdev, struct sk_buff *skb) -{ - /* Calculate count of blocks used by this packet */ - return DIV_ROUND_UP(skb->len - HCI_ACL_HDR_SIZE, hdev->block_len); -} - static void __check_timeout(struct hci_dev *hdev, unsigned int cnt, u8 type) { unsigned long last_tx; @@ -3700,81 +3659,15 @@ static void hci_sched_acl_pkt(struct hci_dev *hdev) hci_prio_recalculate(hdev, ACL_LINK); } -static void hci_sched_acl_blk(struct hci_dev *hdev) -{ - unsigned int cnt = hdev->block_cnt; - struct hci_chan *chan; - struct sk_buff *skb; - int quote; - u8 type; - - BT_DBG("%s", hdev->name); - - if (hdev->dev_type == HCI_AMP) - type = AMP_LINK; - else - type = ACL_LINK; - - __check_timeout(hdev, cnt, type); - - while (hdev->block_cnt > 0 && - (chan = hci_chan_sent(hdev, type, "e))) { - u32 priority = (skb_peek(&chan->data_q))->priority; - while (quote > 0 && (skb = skb_peek(&chan->data_q))) { - int blocks; - - BT_DBG("chan %p skb %p len %d priority %u", chan, skb, - skb->len, skb->priority); - - /* Stop if priority has changed */ - if (skb->priority < priority) - break; - - skb = skb_dequeue(&chan->data_q); - - blocks = __get_blocks(hdev, skb); - if (blocks > hdev->block_cnt) - return; - - hci_conn_enter_active_mode(chan->conn, - bt_cb(skb)->force_active); - - hci_send_frame(hdev, skb); - hdev->acl_last_tx = jiffies; - - hdev->block_cnt -= blocks; - quote -= blocks; - - chan->sent += blocks; - chan->conn->sent += blocks; - } - } - - if (cnt != hdev->block_cnt) - hci_prio_recalculate(hdev, type); -} - static void hci_sched_acl(struct hci_dev *hdev) { BT_DBG("%s", hdev->name); /* No ACL link over BR/EDR controller */ - if (!hci_conn_num(hdev, ACL_LINK) && hdev->dev_type == HCI_PRIMARY) - return; - - /* No AMP link over AMP controller */ - if (!hci_conn_num(hdev, AMP_LINK) && hdev->dev_type == HCI_AMP) + if (!hci_conn_num(hdev, ACL_LINK)) return; - switch (hdev->flow_ctl_mode) { - case HCI_FLOW_CTL_MODE_PACKET_BASED: - hci_sched_acl_pkt(hdev); - break; - - case HCI_FLOW_CTL_MODE_BLOCK_BASED: - hci_sched_acl_blk(hdev); - break; - } + hci_sched_acl_pkt(hdev); } static void hci_sched_le(struct hci_dev *hdev) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 9d1063c51e..a78f6d706c 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1,7 +1,7 @@ /* BlueZ - Bluetooth protocol stack for Linux Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved. - Copyright 2023 NXP + Copyright 2023-2024 NXP Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> @@ -913,21 +913,6 @@ static u8 hci_cc_read_local_ext_features(struct hci_dev *hdev, void *data, return rp->status; } -static u8 hci_cc_read_flow_control_mode(struct hci_dev *hdev, void *data, - struct sk_buff *skb) -{ - struct hci_rp_read_flow_control_mode *rp = data; - - bt_dev_dbg(hdev, "status 0x%2.2x", rp->status); - - if (rp->status) - return rp->status; - - hdev->flow_ctl_mode = rp->mode; - - return rp->status; -} - static u8 hci_cc_read_buffer_size(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -954,6 +939,9 @@ static u8 hci_cc_read_buffer_size(struct hci_dev *hdev, void *data, BT_DBG("%s acl mtu %d:%d sco mtu %d:%d", hdev->name, hdev->acl_mtu, hdev->acl_pkts, hdev->sco_mtu, hdev->sco_pkts); + if (!hdev->acl_mtu || !hdev->acl_pkts) + return HCI_ERROR_INVALID_PARAMETERS; + return rp->status; } @@ -1068,28 +1056,6 @@ static u8 hci_cc_write_page_scan_type(struct hci_dev *hdev, void *data, return rp->status; } -static u8 hci_cc_read_data_block_size(struct hci_dev *hdev, void *data, - struct sk_buff *skb) -{ - struct hci_rp_read_data_block_size *rp = data; - - bt_dev_dbg(hdev, "status 0x%2.2x", rp->status); - - if (rp->status) - return rp->status; - - hdev->block_mtu = __le16_to_cpu(rp->max_acl_len); - hdev->block_len = __le16_to_cpu(rp->block_len); - hdev->num_blocks = __le16_to_cpu(rp->num_blocks); - - hdev->block_cnt = hdev->num_blocks; - - BT_DBG("%s blk mtu %d cnt %d len %d", hdev->name, hdev->block_mtu, - hdev->block_cnt, hdev->block_len); - - return rp->status; -} - static u8 hci_cc_read_clock(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -1124,30 +1090,6 @@ unlock: return rp->status; } -static u8 hci_cc_read_local_amp_info(struct hci_dev *hdev, void *data, - struct sk_buff *skb) -{ - struct hci_rp_read_local_amp_info *rp = data; - - bt_dev_dbg(hdev, "status 0x%2.2x", rp->status); - - if (rp->status) - return rp->status; - - hdev->amp_status = rp->amp_status; - hdev->amp_total_bw = __le32_to_cpu(rp->total_bw); - hdev->amp_max_bw = __le32_to_cpu(rp->max_bw); - hdev->amp_min_latency = __le32_to_cpu(rp->min_latency); - hdev->amp_max_pdu = __le32_to_cpu(rp->max_pdu); - hdev->amp_type = rp->amp_type; - hdev->amp_pal_cap = __le16_to_cpu(rp->pal_cap); - hdev->amp_assoc_size = __le16_to_cpu(rp->max_assoc_size); - hdev->amp_be_flush_to = __le32_to_cpu(rp->be_flush_to); - hdev->amp_max_flush_to = __le32_to_cpu(rp->max_flush_to); - - return rp->status; -} - static u8 hci_cc_read_inq_rsp_tx_power(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -1263,6 +1205,9 @@ static u8 hci_cc_le_read_buffer_size(struct hci_dev *hdev, void *data, BT_DBG("%s le mtu %d:%d", hdev->name, hdev->le_mtu, hdev->le_pkts); + if (hdev->le_mtu && hdev->le_mtu < HCI_MIN_LE_MTU) + return HCI_ERROR_INVALID_PARAMETERS; + return rp->status; } @@ -1777,10 +1722,10 @@ static void le_set_scan_enable_complete(struct hci_dev *hdev, u8 enable) switch (enable) { case LE_SCAN_ENABLE: hci_dev_set_flag(hdev, HCI_LE_SCAN); - if (hdev->le_scan_type == LE_SCAN_ACTIVE) + if (hdev->le_scan_type == LE_SCAN_ACTIVE) { clear_pending_adv_report(hdev); - if (hci_dev_test_flag(hdev, HCI_MESH)) hci_discovery_set_state(hdev, DISCOVERY_FINDING); + } break; case LE_SCAN_DISABLE: @@ -2342,8 +2287,8 @@ static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status) if (!conn) { conn = hci_conn_add_unset(hdev, ACL_LINK, &cp->bdaddr, HCI_ROLE_MASTER); - if (!conn) - bt_dev_err(hdev, "no memory for new connection"); + if (IS_ERR(conn)) + bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn)); } } @@ -3154,8 +3099,8 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, BDADDR_BREDR)) { conn = hci_conn_add_unset(hdev, ev->link_type, &ev->bdaddr, HCI_ROLE_SLAVE); - if (!conn) { - bt_dev_err(hdev, "no memory for new conn"); + if (IS_ERR(conn)) { + bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn)); goto unlock; } } else { @@ -3343,8 +3288,8 @@ static void hci_conn_request_evt(struct hci_dev *hdev, void *data, if (!conn) { conn = hci_conn_add_unset(hdev, ev->link_type, &ev->bdaddr, HCI_ROLE_SLAVE); - if (!conn) { - bt_dev_err(hdev, "no memory for new connection"); + if (IS_ERR(conn)) { + bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn)); goto unlock; } } @@ -3821,6 +3766,9 @@ static u8 hci_cc_le_read_buffer_size_v2(struct hci_dev *hdev, void *data, BT_DBG("%s acl mtu %d:%d iso mtu %d:%d", hdev->name, hdev->acl_mtu, hdev->acl_pkts, hdev->iso_mtu, hdev->iso_pkts); + if (hdev->le_mtu && hdev->le_mtu < HCI_MIN_LE_MTU) + return HCI_ERROR_INVALID_PARAMETERS; + return rp->status; } @@ -4112,12 +4060,6 @@ static const struct hci_cc { HCI_CC(HCI_OP_READ_PAGE_SCAN_TYPE, hci_cc_read_page_scan_type, sizeof(struct hci_rp_read_page_scan_type)), HCI_CC_STATUS(HCI_OP_WRITE_PAGE_SCAN_TYPE, hci_cc_write_page_scan_type), - HCI_CC(HCI_OP_READ_DATA_BLOCK_SIZE, hci_cc_read_data_block_size, - sizeof(struct hci_rp_read_data_block_size)), - HCI_CC(HCI_OP_READ_FLOW_CONTROL_MODE, hci_cc_read_flow_control_mode, - sizeof(struct hci_rp_read_flow_control_mode)), - HCI_CC(HCI_OP_READ_LOCAL_AMP_INFO, hci_cc_read_local_amp_info, - sizeof(struct hci_rp_read_local_amp_info)), HCI_CC(HCI_OP_READ_CLOCK, hci_cc_read_clock, sizeof(struct hci_rp_read_clock)), HCI_CC(HCI_OP_READ_ENC_KEY_SIZE, hci_cc_read_enc_key_size, @@ -4308,7 +4250,7 @@ static void hci_cs_le_create_cis(struct hci_dev *hdev, u8 status) hci_dev_lock(hdev); /* Remove connection if command failed */ - for (i = 0; cp->num_cis; cp->num_cis--, i++) { + for (i = 0; i < cp->num_cis; i++) { struct hci_conn *conn; u16 handle; @@ -4324,6 +4266,7 @@ static void hci_cs_le_create_cis(struct hci_dev *hdev, u8 status) hci_conn_del(conn); } } + cp->num_cis = 0; if (pending) hci_le_create_cis_pending(hdev); @@ -4452,11 +4395,6 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data, flex_array_size(ev, handles, ev->num))) return; - if (hdev->flow_ctl_mode != HCI_FLOW_CTL_MODE_PACKET_BASED) { - bt_dev_err(hdev, "wrong event for mode %d", hdev->flow_ctl_mode); - return; - } - bt_dev_dbg(hdev, "num %d", ev->num); for (i = 0; i < ev->num; i++) { @@ -4524,78 +4462,6 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data, queue_work(hdev->workqueue, &hdev->tx_work); } -static struct hci_conn *__hci_conn_lookup_handle(struct hci_dev *hdev, - __u16 handle) -{ - struct hci_chan *chan; - - switch (hdev->dev_type) { - case HCI_PRIMARY: - return hci_conn_hash_lookup_handle(hdev, handle); - case HCI_AMP: - chan = hci_chan_lookup_handle(hdev, handle); - if (chan) - return chan->conn; - break; - default: - bt_dev_err(hdev, "unknown dev_type %d", hdev->dev_type); - break; - } - - return NULL; -} - -static void hci_num_comp_blocks_evt(struct hci_dev *hdev, void *data, - struct sk_buff *skb) -{ - struct hci_ev_num_comp_blocks *ev = data; - int i; - - if (!hci_ev_skb_pull(hdev, skb, HCI_EV_NUM_COMP_BLOCKS, - flex_array_size(ev, handles, ev->num_hndl))) - return; - - if (hdev->flow_ctl_mode != HCI_FLOW_CTL_MODE_BLOCK_BASED) { - bt_dev_err(hdev, "wrong event for mode %d", - hdev->flow_ctl_mode); - return; - } - - bt_dev_dbg(hdev, "num_blocks %d num_hndl %d", ev->num_blocks, - ev->num_hndl); - - for (i = 0; i < ev->num_hndl; i++) { - struct hci_comp_blocks_info *info = &ev->handles[i]; - struct hci_conn *conn = NULL; - __u16 handle, block_count; - - handle = __le16_to_cpu(info->handle); - block_count = __le16_to_cpu(info->blocks); - - conn = __hci_conn_lookup_handle(hdev, handle); - if (!conn) - continue; - - conn->sent -= block_count; - - switch (conn->type) { - case ACL_LINK: - case AMP_LINK: - hdev->block_cnt += block_count; - if (hdev->block_cnt > hdev->num_blocks) - hdev->block_cnt = hdev->num_blocks; - break; - - default: - bt_dev_err(hdev, "unknown type %d conn %p", - conn->type, conn); - break; - } - } - - queue_work(hdev->workqueue, &hdev->tx_work); -} - static void hci_mode_change_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -5688,150 +5554,6 @@ unlock: hci_dev_unlock(hdev); } -#if IS_ENABLED(CONFIG_BT_HS) -static void hci_chan_selected_evt(struct hci_dev *hdev, void *data, - struct sk_buff *skb) -{ - struct hci_ev_channel_selected *ev = data; - struct hci_conn *hcon; - - bt_dev_dbg(hdev, "handle 0x%2.2x", ev->phy_handle); - - hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle); - if (!hcon) - return; - - amp_read_loc_assoc_final_data(hdev, hcon); -} - -static void hci_phy_link_complete_evt(struct hci_dev *hdev, void *data, - struct sk_buff *skb) -{ - struct hci_ev_phy_link_complete *ev = data; - struct hci_conn *hcon, *bredr_hcon; - - bt_dev_dbg(hdev, "handle 0x%2.2x status 0x%2.2x", ev->phy_handle, - ev->status); - - hci_dev_lock(hdev); - - hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle); - if (!hcon) - goto unlock; - - if (!hcon->amp_mgr) - goto unlock; - - if (ev->status) { - hci_conn_del(hcon); - goto unlock; - } - - bredr_hcon = hcon->amp_mgr->l2cap_conn->hcon; - - hcon->state = BT_CONNECTED; - bacpy(&hcon->dst, &bredr_hcon->dst); - - hci_conn_hold(hcon); - hcon->disc_timeout = HCI_DISCONN_TIMEOUT; - hci_conn_drop(hcon); - - hci_debugfs_create_conn(hcon); - hci_conn_add_sysfs(hcon); - - amp_physical_cfm(bredr_hcon, hcon); - -unlock: - hci_dev_unlock(hdev); -} - -static void hci_loglink_complete_evt(struct hci_dev *hdev, void *data, - struct sk_buff *skb) -{ - struct hci_ev_logical_link_complete *ev = data; - struct hci_conn *hcon; - struct hci_chan *hchan; - struct amp_mgr *mgr; - - bt_dev_dbg(hdev, "log_handle 0x%4.4x phy_handle 0x%2.2x status 0x%2.2x", - le16_to_cpu(ev->handle), ev->phy_handle, ev->status); - - hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle); - if (!hcon) - return; - - /* Create AMP hchan */ - hchan = hci_chan_create(hcon); - if (!hchan) - return; - - hchan->handle = le16_to_cpu(ev->handle); - hchan->amp = true; - - BT_DBG("hcon %p mgr %p hchan %p", hcon, hcon->amp_mgr, hchan); - - mgr = hcon->amp_mgr; - if (mgr && mgr->bredr_chan) { - struct l2cap_chan *bredr_chan = mgr->bredr_chan; - - l2cap_chan_lock(bredr_chan); - - bredr_chan->conn->mtu = hdev->block_mtu; - l2cap_logical_cfm(bredr_chan, hchan, 0); - hci_conn_hold(hcon); - - l2cap_chan_unlock(bredr_chan); - } -} - -static void hci_disconn_loglink_complete_evt(struct hci_dev *hdev, void *data, - struct sk_buff *skb) -{ - struct hci_ev_disconn_logical_link_complete *ev = data; - struct hci_chan *hchan; - - bt_dev_dbg(hdev, "handle 0x%4.4x status 0x%2.2x", - le16_to_cpu(ev->handle), ev->status); - - if (ev->status) - return; - - hci_dev_lock(hdev); - - hchan = hci_chan_lookup_handle(hdev, le16_to_cpu(ev->handle)); - if (!hchan || !hchan->amp) - goto unlock; - - amp_destroy_logical_link(hchan, ev->reason); - -unlock: - hci_dev_unlock(hdev); -} - -static void hci_disconn_phylink_complete_evt(struct hci_dev *hdev, void *data, - struct sk_buff *skb) -{ - struct hci_ev_disconn_phy_link_complete *ev = data; - struct hci_conn *hcon; - - bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); - - if (ev->status) - return; - - hci_dev_lock(hdev); - - hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle); - if (hcon && hcon->type == AMP_LINK) { - hcon->state = BT_CLOSED; - hci_disconn_cfm(hcon, ev->reason); - hci_conn_del(hcon); - } - - hci_dev_unlock(hdev); -} -#endif - static void le_conn_update_addr(struct hci_conn *conn, bdaddr_t *bdaddr, u8 bdaddr_type, bdaddr_t *local_rpa) { @@ -5912,8 +5634,8 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, goto unlock; conn = hci_conn_add_unset(hdev, LE_LINK, bdaddr, role); - if (!conn) { - bt_dev_err(hdev, "no memory for new connection"); + if (IS_ERR(conn)) { + bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn)); goto unlock; } @@ -6590,6 +6312,13 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, void *data, evt_type = __le16_to_cpu(info->type) & LE_EXT_ADV_EVT_TYPE_MASK; legacy_evt_type = ext_evt_type_to_legacy(hdev, evt_type); + + if (test_bit(HCI_QUIRK_FIXUP_LE_EXT_ADV_REPORT_PHY, + &hdev->quirks)) { + info->primary_phy &= 0x1f; + info->secondary_phy &= 0x1f; + } + if (legacy_evt_type != LE_ADV_INVALID) { process_adv_report(hdev, legacy_evt_type, &info->bdaddr, info->bdaddr_type, NULL, 0, @@ -6637,14 +6366,16 @@ static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data, if (!(flags & HCI_PROTO_DEFER)) goto unlock; - if (ev->status) { - /* Add connection to indicate the failed PA sync event */ - pa_sync = hci_conn_add_unset(hdev, ISO_LINK, BDADDR_ANY, - HCI_ROLE_SLAVE); + /* Add connection to indicate PA sync event */ + pa_sync = hci_conn_add_unset(hdev, ISO_LINK, BDADDR_ANY, + HCI_ROLE_SLAVE); - if (!pa_sync) - goto unlock; + if (IS_ERR(pa_sync)) + goto unlock; + + pa_sync->sync_handle = le16_to_cpu(ev->handle); + if (ev->status) { set_bit(HCI_CONN_PA_SYNC_FAILED, &pa_sync->flags); /* Notify iso layer */ @@ -6661,6 +6392,7 @@ static void hci_le_per_adv_report_evt(struct hci_dev *hdev, void *data, struct hci_ev_le_per_adv_report *ev = data; int mask = hdev->link_mode; __u8 flags = 0; + struct hci_conn *pa_sync; bt_dev_dbg(hdev, "sync_handle 0x%4.4x", le16_to_cpu(ev->sync_handle)); @@ -6668,8 +6400,28 @@ static void hci_le_per_adv_report_evt(struct hci_dev *hdev, void *data, mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, ISO_LINK, &flags); if (!(mask & HCI_LM_ACCEPT)) - hci_le_pa_term_sync(hdev, ev->sync_handle); + goto unlock; + + if (!(flags & HCI_PROTO_DEFER)) + goto unlock; + pa_sync = hci_conn_hash_lookup_pa_sync_handle + (hdev, + le16_to_cpu(ev->sync_handle)); + + if (!pa_sync) + goto unlock; + + if (ev->data_status == LE_PA_DATA_COMPLETE && + !test_and_set_bit(HCI_CONN_PA_SYNC, &pa_sync->flags)) { + /* Notify iso layer */ + hci_connect_cfm(pa_sync, 0); + + /* Notify MGMT layer */ + mgmt_device_connected(hdev, pa_sync, NULL, 0); + } + +unlock: hci_dev_unlock(hdev); } @@ -6916,6 +6668,7 @@ static void hci_le_cis_estabilished_evt(struct hci_dev *hdev, void *data, struct bt_iso_qos *qos; bool pending = false; u16 handle = __le16_to_cpu(ev->handle); + u32 c_sdu_interval, p_sdu_interval; bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); @@ -6940,12 +6693,25 @@ static void hci_le_cis_estabilished_evt(struct hci_dev *hdev, void *data, pending = test_and_clear_bit(HCI_CONN_CREATE_CIS, &conn->flags); - /* Convert ISO Interval (1.25 ms slots) to SDU Interval (us) */ - qos->ucast.in.interval = le16_to_cpu(ev->interval) * 1250; - qos->ucast.out.interval = qos->ucast.in.interval; + /* BLUETOOTH CORE SPECIFICATION Version 5.4 | Vol 6, Part G + * page 3075: + * Transport_Latency_C_To_P = CIG_Sync_Delay + (FT_C_To_P) × + * ISO_Interval + SDU_Interval_C_To_P + * ... + * SDU_Interval = (CIG_Sync_Delay + (FT) x ISO_Interval) - + * Transport_Latency + */ + c_sdu_interval = (get_unaligned_le24(ev->cig_sync_delay) + + (ev->c_ft * le16_to_cpu(ev->interval) * 1250)) - + get_unaligned_le24(ev->c_latency); + p_sdu_interval = (get_unaligned_le24(ev->cig_sync_delay) + + (ev->p_ft * le16_to_cpu(ev->interval) * 1250)) - + get_unaligned_le24(ev->p_latency); switch (conn->role) { case HCI_ROLE_SLAVE: + qos->ucast.in.interval = c_sdu_interval; + qos->ucast.out.interval = p_sdu_interval; /* Convert Transport Latency (us) to Latency (msec) */ qos->ucast.in.latency = DIV_ROUND_CLOSEST(get_unaligned_le24(ev->c_latency), @@ -6959,6 +6725,8 @@ static void hci_le_cis_estabilished_evt(struct hci_dev *hdev, void *data, qos->ucast.out.phy = ev->p_phy; break; case HCI_ROLE_MASTER: + qos->ucast.in.interval = p_sdu_interval; + qos->ucast.out.interval = c_sdu_interval; /* Convert Transport Latency (us) to Latency (msec) */ qos->ucast.out.latency = DIV_ROUND_CLOSEST(get_unaligned_le24(ev->c_latency), @@ -7042,7 +6810,7 @@ static void hci_le_cis_req_evt(struct hci_dev *hdev, void *data, if (!cis) { cis = hci_conn_add(hdev, ISO_LINK, &acl->dst, HCI_ROLE_SLAVE, cis_handle); - if (!cis) { + if (IS_ERR(cis)) { hci_le_reject_cis(hdev, ev->cis_handle); goto unlock; } @@ -7149,9 +6917,13 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, bis = hci_conn_hash_lookup_handle(hdev, handle); if (!bis) { + if (handle > HCI_CONN_HANDLE_MAX) { + bt_dev_dbg(hdev, "ignore too large handle %u", handle); + continue; + } bis = hci_conn_add(hdev, ISO_LINK, BDADDR_ANY, HCI_ROLE_SLAVE, handle); - if (!bis) + if (IS_ERR(bis)) continue; } @@ -7181,6 +6953,8 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, u16 handle = le16_to_cpu(ev->bis[i]); bis = hci_conn_hash_lookup_handle(hdev, handle); + if (!bis) + continue; set_bit(HCI_CONN_BIG_SYNC_FAILED, &bis->flags); hci_connect_cfm(bis, ev->status); @@ -7202,10 +6976,8 @@ static void hci_le_big_info_adv_report_evt(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, ISO_LINK, &flags); - if (!(mask & HCI_LM_ACCEPT)) { - hci_le_pa_term_sync(hdev, ev->sync_handle); + if (!(mask & HCI_LM_ACCEPT)) goto unlock; - } if (!(flags & HCI_PROTO_DEFER)) goto unlock; @@ -7214,24 +6986,13 @@ static void hci_le_big_info_adv_report_evt(struct hci_dev *hdev, void *data, (hdev, le16_to_cpu(ev->sync_handle)); - if (pa_sync) - goto unlock; - - /* Add connection to indicate the PA sync event */ - pa_sync = hci_conn_add_unset(hdev, ISO_LINK, BDADDR_ANY, - HCI_ROLE_SLAVE); - if (!pa_sync) goto unlock; - pa_sync->sync_handle = le16_to_cpu(ev->sync_handle); - set_bit(HCI_CONN_PA_SYNC, &pa_sync->flags); + pa_sync->iso_qos.bcast.encryption = ev->encryption; /* Notify iso layer */ - hci_connect_cfm(pa_sync, 0x00); - - /* Notify MGMT layer */ - mgmt_device_connected(hdev, pa_sync, NULL, 0); + hci_connect_cfm(pa_sync, 0); unlock: hci_dev_unlock(hdev); @@ -7645,28 +7406,6 @@ static const struct hci_ev { /* [0x3e = HCI_EV_LE_META] */ HCI_EV_REQ_VL(HCI_EV_LE_META, hci_le_meta_evt, sizeof(struct hci_ev_le_meta), HCI_MAX_EVENT_SIZE), -#if IS_ENABLED(CONFIG_BT_HS) - /* [0x40 = HCI_EV_PHY_LINK_COMPLETE] */ - HCI_EV(HCI_EV_PHY_LINK_COMPLETE, hci_phy_link_complete_evt, - sizeof(struct hci_ev_phy_link_complete)), - /* [0x41 = HCI_EV_CHANNEL_SELECTED] */ - HCI_EV(HCI_EV_CHANNEL_SELECTED, hci_chan_selected_evt, - sizeof(struct hci_ev_channel_selected)), - /* [0x42 = HCI_EV_DISCONN_PHY_LINK_COMPLETE] */ - HCI_EV(HCI_EV_DISCONN_LOGICAL_LINK_COMPLETE, - hci_disconn_loglink_complete_evt, - sizeof(struct hci_ev_disconn_logical_link_complete)), - /* [0x45 = HCI_EV_LOGICAL_LINK_COMPLETE] */ - HCI_EV(HCI_EV_LOGICAL_LINK_COMPLETE, hci_loglink_complete_evt, - sizeof(struct hci_ev_logical_link_complete)), - /* [0x46 = HCI_EV_DISCONN_LOGICAL_LINK_COMPLETE] */ - HCI_EV(HCI_EV_DISCONN_PHY_LINK_COMPLETE, - hci_disconn_phylink_complete_evt, - sizeof(struct hci_ev_disconn_phy_link_complete)), -#endif - /* [0x48 = HCI_EV_NUM_COMP_BLOCKS] */ - HCI_EV(HCI_EV_NUM_COMP_BLOCKS, hci_num_comp_blocks_evt, - sizeof(struct hci_ev_num_comp_blocks)), /* [0xff = HCI_EV_VENDOR] */ HCI_EV_VL(HCI_EV_VENDOR, msft_vendor_evt, 0, HCI_MAX_EVENT_SIZE), }; diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index 0be75cf0ef..c91f2838f5 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -29,10 +29,6 @@ #define hci_req_sync_lock(hdev) mutex_lock(&hdev->req_lock) #define hci_req_sync_unlock(hdev) mutex_unlock(&hdev->req_lock) -#define HCI_REQ_DONE 0 -#define HCI_REQ_PEND 1 -#define HCI_REQ_CANCELED 2 - struct hci_request { struct hci_dev *hdev; struct sk_buff_head cmd_q; diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 3f5f093233..69c2ba1e84 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -101,7 +101,7 @@ static bool hci_sock_gen_cookie(struct sock *sk) int id = hci_pi(sk)->cookie; if (!id) { - id = ida_simple_get(&sock_cookie_ida, 1, 0, GFP_KERNEL); + id = ida_alloc_min(&sock_cookie_ida, 1, GFP_KERNEL); if (id < 0) id = 0xffffffff; @@ -119,7 +119,7 @@ static void hci_sock_free_cookie(struct sock *sk) if (id) { hci_pi(sk)->cookie = 0xffffffff; - ida_simple_remove(&sock_cookie_ida, id); + ida_free(&sock_cookie_ida, id); } } @@ -485,7 +485,7 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) return NULL; ni = skb_put(skb, HCI_MON_NEW_INDEX_SIZE); - ni->type = hdev->dev_type; + ni->type = 0x00; /* Old hdev->dev_type */ ni->bus = hdev->bus; bacpy(&ni->bdaddr, &hdev->bdaddr); memcpy_and_pad(ni->name, sizeof(ni->name), hdev->name, @@ -1007,9 +1007,6 @@ static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) return -EOPNOTSUPP; - if (hdev->dev_type != HCI_PRIMARY) - return -EOPNOTSUPP; - switch (cmd) { case HCISETRAW: if (!capable(CAP_NET_ADMIN)) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 40b71bc505..4e90bd722e 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -280,6 +280,19 @@ int __hci_cmd_sync_status(struct hci_dev *hdev, u16 opcode, u32 plen, } EXPORT_SYMBOL(__hci_cmd_sync_status); +int hci_cmd_sync_status(struct hci_dev *hdev, u16 opcode, u32 plen, + const void *param, u32 timeout) +{ + int err; + + hci_req_sync_lock(hdev); + err = __hci_cmd_sync_status(hdev, opcode, plen, param, timeout); + hci_req_sync_unlock(hdev); + + return err; +} +EXPORT_SYMBOL(hci_cmd_sync_status); + static void hci_cmd_sync_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_sync_work); @@ -358,8 +371,6 @@ static void le_scan_disable(struct work_struct *work) goto _return; } - hdev->discovery.scan_start = 0; - /* If we were running LE only scan, change discovery state. If * we were running both LE and BR/EDR inquiry simultaneously, * and BR/EDR inquiry is already finished, stop discovery, @@ -1043,11 +1054,10 @@ static int hci_disable_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance) struct hci_cp_ext_adv_set *set; u8 data[sizeof(*cp) + sizeof(*set) * 1]; u8 size; + struct adv_info *adv = NULL; /* If request specifies an instance that doesn't exist, fail */ if (instance > 0) { - struct adv_info *adv; - adv = hci_find_adv_instance(hdev, instance); if (!adv) return -EINVAL; @@ -1066,7 +1076,7 @@ static int hci_disable_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance) cp->num_of_sets = !!instance; cp->enable = 0x00; - set->handle = instance; + set->handle = adv ? adv->handle : instance; size = sizeof(*cp) + sizeof(*set) * cp->num_of_sets; @@ -1195,7 +1205,7 @@ int hci_setup_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance) cp.own_addr_type = own_addr_type; cp.channel_map = hdev->le_adv_channel_map; - cp.handle = instance; + cp.handle = adv ? adv->handle : instance; if (flags & MGMT_ADV_FLAG_SEC_2M) { cp.primary_phy = HCI_ADV_PHY_1M; @@ -1235,31 +1245,27 @@ int hci_setup_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance) static int hci_set_ext_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance) { - struct { - struct hci_cp_le_set_ext_scan_rsp_data cp; - u8 data[HCI_MAX_EXT_AD_LENGTH]; - } pdu; + DEFINE_FLEX(struct hci_cp_le_set_ext_scan_rsp_data, pdu, data, length, + HCI_MAX_EXT_AD_LENGTH); u8 len; struct adv_info *adv = NULL; int err; - memset(&pdu, 0, sizeof(pdu)); - if (instance) { adv = hci_find_adv_instance(hdev, instance); if (!adv || !adv->scan_rsp_changed) return 0; } - len = eir_create_scan_rsp(hdev, instance, pdu.data); + len = eir_create_scan_rsp(hdev, instance, pdu->data); - pdu.cp.handle = instance; - pdu.cp.length = len; - pdu.cp.operation = LE_SET_ADV_DATA_OP_COMPLETE; - pdu.cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG; + pdu->handle = adv ? adv->handle : instance; + pdu->length = len; + pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE; + pdu->frag_pref = LE_SET_ADV_DATA_NO_FRAG; err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_SCAN_RSP_DATA, - sizeof(pdu.cp) + len, &pdu.cp, + struct_size(pdu, data, len), pdu, HCI_CMD_TIMEOUT); if (err) return err; @@ -1267,7 +1273,7 @@ static int hci_set_ext_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance) if (adv) { adv->scan_rsp_changed = false; } else { - memcpy(hdev->scan_rsp_data, pdu.data, len); + memcpy(hdev->scan_rsp_data, pdu->data, len); hdev->scan_rsp_data_len = len; } @@ -1335,7 +1341,7 @@ int hci_enable_ext_advertising_sync(struct hci_dev *hdev, u8 instance) memset(set, 0, sizeof(*set)); - set->handle = instance; + set->handle = adv ? adv->handle : instance; /* Set duration per instance since controller is responsible for * scheduling it. @@ -1411,29 +1417,25 @@ static int hci_set_per_adv_params_sync(struct hci_dev *hdev, u8 instance, static int hci_set_per_adv_data_sync(struct hci_dev *hdev, u8 instance) { - struct { - struct hci_cp_le_set_per_adv_data cp; - u8 data[HCI_MAX_PER_AD_LENGTH]; - } pdu; + DEFINE_FLEX(struct hci_cp_le_set_per_adv_data, pdu, data, length, + HCI_MAX_PER_AD_LENGTH); u8 len; - - memset(&pdu, 0, sizeof(pdu)); + struct adv_info *adv = NULL; if (instance) { - struct adv_info *adv = hci_find_adv_instance(hdev, instance); - + adv = hci_find_adv_instance(hdev, instance); if (!adv || !adv->periodic) return 0; } - len = eir_create_per_adv_data(hdev, instance, pdu.data); + len = eir_create_per_adv_data(hdev, instance, pdu->data); - pdu.cp.length = len; - pdu.cp.handle = instance; - pdu.cp.operation = LE_SET_ADV_DATA_OP_COMPLETE; + pdu->length = len; + pdu->handle = adv ? adv->handle : instance; + pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_DATA, - sizeof(pdu.cp) + len, &pdu, + struct_size(pdu, data, len), pdu, HCI_CMD_TIMEOUT); } @@ -1727,31 +1729,27 @@ int hci_le_terminate_big_sync(struct hci_dev *hdev, u8 handle, u8 reason) static int hci_set_ext_adv_data_sync(struct hci_dev *hdev, u8 instance) { - struct { - struct hci_cp_le_set_ext_adv_data cp; - u8 data[HCI_MAX_EXT_AD_LENGTH]; - } pdu; + DEFINE_FLEX(struct hci_cp_le_set_ext_adv_data, pdu, data, length, + HCI_MAX_EXT_AD_LENGTH); u8 len; struct adv_info *adv = NULL; int err; - memset(&pdu, 0, sizeof(pdu)); - if (instance) { adv = hci_find_adv_instance(hdev, instance); if (!adv || !adv->adv_data_changed) return 0; } - len = eir_create_adv_data(hdev, instance, pdu.data); + len = eir_create_adv_data(hdev, instance, pdu->data); - pdu.cp.length = len; - pdu.cp.handle = instance; - pdu.cp.operation = LE_SET_ADV_DATA_OP_COMPLETE; - pdu.cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG; + pdu->length = len; + pdu->handle = adv ? adv->handle : instance; + pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE; + pdu->frag_pref = LE_SET_ADV_DATA_NO_FRAG; err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_DATA, - sizeof(pdu.cp) + len, &pdu.cp, + struct_size(pdu, data, len), pdu, HCI_CMD_TIMEOUT); if (err) return err; @@ -1760,7 +1758,7 @@ static int hci_set_ext_adv_data_sync(struct hci_dev *hdev, u8 instance) if (adv) { adv->adv_data_changed = false; } else { - memcpy(hdev->adv_data, pdu.data, len); + memcpy(hdev->adv_data, pdu->data, len); hdev->adv_data_len = len; } @@ -2569,6 +2567,16 @@ static struct conn_params *conn_params_copy(struct list_head *list, size_t *n) return p; } +/* Clear LE Accept List */ +static int hci_le_clear_accept_list_sync(struct hci_dev *hdev) +{ + if (!(hdev->commands[26] & 0x80)) + return 0; + + return __hci_cmd_sync_status(hdev, HCI_OP_LE_CLEAR_ACCEPT_LIST, 0, NULL, + HCI_CMD_TIMEOUT); +} + /* Device must not be scanning when updating the accept list. * * Update is done using the following sequence: @@ -2617,6 +2625,31 @@ static u8 hci_update_accept_list_sync(struct hci_dev *hdev) goto done; } + /* Force address filtering if PA Sync is in progress */ + if (hci_dev_test_flag(hdev, HCI_PA_SYNC)) { + struct hci_cp_le_pa_create_sync *sent; + + sent = hci_sent_cmd_data(hdev, HCI_OP_LE_PA_CREATE_SYNC); + if (sent) { + struct conn_params pa; + + memset(&pa, 0, sizeof(pa)); + + bacpy(&pa.addr, &sent->addr); + pa.addr_type = sent->addr_type; + + /* Clear first since there could be addresses left + * behind. + */ + hci_le_clear_accept_list_sync(hdev); + + num_entries = 1; + err = hci_le_add_accept_list_sync(hdev, &pa, + &num_entries); + goto done; + } + } + /* Go through the current accept list programmed into the * controller one by one and check if that address is connected or is * still in the list of pending connections or list of devices to @@ -2896,6 +2929,27 @@ static int hci_passive_scan_sync(struct hci_dev *hdev) */ filter_policy = hci_update_accept_list_sync(hdev); + /* If suspended and filter_policy set to 0x00 (no acceptlist) then + * passive scanning cannot be started since that would require the host + * to be woken up to process the reports. + */ + if (hdev->suspended && !filter_policy) { + /* Check if accept list is empty then there is no need to scan + * while suspended. + */ + if (list_empty(&hdev->le_accept_list)) + return 0; + + /* If there are devices is the accept_list that means some + * devices could not be programmed which in non-suspended case + * means filter_policy needs to be set to 0x00 so the host needs + * to filter, but since this is treating suspended case we + * can ignore device needing host to filter to allow devices in + * the acceptlist to be able to wakeup the system. + */ + filter_policy = 0x01; + } + /* When the controller is using random resolvable addresses and * with that having LE privacy enabled, then controllers with * Extended Scanner Filter Policies support can now enable support @@ -2918,6 +2972,20 @@ static int hci_passive_scan_sync(struct hci_dev *hdev) } else if (hci_is_adv_monitoring(hdev)) { window = hdev->le_scan_window_adv_monitor; interval = hdev->le_scan_int_adv_monitor; + + /* Disable duplicates filter when scanning for advertisement + * monitor for the following reasons. + * + * For HW pattern filtering (ex. MSFT), Realtek and Qualcomm + * controllers ignore RSSI_Sampling_Period when the duplicates + * filter is enabled. + * + * For SW pattern filtering, when we're not doing interleaved + * scanning, it is necessary to disable duplicates filter, + * otherwise hosts can only receive one advertisement and it's + * impossible to know if a peer is still in range. + */ + filter_dups = LE_SCAN_FILTER_DUP_DISABLE; } else { window = hdev->le_scan_window; interval = hdev->le_scan_interval; @@ -3488,10 +3556,6 @@ static int hci_unconf_init_sync(struct hci_dev *hdev) /* Read Local Supported Features. */ static int hci_read_local_features_sync(struct hci_dev *hdev) { - /* Not all AMP controllers support this command */ - if (hdev->dev_type == HCI_AMP && !(hdev->commands[14] & 0x20)) - return 0; - return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_FEATURES, 0, NULL, HCI_CMD_TIMEOUT); } @@ -3526,51 +3590,6 @@ static int hci_read_local_cmds_sync(struct hci_dev *hdev) return 0; } -/* Read Local AMP Info */ -static int hci_read_local_amp_info_sync(struct hci_dev *hdev) -{ - return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_AMP_INFO, - 0, NULL, HCI_CMD_TIMEOUT); -} - -/* Read Data Blk size */ -static int hci_read_data_block_size_sync(struct hci_dev *hdev) -{ - return __hci_cmd_sync_status(hdev, HCI_OP_READ_DATA_BLOCK_SIZE, - 0, NULL, HCI_CMD_TIMEOUT); -} - -/* Read Flow Control Mode */ -static int hci_read_flow_control_mode_sync(struct hci_dev *hdev) -{ - return __hci_cmd_sync_status(hdev, HCI_OP_READ_FLOW_CONTROL_MODE, - 0, NULL, HCI_CMD_TIMEOUT); -} - -/* Read Location Data */ -static int hci_read_location_data_sync(struct hci_dev *hdev) -{ - return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCATION_DATA, - 0, NULL, HCI_CMD_TIMEOUT); -} - -/* AMP Controller init stage 1 command sequence */ -static const struct hci_init_stage amp_init1[] = { - /* HCI_OP_READ_LOCAL_VERSION */ - HCI_INIT(hci_read_local_version_sync), - /* HCI_OP_READ_LOCAL_COMMANDS */ - HCI_INIT(hci_read_local_cmds_sync), - /* HCI_OP_READ_LOCAL_AMP_INFO */ - HCI_INIT(hci_read_local_amp_info_sync), - /* HCI_OP_READ_DATA_BLOCK_SIZE */ - HCI_INIT(hci_read_data_block_size_sync), - /* HCI_OP_READ_FLOW_CONTROL_MODE */ - HCI_INIT(hci_read_flow_control_mode_sync), - /* HCI_OP_READ_LOCATION_DATA */ - HCI_INIT(hci_read_location_data_sync), - {} -}; - static int hci_init1_sync(struct hci_dev *hdev) { int err; @@ -3584,28 +3603,9 @@ static int hci_init1_sync(struct hci_dev *hdev) return err; } - switch (hdev->dev_type) { - case HCI_PRIMARY: - hdev->flow_ctl_mode = HCI_FLOW_CTL_MODE_PACKET_BASED; - return hci_init_stage_sync(hdev, br_init1); - case HCI_AMP: - hdev->flow_ctl_mode = HCI_FLOW_CTL_MODE_BLOCK_BASED; - return hci_init_stage_sync(hdev, amp_init1); - default: - bt_dev_err(hdev, "Unknown device type %d", hdev->dev_type); - break; - } - - return 0; + return hci_init_stage_sync(hdev, br_init1); } -/* AMP Controller init stage 2 command sequence */ -static const struct hci_init_stage amp_init2[] = { - /* HCI_OP_READ_LOCAL_FEATURES */ - HCI_INIT(hci_read_local_features_sync), - {} -}; - /* Read Buffer Size (ACL mtu, max pkt, etc.) */ static int hci_read_buffer_size_sync(struct hci_dev *hdev) { @@ -3863,9 +3863,6 @@ static int hci_init2_sync(struct hci_dev *hdev) bt_dev_dbg(hdev, ""); - if (hdev->dev_type == HCI_AMP) - return hci_init_stage_sync(hdev, amp_init2); - err = hci_init_stage_sync(hdev, hci_init2); if (err) return err; @@ -4270,16 +4267,6 @@ static int hci_le_read_accept_list_size_sync(struct hci_dev *hdev) 0, NULL, HCI_CMD_TIMEOUT); } -/* Clear LE Accept List */ -static int hci_le_clear_accept_list_sync(struct hci_dev *hdev) -{ - if (!(hdev->commands[26] & 0x80)) - return 0; - - return __hci_cmd_sync_status(hdev, HCI_OP_LE_CLEAR_ACCEPT_LIST, 0, NULL, - HCI_CMD_TIMEOUT); -} - /* Read LE Resolving List Size */ static int hci_le_read_resolv_list_size_sync(struct hci_dev *hdev) { @@ -4703,13 +4690,6 @@ static int hci_init_sync(struct hci_dev *hdev) if (err < 0) return err; - /* HCI_PRIMARY covers both single-mode LE, BR/EDR and dual-mode - * BR/EDR/LE type controllers. AMP controllers only need the - * first two stages of init. - */ - if (hdev->dev_type != HCI_PRIMARY) - return 0; - err = hci_init3_sync(hdev); if (err < 0) return err; @@ -4938,12 +4918,8 @@ int hci_dev_open_sync(struct hci_dev *hdev) * In case of user channel usage, it is not important * if a public address or static random address is * available. - * - * This check is only valid for BR/EDR controllers - * since AMP controllers do not have an address. */ if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && - hdev->dev_type == HCI_PRIMARY && !bacmp(&hdev->bdaddr, BDADDR_ANY) && !bacmp(&hdev->static_addr, BDADDR_ANY)) { ret = -EADDRNOTAVAIL; @@ -4978,8 +4954,7 @@ int hci_dev_open_sync(struct hci_dev *hdev) !hci_dev_test_flag(hdev, HCI_CONFIG) && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && - hci_dev_test_flag(hdev, HCI_MGMT) && - hdev->dev_type == HCI_PRIMARY) { + hci_dev_test_flag(hdev, HCI_MGMT)) { ret = hci_powered_update_sync(hdev); mgmt_power_on(hdev, ret); } @@ -5124,8 +5099,7 @@ int hci_dev_close_sync(struct hci_dev *hdev) auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF); - if (!auto_off && hdev->dev_type == HCI_PRIMARY && - !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && + if (!auto_off && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && hci_dev_test_flag(hdev, HCI_MGMT)) __mgmt_power_off(hdev); @@ -5187,9 +5161,6 @@ int hci_dev_close_sync(struct hci_dev *hdev) hdev->flags &= BIT(HCI_RAW); hci_dev_clear_volatile_flags(hdev); - /* Controller radio is available but is currently powered down */ - hdev->amp_status = AMP_STATUS_POWERED_DOWN; - memset(hdev->eir, 0, sizeof(hdev->eir)); memset(hdev->dev_class, 0, sizeof(hdev->dev_class)); bacpy(&hdev->random_addr, BDADDR_ANY); @@ -5226,8 +5197,7 @@ static int hci_power_on_sync(struct hci_dev *hdev) */ if (hci_dev_test_flag(hdev, HCI_RFKILLED) || hci_dev_test_flag(hdev, HCI_UNCONFIGURED) || - (hdev->dev_type == HCI_PRIMARY && - !bacmp(&hdev->bdaddr, BDADDR_ANY) && + (!bacmp(&hdev->bdaddr, BDADDR_ANY) && !bacmp(&hdev->static_addr, BDADDR_ANY))) { hci_dev_clear_flag(hdev, HCI_AUTO_OFF); hci_dev_close_sync(hdev); @@ -5329,27 +5299,11 @@ int hci_stop_discovery_sync(struct hci_dev *hdev) return 0; } -static int hci_disconnect_phy_link_sync(struct hci_dev *hdev, u16 handle, - u8 reason) -{ - struct hci_cp_disconn_phy_link cp; - - memset(&cp, 0, sizeof(cp)); - cp.phy_handle = HCI_PHY_HANDLE(handle); - cp.reason = reason; - - return __hci_cmd_sync_status(hdev, HCI_OP_DISCONN_PHY_LINK, - sizeof(cp), &cp, HCI_CMD_TIMEOUT); -} - static int hci_disconnect_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { struct hci_cp_disconnect cp; - if (conn->type == AMP_LINK) - return hci_disconnect_phy_link_sync(hdev, conn->handle, reason); - if (test_bit(HCI_CONN_BIG_CREATED, &conn->flags)) { /* This is a BIS connection, hci_conn_del will * do the necessary cleanup. @@ -5586,27 +5540,33 @@ static int hci_power_off_sync(struct hci_dev *hdev) if (!test_bit(HCI_UP, &hdev->flags)) return 0; + hci_dev_set_flag(hdev, HCI_POWERING_DOWN); + if (test_bit(HCI_ISCAN, &hdev->flags) || test_bit(HCI_PSCAN, &hdev->flags)) { err = hci_write_scan_enable_sync(hdev, 0x00); if (err) - return err; + goto out; } err = hci_clear_adv_sync(hdev, NULL, false); if (err) - return err; + goto out; err = hci_stop_discovery_sync(hdev); if (err) - return err; + goto out; /* Terminated due to Power Off */ err = hci_disconnect_all_sync(hdev, HCI_ERROR_REMOTE_POWER_OFF); if (err) - return err; + goto out; - return hci_dev_close_sync(hdev); + err = hci_dev_close_sync(hdev); + +out: + hci_dev_clear_flag(hdev, HCI_POWERING_DOWN); + return err; } int hci_set_powered_sync(struct hci_dev *hdev, u8 val) @@ -6462,10 +6422,8 @@ done: int hci_le_create_cis_sync(struct hci_dev *hdev) { - struct { - struct hci_cp_le_create_cis cp; - struct hci_cis cis[0x1f]; - } cmd; + DEFINE_FLEX(struct hci_cp_le_create_cis, cmd, cis, num_cis, 0x1f); + size_t aux_num_cis = 0; struct hci_conn *conn; u8 cig = BT_ISO_QOS_CIG_UNSET; @@ -6492,8 +6450,6 @@ int hci_le_create_cis_sync(struct hci_dev *hdev) * remains pending. */ - memset(&cmd, 0, sizeof(cmd)); - hci_dev_lock(hdev); rcu_read_lock(); @@ -6530,7 +6486,7 @@ int hci_le_create_cis_sync(struct hci_dev *hdev) goto done; list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { - struct hci_cis *cis = &cmd.cis[cmd.cp.num_cis]; + struct hci_cis *cis = &cmd->cis[aux_num_cis]; if (hci_conn_check_create_cis(conn) || conn->iso_qos.ucast.cig != cig) @@ -6539,25 +6495,25 @@ int hci_le_create_cis_sync(struct hci_dev *hdev) set_bit(HCI_CONN_CREATE_CIS, &conn->flags); cis->acl_handle = cpu_to_le16(conn->parent->handle); cis->cis_handle = cpu_to_le16(conn->handle); - cmd.cp.num_cis++; + aux_num_cis++; - if (cmd.cp.num_cis >= ARRAY_SIZE(cmd.cis)) + if (aux_num_cis >= cmd->num_cis) break; } + cmd->num_cis = aux_num_cis; done: rcu_read_unlock(); hci_dev_unlock(hdev); - if (!cmd.cp.num_cis) + if (!aux_num_cis) return 0; /* Wait for HCI_LE_CIS_Established */ return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CREATE_CIS, - sizeof(cmd.cp) + sizeof(cmd.cis[0]) * - cmd.cp.num_cis, &cmd, - HCI_EVT_LE_CIS_ESTABLISHED, + struct_size(cmd, cis, cmd->num_cis), + cmd, HCI_EVT_LE_CIS_ESTABLISHED, conn->conn_timeout, NULL); } diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index fa6c2e95d5..398fb81f7a 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -3,7 +3,7 @@ * BlueZ - Bluetooth protocol stack for Linux * * Copyright (C) 2022 Intel Corporation - * Copyright 2023 NXP + * Copyright 2023-2024 NXP */ #include <linux/module.h> @@ -54,7 +54,6 @@ static void iso_sock_kill(struct sock *sk); enum { BT_SK_BIG_SYNC, BT_SK_PA_SYNC, - BT_SK_PA_SYNC_TERM, }; struct iso_pinfo { @@ -81,12 +80,14 @@ static bool check_ucast_qos(struct bt_iso_qos *qos); static bool check_bcast_qos(struct bt_iso_qos *qos); static bool iso_match_sid(struct sock *sk, void *data); static bool iso_match_sync_handle(struct sock *sk, void *data); +static bool iso_match_sync_handle_pa_report(struct sock *sk, void *data); static void iso_sock_disconn(struct sock *sk); typedef bool (*iso_sock_match_t)(struct sock *sk, void *data); -static struct sock *iso_get_sock_listen(bdaddr_t *src, bdaddr_t *dst, - iso_sock_match_t match, void *data); +static struct sock *iso_get_sock(bdaddr_t *src, bdaddr_t *dst, + enum bt_sock_state state, + iso_sock_match_t match, void *data); /* ---- ISO timers ---- */ #define ISO_CONN_TIMEOUT (HZ * 40) @@ -196,21 +197,10 @@ static void iso_chan_del(struct sock *sk, int err) sock_set_flag(sk, SOCK_ZAPPED); } -static bool iso_match_conn_sync_handle(struct sock *sk, void *data) -{ - struct hci_conn *hcon = data; - - if (test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) - return false; - - return hcon->sync_handle == iso_pi(sk)->sync_handle; -} - static void iso_conn_del(struct hci_conn *hcon, int err) { struct iso_conn *conn = hcon->iso_data; struct sock *sk; - struct sock *parent; if (!conn) return; @@ -226,25 +216,6 @@ static void iso_conn_del(struct hci_conn *hcon, int err) if (sk) { lock_sock(sk); - - /* While a PA sync hcon is in the process of closing, - * mark parent socket with a flag, so that any residual - * BIGInfo adv reports that arrive before PA sync is - * terminated are not processed anymore. - */ - if (test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) { - parent = iso_get_sock_listen(&hcon->src, - &hcon->dst, - iso_match_conn_sync_handle, - hcon); - - if (parent) { - set_bit(BT_SK_PA_SYNC_TERM, - &iso_pi(parent)->flags); - sock_put(parent); - } - } - iso_sock_clear_timer(sk); iso_chan_del(sk, err); release_sock(sk); @@ -581,22 +552,23 @@ static struct sock *__iso_get_sock_listen_by_sid(bdaddr_t *ba, bdaddr_t *bc, return NULL; } -/* Find socket listening: +/* Find socket in given state: * source bdaddr (Unicast) * destination bdaddr (Broadcast only) * match func - pass NULL to ignore * match func data - pass -1 to ignore * Returns closest match. */ -static struct sock *iso_get_sock_listen(bdaddr_t *src, bdaddr_t *dst, - iso_sock_match_t match, void *data) +static struct sock *iso_get_sock(bdaddr_t *src, bdaddr_t *dst, + enum bt_sock_state state, + iso_sock_match_t match, void *data) { struct sock *sk = NULL, *sk1 = NULL; read_lock(&iso_sk_list.lock); sk_for_each(sk, &iso_sk_list.head) { - if (sk->sk_state != BT_LISTEN) + if (sk->sk_state != state) continue; /* Match Broadcast destination */ @@ -690,11 +662,8 @@ static void iso_sock_cleanup_listen(struct sock *parent) iso_sock_kill(sk); } - /* If listening socket stands for a PA sync connection, - * properly disconnect the hcon and socket. - */ - if (iso_pi(parent)->conn && iso_pi(parent)->conn->hcon && - test_bit(HCI_CONN_PA_SYNC, &iso_pi(parent)->conn->hcon->flags)) { + /* If listening socket has a hcon, properly disconnect it */ + if (iso_pi(parent)->conn && iso_pi(parent)->conn->hcon) { iso_sock_disconn(parent); return; } @@ -860,6 +829,7 @@ static struct sock *iso_sock_alloc(struct net *net, struct socket *sock, iso_pi(sk)->src_type = BDADDR_LE_PUBLIC; iso_pi(sk)->qos = default_qos; + iso_pi(sk)->sync_handle = -1; bt_sock_link(&iso_sk_list, sk); return sk; @@ -907,7 +877,6 @@ static int iso_sock_bind_bc(struct socket *sock, struct sockaddr *addr, return -EINVAL; iso_pi(sk)->dst_type = sa->iso_bc->bc_bdaddr_type; - iso_pi(sk)->sync_handle = -1; if (sa->iso_bc->bc_sid > 0x0f) return -EINVAL; @@ -984,7 +953,8 @@ static int iso_sock_bind(struct socket *sock, struct sockaddr *addr, /* Allow the user to bind a PA sync socket to a number * of BISes to sync to. */ - if (sk->sk_state == BT_CONNECT2 && + if ((sk->sk_state == BT_CONNECT2 || + sk->sk_state == BT_CONNECTED) && test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) { err = iso_sock_bind_pa_sk(sk, sa, addr_len); goto done; @@ -1076,6 +1046,8 @@ static int iso_listen_bis(struct sock *sk) { struct hci_dev *hdev; int err = 0; + struct iso_conn *conn; + struct hci_conn *hcon; BT_DBG("%pMR -> %pMR (SID 0x%2.2x)", &iso_pi(sk)->src, &iso_pi(sk)->dst, iso_pi(sk)->bc_sid); @@ -1096,18 +1068,40 @@ static int iso_listen_bis(struct sock *sk) if (!hdev) return -EHOSTUNREACH; + hci_dev_lock(hdev); + /* Fail if user set invalid QoS */ if (iso_pi(sk)->qos_user_set && !check_bcast_qos(&iso_pi(sk)->qos)) { iso_pi(sk)->qos = default_qos; - return -EINVAL; + err = -EINVAL; + goto unlock; } - err = hci_pa_create_sync(hdev, &iso_pi(sk)->dst, - le_addr_type(iso_pi(sk)->dst_type), - iso_pi(sk)->bc_sid, &iso_pi(sk)->qos); + hcon = hci_pa_create_sync(hdev, &iso_pi(sk)->dst, + le_addr_type(iso_pi(sk)->dst_type), + iso_pi(sk)->bc_sid, &iso_pi(sk)->qos); + if (IS_ERR(hcon)) { + err = PTR_ERR(hcon); + goto unlock; + } + + conn = iso_conn_add(hcon); + if (!conn) { + hci_conn_drop(hcon); + err = -ENOMEM; + goto unlock; + } + + err = iso_chan_add(conn, sk, NULL); + if (err) { + hci_conn_drop(hcon); + goto unlock; + } hci_dev_put(hdev); +unlock: + hci_dev_unlock(hdev); return err; } @@ -1165,7 +1159,7 @@ done: } static int iso_sock_accept(struct socket *sock, struct socket *newsock, - int flags, bool kern) + struct proto_accept_arg *arg) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sock *sk = sock->sk, *ch; @@ -1174,7 +1168,7 @@ static int iso_sock_accept(struct socket *sock, struct socket *newsock, lock_sock(sk); - timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); BT_DBG("sk %p timeo %ld", sk, timeo); @@ -1264,7 +1258,7 @@ static int iso_sock_sendmsg(struct socket *sock, struct msghdr *msg, return -ENOTCONN; } - mtu = iso_pi(sk)->conn->hcon->hdev->iso_mtu; + mtu = iso_pi(sk)->conn->hcon->mtu; release_sock(sk); @@ -1362,8 +1356,7 @@ static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg, lock_sock(sk); switch (sk->sk_state) { case BT_CONNECT2: - if (pi->conn->hcon && - test_bit(HCI_CONN_PA_SYNC, &pi->conn->hcon->flags)) { + if (test_bit(BT_SK_PA_SYNC, &pi->flags)) { iso_conn_big_sync(sk); sk->sk_state = BT_LISTEN; } else { @@ -1372,6 +1365,16 @@ static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg, } release_sock(sk); return 0; + case BT_CONNECTED: + if (test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) { + iso_conn_big_sync(sk); + sk->sk_state = BT_LISTEN; + release_sock(sk); + return 0; + } + + release_sock(sk); + break; case BT_CONNECT: release_sock(sk); return iso_connect_cis(sk); @@ -1517,7 +1520,9 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, case BT_ISO_QOS: if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND && - sk->sk_state != BT_CONNECT2) { + sk->sk_state != BT_CONNECT2 && + (!test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags) || + sk->sk_state != BT_CONNECTED)) { err = -EINVAL; break; } @@ -1738,7 +1743,7 @@ static void iso_conn_ready(struct iso_conn *conn) struct sock *sk = conn->sk; struct hci_ev_le_big_sync_estabilished *ev = NULL; struct hci_ev_le_pa_sync_established *ev2 = NULL; - struct hci_evt_le_big_info_adv_report *ev3 = NULL; + struct hci_ev_le_per_adv_report *ev3 = NULL; struct hci_conn *hcon; BT_DBG("conn %p", conn); @@ -1756,32 +1761,37 @@ static void iso_conn_ready(struct iso_conn *conn) HCI_EVT_LE_BIG_SYNC_ESTABILISHED); /* Get reference to PA sync parent socket, if it exists */ - parent = iso_get_sock_listen(&hcon->src, - &hcon->dst, - iso_match_pa_sync_flag, NULL); + parent = iso_get_sock(&hcon->src, &hcon->dst, + BT_LISTEN, + iso_match_pa_sync_flag, + NULL); if (!parent && ev) - parent = iso_get_sock_listen(&hcon->src, - &hcon->dst, - iso_match_big, ev); + parent = iso_get_sock(&hcon->src, + &hcon->dst, + BT_LISTEN, + iso_match_big, ev); } else if (test_bit(HCI_CONN_PA_SYNC_FAILED, &hcon->flags)) { ev2 = hci_recv_event_data(hcon->hdev, HCI_EV_LE_PA_SYNC_ESTABLISHED); if (ev2) - parent = iso_get_sock_listen(&hcon->src, - &hcon->dst, - iso_match_sid, ev2); + parent = iso_get_sock(&hcon->src, + &hcon->dst, + BT_LISTEN, + iso_match_sid, ev2); } else if (test_bit(HCI_CONN_PA_SYNC, &hcon->flags)) { ev3 = hci_recv_event_data(hcon->hdev, - HCI_EVT_LE_BIG_INFO_ADV_REPORT); + HCI_EV_LE_PER_ADV_REPORT); if (ev3) - parent = iso_get_sock_listen(&hcon->src, - &hcon->dst, - iso_match_sync_handle, ev3); + parent = iso_get_sock(&hcon->src, + &hcon->dst, + BT_LISTEN, + iso_match_sync_handle_pa_report, + ev3); } if (!parent) - parent = iso_get_sock_listen(&hcon->src, - BDADDR_ANY, NULL, NULL); + parent = iso_get_sock(&hcon->src, BDADDR_ANY, + BT_LISTEN, NULL, NULL); if (!parent) return; @@ -1818,7 +1828,6 @@ static void iso_conn_ready(struct iso_conn *conn) if (ev3) { iso_pi(sk)->qos = iso_pi(parent)->qos; - iso_pi(sk)->qos.bcast.encryption = ev3->encryption; hcon->iso_qos = iso_pi(sk)->qos; iso_pi(sk)->bc_num_bis = iso_pi(parent)->bc_num_bis; memcpy(iso_pi(sk)->bc_bis, iso_pi(parent)->bc_bis, ISO_MAX_NUM_BIS); @@ -1883,7 +1892,6 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) struct hci_evt_le_big_info_adv_report *ev2; struct hci_ev_le_per_adv_report *ev3; struct sock *sk; - int lm = 0; bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr); @@ -1903,8 +1911,8 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) */ ev1 = hci_recv_event_data(hdev, HCI_EV_LE_PA_SYNC_ESTABLISHED); if (ev1) { - sk = iso_get_sock_listen(&hdev->bdaddr, bdaddr, iso_match_sid, - ev1); + sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_LISTEN, + iso_match_sid, ev1); if (sk && !ev1->status) iso_pi(sk)->sync_handle = le16_to_cpu(ev1->handle); @@ -1913,26 +1921,29 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) ev2 = hci_recv_event_data(hdev, HCI_EVT_LE_BIG_INFO_ADV_REPORT); if (ev2) { - /* Try to get PA sync listening socket, if it exists */ - sk = iso_get_sock_listen(&hdev->bdaddr, bdaddr, - iso_match_pa_sync_flag, NULL); - - if (!sk) { - sk = iso_get_sock_listen(&hdev->bdaddr, bdaddr, - iso_match_sync_handle, ev2); - - /* If PA Sync is in process of terminating, - * do not handle any more BIGInfo adv reports. - */ - - if (sk && test_bit(BT_SK_PA_SYNC_TERM, - &iso_pi(sk)->flags)) - return lm; + /* Check if BIGInfo report has already been handled */ + sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_CONNECTED, + iso_match_sync_handle, ev2); + if (sk) { + sock_put(sk); + sk = NULL; + goto done; } + /* Try to get PA sync socket, if it exists */ + sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_CONNECT2, + iso_match_sync_handle, ev2); + if (!sk) + sk = iso_get_sock(&hdev->bdaddr, bdaddr, + BT_LISTEN, + iso_match_sync_handle, + ev2); + if (sk) { int err; + iso_pi(sk)->qos.bcast.encryption = ev2->encryption; + if (ev2->num_bis < iso_pi(sk)->bc_num_bis) iso_pi(sk)->bc_num_bis = ev2->num_bis; @@ -1951,6 +1962,8 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) } } } + + goto done; } ev3 = hci_recv_event_data(hdev, HCI_EV_LE_PER_ADV_REPORT); @@ -1959,8 +1972,8 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) u8 *base; struct hci_conn *hcon; - sk = iso_get_sock_listen(&hdev->bdaddr, bdaddr, - iso_match_sync_handle_pa_report, ev3); + sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_LISTEN, + iso_match_sync_handle_pa_report, ev3); if (!sk) goto done; @@ -2009,21 +2022,20 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) hcon->le_per_adv_data_len = 0; } } else { - sk = iso_get_sock_listen(&hdev->bdaddr, BDADDR_ANY, NULL, NULL); + sk = iso_get_sock(&hdev->bdaddr, BDADDR_ANY, + BT_LISTEN, NULL, NULL); } done: if (!sk) - return lm; - - lm |= HCI_LM_ACCEPT; + return 0; if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) *flags |= HCI_PROTO_DEFER; sock_put(sk); - return lm; + return HCI_LM_ACCEPT; } static void iso_connect_cfm(struct hci_conn *hcon, __u8 status) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 84fc70862d..9988ba382b 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -415,6 +415,9 @@ static void l2cap_chan_timeout(struct work_struct *work) BT_DBG("chan %p state %s", chan, state_to_string(chan->state)); + if (!conn) + return; + mutex_lock(&conn->chan_lock); /* __set_chan_timer() calls l2cap_chan_hold(chan) while scheduling * this work. No need to call l2cap_chan_hold(chan) here again. @@ -454,6 +457,9 @@ struct l2cap_chan *l2cap_chan_create(void) /* Set default lock nesting level */ atomic_set(&chan->nesting, L2CAP_NESTING_NORMAL); + /* Available receive buffer space is initially unknown */ + chan->rx_avail = -1; + write_lock(&chan_list_lock); list_add(&chan->global_l, &chan_list); write_unlock(&chan_list_lock); @@ -535,6 +541,28 @@ void l2cap_chan_set_defaults(struct l2cap_chan *chan) } EXPORT_SYMBOL_GPL(l2cap_chan_set_defaults); +static __u16 l2cap_le_rx_credits(struct l2cap_chan *chan) +{ + size_t sdu_len = chan->sdu ? chan->sdu->len : 0; + + if (chan->mps == 0) + return 0; + + /* If we don't know the available space in the receiver buffer, give + * enough credits for a full packet. + */ + if (chan->rx_avail == -1) + return (chan->imtu / chan->mps) + 1; + + /* If we know how much space is available in the receive buffer, give + * out as many credits as would fill the buffer. + */ + if (chan->rx_avail <= sdu_len) + return 0; + + return DIV_ROUND_UP(chan->rx_avail - sdu_len, chan->mps); +} + static void l2cap_le_flowctl_init(struct l2cap_chan *chan, u16 tx_credits) { chan->sdu = NULL; @@ -543,8 +571,7 @@ static void l2cap_le_flowctl_init(struct l2cap_chan *chan, u16 tx_credits) chan->tx_credits = tx_credits; /* Derive MPS from connection MTU to stop HCI fragmentation */ chan->mps = min_t(u16, chan->imtu, chan->conn->mtu - L2CAP_HDR_SIZE); - /* Give enough credits for a full packet */ - chan->rx_credits = (chan->imtu / chan->mps) + 1; + chan->rx_credits = l2cap_le_rx_credits(chan); skb_queue_head_init(&chan->tx_q); } @@ -556,7 +583,7 @@ static void l2cap_ecred_init(struct l2cap_chan *chan, u16 tx_credits) /* L2CAP implementations shall support a minimum MPS of 64 octets */ if (chan->mps < L2CAP_ECRED_MIN_MPS) { chan->mps = L2CAP_ECRED_MIN_MPS; - chan->rx_credits = (chan->imtu / chan->mps) + 1; + chan->rx_credits = l2cap_le_rx_credits(chan); } } @@ -1257,7 +1284,7 @@ static void l2cap_le_connect(struct l2cap_chan *chan) struct l2cap_ecred_conn_data { struct { - struct l2cap_ecred_conn_req req; + struct l2cap_ecred_conn_req_hdr req; __le16 scid[5]; } __packed pdu; struct l2cap_chan *chan; @@ -3737,7 +3764,7 @@ static void l2cap_ecred_list_defer(struct l2cap_chan *chan, void *data) struct l2cap_ecred_rsp_data { struct { - struct l2cap_ecred_conn_rsp rsp; + struct l2cap_ecred_conn_rsp_hdr rsp; __le16 scid[L2CAP_ECRED_MAX_CID]; } __packed pdu; int count; @@ -3746,6 +3773,8 @@ struct l2cap_ecred_rsp_data { static void l2cap_ecred_rsp_defer(struct l2cap_chan *chan, void *data) { struct l2cap_ecred_rsp_data *rsp = data; + struct l2cap_ecred_conn_rsp *rsp_flex = + container_of(&rsp->pdu.rsp, struct l2cap_ecred_conn_rsp, hdr); if (test_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags)) return; @@ -3755,7 +3784,7 @@ static void l2cap_ecred_rsp_defer(struct l2cap_chan *chan, void *data) /* Include all channels pending with the same ident */ if (!rsp->pdu.rsp.result) - rsp->pdu.rsp.dcid[rsp->count++] = cpu_to_le16(chan->scid); + rsp_flex->dcid[rsp->count++] = cpu_to_le16(chan->scid); else l2cap_chan_del(chan, ECONNRESET); } @@ -3902,13 +3931,12 @@ static inline int l2cap_command_rej(struct l2cap_conn *conn, return 0; } -static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn, - struct l2cap_cmd_hdr *cmd, - u8 *data, u8 rsp_code, u8 amp_id) +static void l2cap_connect(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, + u8 *data, u8 rsp_code) { struct l2cap_conn_req *req = (struct l2cap_conn_req *) data; struct l2cap_conn_rsp rsp; - struct l2cap_chan *chan = NULL, *pchan; + struct l2cap_chan *chan = NULL, *pchan = NULL; int result, status = L2CAP_CS_NO_INFO; u16 dcid = 0, scid = __le16_to_cpu(req->scid); @@ -3921,7 +3949,7 @@ static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn, &conn->hcon->dst, ACL_LINK); if (!pchan) { result = L2CAP_CR_BAD_PSM; - goto sendresp; + goto response; } mutex_lock(&conn->chan_lock); @@ -3983,17 +4011,8 @@ static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn, status = L2CAP_CS_AUTHOR_PEND; chan->ops->defer(chan); } else { - /* Force pending result for AMP controllers. - * The connection will succeed after the - * physical link is up. - */ - if (amp_id == AMP_ID_BREDR) { - l2cap_state_change(chan, BT_CONFIG); - result = L2CAP_CR_SUCCESS; - } else { - l2cap_state_change(chan, BT_CONNECT2); - result = L2CAP_CR_PEND; - } + l2cap_state_change(chan, BT_CONFIG); + result = L2CAP_CR_SUCCESS; status = L2CAP_CS_NO_INFO; } } else { @@ -4008,17 +4027,15 @@ static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn, } response: - l2cap_chan_unlock(pchan); - mutex_unlock(&conn->chan_lock); - l2cap_chan_put(pchan); - -sendresp: rsp.scid = cpu_to_le16(scid); rsp.dcid = cpu_to_le16(dcid); rsp.result = cpu_to_le16(result); rsp.status = cpu_to_le16(status); l2cap_send_cmd(conn, cmd->ident, rsp_code, sizeof(rsp), &rsp); + if (!pchan) + return; + if (result == L2CAP_CR_PEND && status == L2CAP_CS_NO_INFO) { struct l2cap_info_req info; info.type = cpu_to_le16(L2CAP_IT_FEAT_MASK); @@ -4041,7 +4058,9 @@ sendresp: chan->num_conf_req++; } - return chan; + l2cap_chan_unlock(pchan); + mutex_unlock(&conn->chan_lock); + l2cap_chan_put(pchan); } static int l2cap_connect_req(struct l2cap_conn *conn, @@ -4058,7 +4077,7 @@ static int l2cap_connect_req(struct l2cap_conn *conn, mgmt_device_connected(hdev, hcon, NULL, 0); hci_dev_unlock(hdev); - l2cap_connect(conn, cmd, data, L2CAP_CONN_RSP, 0); + l2cap_connect(conn, cmd, data, L2CAP_CONN_RSP); return 0; } @@ -4628,13 +4647,7 @@ static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn, memset(&rsp, 0, sizeof(rsp)); - if (max > hcon->le_conn_max_interval) { - BT_DBG("requested connection interval exceeds current bounds."); - err = -EINVAL; - } else { - err = hci_check_conn_params(min, max, latency, to_multiplier); - } - + err = hci_check_conn_params(min, max, latency, to_multiplier); if (err) rsp.result = cpu_to_le16(L2CAP_CONN_PARAM_REJECTED); else @@ -4994,10 +5007,7 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, u8 *data) { struct l2cap_ecred_conn_req *req = (void *) data; - struct { - struct l2cap_ecred_conn_rsp rsp; - __le16 dcid[L2CAP_ECRED_MAX_CID]; - } __packed pdu; + DEFINE_RAW_FLEX(struct l2cap_ecred_conn_rsp, pdu, dcid, L2CAP_ECRED_MAX_CID); struct l2cap_chan *chan, *pchan; u16 mtu, mps; __le16 psm; @@ -5016,7 +5026,7 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, cmd_len -= sizeof(*req); num_scid = cmd_len / sizeof(u16); - if (num_scid > ARRAY_SIZE(pdu.dcid)) { + if (num_scid > L2CAP_ECRED_MAX_CID) { result = L2CAP_CR_LE_INVALID_PARAMS; goto response; } @@ -5045,7 +5055,7 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, BT_DBG("psm 0x%2.2x mtu %u mps %u", __le16_to_cpu(psm), mtu, mps); - memset(&pdu, 0, sizeof(pdu)); + memset(pdu, 0, sizeof(*pdu)); /* Check if we have socket listening on psm */ pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, &conn->hcon->src, @@ -5071,8 +5081,8 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, BT_DBG("scid[%d] 0x%4.4x", i, scid); - pdu.dcid[i] = 0x0000; - len += sizeof(*pdu.dcid); + pdu->dcid[i] = 0x0000; + len += sizeof(*pdu->dcid); /* Check for valid dynamic CID range */ if (scid < L2CAP_CID_DYN_START || scid > L2CAP_CID_LE_DYN_END) { @@ -5106,13 +5116,13 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, l2cap_ecred_init(chan, __le16_to_cpu(req->credits)); /* Init response */ - if (!pdu.rsp.credits) { - pdu.rsp.mtu = cpu_to_le16(chan->imtu); - pdu.rsp.mps = cpu_to_le16(chan->mps); - pdu.rsp.credits = cpu_to_le16(chan->rx_credits); + if (!pdu->credits) { + pdu->mtu = cpu_to_le16(chan->imtu); + pdu->mps = cpu_to_le16(chan->mps); + pdu->credits = cpu_to_le16(chan->rx_credits); } - pdu.dcid[i] = cpu_to_le16(chan->scid); + pdu->dcid[i] = cpu_to_le16(chan->scid); __set_chan_timer(chan, chan->ops->get_sndtimeo(chan)); @@ -5134,13 +5144,13 @@ unlock: l2cap_chan_put(pchan); response: - pdu.rsp.result = cpu_to_le16(result); + pdu->result = cpu_to_le16(result); if (defer) return 0; l2cap_send_cmd(conn, cmd->ident, L2CAP_ECRED_CONN_RSP, - sizeof(pdu.rsp) + len, &pdu); + sizeof(*pdu) + len, pdu); return 0; } @@ -6239,7 +6249,7 @@ static int l2cap_finish_move(struct l2cap_chan *chan) BT_DBG("chan %p", chan); chan->rx_state = L2CAP_RX_STATE_RECV; - chan->conn->mtu = chan->conn->hcon->hdev->acl_mtu; + chan->conn->mtu = chan->conn->hcon->mtu; return l2cap_resegment(chan); } @@ -6306,7 +6316,7 @@ static int l2cap_rx_state_wait_f(struct l2cap_chan *chan, */ chan->next_tx_seq = control->reqseq; chan->unacked_frames = 0; - chan->conn->mtu = chan->conn->hcon->hdev->acl_mtu; + chan->conn->mtu = chan->conn->hcon->mtu; err = l2cap_resegment(chan); @@ -6511,9 +6521,7 @@ static void l2cap_chan_le_send_credits(struct l2cap_chan *chan) { struct l2cap_conn *conn = chan->conn; struct l2cap_le_credits pkt; - u16 return_credits; - - return_credits = (chan->imtu / chan->mps) + 1; + u16 return_credits = l2cap_le_rx_credits(chan); if (chan->rx_credits >= return_credits) return; @@ -6532,6 +6540,19 @@ static void l2cap_chan_le_send_credits(struct l2cap_chan *chan) l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CREDITS, sizeof(pkt), &pkt); } +void l2cap_chan_rx_avail(struct l2cap_chan *chan, ssize_t rx_avail) +{ + if (chan->rx_avail == rx_avail) + return; + + BT_DBG("chan %p has %zd bytes avail for rx", chan, rx_avail); + + chan->rx_avail = rx_avail; + + if (chan->state == BT_CONNECTED) + l2cap_chan_le_send_credits(chan); +} + static int l2cap_ecred_recv(struct l2cap_chan *chan, struct sk_buff *skb) { int err; @@ -6541,6 +6562,12 @@ static int l2cap_ecred_recv(struct l2cap_chan *chan, struct sk_buff *skb) /* Wait recv to confirm reception before updating the credits */ err = chan->ops->recv(chan, skb); + if (err < 0 && chan->rx_avail != -1) { + BT_ERR("Queueing received LE L2CAP data failed"); + l2cap_send_disconn_req(chan, ECONNRESET); + return err; + } + /* Update credits whenever an SDU is received */ l2cap_chan_le_send_credits(chan); @@ -6563,7 +6590,8 @@ static int l2cap_ecred_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb) } chan->rx_credits--; - BT_DBG("rx_credits %u -> %u", chan->rx_credits + 1, chan->rx_credits); + BT_DBG("chan %p: rx_credits %u -> %u", + chan, chan->rx_credits + 1, chan->rx_credits); /* Update if remote had run out of credits, this should only happens * if the remote is not using the entire MPS. @@ -6733,6 +6761,8 @@ static void l2cap_conless_channel(struct l2cap_conn *conn, __le16 psm, BT_DBG("chan %p, len %d", chan, skb->len); + l2cap_chan_lock(chan); + if (chan->state != BT_BOUND && chan->state != BT_CONNECTED) goto drop; @@ -6744,11 +6774,13 @@ static void l2cap_conless_channel(struct l2cap_conn *conn, __le16 psm, bt_cb(skb)->l2cap.psm = psm; if (!chan->ops->recv(chan, skb)) { + l2cap_chan_unlock(chan); l2cap_chan_put(chan); return; } drop: + l2cap_chan_unlock(chan); l2cap_chan_put(chan); free_skb: kfree_skb(skb); @@ -6846,18 +6878,7 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) BT_DBG("hcon %p conn %p hchan %p", hcon, conn, hchan); - switch (hcon->type) { - case LE_LINK: - if (hcon->hdev->le_mtu) { - conn->mtu = hcon->hdev->le_mtu; - break; - } - fallthrough; - default: - conn->mtu = hcon->hdev->acl_mtu; - break; - } - + conn->mtu = hcon->mtu; conn->feat_mask = 0; conn->local_fixed_chan = L2CAP_FC_SIG_BREDR | L2CAP_FC_CONNLESS; @@ -7111,14 +7132,11 @@ EXPORT_SYMBOL_GPL(l2cap_chan_connect); static void l2cap_ecred_reconfigure(struct l2cap_chan *chan) { struct l2cap_conn *conn = chan->conn; - struct { - struct l2cap_ecred_reconf_req req; - __le16 scid; - } pdu; + DEFINE_RAW_FLEX(struct l2cap_ecred_reconf_req, pdu, scid, 1); - pdu.req.mtu = cpu_to_le16(chan->imtu); - pdu.req.mps = cpu_to_le16(chan->mps); - pdu.scid = cpu_to_le16(chan->scid); + pdu->mtu = cpu_to_le16(chan->imtu); + pdu->mps = cpu_to_le16(chan->mps); + pdu->scid[0] = cpu_to_le16(chan->scid); chan->ident = l2cap_get_ident(conn); @@ -7462,10 +7480,6 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) struct l2cap_conn *conn = hcon->l2cap_data; int len; - /* For AMP controller do not create l2cap conn */ - if (!conn && hcon->hdev->dev_type != HCI_PRIMARY) - goto drop; - if (!conn) conn = l2cap_conn_add(hcon); diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 5cc83f906c..ba437c6f6e 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -327,7 +327,7 @@ done: } static int l2cap_sock_accept(struct socket *sock, struct socket *newsock, - int flags, bool kern) + struct proto_accept_arg *arg) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sock *sk = sock->sk, *nsk; @@ -336,7 +336,7 @@ static int l2cap_sock_accept(struct socket *sock, struct socket *newsock, lock_sock_nested(sk, L2CAP_NESTING_PARENT); - timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); BT_DBG("sk %p timeo %ld", sk, timeo); @@ -1131,6 +1131,34 @@ static int l2cap_sock_sendmsg(struct socket *sock, struct msghdr *msg, return err; } +static void l2cap_publish_rx_avail(struct l2cap_chan *chan) +{ + struct sock *sk = chan->data; + ssize_t avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc); + int expected_skbs, skb_overhead; + + if (avail <= 0) { + l2cap_chan_rx_avail(chan, 0); + return; + } + + if (!chan->mps) { + l2cap_chan_rx_avail(chan, -1); + return; + } + + /* Correct available memory by estimated sk_buff overhead. + * This is significant due to small transfer sizes. However, accept + * at least one full packet if receive space is non-zero. + */ + expected_skbs = DIV_ROUND_UP(avail, chan->mps); + skb_overhead = expected_skbs * sizeof(struct sk_buff); + if (skb_overhead < avail) + l2cap_chan_rx_avail(chan, avail - skb_overhead); + else + l2cap_chan_rx_avail(chan, -1); +} + static int l2cap_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { @@ -1167,28 +1195,33 @@ static int l2cap_sock_recvmsg(struct socket *sock, struct msghdr *msg, else err = bt_sock_recvmsg(sock, msg, len, flags); - if (pi->chan->mode != L2CAP_MODE_ERTM) + if (pi->chan->mode != L2CAP_MODE_ERTM && + pi->chan->mode != L2CAP_MODE_LE_FLOWCTL && + pi->chan->mode != L2CAP_MODE_EXT_FLOWCTL) return err; - /* Attempt to put pending rx data in the socket buffer */ - lock_sock(sk); - if (!test_bit(CONN_LOCAL_BUSY, &pi->chan->conn_state)) - goto done; + l2cap_publish_rx_avail(pi->chan); - if (pi->rx_busy_skb) { - if (!__sock_queue_rcv_skb(sk, pi->rx_busy_skb)) - pi->rx_busy_skb = NULL; - else + /* Attempt to put pending rx data in the socket buffer */ + while (!list_empty(&pi->rx_busy)) { + struct l2cap_rx_busy *rx_busy = + list_first_entry(&pi->rx_busy, + struct l2cap_rx_busy, + list); + if (__sock_queue_rcv_skb(sk, rx_busy->skb) < 0) goto done; + list_del(&rx_busy->list); + kfree(rx_busy); } /* Restore data flow when half of the receive buffer is * available. This avoids resending large numbers of * frames. */ - if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf >> 1) + if (test_bit(CONN_LOCAL_BUSY, &pi->chan->conn_state) && + atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf >> 1) l2cap_chan_busy(pi->chan, 0); done: @@ -1206,6 +1239,10 @@ static void l2cap_sock_kill(struct sock *sk) BT_DBG("sk %p state %s", sk, state_to_string(sk->sk_state)); + /* Sock is dead, so set chan data to NULL, avoid other task use invalid + * sock pointer. + */ + l2cap_pi(sk)->chan->data = NULL; /* Kill poor orphan */ l2cap_chan_put(l2cap_pi(sk)->chan); @@ -1448,18 +1485,25 @@ static struct l2cap_chan *l2cap_sock_new_connection_cb(struct l2cap_chan *chan) static int l2cap_sock_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) { - struct sock *sk = chan->data; + struct sock *sk; + struct l2cap_pinfo *pi; int err; - lock_sock(sk); + sk = chan->data; + if (!sk) + return -ENXIO; - if (l2cap_pi(sk)->rx_busy_skb) { + pi = l2cap_pi(sk); + lock_sock(sk); + if (chan->mode == L2CAP_MODE_ERTM && !list_empty(&pi->rx_busy)) { err = -ENOMEM; goto done; } if (chan->mode != L2CAP_MODE_ERTM && - chan->mode != L2CAP_MODE_STREAMING) { + chan->mode != L2CAP_MODE_STREAMING && + chan->mode != L2CAP_MODE_LE_FLOWCTL && + chan->mode != L2CAP_MODE_EXT_FLOWCTL) { /* Even if no filter is attached, we could potentially * get errors from security modules, etc. */ @@ -1470,7 +1514,9 @@ static int l2cap_sock_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) err = __sock_queue_rcv_skb(sk, skb); - /* For ERTM, handle one skb that doesn't fit into the recv + l2cap_publish_rx_avail(chan); + + /* For ERTM and LE, handle a skb that doesn't fit into the recv * buffer. This is important to do because the data frames * have already been acked, so the skb cannot be discarded. * @@ -1479,8 +1525,18 @@ static int l2cap_sock_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) * acked and reassembled until there is buffer space * available. */ - if (err < 0 && chan->mode == L2CAP_MODE_ERTM) { - l2cap_pi(sk)->rx_busy_skb = skb; + if (err < 0 && + (chan->mode == L2CAP_MODE_ERTM || + chan->mode == L2CAP_MODE_LE_FLOWCTL || + chan->mode == L2CAP_MODE_EXT_FLOWCTL)) { + struct l2cap_rx_busy *rx_busy = + kmalloc(sizeof(*rx_busy), GFP_KERNEL); + if (!rx_busy) { + err = -ENOMEM; + goto done; + } + rx_busy->skb = skb; + list_add_tail(&rx_busy->list, &pi->rx_busy); l2cap_chan_busy(chan, 1); err = 0; } @@ -1706,6 +1762,8 @@ static const struct l2cap_ops l2cap_chan_ops = { static void l2cap_sock_destruct(struct sock *sk) { + struct l2cap_rx_busy *rx_busy, *next; + BT_DBG("sk %p", sk); if (l2cap_pi(sk)->chan) { @@ -1713,9 +1771,10 @@ static void l2cap_sock_destruct(struct sock *sk) l2cap_chan_put(l2cap_pi(sk)->chan); } - if (l2cap_pi(sk)->rx_busy_skb) { - kfree_skb(l2cap_pi(sk)->rx_busy_skb); - l2cap_pi(sk)->rx_busy_skb = NULL; + list_for_each_entry_safe(rx_busy, next, &l2cap_pi(sk)->rx_busy, list) { + kfree_skb(rx_busy->skb); + list_del(&rx_busy->list); + kfree(rx_busy); } skb_queue_purge(&sk->sk_receive_queue); @@ -1799,6 +1858,8 @@ static void l2cap_sock_init(struct sock *sk, struct sock *parent) chan->data = sk; chan->ops = &l2cap_chan_ops; + + l2cap_publish_rx_avail(chan); } static struct proto l2cap_proto = { @@ -1820,6 +1881,8 @@ static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, sk->sk_destruct = l2cap_sock_destruct; sk->sk_sndtimeo = L2CAP_CONN_TIMEOUT; + INIT_LIST_HEAD(&l2cap_pi(sk)->rx_busy); + chan = l2cap_chan_create(); if (!chan) { sk_free(sk); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index b8e05ddeed..80f220b7e1 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -443,8 +443,7 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data, count = 0; list_for_each_entry(d, &hci_dev_list, list) { - if (d->dev_type == HCI_PRIMARY && - !hci_dev_test_flag(d, HCI_UNCONFIGURED)) + if (!hci_dev_test_flag(d, HCI_UNCONFIGURED)) count++; } @@ -468,8 +467,7 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data, if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks)) continue; - if (d->dev_type == HCI_PRIMARY && - !hci_dev_test_flag(d, HCI_UNCONFIGURED)) { + if (!hci_dev_test_flag(d, HCI_UNCONFIGURED)) { rp->index[count++] = cpu_to_le16(d->id); bt_dev_dbg(hdev, "Added hci%u", d->id); } @@ -503,8 +501,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev, count = 0; list_for_each_entry(d, &hci_dev_list, list) { - if (d->dev_type == HCI_PRIMARY && - hci_dev_test_flag(d, HCI_UNCONFIGURED)) + if (hci_dev_test_flag(d, HCI_UNCONFIGURED)) count++; } @@ -528,8 +525,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev, if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks)) continue; - if (d->dev_type == HCI_PRIMARY && - hci_dev_test_flag(d, HCI_UNCONFIGURED)) { + if (hci_dev_test_flag(d, HCI_UNCONFIGURED)) { rp->index[count++] = cpu_to_le16(d->id); bt_dev_dbg(hdev, "Added hci%u", d->id); } @@ -561,10 +557,8 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev, read_lock(&hci_dev_list_lock); count = 0; - list_for_each_entry(d, &hci_dev_list, list) { - if (d->dev_type == HCI_PRIMARY || d->dev_type == HCI_AMP) - count++; - } + list_for_each_entry(d, &hci_dev_list, list) + count++; rp = kmalloc(struct_size(rp, entry, count), GFP_ATOMIC); if (!rp) { @@ -585,16 +579,10 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev, if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks)) continue; - if (d->dev_type == HCI_PRIMARY) { - if (hci_dev_test_flag(d, HCI_UNCONFIGURED)) - rp->entry[count].type = 0x01; - else - rp->entry[count].type = 0x00; - } else if (d->dev_type == HCI_AMP) { - rp->entry[count].type = 0x02; - } else { - continue; - } + if (hci_dev_test_flag(d, HCI_UNCONFIGURED)) + rp->entry[count].type = 0x01; + else + rp->entry[count].type = 0x00; rp->entry[count].bus = d->bus; rp->entry[count++].index = cpu_to_le16(d->id); @@ -1385,6 +1373,14 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, hci_dev_lock(hdev); + if (!cp->val) { + if (hci_dev_test_flag(hdev, HCI_POWERING_DOWN)) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED, + MGMT_STATUS_BUSY); + goto failed; + } + } + if (pending_find(MGMT_OP_SET_POWERED, hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED, MGMT_STATUS_BUSY); @@ -1699,8 +1695,7 @@ static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data, new_settings(hdev, cmd->sk); done: - if (cmd) - mgmt_pending_remove(cmd); + mgmt_pending_remove(cmd); hci_dev_unlock(hdev); } @@ -8770,8 +8765,7 @@ static void add_ext_adv_params_complete(struct hci_dev *hdev, void *data, } unlock: - if (cmd) - mgmt_pending_free(cmd); + mgmt_pending_free(cmd); hci_dev_unlock(hdev); } @@ -9325,23 +9319,14 @@ void mgmt_index_added(struct hci_dev *hdev) if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) return; - switch (hdev->dev_type) { - case HCI_PRIMARY: - if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { - mgmt_index_event(MGMT_EV_UNCONF_INDEX_ADDED, hdev, - NULL, 0, HCI_MGMT_UNCONF_INDEX_EVENTS); - ev.type = 0x01; - } else { - mgmt_index_event(MGMT_EV_INDEX_ADDED, hdev, NULL, 0, - HCI_MGMT_INDEX_EVENTS); - ev.type = 0x00; - } - break; - case HCI_AMP: - ev.type = 0x02; - break; - default: - return; + if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { + mgmt_index_event(MGMT_EV_UNCONF_INDEX_ADDED, hdev, NULL, 0, + HCI_MGMT_UNCONF_INDEX_EVENTS); + ev.type = 0x01; + } else { + mgmt_index_event(MGMT_EV_INDEX_ADDED, hdev, NULL, 0, + HCI_MGMT_INDEX_EVENTS); + ev.type = 0x00; } ev.bus = hdev->bus; @@ -9358,25 +9343,16 @@ void mgmt_index_removed(struct hci_dev *hdev) if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) return; - switch (hdev->dev_type) { - case HCI_PRIMARY: - mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status); + mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status); - if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { - mgmt_index_event(MGMT_EV_UNCONF_INDEX_REMOVED, hdev, - NULL, 0, HCI_MGMT_UNCONF_INDEX_EVENTS); - ev.type = 0x01; - } else { - mgmt_index_event(MGMT_EV_INDEX_REMOVED, hdev, NULL, 0, - HCI_MGMT_INDEX_EVENTS); - ev.type = 0x00; - } - break; - case HCI_AMP: - ev.type = 0x02; - break; - default: - return; + if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { + mgmt_index_event(MGMT_EV_UNCONF_INDEX_REMOVED, hdev, NULL, 0, + HCI_MGMT_UNCONF_INDEX_EVENTS); + ev.type = 0x01; + } else { + mgmt_index_event(MGMT_EV_INDEX_REMOVED, hdev, NULL, 0, + HCI_MGMT_INDEX_EVENTS); + ev.type = 0x00; } ev.bus = hdev->bus; @@ -9692,6 +9668,9 @@ bool mgmt_powering_down(struct hci_dev *hdev) struct mgmt_pending_cmd *cmd; struct mgmt_mode *cp; + if (hci_dev_test_flag(hdev, HCI_POWERING_DOWN)) + return true; + cmd = pending_find(MGMT_OP_SET_POWERED, hdev); if (!cmd) return false; @@ -9999,6 +9978,9 @@ void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status) /* If this is a HCI command related to powering on the * HCI dev don't send any mgmt signals. */ + if (hci_dev_test_flag(hdev, HCI_POWERING_DOWN)) + return; + if (pending_find(MGMT_OP_SET_POWERED, hdev)) return; } diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c index 9612c5d1b1..d039683d3b 100644 --- a/net/bluetooth/msft.c +++ b/net/bluetooth/msft.c @@ -769,7 +769,7 @@ void msft_register(struct hci_dev *hdev) mutex_init(&msft->filter_lock); } -void msft_unregister(struct hci_dev *hdev) +void msft_release(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; diff --git a/net/bluetooth/msft.h b/net/bluetooth/msft.h index 2a63205b37..fe538e9c91 100644 --- a/net/bluetooth/msft.h +++ b/net/bluetooth/msft.h @@ -14,7 +14,7 @@ bool msft_monitor_supported(struct hci_dev *hdev); void msft_register(struct hci_dev *hdev); -void msft_unregister(struct hci_dev *hdev); +void msft_release(struct hci_dev *hdev); void msft_do_open(struct hci_dev *hdev); void msft_do_close(struct hci_dev *hdev); void msft_vendor_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb); @@ -35,7 +35,7 @@ static inline bool msft_monitor_supported(struct hci_dev *hdev) } static inline void msft_register(struct hci_dev *hdev) {} -static inline void msft_unregister(struct hci_dev *hdev) {} +static inline void msft_release(struct hci_dev *hdev) {} static inline void msft_do_open(struct hci_dev *hdev) {} static inline void msft_do_close(struct hci_dev *hdev) {} static inline void msft_vendor_evt(struct hci_dev *hdev, void *data, diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 29aa07e9db..37d63d768a 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -468,8 +468,8 @@ done: return err; } -static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sock *sk = sock->sk, *nsk; @@ -483,7 +483,7 @@ static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int f goto done; } - timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); BT_DBG("sk %p timeo %ld", sk, timeo); diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 5d03c5440b..a5ac160c59 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -83,6 +83,10 @@ static void sco_sock_timeout(struct work_struct *work) struct sock *sk; sco_conn_lock(conn); + if (!conn->hcon) { + sco_conn_unlock(conn); + return; + } sk = conn->sk; if (sk) sock_hold(sk); @@ -122,7 +126,6 @@ static void sco_sock_clear_timer(struct sock *sk) /* ---- SCO connections ---- */ static struct sco_conn *sco_conn_add(struct hci_conn *hcon) { - struct hci_dev *hdev = hcon->hdev; struct sco_conn *conn = hcon->sco_data; if (conn) { @@ -140,9 +143,10 @@ static struct sco_conn *sco_conn_add(struct hci_conn *hcon) hcon->sco_data = conn; conn->hcon = hcon; + conn->mtu = hcon->mtu; - if (hdev->sco_mtu > 0) - conn->mtu = hdev->sco_mtu; + if (hcon->mtu > 0) + conn->mtu = hcon->mtu; else conn->mtu = 60; @@ -643,7 +647,7 @@ done: } static int sco_sock_accept(struct socket *sock, struct socket *newsock, - int flags, bool kern) + struct proto_accept_arg *arg) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sock *sk = sock->sk, *ch; @@ -652,7 +656,7 @@ static int sco_sock_accept(struct socket *sock, struct socket *newsock, lock_sock(sk); - timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); BT_DBG("sk %p timeo %ld", sk, timeo); diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index 8906f7bdf4..891cdf61c6 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -7,7 +7,7 @@ #include <linux/bpf.h> #include <linux/btf.h> -extern struct bpf_struct_ops bpf_bpf_dummy_ops; +static struct bpf_struct_ops bpf_bpf_dummy_ops; /* A common type for test_N with return value in bpf_dummy_ops */ typedef int (*dummy_ops_test_ret_fn)(struct bpf_dummy_ops_state *state, ...); @@ -22,6 +22,8 @@ struct bpf_dummy_ops_test_args { struct bpf_dummy_ops_state state; }; +static struct btf *bpf_dummy_ops_btf; + static struct bpf_dummy_ops_test_args * dummy_ops_init_args(const union bpf_attr *kattr, unsigned int nr) { @@ -77,6 +79,51 @@ static int dummy_ops_call_op(void *image, struct bpf_dummy_ops_test_args *args) args->args[3], args->args[4]); } +static const struct bpf_ctx_arg_aux *find_ctx_arg_info(struct bpf_prog_aux *aux, int offset) +{ + int i; + + for (i = 0; i < aux->ctx_arg_info_size; i++) + if (aux->ctx_arg_info[i].offset == offset) + return &aux->ctx_arg_info[i]; + + return NULL; +} + +/* There is only one check at the moment: + * - zero should not be passed for pointer parameters not marked as nullable. + */ +static int check_test_run_args(struct bpf_prog *prog, struct bpf_dummy_ops_test_args *args) +{ + const struct btf_type *func_proto = prog->aux->attach_func_proto; + + for (u32 arg_no = 0; arg_no < btf_type_vlen(func_proto) ; ++arg_no) { + const struct btf_param *param = &btf_params(func_proto)[arg_no]; + const struct bpf_ctx_arg_aux *info; + const struct btf_type *t; + int offset; + + if (args->args[arg_no] != 0) + continue; + + /* Program is validated already, so there is no need + * to check if t is NULL. + */ + t = btf_type_skip_modifiers(bpf_dummy_ops_btf, param->type, NULL); + if (!btf_type_is_ptr(t)) + continue; + + offset = btf_ctx_arg_offset(bpf_dummy_ops_btf, func_proto, arg_no); + info = find_ctx_arg_info(prog->aux, offset); + if (info && (info->reg_type & PTR_MAYBE_NULL)) + continue; + + return -EINVAL; + } + + return 0; +} + extern const struct bpf_link_ops bpf_struct_ops_link_lops; int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, @@ -85,14 +132,21 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, const struct bpf_struct_ops *st_ops = &bpf_bpf_dummy_ops; const struct btf_type *func_proto; struct bpf_dummy_ops_test_args *args; - struct bpf_tramp_links *tlinks; + struct bpf_tramp_links *tlinks = NULL; struct bpf_tramp_link *link = NULL; void *image = NULL; unsigned int op_idx; + u32 image_off = 0; int prog_ret; + s32 type_id; int err; - if (prog->aux->attach_btf_id != st_ops->type_id) + type_id = btf_find_by_name_kind(bpf_dummy_ops_btf, + bpf_bpf_dummy_ops.name, + BTF_KIND_STRUCT); + if (type_id < 0) + return -EINVAL; + if (prog->aux->attach_btf_id != type_id) return -EOPNOTSUPP; func_proto = prog->aux->attach_func_proto; @@ -100,14 +154,12 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, if (IS_ERR(args)) return PTR_ERR(args); - tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); - if (!tlinks) { - err = -ENOMEM; + err = check_test_run_args(prog, args); + if (err) goto out; - } - image = arch_alloc_bpf_trampoline(PAGE_SIZE); - if (!image) { + tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); + if (!tlinks) { err = -ENOMEM; goto out; } @@ -125,11 +177,14 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, err = bpf_struct_ops_prepare_trampoline(tlinks, link, &st_ops->func_models[op_idx], &dummy_ops_test_ret_function, - image, image + PAGE_SIZE); + &image, &image_off, + true); if (err < 0) goto out; - arch_protect_bpf_trampoline(image, PAGE_SIZE); + err = arch_protect_bpf_trampoline(image, PAGE_SIZE); + if (err) + goto out; prog_ret = dummy_ops_call_op(image, args); err = dummy_ops_copy_args(args); @@ -139,7 +194,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, err = -EFAULT; out: kfree(args); - arch_free_bpf_trampoline(image, PAGE_SIZE); + bpf_struct_ops_image_free(image); if (link) bpf_link_put(&link->link); kfree(tlinks); @@ -148,6 +203,7 @@ out: static int bpf_dummy_init(struct btf *btf) { + bpf_dummy_ops_btf = btf; return 0; } @@ -169,7 +225,7 @@ static int bpf_dummy_ops_check_member(const struct btf_type *t, case offsetof(struct bpf_dummy_ops, test_sleepable): break; default: - if (prog->aux->sleepable) + if (prog->sleepable) return -EINVAL; } @@ -225,7 +281,7 @@ static void bpf_dummy_unreg(void *kdata) { } -static int bpf_dummy_test_1(struct bpf_dummy_ops_state *cb) +static int bpf_dummy_ops__test_1(struct bpf_dummy_ops_state *cb__nullable) { return 0; } @@ -242,12 +298,12 @@ static int bpf_dummy_test_sleepable(struct bpf_dummy_ops_state *cb) } static struct bpf_dummy_ops __bpf_bpf_dummy_ops = { - .test_1 = bpf_dummy_test_1, + .test_1 = bpf_dummy_ops__test_1, .test_2 = bpf_dummy_test_2, .test_sleepable = bpf_dummy_test_sleepable, }; -struct bpf_struct_ops bpf_bpf_dummy_ops = { +static struct bpf_struct_ops bpf_bpf_dummy_ops = { .verifier_ops = &bpf_dummy_verifier_ops, .init = bpf_dummy_init, .check_member = bpf_dummy_ops_check_member, @@ -256,4 +312,11 @@ struct bpf_struct_ops bpf_bpf_dummy_ops = { .unreg = bpf_dummy_unreg, .name = "bpf_dummy_ops", .cfi_stubs = &__bpf_bpf_dummy_ops, + .owner = THIS_MODULE, }; + +static int __init bpf_dummy_struct_ops_init(void) +{ + return register_bpf_struct_ops(&bpf_bpf_dummy_ops, bpf_dummy_ops); +} +late_initcall(bpf_dummy_struct_ops_init); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index dfd9193740..36ae54f57b 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -12,6 +12,7 @@ #include <linux/rcupdate_trace.h> #include <linux/sched/signal.h> #include <net/bpf_sk_storage.h> +#include <net/hotdata.h> #include <net/sock.h> #include <net/tcp.h> #include <net/net_namespace.h> @@ -254,7 +255,8 @@ static int xdp_recv_frames(struct xdp_frame **frames, int nframes, int i, n; LIST_HEAD(list); - n = kmem_cache_alloc_bulk(skbuff_cache, gfp, nframes, (void **)skbs); + n = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, gfp, nframes, + (void **)skbs); if (unlikely(n == 0)) { for (i = 0; i < nframes; i++) xdp_return_frame(frames[i]); @@ -573,6 +575,13 @@ __bpf_kfunc int bpf_modify_return_test2(int a, int *b, short c, int d, return a + *b + c + d + (long)e + f + g; } +__bpf_kfunc int bpf_modify_return_test_tp(int nonce) +{ + trace_bpf_trigger_tp(nonce); + + return nonce; +} + int noinline bpf_fentry_shadow_test(int a) { return a + 1; @@ -617,21 +626,22 @@ CFI_NOSEAL(bpf_kfunc_call_memb_release_dtor); __bpf_kfunc_end_defs(); -BTF_SET8_START(bpf_test_modify_return_ids) +BTF_KFUNCS_START(bpf_test_modify_return_ids) BTF_ID_FLAGS(func, bpf_modify_return_test) BTF_ID_FLAGS(func, bpf_modify_return_test2) +BTF_ID_FLAGS(func, bpf_modify_return_test_tp) BTF_ID_FLAGS(func, bpf_fentry_test1, KF_SLEEPABLE) -BTF_SET8_END(bpf_test_modify_return_ids) +BTF_KFUNCS_END(bpf_test_modify_return_ids) static const struct btf_kfunc_id_set bpf_test_modify_return_set = { .owner = THIS_MODULE, .set = &bpf_test_modify_return_ids, }; -BTF_SET8_START(test_sk_check_kfunc_ids) +BTF_KFUNCS_START(test_sk_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_kfunc_call_test_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_kfunc_call_memb_release, KF_RELEASE) -BTF_SET8_END(test_sk_check_kfunc_ids) +BTF_KFUNCS_END(test_sk_check_kfunc_ids) static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size, u32 size, u32 headroom, u32 tailroom) @@ -717,10 +727,16 @@ static void __bpf_prog_test_run_raw_tp(void *data) { struct bpf_raw_tp_test_run_info *info = data; + struct bpf_trace_run_ctx run_ctx = {}; + struct bpf_run_ctx *old_run_ctx; + + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); rcu_read_lock(); info->retval = bpf_prog_run(info->prog, info->ctx); rcu_read_unlock(); + + bpf_reset_run_ctx(old_run_ctx); } int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, diff --git a/net/bridge/br.c b/net/bridge/br.c index ac19b797db..2cab878e0a 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -356,26 +356,21 @@ void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on) clear_bit(opt, &br->options); } -static void __net_exit br_net_exit_batch(struct list_head *net_list) +static void __net_exit br_net_exit_batch_rtnl(struct list_head *net_list, + struct list_head *dev_to_kill) { struct net_device *dev; struct net *net; - LIST_HEAD(list); - - rtnl_lock(); + ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) for_each_netdev(net, dev) if (netif_is_bridge_master(dev)) - br_dev_delete(dev, &list); - - unregister_netdevice_many(&list); - - rtnl_unlock(); + br_dev_delete(dev, dev_to_kill); } static struct pernet_operations br_net_ops = { - .exit_batch = br_net_exit_batch, + .exit_batch_rtnl = br_net_exit_batch_rtnl, }; static const struct stp_proto br_stp_proto = { diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 65cee0ad3c..fb1115857e 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -27,6 +27,7 @@ EXPORT_SYMBOL_GPL(nf_br_ops); /* net device transmit always called with BH disabled */ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) { + enum skb_drop_reason reason = pskb_may_pull_reason(skb, ETH_HLEN); struct net_bridge_mcast_port *pmctx_null = NULL; struct net_bridge *br = netdev_priv(dev); struct net_bridge_mcast *brmctx = &br->multicast_ctx; @@ -38,6 +39,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) const unsigned char *dest; u16 vid = 0; + if (unlikely(reason != SKB_NOT_DROPPED_YET)) { + kfree_skb_reason(skb, reason); + return NETDEV_TX_OK; + } + memset(skb->cb, 0, sizeof(struct br_input_skb_cb)); br_tc_skb_miss_set(skb, false); @@ -108,38 +114,23 @@ out: return NETDEV_TX_OK; } -static struct lock_class_key bridge_netdev_addr_lock_key; - -static void br_set_lockdep_class(struct net_device *dev) -{ - lockdep_set_class(&dev->addr_list_lock, &bridge_netdev_addr_lock_key); -} - static int br_dev_init(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); int err; - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - err = br_fdb_hash_init(br); - if (err) { - free_percpu(dev->tstats); + if (err) return err; - } err = br_mdb_hash_init(br); if (err) { - free_percpu(dev->tstats); br_fdb_hash_fini(br); return err; } err = br_vlan_init(br); if (err) { - free_percpu(dev->tstats); br_mdb_hash_fini(br); br_fdb_hash_fini(br); return err; @@ -147,14 +138,14 @@ static int br_dev_init(struct net_device *dev) err = br_multicast_init_stats(br); if (err) { - free_percpu(dev->tstats); br_vlan_flush(br); br_mdb_hash_fini(br); br_fdb_hash_fini(br); + return err; } - br_set_lockdep_class(dev); - return err; + netdev_lockdep_set_classes(dev); + return 0; } static void br_dev_uninit(struct net_device *dev) @@ -166,7 +157,6 @@ static void br_dev_uninit(struct net_device *dev) br_vlan_flush(br); br_mdb_hash_fini(br); br_fdb_hash_fini(br); - free_percpu(dev->tstats); } static int br_dev_open(struct net_device *dev) @@ -213,7 +203,7 @@ static int br_change_mtu(struct net_device *dev, int new_mtu) { struct net_bridge *br = netdev_priv(dev); - dev->mtu = new_mtu; + WRITE_ONCE(dev->mtu, new_mtu); /* this flag will be cleared if the MTU was automatically adjusted */ br_opt_toggle(br, BROPT_MTU_SET_BY_USER, true); @@ -405,7 +395,7 @@ static int br_fill_forward_path(struct net_device_path_ctx *ctx, br_vlan_fill_forward_path_pvid(br, ctx, path); f = br_fdb_find_rcu(br, ctx->daddr, path->bridge.vlan_id); - if (!f || !f->dst) + if (!f) return -1; dst = READ_ONCE(f->dst); @@ -481,7 +471,7 @@ static const struct net_device_ops br_netdev_ops = { .ndo_fill_forward_path = br_fill_forward_path, }; -static struct device_type br_type = { +static const struct device_type br_type = { .name = "bridge", }; @@ -503,6 +493,7 @@ void br_dev_setup(struct net_device *dev) dev->hw_features = COMMON_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; dev->vlan_features = COMMON_FEATURES; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; br->dev = dev; spin_lock_init(&br->lock); diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index c622de5ecc..c77591e638 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -35,10 +35,7 @@ static struct kmem_cache *br_fdb_cache __read_mostly; int __init br_fdb_init(void) { - br_fdb_cache = kmem_cache_create("bridge_fdb_cache", - sizeof(struct net_bridge_fdb_entry), - 0, - SLAB_HWCACHE_ALIGN, NULL); + br_fdb_cache = KMEM_CACHE(net_bridge_fdb_entry, SLAB_HWCACHE_ALIGN); if (!br_fdb_cache) return -ENOMEM; diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 7431f89e89..e19b583ff2 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -25,8 +25,8 @@ static inline int should_deliver(const struct net_bridge_port *p, vg = nbp_vlan_group_rcu(p); return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) && - p->state == BR_STATE_FORWARDING && br_allowed_egress(vg, skb) && - nbp_switchdev_allowed_egress(p, skb) && + (br_mst_is_enabled(p->br) || p->state == BR_STATE_FORWARDING) && + br_allowed_egress(vg, skb) && nbp_switchdev_allowed_egress(p, skb) && !br_skb_isolated(p, skb); } @@ -258,6 +258,7 @@ static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb, { struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev; const unsigned char *src = eth_hdr(skb)->h_source; + struct sk_buff *nskb; if (!should_deliver(p, skb)) return; @@ -266,12 +267,16 @@ static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb, if (skb->dev == p->dev && ether_addr_equal(src, addr)) return; - skb = skb_copy(skb, GFP_ATOMIC); - if (!skb) { + __skb_push(skb, ETH_HLEN); + nskb = pskb_copy(skb, GFP_ATOMIC); + __skb_pull(skb, ETH_HLEN); + if (!nskb) { DEV_STATS_INC(dev, tx_dropped); return; } + skb = nskb; + __skb_pull(skb, ETH_HLEN); if (!is_broadcast_ether_addr(addr)) memcpy(eth_hdr(skb)->h_dest, addr, ETH_ALEN); diff --git a/net/bridge/br_mst.c b/net/bridge/br_mst.c index ee680adcee..1820f09ff5 100644 --- a/net/bridge/br_mst.c +++ b/net/bridge/br_mst.c @@ -73,12 +73,11 @@ int br_mst_get_state(const struct net_device *dev, u16 msti, u8 *state) } EXPORT_SYMBOL_GPL(br_mst_get_state); -static void br_mst_vlan_set_state(struct net_bridge_port *p, struct net_bridge_vlan *v, +static void br_mst_vlan_set_state(struct net_bridge_vlan_group *vg, + struct net_bridge_vlan *v, u8 state) { - struct net_bridge_vlan_group *vg = nbp_vlan_group(p); - - if (v->state == state) + if (br_vlan_get_state(v) == state) return; br_vlan_set_state(v, state); @@ -100,11 +99,12 @@ int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state, }; struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; - int err; + int err = 0; - vg = nbp_vlan_group(p); + rcu_read_lock(); + vg = nbp_vlan_group_rcu(p); if (!vg) - return 0; + goto out; /* MSTI 0 (CST) state changes are notified via the regular * SWITCHDEV_ATTR_ID_PORT_STP_STATE. @@ -112,17 +112,20 @@ int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state, if (msti) { err = switchdev_port_attr_set(p->dev, &attr, extack); if (err && err != -EOPNOTSUPP) - return err; + goto out; } - list_for_each_entry(v, &vg->vlan_list, vlist) { + err = 0; + list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { if (v->brvlan->msti != msti) continue; - br_mst_vlan_set_state(p, v, state); + br_mst_vlan_set_state(vg, v, state); } - return 0; +out: + rcu_read_unlock(); + return err; } static void br_mst_vlan_sync_state(struct net_bridge_vlan *pv, u16 msti) @@ -136,13 +139,13 @@ static void br_mst_vlan_sync_state(struct net_bridge_vlan *pv, u16 msti) * it. */ if (v != pv && v->brvlan->msti == msti) { - br_mst_vlan_set_state(pv->port, pv, v->state); + br_mst_vlan_set_state(vg, pv, v->state); return; } } /* Otherwise, start out in a new MSTI with all ports disabled. */ - return br_mst_vlan_set_state(pv->port, pv, BR_STATE_DISABLED); + return br_mst_vlan_set_state(vg, pv, BR_STATE_DISABLED); } int br_mst_vlan_set_msti(struct net_bridge_vlan *mv, u16 msti) diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 2d7b732429..b2ae0d2434 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -2045,16 +2045,14 @@ void br_multicast_del_port(struct net_bridge_port *port) { struct net_bridge *br = port->br; struct net_bridge_port_group *pg; - HLIST_HEAD(deleted_head); struct hlist_node *n; /* Take care of the remaining groups, only perm ones should be left */ spin_lock_bh(&br->multicast_lock); hlist_for_each_entry_safe(pg, n, &port->mglist, mglist) br_multicast_find_del_pg(br, pg); - hlist_move_list(&br->mcast_gc_list, &deleted_head); spin_unlock_bh(&br->multicast_lock); - br_multicast_gc(&deleted_head); + flush_work(&br->mcast_gc_work); br_multicast_port_ctx_deinit(&port->multicast_ctx); free_percpu(port->mcast_stats); } @@ -5053,7 +5051,7 @@ void br_multicast_uninit_stats(struct net_bridge *br) free_percpu(br->mcast_stats); } -/* noinline for https://bugs.llvm.org/show_bug.cgi?id=45802#c9 */ +/* noinline for https://llvm.org/pr45802#c9 */ static noinline_for_stack void mcast_stats_add_dir(u64 *dst, u64 *src) { dst[BR_MCAST_DIR_RX] += src[BR_MCAST_DIR_RX]; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 22e35623c1..bf30c50b56 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -399,7 +399,8 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_ goto free_skb; rt = ip_route_output(net, iph->daddr, 0, - RT_TOS(iph->tos), 0); + RT_TOS(iph->tos), 0, + RT_SCOPE_UNIVERSE); if (!IS_ERR(rt)) { /* - Bridged-and-DNAT'ed traffic doesn't * require ip_forwarding. */ @@ -1225,7 +1226,6 @@ static struct ctl_table brnf_table[] = { .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, - { } }; static inline void br_netfilter_sysctl_default(struct brnf_net *brnf) @@ -1274,7 +1274,7 @@ static int br_netfilter_sysctl_init_net(struct net *net) static void br_netfilter_sysctl_exit_net(struct net *net, struct brnf_net *brnet) { - struct ctl_table *table = brnet->ctl_hdr->ctl_table_arg; + const struct ctl_table *table = brnet->ctl_hdr->ctl_table_arg; unregister_net_sysctl_table(brnet->ctl_hdr); if (!net_eq(net, &init_net)) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index d415833fce..f17dbac7d8 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -455,7 +455,8 @@ static int br_fill_ifinfo(struct sk_buff *skb, u32 filter_mask, const struct net_device *dev, bool getlink) { - u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; + u8 operstate = netif_running(dev) ? READ_ONCE(dev->operstate) : + IF_OPER_DOWN; struct nlattr *af = NULL; struct net_bridge *br; struct ifinfomsg *hdr; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 15f44d026e..9c2fffb827 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -841,7 +841,7 @@ void br_vlan_flush(struct net_bridge *br) vg = br_vlan_group(br); __vlan_flush(br, NULL, vg); RCU_INIT_POINTER(br->vlgrp, NULL); - synchronize_rcu(); + synchronize_net(); __vlan_group_free(vg); } @@ -1372,7 +1372,7 @@ void nbp_vlan_flush(struct net_bridge_port *port) vg = nbp_vlan_group(port); __vlan_flush(port->br, port, vg); RCU_INIT_POINTER(port->vlgrp, NULL); - synchronize_rcu(); + synchronize_net(); __vlan_group_free(vg); } diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c index 81833ca7a2..a966a6ec82 100644 --- a/net/bridge/br_vlan_tunnel.c +++ b/net/bridge/br_vlan_tunnel.c @@ -65,13 +65,14 @@ static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg, { struct metadata_dst *metadata = rtnl_dereference(vlan->tinfo.tunnel_dst); __be64 key = key32_to_tunnel_id(cpu_to_be32(tun_id)); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; int err; if (metadata) return -EEXIST; - metadata = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY, - key, 0); + __set_bit(IP_TUNNEL_KEY_BIT, flags); + metadata = __ip_tun_set_dst(0, 0, 0, 0, 0, flags, key, 0); if (!metadata) return -EINVAL; @@ -185,6 +186,7 @@ void br_handle_ingress_vlan_tunnel(struct sk_buff *skb, int br_handle_egress_vlan_tunnel(struct sk_buff *skb, struct net_bridge_vlan *vlan) { + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct metadata_dst *tunnel_dst; __be64 tunnel_id; int err; @@ -202,7 +204,8 @@ int br_handle_egress_vlan_tunnel(struct sk_buff *skb, return err; if (BR_INPUT_SKB_CB(skb)->backup_nhid) { - tunnel_dst = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY, + __set_bit(IP_TUNNEL_KEY_BIT, flags); + tunnel_dst = __ip_tun_set_dst(0, 0, 0, 0, 0, flags, tunnel_id, 0); if (!tunnel_dst) return -ENOMEM; diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index 7f304a19ac..104c0125e3 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -39,6 +39,10 @@ config NF_CONNTRACK_BRIDGE To compile it as a module, choose M here. If unsure, say N. +# old sockopt interface and eval loop +config BRIDGE_NF_EBTABLES_LEGACY + tristate + menuconfig BRIDGE_NF_EBTABLES tristate "Ethernet Bridge tables (ebtables) support" depends on BRIDGE && NETFILTER && NETFILTER_XTABLES @@ -55,6 +59,7 @@ if BRIDGE_NF_EBTABLES # config BRIDGE_EBT_BROUTE tristate "ebt: broute table support" + select BRIDGE_NF_EBTABLES_LEGACY help The ebtables broute table is used to define rules that decide between bridging and routing frames, giving Linux the functionality of a @@ -65,6 +70,7 @@ config BRIDGE_EBT_BROUTE config BRIDGE_EBT_T_FILTER tristate "ebt: filter table support" + select BRIDGE_NF_EBTABLES_LEGACY help The ebtables filter table is used to define frame filtering rules at local input, forwarding and local output. See the man page for @@ -74,6 +80,7 @@ config BRIDGE_EBT_T_FILTER config BRIDGE_EBT_T_NAT tristate "ebt: nat table support" + select BRIDGE_NF_EBTABLES_LEGACY help The ebtables nat table is used to define rules that alter the MAC source address (MAC SNAT) or the MAC destination address (MAC DNAT). diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile index 1c9ce49ab6..b9a1303da9 100644 --- a/net/bridge/netfilter/Makefile +++ b/net/bridge/netfilter/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_NFT_BRIDGE_REJECT) += nft_reject_bridge.o # connection tracking obj-$(CONFIG_NF_CONNTRACK_BRIDGE) += nf_conntrack_bridge.o -obj-$(CONFIG_BRIDGE_NF_EBTABLES) += ebtables.o +obj-$(CONFIG_BRIDGE_NF_EBTABLES_LEGACY) += ebtables.o # tables obj-$(CONFIG_BRIDGE_EBT_BROUTE) += ebtable_broute.o diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c index 8480684f27..20139fa1be 100644 --- a/net/caif/cfctrl.c +++ b/net/caif/cfctrl.c @@ -201,14 +201,14 @@ int cfctrl_linkup_request(struct cflayer *layer, struct cflayer *user_layer) { struct cfctrl *cfctrl = container_obj(layer); + struct cflayer *dn = cfctrl->serv.layer.dn; + char utility_name[UTILITY_NAME_LENGTH]; + struct cfctrl_request_info *req; + struct cfpkt *pkt; u32 tmp32; u16 tmp16; u8 tmp8; - struct cfctrl_request_info *req; int ret; - char utility_name[16]; - struct cfpkt *pkt; - struct cflayer *dn = cfctrl->serv.layer.dn; if (!dn) { pr_debug("not able to send linkup request\n"); diff --git a/net/can/af_can.c b/net/can/af_can.c index 7343fd487d..707576eeeb 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -865,6 +865,8 @@ static __init int can_init(void) /* check for correct padding to be able to use the structs similarly */ BUILD_BUG_ON(offsetof(struct can_frame, len) != offsetof(struct canfd_frame, len) || + offsetof(struct can_frame, len) != + offsetof(struct canxl_frame, flags) || offsetof(struct can_frame, data) != offsetof(struct canfd_frame, data)); diff --git a/net/can/bcm.c b/net/can/bcm.c index 9168114fc8..27d5fcf0ea 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -72,9 +72,11 @@ #define BCM_TIMER_SEC_MAX (400 * 24 * 60 * 60) /* use of last_frames[index].flags */ +#define RX_LOCAL 0x10 /* frame was created on the local host */ +#define RX_OWN 0x20 /* frame was sent via the socket it was received on */ #define RX_RECV 0x40 /* received data for this element */ #define RX_THR 0x80 /* element not been sent due to throttle feature */ -#define BCM_CAN_FLAGS_MASK 0x3F /* to clean private flags after usage */ +#define BCM_CAN_FLAGS_MASK 0x0F /* to clean private flags after usage */ /* get best masking value for can_rx_register() for a given single can_id */ #define REGMASK(id) ((id & CAN_EFF_FLAG) ? \ @@ -138,6 +140,16 @@ static LIST_HEAD(bcm_notifier_list); static DEFINE_SPINLOCK(bcm_notifier_lock); static struct bcm_sock *bcm_busy_notifier; +/* Return pointer to store the extra msg flags for bcm_recvmsg(). + * We use the space of one unsigned int beyond the 'struct sockaddr_can' + * in skb->cb. + */ +static inline unsigned int *bcm_flags(struct sk_buff *skb) +{ + /* return pointer after struct sockaddr_can */ + return (unsigned int *)(&((struct sockaddr_can *)skb->cb)[1]); +} + static inline struct bcm_sock *bcm_sk(const struct sock *sk) { return (struct bcm_sock *)sk; @@ -325,6 +337,7 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, struct sock *sk = op->sk; unsigned int datalen = head->nframes * op->cfsiz; int err; + unsigned int *pflags; skb = alloc_skb(sizeof(*head) + datalen, gfp_any()); if (!skb) @@ -332,6 +345,14 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, skb_put_data(skb, head, sizeof(*head)); + /* ensure space for sockaddr_can and msg flags */ + sock_skb_cb_check_size(sizeof(struct sockaddr_can) + + sizeof(unsigned int)); + + /* initialize msg flags */ + pflags = bcm_flags(skb); + *pflags = 0; + if (head->nframes) { /* CAN frames starting here */ firstframe = (struct canfd_frame *)skb_tail_pointer(skb); @@ -344,8 +365,14 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, * relevant for updates that are generated by the * BCM, where nframes is 1 */ - if (head->nframes == 1) + if (head->nframes == 1) { + if (firstframe->flags & RX_LOCAL) + *pflags |= MSG_DONTROUTE; + if (firstframe->flags & RX_OWN) + *pflags |= MSG_CONFIRM; + firstframe->flags &= BCM_CAN_FLAGS_MASK; + } } if (has_timestamp) { @@ -360,7 +387,6 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, * containing the interface index. */ - sock_skb_cb_check_size(sizeof(struct sockaddr_can)); addr = (struct sockaddr_can *)skb->cb; memset(addr, 0, sizeof(*addr)); addr->can_family = AF_CAN; @@ -444,7 +470,7 @@ static void bcm_rx_changed(struct bcm_op *op, struct canfd_frame *data) op->frames_filtered = op->frames_abs = 0; /* this element is not throttled anymore */ - data->flags &= (BCM_CAN_FLAGS_MASK|RX_RECV); + data->flags &= ~RX_THR; memset(&head, 0, sizeof(head)); head.opcode = RX_CHANGED; @@ -465,13 +491,17 @@ static void bcm_rx_changed(struct bcm_op *op, struct canfd_frame *data) */ static void bcm_rx_update_and_send(struct bcm_op *op, struct canfd_frame *lastdata, - const struct canfd_frame *rxdata) + const struct canfd_frame *rxdata, + unsigned char traffic_flags) { memcpy(lastdata, rxdata, op->cfsiz); /* mark as used and throttled by default */ lastdata->flags |= (RX_RECV|RX_THR); + /* add own/local/remote traffic flags */ + lastdata->flags |= traffic_flags; + /* throttling mode inactive ? */ if (!op->kt_ival2) { /* send RX_CHANGED to the user immediately */ @@ -508,7 +538,8 @@ rx_changed_settime: * received data stored in op->last_frames[] */ static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index, - const struct canfd_frame *rxdata) + const struct canfd_frame *rxdata, + unsigned char traffic_flags) { struct canfd_frame *cf = op->frames + op->cfsiz * index; struct canfd_frame *lcf = op->last_frames + op->cfsiz * index; @@ -521,7 +552,7 @@ static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index, if (!(lcf->flags & RX_RECV)) { /* received data for the first time => send update to user */ - bcm_rx_update_and_send(op, lcf, rxdata); + bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags); return; } @@ -529,7 +560,7 @@ static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index, for (i = 0; i < rxdata->len; i += 8) { if ((get_u64(cf, i) & get_u64(rxdata, i)) != (get_u64(cf, i) & get_u64(lcf, i))) { - bcm_rx_update_and_send(op, lcf, rxdata); + bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags); return; } } @@ -537,7 +568,7 @@ static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index, if (op->flags & RX_CHECK_DLC) { /* do a real check in CAN frame length */ if (rxdata->len != lcf->len) { - bcm_rx_update_and_send(op, lcf, rxdata); + bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags); return; } } @@ -644,6 +675,7 @@ static void bcm_rx_handler(struct sk_buff *skb, void *data) struct bcm_op *op = (struct bcm_op *)data; const struct canfd_frame *rxframe = (struct canfd_frame *)skb->data; unsigned int i; + unsigned char traffic_flags; if (op->can_id != rxframe->can_id) return; @@ -673,15 +705,24 @@ static void bcm_rx_handler(struct sk_buff *skb, void *data) return; } + /* compute flags to distinguish between own/local/remote CAN traffic */ + traffic_flags = 0; + if (skb->sk) { + traffic_flags |= RX_LOCAL; + if (skb->sk == op->sk) + traffic_flags |= RX_OWN; + } + if (op->flags & RX_FILTER_ID) { /* the easiest case */ - bcm_rx_update_and_send(op, op->last_frames, rxframe); + bcm_rx_update_and_send(op, op->last_frames, rxframe, + traffic_flags); goto rx_starttimer; } if (op->nframes == 1) { /* simple compare with index 0 */ - bcm_rx_cmp_to_index(op, 0, rxframe); + bcm_rx_cmp_to_index(op, 0, rxframe, traffic_flags); goto rx_starttimer; } @@ -698,7 +739,8 @@ static void bcm_rx_handler(struct sk_buff *skb, void *data) if ((get_u64(op->frames, 0) & get_u64(rxframe, 0)) == (get_u64(op->frames, 0) & get_u64(op->frames + op->cfsiz * i, 0))) { - bcm_rx_cmp_to_index(op, i, rxframe); + bcm_rx_cmp_to_index(op, i, rxframe, + traffic_flags); break; } } @@ -1675,6 +1717,9 @@ static int bcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, memcpy(msg->msg_name, skb->cb, msg->msg_namelen); } + /* assign the flags that have been recorded in bcm_send_to_user() */ + msg->msg_flags |= *(bcm_flags(skb)); + skb_free_datagram(sk, skb); return size; diff --git a/net/can/isotp.c b/net/can/isotp.c index d1c6f206f4..25bac0fafc 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -381,8 +381,9 @@ static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae) return 1; } - /* get communication parameters only from the first FC frame */ - if (so->tx.state == ISOTP_WAIT_FIRST_FC) { + /* get static/dynamic communication params from first/every FC frame */ + if (so->tx.state == ISOTP_WAIT_FIRST_FC || + so->opt.flags & CAN_ISOTP_DYN_FC_PARMS) { so->txfc.bs = cf->data[ae + 1]; so->txfc.stmin = cf->data[ae + 2]; diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index a6fb89fa62..7e8a20f2fc 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -30,10 +30,6 @@ MODULE_ALIAS("can-proto-" __stringify(CAN_J1939)); /* CAN_HDR: #bytes before can_frame data part */ #define J1939_CAN_HDR (offsetof(struct can_frame, data)) -/* CAN_FTR: #bytes beyond data part */ -#define J1939_CAN_FTR (sizeof(struct can_frame) - J1939_CAN_HDR - \ - sizeof(((struct can_frame *)0)->data)) - /* lowest layer */ static void j1939_can_recv(struct sk_buff *iskb, void *data) { @@ -342,7 +338,7 @@ int j1939_send_one(struct j1939_priv *priv, struct sk_buff *skb) memset(cf, 0, J1939_CAN_HDR); /* make it a full can frame again */ - skb_put(skb, J1939_CAN_FTR + (8 - dlc)); + skb_put_zero(skb, 8 - dlc); canid = CAN_EFF_FLAG | (skcb->priority << 26) | diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index fe3df23a25..4be73de503 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -1593,8 +1593,8 @@ j1939_session *j1939_xtp_rx_rts_session_new(struct j1939_priv *priv, struct j1939_sk_buff_cb skcb = *j1939_skb_to_cb(skb); struct j1939_session *session; const u8 *dat; + int len, ret; pgn_t pgn; - int len; netdev_dbg(priv->ndev, "%s\n", __func__); @@ -1653,7 +1653,22 @@ j1939_session *j1939_xtp_rx_rts_session_new(struct j1939_priv *priv, session->tskey = priv->rx_tskey++; j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_RTS); - WARN_ON_ONCE(j1939_session_activate(session)); + ret = j1939_session_activate(session); + if (ret) { + /* Entering this scope indicates an issue with the J1939 bus. + * Possible scenarios include: + * - A time lapse occurred, and a new session was initiated + * due to another packet being sent correctly. This could + * have been caused by too long interrupt, debugger, or being + * out-scheduled by another task. + * - The bus is receiving numerous erroneous packets, either + * from a malfunctioning device or during a test scenario. + */ + netdev_alert(priv->ndev, "%s: 0x%p: concurrent session with same addr (%02x %02x) is already active.\n", + __func__, session, skcb.addr.sa, skcb.addr.da); + j1939_session_put(session); + return NULL; + } return session; } @@ -1681,6 +1696,8 @@ static int j1939_xtp_rx_rts_session_active(struct j1939_session *session, j1939_session_timers_cancel(session); j1939_session_cancel(session, J1939_XTP_ABORT_BUSY); + if (session->transmission) + j1939_session_deactivate_activate_next(session); return -EBUSY; } diff --git a/net/can/raw.c b/net/can/raw.c index e6b822624b..00533f64d6 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -91,6 +91,10 @@ struct raw_sock { int recv_own_msgs; int fd_frames; int xl_frames; + struct can_raw_vcid_options raw_vcid_opts; + canid_t tx_vcid_shifted; + canid_t rx_vcid_shifted; + canid_t rx_vcid_mask_shifted; int join_filters; int count; /* number of active filters */ struct can_filter dfilter; /* default/single filter */ @@ -134,10 +138,29 @@ static void raw_rcv(struct sk_buff *oskb, void *data) return; /* make sure to not pass oversized frames to the socket */ - if ((!ro->fd_frames && can_is_canfd_skb(oskb)) || - (!ro->xl_frames && can_is_canxl_skb(oskb))) + if (!ro->fd_frames && can_is_canfd_skb(oskb)) return; + if (can_is_canxl_skb(oskb)) { + struct canxl_frame *cxl = (struct canxl_frame *)oskb->data; + + /* make sure to not pass oversized frames to the socket */ + if (!ro->xl_frames) + return; + + /* filter CAN XL VCID content */ + if (ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_RX_FILTER) { + /* apply VCID filter if user enabled the filter */ + if ((cxl->prio & ro->rx_vcid_mask_shifted) != + (ro->rx_vcid_shifted & ro->rx_vcid_mask_shifted)) + return; + } else { + /* no filter => do not forward VCID tagged frames */ + if (cxl->prio & CANXL_VCID_MASK) + return; + } + } + /* eliminate multiple filter matches for the same skb */ if (this_cpu_ptr(ro->uniq)->skb == oskb && this_cpu_ptr(ro->uniq)->skbcnt == can_skb_prv(oskb)->skbcnt) { @@ -698,6 +721,19 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, ro->fd_frames = ro->xl_frames; break; + case CAN_RAW_XL_VCID_OPTS: + if (optlen != sizeof(ro->raw_vcid_opts)) + return -EINVAL; + + if (copy_from_sockptr(&ro->raw_vcid_opts, optval, optlen)) + return -EFAULT; + + /* prepare 32 bit values for handling in hot path */ + ro->tx_vcid_shifted = ro->raw_vcid_opts.tx_vcid << CANXL_VCID_OFFSET; + ro->rx_vcid_shifted = ro->raw_vcid_opts.rx_vcid << CANXL_VCID_OFFSET; + ro->rx_vcid_mask_shifted = ro->raw_vcid_opts.rx_vcid_mask << CANXL_VCID_OFFSET; + break; + case CAN_RAW_JOIN_FILTERS: if (optlen != sizeof(ro->join_filters)) return -EINVAL; @@ -720,7 +756,6 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, struct raw_sock *ro = raw_sk(sk); int len; void *val; - int err = 0; if (level != SOL_CAN_RAW) return -EINVAL; @@ -730,7 +765,9 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, return -EINVAL; switch (optname) { - case CAN_RAW_FILTER: + case CAN_RAW_FILTER: { + int err = 0; + lock_sock(sk); if (ro->count > 0) { int fsize = ro->count * sizeof(struct can_filter); @@ -755,7 +792,7 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, if (!err) err = put_user(len, optlen); return err; - + } case CAN_RAW_ERR_FILTER: if (len > sizeof(can_err_mask_t)) len = sizeof(can_err_mask_t); @@ -786,6 +823,25 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, val = &ro->xl_frames; break; + case CAN_RAW_XL_VCID_OPTS: { + int err = 0; + + /* user space buffer to small for VCID opts? */ + if (len < sizeof(ro->raw_vcid_opts)) { + /* return -ERANGE and needed space in optlen */ + err = -ERANGE; + if (put_user(sizeof(ro->raw_vcid_opts), optlen)) + err = -EFAULT; + } else { + if (len > sizeof(ro->raw_vcid_opts)) + len = sizeof(ro->raw_vcid_opts); + if (copy_to_user(optval, &ro->raw_vcid_opts, len)) + err = -EFAULT; + } + if (!err) + err = put_user(len, optlen); + return err; + } case CAN_RAW_JOIN_FILTERS: if (len > sizeof(int)) len = sizeof(int); @@ -803,23 +859,41 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, return 0; } -static bool raw_bad_txframe(struct raw_sock *ro, struct sk_buff *skb, int mtu) +static void raw_put_canxl_vcid(struct raw_sock *ro, struct sk_buff *skb) +{ + struct canxl_frame *cxl = (struct canxl_frame *)skb->data; + + /* sanitize non CAN XL bits */ + cxl->prio &= (CANXL_PRIO_MASK | CANXL_VCID_MASK); + + /* clear VCID in CAN XL frame if pass through is disabled */ + if (!(ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_TX_PASS)) + cxl->prio &= CANXL_PRIO_MASK; + + /* set VCID in CAN XL frame if enabled */ + if (ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_TX_SET) { + cxl->prio &= CANXL_PRIO_MASK; + cxl->prio |= ro->tx_vcid_shifted; + } +} + +static unsigned int raw_check_txframe(struct raw_sock *ro, struct sk_buff *skb, int mtu) { /* Classical CAN -> no checks for flags and device capabilities */ if (can_is_can_skb(skb)) - return false; + return CAN_MTU; /* CAN FD -> needs to be enabled and a CAN FD or CAN XL device */ if (ro->fd_frames && can_is_canfd_skb(skb) && (mtu == CANFD_MTU || can_is_canxl_dev_mtu(mtu))) - return false; + return CANFD_MTU; /* CAN XL -> needs to be enabled and a CAN XL device */ if (ro->xl_frames && can_is_canxl_skb(skb) && can_is_canxl_dev_mtu(mtu)) - return false; + return CANXL_MTU; - return true; + return 0; } static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) @@ -829,6 +903,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) struct sockcm_cookie sockc; struct sk_buff *skb; struct net_device *dev; + unsigned int txmtu; int ifindex; int err = -EINVAL; @@ -869,9 +944,16 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) goto free_skb; err = -EINVAL; - if (raw_bad_txframe(ro, skb, dev->mtu)) + + /* check for valid CAN (CC/FD/XL) frame content */ + txmtu = raw_check_txframe(ro, skb, dev->mtu); + if (!txmtu) goto free_skb; + /* only CANXL: clear/forward/set VCID value */ + if (txmtu == CANXL_MTU) + raw_put_canxl_vcid(ro, skb); + sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 1daf95e17d..3a5bd1cd1e 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -429,7 +429,10 @@ static int is_out(const struct crush_map *map, /** * crush_choose_firstn - choose numrep distinct items of given type * @map: the crush_map + * @work: working space initialized by crush_init_workspace() * @bucket: the bucket we are choose an item from + * @weight: weight vector (for map leaves) + * @weight_max: size of weight vector * @x: crush input value * @numrep: the number of items to choose * @type: the type of item to choose @@ -445,6 +448,7 @@ static int is_out(const struct crush_map *map, * @vary_r: pass r to recursive calls * @out2: second output vector for leaf items (if @recurse_to_leaf) * @parent_r: r value passed from the parent + * @choose_args: weights and ids for each known bucket */ static int crush_choose_firstn(const struct crush_map *map, struct crush_work *work, @@ -636,9 +640,8 @@ reject: } -/** +/* * crush_choose_indep: alternative breadth-first positionally stable mapping - * */ static void crush_choose_indep(const struct crush_map *map, struct crush_work *work, diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index f263f7e91a..ab66b599ac 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -1085,13 +1085,19 @@ static void delayed_work(struct work_struct *work) struct ceph_mon_client *monc = container_of(work, struct ceph_mon_client, delayed_work.work); - dout("monc delayed_work\n"); mutex_lock(&monc->mutex); + dout("%s mon%d\n", __func__, monc->cur_mon); + if (monc->cur_mon < 0) { + goto out; + } + if (monc->hunting) { dout("%s continuing hunt\n", __func__); reopen_session(monc); } else { int is_auth = ceph_auth_is_authenticated(monc->auth); + + dout("%s is_authed %d\n", __func__, is_auth); if (ceph_con_keepalive_expired(&monc->con, CEPH_MONC_PING_TIMEOUT)) { dout("monc keepalive timeout\n"); @@ -1116,6 +1122,8 @@ static void delayed_work(struct work_struct *work) } } __schedule_delayed(monc); + +out: mutex_unlock(&monc->mutex); } @@ -1232,13 +1240,15 @@ EXPORT_SYMBOL(ceph_monc_init); void ceph_monc_stop(struct ceph_mon_client *monc) { dout("stop\n"); - cancel_delayed_work_sync(&monc->delayed_work); mutex_lock(&monc->mutex); __close_session(monc); + monc->hunting = false; monc->cur_mon = -1; mutex_unlock(&monc->mutex); + cancel_delayed_work_sync(&monc->delayed_work); + /* * flush msgr queue before we destroy ourselves to ensure that: * - any work that references our embedded con is finished. diff --git a/net/core/Makefile b/net/core/Makefile index 821aec06ab..62be9aef25 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -18,6 +18,7 @@ obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o obj-y += net-sysfs.o +obj-y += hotdata.o obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o @@ -25,6 +26,7 @@ obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_FIB_RULES) += fib_rules.o obj-$(CONFIG_TRACEPOINTS) += net-traces.o obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o +obj-$(CONFIG_NET_IEEE8021Q_HELPERS) += ieee8021q_helpers.o obj-$(CONFIG_NET_SELFTESTS) += selftests.o obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o @@ -40,4 +42,4 @@ obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_BPF_SYSCALL) += sock_map.o obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o obj-$(CONFIG_OF) += of_net.o -obj-$(CONFIG_NET_TEST) += gso_test.o +obj-$(CONFIG_NET_TEST) += net_test.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 6c4d90b24d..bc01b3aa6b 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -496,27 +496,22 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) if (!bpf_capable()) return ERR_PTR(-EPERM); - nla_for_each_nested(nla, nla_stgs, rem) { - if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD) { - if (nla_len(nla) != sizeof(u32)) - return ERR_PTR(-EINVAL); - nr_maps++; - } + nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD, + nla_stgs, rem) { + if (nla_len(nla) != sizeof(u32)) + return ERR_PTR(-EINVAL); + nr_maps++; } diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL); if (!diag) return ERR_PTR(-ENOMEM); - nla_for_each_nested(nla, nla_stgs, rem) { - struct bpf_map *map; - int map_fd; - - if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD) - continue; + nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD, + nla_stgs, rem) { + int map_fd = nla_get_u32(nla); + struct bpf_map *map = bpf_map_get(map_fd); - map_fd = nla_get_u32(nla); - map = bpf_map_get(map_fd); if (IS_ERR(map)) { err = PTR_ERR(map); goto err_free; diff --git a/net/core/datagram.c b/net/core/datagram.c index a8b625abe2..e72dd78471 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -324,25 +324,6 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(skb_free_datagram); -void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len) -{ - bool slow; - - if (!skb_unref(skb)) { - sk_peek_offset_bwd(sk, len); - return; - } - - slow = lock_sock_fast(sk); - sk_peek_offset_bwd(sk, len); - skb_orphan(skb); - unlock_sock_fast(sk, slow); - - /* skb is now orphaned, can be freed outside of locked section */ - __kfree_skb(skb); -} -EXPORT_SYMBOL(__skb_free_datagram_locked); - int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, struct sk_buff *skb, unsigned int flags, void (*destructor)(struct sock *sk, @@ -435,15 +416,23 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset, end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { - struct page *page = skb_frag_page(frag); - u8 *vaddr = kmap(page); + u32 p_off, p_len, copied; + struct page *p; + u8 *vaddr; if (copy > len) copy = len; - n = INDIRECT_CALL_1(cb, simple_copy_to_iter, - vaddr + skb_frag_off(frag) + offset - start, - copy, data, to); - kunmap(page); + + n = 0; + skb_frag_foreach_page(frag, + skb_frag_off(frag) + offset - start, + copy, p, p_off, p_len, copied) { + vaddr = kmap_local_page(p); + n += INDIRECT_CALL_1(cb, simple_copy_to_iter, + vaddr + p_off, p_len, data, to); + kunmap_local(vaddr); + } + offset += n; if (n != copy) goto short_copy; diff --git a/net/core/dev.c b/net/core/dev.c index c365aa06f8..2b4819b610 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -77,7 +77,9 @@ #include <linux/hash.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/sched/isolation.h> #include <linux/sched/mm.h> +#include <linux/smpboot.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/string.h> @@ -153,41 +155,21 @@ #include <linux/prandom.h> #include <linux/once_lite.h> #include <net/netdev_rx_queue.h> +#include <net/page_pool/types.h> +#include <net/page_pool/helpers.h> +#include <net/rps.h> #include "dev.h" #include "net-sysfs.h" static DEFINE_SPINLOCK(ptype_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; -struct list_head ptype_all __read_mostly; /* Taps */ static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_extack(unsigned long val, struct net_device *dev, struct netlink_ext_ack *extack); -/* - * The @dev_base_head list is protected by @dev_base_lock and the rtnl - * semaphore. - * - * Pure readers hold dev_base_lock for reading, or rcu_read_lock() - * - * Writers must hold the rtnl semaphore while they loop through the - * dev_base_head list, and hold dev_base_lock for writing when they do the - * actual updates. This allows pure readers to access the list even - * while a writer is preparing to update it. - * - * To put it another way, dev_base_lock is held for writing only to - * protect against pure readers; the rtnl semaphore provides the - * protection against other writers. - * - * See, for example usages, register_netdevice() and - * unregister_netdevice(), which must be called with the rtnl - * semaphore held. - */ -DEFINE_RWLOCK(dev_base_lock); -EXPORT_SYMBOL(dev_base_lock); - static DEFINE_MUTEX(ifalias_mutex); /* protects napi_hash addition/deletion and napi_gen_id */ @@ -200,8 +182,9 @@ static DECLARE_RWSEM(devnet_rename_sem); static inline void dev_base_seq_inc(struct net *net) { - while (++net->dev_base_seq == 0) - ; + unsigned int val = net->dev_base_seq + 1; + + WRITE_ONCE(net->dev_base_seq, val ?: 1); } static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) @@ -216,35 +199,60 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } -static inline void rps_lock_irqsave(struct softnet_data *sd, - unsigned long *flags) +#ifndef CONFIG_PREEMPT_RT + +static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key); + +static int __init setup_backlog_napi_threads(char *arg) +{ + static_branch_enable(&use_backlog_threads_key); + return 0; +} +early_param("thread_backlog_napi", setup_backlog_napi_threads); + +static bool use_backlog_threads(void) +{ + return static_branch_unlikely(&use_backlog_threads_key); +} + +#else + +static bool use_backlog_threads(void) +{ + return true; +} + +#endif + +static inline void backlog_lock_irq_save(struct softnet_data *sd, + unsigned long *flags) { - if (IS_ENABLED(CONFIG_RPS)) + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_save(*flags); } -static inline void rps_lock_irq_disable(struct softnet_data *sd) +static inline void backlog_lock_irq_disable(struct softnet_data *sd) { - if (IS_ENABLED(CONFIG_RPS)) + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_lock_irq(&sd->input_pkt_queue.lock); else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_disable(); } -static inline void rps_unlock_irq_restore(struct softnet_data *sd, - unsigned long *flags) +static inline void backlog_unlock_irq_restore(struct softnet_data *sd, + unsigned long *flags) { - if (IS_ENABLED(CONFIG_RPS)) + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_restore(*flags); } -static inline void rps_unlock_irq_enable(struct softnet_data *sd) +static inline void backlog_unlock_irq_enable(struct softnet_data *sd) { - if (IS_ENABLED(CONFIG_RPS)) + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_unlock_irq(&sd->input_pkt_queue.lock); else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_enable(); @@ -341,13 +349,22 @@ int netdev_name_node_alt_create(struct net_device *dev, const char *name) return 0; } -static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) +static void netdev_name_node_alt_free(struct rcu_head *head) { - list_del(&name_node->list); + struct netdev_name_node *name_node = + container_of(head, struct netdev_name_node, rcu); + kfree(name_node->name); netdev_name_node_free(name_node); } +static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) +{ + netdev_name_node_del(name_node); + list_del(&name_node->list); + call_rcu(&name_node->rcu, netdev_name_node_alt_free); +} + int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) { struct netdev_name_node *name_node; @@ -362,10 +379,7 @@ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) if (name_node == dev->name_node || name_node->dev != dev) return -EINVAL; - netdev_name_node_del(name_node); - synchronize_rcu(); __netdev_name_node_alt_destroy(name_node); - return 0; } @@ -373,8 +387,10 @@ static void netdev_name_node_alt_flush(struct net_device *dev) { struct netdev_name_node *name_node, *tmp; - list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) - __netdev_name_node_alt_destroy(name_node); + list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) { + list_del(&name_node->list); + netdev_name_node_alt_free(&name_node->rcu); + } } /* Device list insertion */ @@ -385,12 +401,10 @@ static void list_netdevice(struct net_device *dev) ASSERT_RTNL(); - write_lock(&dev_base_lock); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); netdev_name_node_add(net, dev->name_node); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); - write_unlock(&dev_base_lock); netdev_for_each_altname(dev, name_node) netdev_name_node_add(net, name_node); @@ -404,7 +418,7 @@ static void list_netdevice(struct net_device *dev) /* Device list removal * caller must respect a RCU grace period before freeing/reusing dev */ -static void unlist_netdevice(struct net_device *dev, bool lock) +static void unlist_netdevice(struct net_device *dev) { struct netdev_name_node *name_node; struct net *net = dev_net(dev); @@ -417,13 +431,9 @@ static void unlist_netdevice(struct net_device *dev, bool lock) netdev_name_node_del(name_node); /* Unlink dev from the device chain */ - if (lock) - write_lock(&dev_base_lock); list_del_rcu(&dev->dev_list); netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); - if (lock) - write_unlock(&dev_base_lock); dev_base_seq_inc(dev_net(dev)); } @@ -442,6 +452,12 @@ static RAW_NOTIFIER_HEAD(netdev_chain); DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); EXPORT_PER_CPU_SYMBOL(softnet_data); +/* Page_pool has a lockless array/stack to alloc/recycle pages. + * PP consumers must pay attention to run APIs in the appropriate context + * (e.g. NAPI context). + */ +static DEFINE_PER_CPU(struct page_pool *, system_page_pool); + #ifdef CONFIG_LOCKDEP /* * register_netdevice() inits txq->_xmit_lock and sets lockdep class @@ -551,7 +567,7 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) static inline struct list_head *ptype_head(const struct packet_type *pt) { if (pt->type == htons(ETH_P_ALL)) - return pt->dev ? &pt->dev->ptype_all : &ptype_all; + return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all; else return pt->dev ? &pt->dev->ptype_specific : &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; @@ -653,7 +669,7 @@ int dev_get_iflink(const struct net_device *dev) if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) return dev->netdev_ops->ndo_get_iflink(dev); - return dev->ifindex; + return READ_ONCE(dev->ifindex); } EXPORT_SYMBOL(dev_get_iflink); @@ -738,9 +754,9 @@ EXPORT_SYMBOL_GPL(dev_fill_forward_path); * @net: the applicable net namespace * @name: name to find * - * Find an interface by name. Must be called under RTNL semaphore - * or @dev_base_lock. If the name is found a pointer to the device - * is returned. If the name is not found then %NULL is returned. The + * Find an interface by name. Must be called under RTNL semaphore. + * If the name is found a pointer to the device is returned. + * If the name is not found then %NULL is returned. The * reference counters are not incremented so the caller must be * careful with locks. */ @@ -821,8 +837,7 @@ EXPORT_SYMBOL(netdev_get_by_name); * Search for an interface by index. Returns %NULL if the device * is not found or a pointer to the device. The device has not * had its reference counter increased so the caller must be careful - * about locking. The caller must hold either the RTNL semaphore - * or @dev_base_lock. + * about locking. The caller must hold the RTNL semaphore. */ struct net_device *__dev_get_by_index(struct net *net, int ifindex) @@ -924,6 +939,18 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id) } EXPORT_SYMBOL(dev_get_by_napi_id); +static DEFINE_SEQLOCK(netdev_rename_lock); + +void netdev_copy_name(struct net_device *dev, char *name) +{ + unsigned int seq; + + do { + seq = read_seqbegin(&netdev_rename_lock); + strscpy(name, dev->name, IFNAMSIZ); + } while (read_seqretry(&netdev_rename_lock, seq)); +} + /** * netdev_get_name - get a netdevice name, knowing its ifindex. * @net: network namespace @@ -935,7 +962,6 @@ int netdev_get_name(struct net *net, char *name, int ifindex) struct net_device *dev; int ret; - down_read(&devnet_rename_sem); rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); @@ -944,12 +970,11 @@ int netdev_get_name(struct net *net, char *name, int ifindex) goto out; } - strcpy(name, dev->name); + netdev_copy_name(dev, name); ret = 0; out: rcu_read_unlock(); - up_read(&devnet_rename_sem); return ret; } @@ -1201,7 +1226,10 @@ int dev_change_name(struct net_device *dev, const char *newname) memcpy(oldname, dev->name, IFNAMSIZ); + write_seqlock_bh(&netdev_rename_lock); err = dev_get_valid_name(net, dev, newname); + write_sequnlock_bh(&netdev_rename_lock); + if (err < 0) { up_write(&devnet_rename_sem); return err; @@ -1212,13 +1240,13 @@ int dev_change_name(struct net_device *dev, const char *newname) dev->flags & IFF_UP ? " (while UP)" : ""); old_assign_type = dev->name_assign_type; - dev->name_assign_type = NET_NAME_RENAMED; + WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED); rollback: ret = device_rename(&dev->dev, dev->name); if (ret) { memcpy(dev->name, oldname, IFNAMSIZ); - dev->name_assign_type = old_assign_type; + WRITE_ONCE(dev->name_assign_type, old_assign_type); up_write(&devnet_rename_sem); return ret; } @@ -1227,15 +1255,11 @@ rollback: netdev_adjacent_rename_links(dev, oldname); - write_lock(&dev_base_lock); netdev_name_node_del(dev->name_node); - write_unlock(&dev_base_lock); - synchronize_rcu(); + synchronize_net(); - write_lock(&dev_base_lock); netdev_name_node_add(net, dev->name_node); - write_unlock(&dev_base_lock); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); ret = notifier_to_errno(ret); @@ -1245,9 +1269,11 @@ rollback: if (err >= 0) { err = ret; down_write(&devnet_rename_sem); + write_seqlock_bh(&netdev_rename_lock); memcpy(dev->name, oldname, IFNAMSIZ); + write_sequnlock_bh(&netdev_rename_lock); memcpy(oldname, newname, IFNAMSIZ); - dev->name_assign_type = old_assign_type; + WRITE_ONCE(dev->name_assign_type, old_assign_type); old_assign_type = NET_NAME_RENAMED; goto rollback; } else { @@ -2073,6 +2099,11 @@ void net_dec_egress_queue(void) EXPORT_SYMBOL_GPL(net_dec_egress_queue); #endif +#ifdef CONFIG_NET_CLS_ACT +DEFINE_STATIC_KEY_FALSE(tcf_bypass_check_needed_key); +EXPORT_SYMBOL(tcf_bypass_check_needed_key); +#endif + DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); EXPORT_SYMBOL(netstamp_needed_key); #ifdef CONFIG_JUMP_LABEL @@ -2242,7 +2273,8 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) */ bool dev_nit_active(struct net_device *dev) { - return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all); + return !list_empty(&net_hotdata.ptype_all) || + !list_empty(&dev->ptype_all); } EXPORT_SYMBOL_GPL(dev_nit_active); @@ -2253,10 +2285,9 @@ EXPORT_SYMBOL_GPL(dev_nit_active); void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { - struct packet_type *ptype; + struct list_head *ptype_list = &net_hotdata.ptype_all; + struct packet_type *ptype, *pt_prev = NULL; struct sk_buff *skb2 = NULL; - struct packet_type *pt_prev = NULL; - struct list_head *ptype_list = &ptype_all; rcu_read_lock(); again: @@ -2302,7 +2333,7 @@ again: pt_prev = ptype; } - if (ptype_list == &ptype_all) { + if (ptype_list == &net_hotdata.ptype_all) { ptype_list = &dev->ptype_all; goto again; } @@ -3933,6 +3964,11 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb, if (!miniq) return ret; + if (static_branch_unlikely(&tcf_bypass_check_needed_key)) { + if (tcf_block_bypass_sw(miniq->block)) + return ret; + } + tc_skb_cb(skb)->mru = 0; tc_skb_cb(skb)->post_ct = false; tcf_set_drop_reason(skb, *drop_reason); @@ -4426,20 +4462,11 @@ EXPORT_SYMBOL(__dev_direct_xmit); /************************************************************************* * Receiver routines *************************************************************************/ +static DEFINE_PER_CPU(struct task_struct *, backlog_napi); -int netdev_max_backlog __read_mostly = 1000; -EXPORT_SYMBOL(netdev_max_backlog); - -int netdev_tstamp_prequeue __read_mostly = 1; -unsigned int sysctl_skb_defer_max __read_mostly = 64; -int netdev_budget __read_mostly = 300; -/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ -unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ; int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ -int dev_rx_weight __read_mostly = 64; -int dev_tx_weight __read_mostly = 64; /* Called with irq disabled */ static inline void ____napi_schedule(struct softnet_data *sd, @@ -4458,18 +4485,16 @@ static inline void ____napi_schedule(struct softnet_data *sd, */ thread = READ_ONCE(napi->thread); if (thread) { - /* Avoid doing set_bit() if the thread is in - * INTERRUPTIBLE state, cause napi_thread_wait() - * makes sure to proceed with napi polling - * if the thread is explicitly woken from here. - */ - if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE) - set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); + if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi)) + goto use_local_napi; + + set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); wake_up_process(thread); return; } } +use_local_napi: list_add_tail(&napi->poll_list, &sd->poll_list); WRITE_ONCE(napi->list_owner, smp_processor_id()); /* If not called from net_rx_action() @@ -4481,12 +4506,6 @@ static inline void ____napi_schedule(struct softnet_data *sd, #ifdef CONFIG_RPS -/* One global table that all flow-based protocols share. */ -struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; -EXPORT_SYMBOL(rps_sock_flow_table); -u32 rps_cpu_mask __read_mostly; -EXPORT_SYMBOL(rps_cpu_mask); - struct static_key_false rps_needed __read_mostly; EXPORT_SYMBOL(rps_needed); struct static_key_false rfs_needed __read_mostly; @@ -4497,12 +4516,13 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu) { if (next_cpu < nr_cpu_ids) { + u32 head; #ifdef CONFIG_RFS_ACCEL struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *old_rflow; - u32 flow_id; u16 rxq_index; + u32 flow_id; int rc; /* Should we steer this flow to a different hardware queue? */ @@ -4524,16 +4544,16 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, goto out; old_rflow = rflow; rflow = &flow_table->flows[flow_id]; - rflow->filter = rc; - if (old_rflow->filter == rflow->filter) - old_rflow->filter = RPS_NO_FILTER; + WRITE_ONCE(rflow->filter, rc); + if (old_rflow->filter == rc) + WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER); out: #endif - rflow->last_qtail = - per_cpu(softnet_data, next_cpu).input_queue_head; + head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head); + rps_input_queue_tail_save(&rflow->last_qtail, head); } - rflow->cpu = next_cpu; + WRITE_ONCE(rflow->cpu, next_cpu); return rflow; } @@ -4578,7 +4598,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (!hash) goto done; - sock_flow_table = rcu_dereference(rps_sock_flow_table); + sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); if (flow_table && sock_flow_table) { struct rps_dev_flow *rflow; u32 next_cpu; @@ -4588,10 +4608,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow(). */ ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]); - if ((ident ^ hash) & ~rps_cpu_mask) + if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask) goto try_rps; - next_cpu = ident & rps_cpu_mask; + next_cpu = ident & net_hotdata.rps_cpu_mask; /* OK, now we know there is a match, * we can look at the local (per receive queue) flow table @@ -4612,7 +4632,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, */ if (unlikely(tcpu != next_cpu) && (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || - ((int)(per_cpu(softnet_data, tcpu).input_queue_head - + ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) - rflow->last_qtail)) >= 0)) { tcpu = next_cpu; rflow = set_rps_cpu(dev, skb, rflow, next_cpu); @@ -4666,9 +4686,9 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, if (flow_table && flow_id <= flow_table->mask) { rflow = &flow_table->flows[flow_id]; cpu = READ_ONCE(rflow->cpu); - if (rflow->filter == filter_id && cpu < nr_cpu_ids && - ((int)(per_cpu(softnet_data, cpu).input_queue_head - - rflow->last_qtail) < + if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids && + ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) - + READ_ONCE(rflow->last_qtail)) < (int)(10 * flow_table->mask))) expire = false; } @@ -4715,6 +4735,11 @@ static void napi_schedule_rps(struct softnet_data *sd) #ifdef CONFIG_RPS if (sd != mysd) { + if (use_backlog_threads()) { + __napi_schedule_irqoff(&sd->backlog); + return; + } + sd->rps_ipi_next = mysd->rps_ipi_list; mysd->rps_ipi_list = sd; @@ -4729,6 +4754,23 @@ static void napi_schedule_rps(struct softnet_data *sd) __napi_schedule_irqoff(&mysd->backlog); } +void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu) +{ + unsigned long flags; + + if (use_backlog_threads()) { + backlog_lock_irq_save(sd, &flags); + + if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) + __napi_schedule_irqoff(&sd->backlog); + + backlog_unlock_irq_restore(sd, &flags); + + } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) { + smp_call_function_single_async(cpu, &sd->defer_csd); + } +} + #ifdef CONFIG_NET_FLOW_LIMIT int netdev_flow_limit_table_len __read_mostly = (1 << 12); #endif @@ -4740,7 +4782,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) struct softnet_data *sd; unsigned int old_flow, new_flow; - if (qlen < (READ_ONCE(netdev_max_backlog) >> 1)) + if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1)) return false; sd = this_cpu_ptr(&softnet_data); @@ -4780,36 +4822,45 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, struct softnet_data *sd; unsigned long flags; unsigned int qlen; + int max_backlog; + u32 tail; - reason = SKB_DROP_REASON_NOT_SPECIFIED; + reason = SKB_DROP_REASON_DEV_READY; + if (!netif_running(skb->dev)) + goto bad_dev; + + reason = SKB_DROP_REASON_CPU_BACKLOG; sd = &per_cpu(softnet_data, cpu); - rps_lock_irqsave(sd, &flags); - if (!netif_running(skb->dev)) - goto drop; + qlen = skb_queue_len_lockless(&sd->input_pkt_queue); + max_backlog = READ_ONCE(net_hotdata.max_backlog); + if (unlikely(qlen > max_backlog)) + goto cpu_backlog_drop; + backlog_lock_irq_save(sd, &flags); qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) { - if (qlen) { -enqueue: - __skb_queue_tail(&sd->input_pkt_queue, skb); - input_queue_tail_incr_save(sd, qtail); - rps_unlock_irq_restore(sd, &flags); - return NET_RX_SUCCESS; + if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) { + if (!qlen) { + /* Schedule NAPI for backlog device. We can use + * non atomic operation as we own the queue lock. + */ + if (!__test_and_set_bit(NAPI_STATE_SCHED, + &sd->backlog.state)) + napi_schedule_rps(sd); } + __skb_queue_tail(&sd->input_pkt_queue, skb); + tail = rps_input_queue_tail_incr(sd); + backlog_unlock_irq_restore(sd, &flags); - /* Schedule NAPI for backlog device - * We can use non atomic operation since we own the queue lock - */ - if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) - napi_schedule_rps(sd); - goto enqueue; + /* save the tail outside of the critical section */ + rps_input_queue_tail_save(qtail, tail); + return NET_RX_SUCCESS; } - reason = SKB_DROP_REASON_CPU_BACKLOG; -drop: - sd->dropped++; - rps_unlock_irq_restore(sd, &flags); + backlog_unlock_irq_restore(sd, &flags); +cpu_backlog_drop: + atomic_inc(&sd->dropped); +bad_dev: dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb_reason(skb, reason); return NET_RX_DROP; @@ -4864,6 +4915,12 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, skb_headlen(skb) + mac_len, true); + if (skb_is_nonlinear(skb)) { + skb_shinfo(skb)->xdp_frags_size = skb->data_len; + xdp_buff_set_frags_flag(xdp); + } else { + xdp_buff_clear_frags_flag(xdp); + } orig_data_end = xdp->data_end; orig_data = xdp->data; @@ -4893,6 +4950,14 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, skb->len += off; /* positive on grow, negative on shrink */ } + /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers + * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. + */ + if (xdp_buff_has_frags(xdp)) + skb->data_len = skb_shinfo(skb)->xdp_frags_size; + else + skb->data_len = 0; + /* check if XDP changed eth hdr such SKB needs update */ eth = (struct ethhdr *)xdp->data; if ((orig_eth_type != eth->h_proto) || @@ -4926,11 +4991,35 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, return act; } -static u32 netif_receive_generic_xdp(struct sk_buff *skb, +static int +netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog) +{ + struct sk_buff *skb = *pskb; + int err, hroom, troom; + + if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog)) + return 0; + + /* In case we have to go down the path and also linearize, + * then lets do the pskb_expand_head() work just once here. + */ + hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); + troom = skb->tail + skb->data_len - skb->end; + err = pskb_expand_head(skb, + hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, + troom > 0 ? troom + 128 : 0, GFP_ATOMIC); + if (err) + return err; + + return skb_linearize(skb); +} + +static u32 netif_receive_generic_xdp(struct sk_buff **pskb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - u32 act = XDP_DROP; + struct sk_buff *skb = *pskb; + u32 mac_len, act = XDP_DROP; /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. @@ -4938,41 +5027,36 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, if (skb_is_redirected(skb)) return XDP_PASS; - /* XDP packets must be linear and must have sufficient headroom - * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also - * native XDP provides, thus we need to do it here as well. + /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM + * bytes. This is the guarantee that also native XDP provides, + * thus we need to do it here as well. */ + mac_len = skb->data - skb_mac_header(skb); + __skb_push(skb, mac_len); + if (skb_cloned(skb) || skb_is_nonlinear(skb) || skb_headroom(skb) < XDP_PACKET_HEADROOM) { - int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); - int troom = skb->tail + skb->data_len - skb->end; - - /* In case we have to go down the path and also linearize, - * then lets do the pskb_expand_head() work just once here. - */ - if (pskb_expand_head(skb, - hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, - troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) - goto do_drop; - if (skb_linearize(skb)) + if (netif_skb_check_for_xdp(pskb, xdp_prog)) goto do_drop; } - act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog); + __skb_pull(*pskb, mac_len); + + act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog); switch (act) { case XDP_REDIRECT: case XDP_TX: case XDP_PASS: break; default: - bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act); + bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: - trace_xdp_exception(skb->dev, xdp_prog, act); + trace_xdp_exception((*pskb)->dev, xdp_prog, act); fallthrough; case XDP_DROP: do_drop: - kfree_skb(skb); + kfree_skb(*pskb); break; } @@ -5010,24 +5094,24 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); -int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) +int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb) { if (xdp_prog) { struct xdp_buff xdp; u32 act; int err; - act = netif_receive_generic_xdp(skb, &xdp, xdp_prog); + act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog); if (act != XDP_PASS) { switch (act) { case XDP_REDIRECT: - err = xdp_do_generic_redirect(skb->dev, skb, + err = xdp_do_generic_redirect((*pskb)->dev, *pskb, &xdp, xdp_prog); if (err) goto out_redir; break; case XDP_TX: - generic_xdp_tx(skb, xdp_prog); + generic_xdp_tx(*pskb, xdp_prog); break; } return XDP_DROP; @@ -5035,7 +5119,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) } return XDP_PASS; out_redir: - kfree_skb_reason(skb, SKB_DROP_REASON_XDP); + kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP); return XDP_DROP; } EXPORT_SYMBOL_GPL(do_xdp_generic); @@ -5044,7 +5128,7 @@ static int netif_rx_internal(struct sk_buff *skb) { int ret; - net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); + net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); trace_netif_rx(skb); @@ -5336,7 +5420,7 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, int ret = NET_RX_DROP; __be16 type; - net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb); + net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb); trace_netif_receive_skb(skb); @@ -5358,7 +5442,8 @@ another_round: int ret2; migrate_disable(); - ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); + ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), + &skb); migrate_enable(); if (ret2 != XDP_PASS) { @@ -5379,7 +5464,7 @@ another_round: if (pfmemalloc) goto skip_taps; - list_for_each_entry_rcu(ptype, &ptype_all, list) { + list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; @@ -5719,7 +5804,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; - net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); + net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; @@ -5749,7 +5834,8 @@ void netif_receive_skb_list_internal(struct list_head *head) INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { - net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); + net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), + skb); skb_list_del_init(skb); if (!skb_defer_rx_timestamp(skb)) list_add_tail(&skb->list, &sublist); @@ -5839,21 +5925,21 @@ static void flush_backlog(struct work_struct *work) local_bh_disable(); sd = this_cpu_ptr(&softnet_data); - rps_lock_irq_disable(sd); + backlog_lock_irq_disable(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); dev_kfree_skb_irq(skb); - input_queue_head_incr(sd); + rps_input_queue_head_incr(sd); } } - rps_unlock_irq_enable(sd); + backlog_unlock_irq_enable(sd); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); kfree_skb(skb); - input_queue_head_incr(sd); + rps_input_queue_head_incr(sd); } } local_bh_enable(); @@ -5865,14 +5951,14 @@ static bool flush_required(int cpu) struct softnet_data *sd = &per_cpu(softnet_data, cpu); bool do_flush; - rps_lock_irq_disable(sd); + backlog_lock_irq_disable(sd); /* as insertion into process_queue happens with the rps lock held, * process_queue access may race only with dequeue */ do_flush = !skb_queue_empty(&sd->input_pkt_queue) || !skb_queue_empty_lockless(&sd->process_queue); - rps_unlock_irq_enable(sd); + backlog_unlock_irq_enable(sd); return do_flush; #endif @@ -5938,7 +6024,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) #ifdef CONFIG_RPS struct softnet_data *remsd = sd->rps_ipi_list; - if (remsd) { + if (!use_backlog_threads() && remsd) { sd->rps_ipi_list = NULL; local_irq_enable(); @@ -5953,7 +6039,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) { #ifdef CONFIG_RPS - return sd->rps_ipi_list != NULL; + return !use_backlog_threads() && sd->rps_ipi_list; #else return false; #endif @@ -5973,7 +6059,7 @@ static int process_backlog(struct napi_struct *napi, int quota) net_rps_action_and_irq_enable(sd); } - napi->weight = READ_ONCE(dev_rx_weight); + napi->weight = READ_ONCE(net_hotdata.dev_rx_weight); while (again) { struct sk_buff *skb; @@ -5981,13 +6067,14 @@ static int process_backlog(struct napi_struct *napi, int quota) rcu_read_lock(); __netif_receive_skb(skb); rcu_read_unlock(); - input_queue_head_incr(sd); - if (++work >= quota) + if (++work >= quota) { + rps_input_queue_head_add(sd, work); return work; + } } - rps_lock_irq_disable(sd); + backlog_lock_irq_disable(sd); if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). @@ -5997,15 +6084,17 @@ static int process_backlog(struct napi_struct *napi, int quota) * We can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ - napi->state = 0; + napi->state &= NAPIF_STATE_THREADED; again = false; } else { skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue); } - rps_unlock_irq_enable(sd); + backlog_unlock_irq_enable(sd); } + if (work) + rps_input_queue_head_add(sd, work); return work; } @@ -6162,6 +6251,27 @@ struct napi_struct *napi_by_id(unsigned int napi_id) return NULL; } +static void skb_defer_free_flush(struct softnet_data *sd) +{ + struct sk_buff *skb, *next; + + /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ + if (!READ_ONCE(sd->defer_list)) + return; + + spin_lock(&sd->defer_lock); + skb = sd->defer_list; + sd->defer_list = NULL; + sd->defer_count = 0; + spin_unlock(&sd->defer_lock); + + while (skb != NULL) { + next = skb->next; + napi_consume_skb(skb, 1); + skb = next; + } +} + #if defined(CONFIG_NET_RX_BUSY_POLL) static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) @@ -6183,8 +6293,13 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) clear_bit(NAPI_STATE_SCHED, &napi->state); } -static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll, - u16 budget) +enum { + NAPI_F_PREFER_BUSY_POLL = 1, + NAPI_F_END_ON_RESCHED = 2, +}; + +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, + unsigned flags, u16 budget) { bool skip_schedule = false; unsigned long timeout; @@ -6204,7 +6319,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool local_bh_disable(); - if (prefer_busy_poll) { + if (flags & NAPI_F_PREFER_BUSY_POLL) { napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); timeout = READ_ONCE(napi->dev->gro_flush_timeout); if (napi->defer_hard_irqs_count && timeout) { @@ -6228,23 +6343,23 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool local_bh_enable(); } -void napi_busy_loop(unsigned int napi_id, - bool (*loop_end)(void *, unsigned long), - void *loop_end_arg, bool prefer_busy_poll, u16 budget) +static void __napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, unsigned flags, u16 budget) { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); void *have_poll_lock = NULL; struct napi_struct *napi; + WARN_ON_ONCE(!rcu_read_lock_held()); + restart: napi_poll = NULL; - rcu_read_lock(); - napi = napi_by_id(napi_id); if (!napi) - goto out; + return; if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_disable(); @@ -6260,14 +6375,14 @@ restart: */ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | NAPIF_STATE_IN_BUSY_POLL)) { - if (prefer_busy_poll) + if (flags & NAPI_F_PREFER_BUSY_POLL) set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; } if (cmpxchg(&napi->state, val, val | NAPIF_STATE_IN_BUSY_POLL | NAPIF_STATE_SCHED) != val) { - if (prefer_busy_poll) + if (flags & NAPI_F_PREFER_BUSY_POLL) set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; } @@ -6281,18 +6396,22 @@ count: if (work > 0) __NET_ADD_STATS(dev_net(napi->dev), LINUX_MIB_BUSYPOLLRXPACKETS, work); + skb_defer_free_flush(this_cpu_ptr(&softnet_data)); local_bh_enable(); if (!loop_end || loop_end(loop_end_arg, start_time)) break; if (unlikely(need_resched())) { + if (flags & NAPI_F_END_ON_RESCHED) + break; if (napi_poll) - busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); + busy_poll_stop(napi, have_poll_lock, flags, budget); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable(); rcu_read_unlock(); cond_resched(); + rcu_read_lock(); if (loop_end(loop_end_arg, start_time)) return; goto restart; @@ -6300,10 +6419,31 @@ count: cpu_relax(); } if (napi_poll) - busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); + busy_poll_stop(napi, have_poll_lock, flags, budget); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable(); -out: +} + +void napi_busy_loop_rcu(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, bool prefer_busy_poll, u16 budget) +{ + unsigned flags = NAPI_F_END_ON_RESCHED; + + if (prefer_busy_poll) + flags |= NAPI_F_PREFER_BUSY_POLL; + + __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); +} + +void napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, bool prefer_busy_poll, u16 budget) +{ + unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0; + + rcu_read_lock(); + __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); rcu_read_unlock(); } EXPORT_SYMBOL(napi_busy_loop); @@ -6391,7 +6531,7 @@ int dev_set_threaded(struct net_device *dev, bool threaded) } } - dev->threaded = threaded; + WRITE_ONCE(dev->threaded, threaded); /* Make sure kthread is created before THREADED bit * is set. @@ -6482,7 +6622,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, * threaded mode will not be enabled in napi_enable(). */ if (dev->threaded && napi_kthread_create(napi)) - dev->threaded = 0; + dev->threaded = false; netif_napi_set_irq(napi, -1); } EXPORT_SYMBOL(netif_napi_add_weight); @@ -6660,8 +6800,6 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) static int napi_thread_wait(struct napi_struct *napi) { - bool woken = false; - set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { @@ -6670,15 +6808,13 @@ static int napi_thread_wait(struct napi_struct *napi) * Testing SCHED bit is not enough because SCHED bit might be * set by some other busy poll thread or by napi_disable(). */ - if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) { + if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) { WARN_ON(!list_empty(&napi->poll_list)); __set_current_state(TASK_RUNNING); return 0; } schedule(); - /* woken being true indicates this thread owns this napi. */ - woken = true; set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); @@ -6686,64 +6822,48 @@ static int napi_thread_wait(struct napi_struct *napi) return -1; } -static void skb_defer_free_flush(struct softnet_data *sd) +static void napi_threaded_poll_loop(struct napi_struct *napi) { - struct sk_buff *skb, *next; + struct softnet_data *sd; + unsigned long last_qs = jiffies; - /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ - if (!READ_ONCE(sd->defer_list)) - return; + for (;;) { + bool repoll = false; + void *have; - spin_lock(&sd->defer_lock); - skb = sd->defer_list; - sd->defer_list = NULL; - sd->defer_count = 0; - spin_unlock(&sd->defer_lock); + local_bh_disable(); + sd = this_cpu_ptr(&softnet_data); + sd->in_napi_threaded_poll = true; - while (skb != NULL) { - next = skb->next; - napi_consume_skb(skb, 1); - skb = next; + have = netpoll_poll_lock(napi); + __napi_poll(napi, &repoll); + netpoll_poll_unlock(have); + + sd->in_napi_threaded_poll = false; + barrier(); + + if (sd_has_rps_ipi_waiting(sd)) { + local_irq_disable(); + net_rps_action_and_irq_enable(sd); + } + skb_defer_free_flush(sd); + local_bh_enable(); + + if (!repoll) + break; + + rcu_softirq_qs_periodic(last_qs); + cond_resched(); } } static int napi_threaded_poll(void *data) { struct napi_struct *napi = data; - struct softnet_data *sd; - void *have; - - while (!napi_thread_wait(napi)) { - unsigned long last_qs = jiffies; - - for (;;) { - bool repoll = false; - - local_bh_disable(); - sd = this_cpu_ptr(&softnet_data); - sd->in_napi_threaded_poll = true; - - have = netpoll_poll_lock(napi); - __napi_poll(napi, &repoll); - netpoll_poll_unlock(have); - sd->in_napi_threaded_poll = false; - barrier(); - - if (sd_has_rps_ipi_waiting(sd)) { - local_irq_disable(); - net_rps_action_and_irq_enable(sd); - } - skb_defer_free_flush(sd); - local_bh_enable(); - - if (!repoll) - break; + while (!napi_thread_wait(napi)) + napi_threaded_poll_loop(napi); - rcu_softirq_qs_periodic(last_qs); - cond_resched(); - } - } return 0; } @@ -6751,8 +6871,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + - usecs_to_jiffies(READ_ONCE(netdev_budget_usecs)); - int budget = READ_ONCE(netdev_budget); + usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs)); + int budget = READ_ONCE(net_hotdata.netdev_budget); LIST_HEAD(list); LIST_HEAD(repoll); @@ -8424,27 +8544,29 @@ static void dev_change_rx_flags(struct net_device *dev, int flags) static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags; + unsigned int promiscuity, flags; kuid_t uid; kgid_t gid; ASSERT_RTNL(); - dev->flags |= IFF_PROMISC; - dev->promiscuity += inc; - if (dev->promiscuity == 0) { + promiscuity = dev->promiscuity + inc; + if (promiscuity == 0) { /* * Avoid overflow. * If inc causes overflow, untouch promisc and return error. */ - if (inc < 0) - dev->flags &= ~IFF_PROMISC; - else { - dev->promiscuity -= inc; + if (unlikely(inc > 0)) { netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n"); return -EOVERFLOW; } + flags = old_flags & ~IFF_PROMISC; + } else { + flags = old_flags | IFF_PROMISC; } - if (dev->flags != old_flags) { + WRITE_ONCE(dev->promiscuity, promiscuity); + if (flags != old_flags) { + WRITE_ONCE(dev->flags, flags); netdev_info(dev, "%s promiscuous mode\n", dev->flags & IFF_PROMISC ? "entered" : "left"); if (audit_enabled) { @@ -8495,25 +8617,27 @@ EXPORT_SYMBOL(dev_set_promiscuity); static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags, old_gflags = dev->gflags; + unsigned int allmulti, flags; ASSERT_RTNL(); - dev->flags |= IFF_ALLMULTI; - dev->allmulti += inc; - if (dev->allmulti == 0) { + allmulti = dev->allmulti + inc; + if (allmulti == 0) { /* * Avoid overflow. * If inc causes overflow, untouch allmulti and return error. */ - if (inc < 0) - dev->flags &= ~IFF_ALLMULTI; - else { - dev->allmulti -= inc; + if (unlikely(inc > 0)) { netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n"); return -EOVERFLOW; } + flags = old_flags & ~IFF_ALLMULTI; + } else { + flags = old_flags | IFF_ALLMULTI; } - if (dev->flags ^ old_flags) { + WRITE_ONCE(dev->allmulti, allmulti); + if (flags != old_flags) { + WRITE_ONCE(dev->flags, flags); netdev_info(dev, "%s allmulticast mode\n", dev->flags & IFF_ALLMULTI ? "entered" : "left"); dev_change_rx_flags(dev, IFF_ALLMULTI); @@ -8595,12 +8719,12 @@ unsigned int dev_get_flags(const struct net_device *dev) { unsigned int flags; - flags = (dev->flags & ~(IFF_PROMISC | + flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC | IFF_ALLMULTI | IFF_RUNNING | IFF_LOWER_UP | IFF_DORMANT)) | - (dev->gflags & (IFF_PROMISC | + (READ_ONCE(dev->gflags) & (IFF_PROMISC | IFF_ALLMULTI)); if (netif_running(dev)) { @@ -8839,7 +8963,7 @@ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) return -ERANGE; if (new_len != orig_len) { - dev->tx_queue_len = new_len; + WRITE_ONCE(dev->tx_queue_len, new_len); res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); res = notifier_to_errno(res); if (res) @@ -8853,7 +8977,7 @@ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) err_rollback: netdev_err(dev, "refused to change device tx_queue_len\n"); - dev->tx_queue_len = orig_len; + WRITE_ONCE(dev->tx_queue_len, orig_len); return res; } @@ -8923,7 +9047,7 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, } EXPORT_SYMBOL(dev_set_mac_address); -static DECLARE_RWSEM(dev_addr_sem); +DECLARE_RWSEM(dev_addr_sem); int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack) @@ -9099,7 +9223,7 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) netif_carrier_off(dev); else netif_carrier_on(dev); - dev->proto_down = proto_down; + WRITE_ONCE(dev->proto_down, proto_down); return 0; } @@ -9113,18 +9237,21 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, u32 value) { + u32 proto_down_reason; int b; if (!mask) { - dev->proto_down_reason = value; + proto_down_reason = value; } else { + proto_down_reason = dev->proto_down_reason; for_each_set_bit(b, &mask, 32) { if (value & (1 << b)) - dev->proto_down_reason |= BIT(b); + proto_down_reason |= BIT(b); else - dev->proto_down_reason &= ~BIT(b); + proto_down_reason &= ~BIT(b); } } + WRITE_ONCE(dev->proto_down_reason, proto_down_reason); } struct bpf_xdp_link { @@ -9677,11 +9804,11 @@ static void dev_index_release(struct net *net, int ifindex) /* Delayed registration/unregisteration */ LIST_HEAD(net_todo_list); DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); +atomic_t dev_unreg_count = ATOMIC_INIT(0); static void net_set_todo(struct net_device *dev) { list_add_tail(&dev->todo_list, &net_todo_list); - atomic_inc(&dev_net(dev)->dev_unreg_count); } static netdev_features_t netdev_sync_upper_features(struct net_device *lower, @@ -10246,9 +10373,9 @@ int register_netdevice(struct net_device *dev) goto err_ifindex_release; ret = netdev_register_kobject(dev); - write_lock(&dev_base_lock); - dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED; - write_unlock(&dev_base_lock); + + WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED); + if (ret) goto err_uninit_notify; @@ -10314,25 +10441,12 @@ err_free_name: } EXPORT_SYMBOL(register_netdevice); -/** - * init_dummy_netdev - init a dummy network device for NAPI - * @dev: device to init - * - * This takes a network device structure and initialize the minimum - * amount of fields so it can be used to schedule NAPI polls without - * registering a full blown interface. This is to be used by drivers - * that need to tie several hardware interfaces to a single NAPI - * poll scheduler due to HW limitations. +/* Initialize the core of a dummy net device. + * This is useful if you are calling this function after alloc_netdev(), + * since it does not memset the net_device fields. */ -int init_dummy_netdev(struct net_device *dev) +static void init_dummy_netdev_core(struct net_device *dev) { - /* Clear everything. Note we don't initialize spinlocks - * are they aren't supposed to be taken by any of the - * NAPI code and this dummy netdev is supposed to be - * only ever used for NAPI polls - */ - memset(dev, 0, sizeof(struct net_device)); - /* make sure we BUG if trying to hit standard * register/unregister code path */ @@ -10352,12 +10466,30 @@ int init_dummy_netdev(struct net_device *dev) * because users of this 'device' dont need to change * its refcount. */ +} - return 0; +/** + * init_dummy_netdev - init a dummy network device for NAPI + * @dev: device to init + * + * This takes a network device structure and initializes the minimum + * amount of fields so it can be used to schedule NAPI polls without + * registering a full blown interface. This is to be used by drivers + * that need to tie several hardware interfaces to a single NAPI + * poll scheduler due to HW limitations. + */ +void init_dummy_netdev(struct net_device *dev) +{ + /* Clear everything. Note we don't initialize spinlocks + * as they aren't supposed to be taken by any of the + * NAPI code and this dummy netdev is supposed to be + * only ever used for NAPI polls + */ + memset(dev, 0, sizeof(struct net_device)); + init_dummy_netdev_core(dev); } EXPORT_SYMBOL_GPL(init_dummy_netdev); - /** * register_netdev - register a network device * @dev: device to register @@ -10455,8 +10587,9 @@ static struct net_device *netdev_wait_allrefs_any(struct list_head *list) rebroadcast_time = jiffies; } + rcu_barrier(); + if (!wait) { - rcu_barrier(); wait = WAIT_REFS_MIN_MSECS; } else { msleep(wait); @@ -10508,6 +10641,7 @@ void netdev_run_todo(void) { struct net_device *dev, *tmp; struct list_head list; + int cnt; #ifdef CONFIG_LOCKDEP struct list_head unlink_list; @@ -10538,12 +10672,11 @@ void netdev_run_todo(void) continue; } - write_lock(&dev_base_lock); - dev->reg_state = NETREG_UNREGISTERED; - write_unlock(&dev_base_lock); + WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED); linkwatch_sync_dev(dev); } + cnt = 0; while (!list_empty(&list)) { dev = netdev_wait_allrefs_any(&list); list_del(&dev->todo_list); @@ -10561,12 +10694,13 @@ void netdev_run_todo(void) if (dev->needs_free_netdev) free_netdev(dev); - if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count)) - wake_up(&netdev_unregistering_wq); + cnt++; /* Free network device */ kobject_put(&dev->dev.kobj); } + if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count)) + wake_up(&netdev_unregistering_wq); } /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has @@ -10643,6 +10777,8 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, ops->ndo_get_stats64(dev, storage); } else if (ops->ndo_get_stats) { netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); + } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) { + dev_get_tstats64(dev, storage); } else { netdev_stats_to_stats64(storage, &dev->stats); } @@ -10951,13 +11087,14 @@ void free_netdev(struct net_device *dev) dev->xdp_bulkq = NULL; /* Compatibility with error handling in drivers */ - if (dev->reg_state == NETREG_UNINITIALIZED) { + if (dev->reg_state == NETREG_UNINITIALIZED || + dev->reg_state == NETREG_DUMMY) { netdev_freemem(dev); return; } BUG_ON(dev->reg_state != NETREG_UNREGISTERED); - dev->reg_state = NETREG_RELEASED; + WRITE_ONCE(dev->reg_state, NETREG_RELEASED); /* will free via device release */ put_device(&dev->dev); @@ -10965,6 +11102,19 @@ void free_netdev(struct net_device *dev) EXPORT_SYMBOL(free_netdev); /** + * alloc_netdev_dummy - Allocate and initialize a dummy net device. + * @sizeof_priv: size of private data to allocate space for + * + * Return: the allocated net_device on success, NULL otherwise + */ +struct net_device *alloc_netdev_dummy(int sizeof_priv) +{ + return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN, + init_dummy_netdev_core); +} +EXPORT_SYMBOL_GPL(alloc_netdev_dummy); + +/** * synchronize_net - Synchronize with packet receive processing * * Wait for packets currently being received to be done. @@ -11013,6 +11163,7 @@ void unregister_netdevice_many_notify(struct list_head *head, { struct net_device *dev, *tmp; LIST_HEAD(close_head); + int cnt = 0; BUG_ON(dev_boot_phase); ASSERT_RTNL(); @@ -11044,10 +11195,8 @@ void unregister_netdevice_many_notify(struct list_head *head, list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ - write_lock(&dev_base_lock); - unlist_netdevice(dev, false); - dev->reg_state = NETREG_UNREGISTERING; - write_unlock(&dev_base_lock); + unlist_netdevice(dev); + WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING); } flush_all_backlogs(); @@ -11109,7 +11258,9 @@ void unregister_netdevice_many_notify(struct list_head *head, list_for_each_entry(dev, head, unreg_list) { netdev_put(dev, &dev->dev_registered_tracker); net_set_todo(dev); + cnt++; } + atomic_add(cnt, &dev_unreg_count); list_del(head); } @@ -11227,7 +11378,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, dev_close(dev); /* And unlink it from device chain */ - unlist_netdevice(dev, true); + unlist_netdevice(dev); synchronize_net(); @@ -11266,8 +11417,12 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, dev_net_set(dev, net); dev->ifindex = new_ifindex; - if (new_name[0]) /* Rename the netdev to prepared name */ + if (new_name[0]) { + /* Rename the netdev to prepared name */ + write_seqlock_bh(&netdev_rename_lock); strscpy(dev->name, new_name, IFNAMSIZ); + write_sequnlock_bh(&netdev_rename_lock); + } /* Fixup kobjects */ dev_set_uevent_suppress(&dev->dev, 1); @@ -11342,7 +11497,7 @@ static int dev_cpu_dead(unsigned int oldcpu) list_del_init(&napi->poll_list); if (napi->poll == process_backlog) - napi->state = 0; + napi->state &= NAPIF_STATE_THREADED; else ____napi_schedule(sd, napi); } @@ -11350,21 +11505,23 @@ static int dev_cpu_dead(unsigned int oldcpu) raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_enable(); + if (!use_backlog_threads()) { #ifdef CONFIG_RPS - remsd = oldsd->rps_ipi_list; - oldsd->rps_ipi_list = NULL; + remsd = oldsd->rps_ipi_list; + oldsd->rps_ipi_list = NULL; #endif - /* send out pending IPI's on offline CPU */ - net_rps_send_ipi(remsd); + /* send out pending IPI's on offline CPU */ + net_rps_send_ipi(remsd); + } /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) { netif_rx(skb); - input_queue_head_incr(oldsd); + rps_input_queue_head_incr(oldsd); } while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx(skb); - input_queue_head_incr(oldsd); + rps_input_queue_head_incr(oldsd); } return 0; @@ -11563,11 +11720,8 @@ static void __net_exit default_device_exit_net(struct net *net) snprintf(fb_name, IFNAMSIZ, "dev%%d"); netdev_for_each_altname_safe(dev, name_node, tmp) - if (netdev_name_in_use(&init_net, name_node->name)) { - netdev_name_node_del(name_node); - synchronize_rcu(); + if (netdev_name_in_use(&init_net, name_node->name)) __netdev_name_node_alt_destroy(name_node); - } err = dev_change_net_namespace(dev, &init_net, fb_name); if (err) { @@ -11675,6 +11829,60 @@ static void __init net_dev_struct_check(void) * */ +/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */ +#define SYSTEM_PERCPU_PAGE_POOL_SIZE ((1 << 20) / PAGE_SIZE) + +static int net_page_pool_create(int cpuid) +{ +#if IS_ENABLED(CONFIG_PAGE_POOL) + struct page_pool_params page_pool_params = { + .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE, + .flags = PP_FLAG_SYSTEM_POOL, + .nid = cpu_to_mem(cpuid), + }; + struct page_pool *pp_ptr; + + pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid); + if (IS_ERR(pp_ptr)) + return -ENOMEM; + + per_cpu(system_page_pool, cpuid) = pp_ptr; +#endif + return 0; +} + +static int backlog_napi_should_run(unsigned int cpu) +{ + struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + struct napi_struct *napi = &sd->backlog; + + return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state); +} + +static void run_backlog_napi(unsigned int cpu) +{ + struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + + napi_threaded_poll_loop(&sd->backlog); +} + +static void backlog_napi_setup(unsigned int cpu) +{ + struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + struct napi_struct *napi = &sd->backlog; + + napi->thread = this_cpu_read(backlog_napi); + set_bit(NAPI_STATE_THREADED, &napi->state); +} + +static struct smp_hotplug_thread backlog_threads = { + .store = &backlog_napi, + .thread_should_run = backlog_napi_should_run, + .thread_fn = run_backlog_napi, + .thread_comm = "backlog_napi/%u", + .setup = backlog_napi_setup, +}; + /* * This is called single threaded during boot, so no need * to take the rtnl semaphore. @@ -11693,7 +11901,6 @@ static int __init net_dev_init(void) if (netdev_kobject_init()) goto out; - INIT_LIST_HEAD(&ptype_all); for (i = 0; i < PTYPE_HASH_SIZE; i++) INIT_LIST_HEAD(&ptype_base[i]); @@ -11727,7 +11934,13 @@ static int __init net_dev_init(void) init_gro_hash(&sd->backlog); sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; + INIT_LIST_HEAD(&sd->backlog.poll_list); + + if (net_page_pool_create(i)) + goto out; } + if (use_backlog_threads()) + smpboot_register_percpu_thread(&backlog_threads); dev_boot_phase = 0; @@ -11753,7 +11966,24 @@ static int __init net_dev_init(void) NULL, dev_cpu_dead); WARN_ON(rc < 0); rc = 0; + + /* avoid static key IPIs to isolated CPUs */ + if (housekeeping_enabled(HK_TYPE_MISC)) + net_enable_timestamp(); out: + if (rc < 0) { + for_each_possible_cpu(i) { + struct page_pool *pp_ptr; + + pp_ptr = per_cpu(system_page_pool, i); + if (!pp_ptr) + continue; + + page_pool_destroy(pp_ptr); + per_cpu(system_page_pool, i) = NULL; + } + } + return rc; } diff --git a/net/core/dev.h b/net/core/dev.h index 7480b4c842..b7b518bc2b 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -3,11 +3,10 @@ #define _NET_CORE_DEV_H #include <linux/types.h> +#include <linux/rwsem.h> +#include <linux/netdevice.h> struct net; -struct net_device; -struct netdev_bpf; -struct netdev_phys_item_id; struct netlink_ext_ack; struct cpumask; @@ -37,15 +36,13 @@ int dev_addr_init(struct net_device *dev); void dev_addr_check(struct net_device *dev); /* sysctls not referred to from outside net/core/ */ -extern int netdev_budget; -extern unsigned int netdev_budget_usecs; -extern unsigned int sysctl_skb_defer_max; -extern int netdev_tstamp_prequeue; extern int netdev_unregister_timeout_secs; extern int weight_p; extern int dev_weight_rx_bias; extern int dev_weight_tx_bias; +extern struct rw_semaphore dev_addr_sem; + /* rtnl helpers */ extern struct list_head net_todo_list; void netdev_run_todo(void); @@ -56,6 +53,7 @@ struct netdev_name_node { struct list_head list; struct net_device *dev; const char *name; + struct rcu_head rcu; }; int netdev_get_name(struct net *net, char *name, int ifindex); @@ -149,4 +147,23 @@ static inline void xdp_do_check_flushed(struct napi_struct *napi) { } #endif struct napi_struct *napi_by_id(unsigned int napi_id); +void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); + +#define XMIT_RECURSION_LIMIT 8 +static inline bool dev_xmit_recursion(void) +{ + return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > + XMIT_RECURSION_LIMIT); +} + +static inline void dev_xmit_recursion_inc(void) +{ + __this_cpu_inc(softnet_data.xmit.recursion); +} + +static inline void dev_xmit_recursion_dec(void) +{ + __this_cpu_dec(softnet_data.xmit.recursion); +} + #endif diff --git a/net/core/dev_addr_lists_test.c b/net/core/dev_addr_lists_test.c index 4dbd0dc6ae..8e1dba825e 100644 --- a/net/core/dev_addr_lists_test.c +++ b/net/core/dev_addr_lists_test.c @@ -49,7 +49,6 @@ static int dev_addr_test_init(struct kunit *test) KUNIT_FAIL(test, "Can't register netdev %d", err); } - rtnl_lock(); return 0; } @@ -57,7 +56,6 @@ static void dev_addr_test_exit(struct kunit *test) { struct net_device *netdev = test->priv; - rtnl_unlock(); unregister_netdev(netdev); free_netdev(netdev); } @@ -67,6 +65,7 @@ static void dev_addr_test_basic(struct kunit *test) struct net_device *netdev = test->priv; u8 addr[ETH_ALEN]; + rtnl_lock(); KUNIT_EXPECT_TRUE(test, !!netdev->dev_addr); memset(addr, 2, sizeof(addr)); @@ -76,6 +75,7 @@ static void dev_addr_test_basic(struct kunit *test) memset(addr, 3, sizeof(addr)); dev_addr_set(netdev, addr); KUNIT_EXPECT_MEMEQ(test, netdev->dev_addr, addr, sizeof(addr)); + rtnl_unlock(); } static void dev_addr_test_sync_one(struct kunit *test) @@ -86,6 +86,7 @@ static void dev_addr_test_sync_one(struct kunit *test) datp = netdev_priv(netdev); + rtnl_lock(); memset(addr, 1, sizeof(addr)); eth_hw_addr_set(netdev, addr); @@ -103,6 +104,7 @@ static void dev_addr_test_sync_one(struct kunit *test) * considered synced and we overwrite in place. */ KUNIT_EXPECT_EQ(test, 0, datp->addr_seen); + rtnl_unlock(); } static void dev_addr_test_add_del(struct kunit *test) @@ -114,6 +116,7 @@ static void dev_addr_test_add_del(struct kunit *test) datp = netdev_priv(netdev); + rtnl_lock(); for (i = 1; i < 4; i++) { memset(addr, i, sizeof(addr)); KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, @@ -143,6 +146,7 @@ static void dev_addr_test_add_del(struct kunit *test) __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, dev_addr_test_unsync); KUNIT_EXPECT_EQ(test, 1, datp->addr_seen); + rtnl_unlock(); } static void dev_addr_test_del_main(struct kunit *test) @@ -150,6 +154,7 @@ static void dev_addr_test_del_main(struct kunit *test) struct net_device *netdev = test->priv; u8 addr[ETH_ALEN]; + rtnl_lock(); memset(addr, 1, sizeof(addr)); eth_hw_addr_set(netdev, addr); @@ -161,6 +166,7 @@ static void dev_addr_test_del_main(struct kunit *test) NETDEV_HW_ADDR_T_LAN)); KUNIT_EXPECT_EQ(test, -ENOENT, dev_addr_del(netdev, addr, NETDEV_HW_ADDR_T_LAN)); + rtnl_unlock(); } static void dev_addr_test_add_set(struct kunit *test) @@ -172,6 +178,7 @@ static void dev_addr_test_add_set(struct kunit *test) datp = netdev_priv(netdev); + rtnl_lock(); /* There is no external API like dev_addr_add_excl(), * so shuffle the tree a little bit and exploit aliasing. */ @@ -191,6 +198,7 @@ static void dev_addr_test_add_set(struct kunit *test) __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, dev_addr_test_unsync); KUNIT_EXPECT_EQ(test, 0xffff, datp->addr_seen); + rtnl_unlock(); } static void dev_addr_test_add_excl(struct kunit *test) @@ -199,6 +207,7 @@ static void dev_addr_test_add_excl(struct kunit *test) u8 addr[ETH_ALEN]; int i; + rtnl_lock(); for (i = 0; i < 10; i++) { memset(addr, i, sizeof(addr)); KUNIT_EXPECT_EQ(test, 0, dev_uc_add_excl(netdev, addr)); @@ -213,6 +222,7 @@ static void dev_addr_test_add_excl(struct kunit *test) memset(addr, i, sizeof(addr)); KUNIT_EXPECT_EQ(test, -EEXIST, dev_uc_add_excl(netdev, addr)); } + rtnl_unlock(); } static struct kunit_case dev_addr_test_cases[] = { diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index b0f221d658..430ed18f85 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -74,7 +74,7 @@ struct net_dm_hw_entries { }; struct per_cpu_dm_data { - spinlock_t lock; /* Protects 'skb', 'hw_entries' and + raw_spinlock_t lock; /* Protects 'skb', 'hw_entries' and * 'send_timer' */ union { @@ -168,9 +168,9 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data) err: mod_timer(&data->send_timer, jiffies + HZ / 10); out: - spin_lock_irqsave(&data->lock, flags); + raw_spin_lock_irqsave(&data->lock, flags); swap(data->skb, skb); - spin_unlock_irqrestore(&data->lock, flags); + raw_spin_unlock_irqrestore(&data->lock, flags); if (skb) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data; @@ -225,7 +225,7 @@ static void trace_drop_common(struct sk_buff *skb, void *location) local_irq_save(flags); data = this_cpu_ptr(&dm_cpu_data); - spin_lock(&data->lock); + raw_spin_lock(&data->lock); dskb = data->skb; if (!dskb) @@ -259,7 +259,7 @@ static void trace_drop_common(struct sk_buff *skb, void *location) } out: - spin_unlock_irqrestore(&data->lock, flags); + raw_spin_unlock_irqrestore(&data->lock, flags); } static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, @@ -314,9 +314,9 @@ net_dm_hw_reset_per_cpu_data(struct per_cpu_dm_data *hw_data) mod_timer(&hw_data->send_timer, jiffies + HZ / 10); } - spin_lock_irqsave(&hw_data->lock, flags); + raw_spin_lock_irqsave(&hw_data->lock, flags); swap(hw_data->hw_entries, hw_entries); - spin_unlock_irqrestore(&hw_data->lock, flags); + raw_spin_unlock_irqrestore(&hw_data->lock, flags); return hw_entries; } @@ -448,7 +448,7 @@ net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink, return; hw_data = this_cpu_ptr(&dm_hw_cpu_data); - spin_lock_irqsave(&hw_data->lock, flags); + raw_spin_lock_irqsave(&hw_data->lock, flags); hw_entries = hw_data->hw_entries; if (!hw_entries) @@ -477,7 +477,7 @@ net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink, } out: - spin_unlock_irqrestore(&hw_data->lock, flags); + raw_spin_unlock_irqrestore(&hw_data->lock, flags); } static const struct net_dm_alert_ops net_dm_alert_summary_ops = { @@ -1673,7 +1673,7 @@ static struct notifier_block dropmon_net_notifier = { static void __net_dm_cpu_data_init(struct per_cpu_dm_data *data) { - spin_lock_init(&data->lock); + raw_spin_lock_init(&data->lock); skb_queue_head_init(&data->drop_queue); u64_stats_init(&data->stats.syncp); } diff --git a/net/core/dst.c b/net/core/dst.c index 6838d3212c..95f533844f 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -96,7 +96,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, } EXPORT_SYMBOL(dst_alloc); -struct dst_entry *dst_destroy(struct dst_entry * dst) +static void dst_destroy(struct dst_entry *dst) { struct dst_entry *child = NULL; @@ -126,15 +126,13 @@ struct dst_entry *dst_destroy(struct dst_entry * dst) dst = child; if (dst) dst_release_immediate(dst); - return NULL; } -EXPORT_SYMBOL(dst_destroy); static void dst_destroy_rcu(struct rcu_head *head) { struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); - dst = dst_destroy(dst); + dst_destroy(dst); } /* Operations to mark dst as DEAD and clean up the net device referenced diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c index 0ccfd5fa5c..70c634b9e7 100644 --- a/net/core/dst_cache.c +++ b/net/core/dst_cache.c @@ -27,6 +27,7 @@ struct dst_cache_pcpu { static void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache, struct dst_entry *dst, u32 cookie) { + DEBUG_NET_WARN_ON_ONCE(!in_softirq()); dst_release(dst_cache->dst); if (dst) dst_hold(dst); @@ -40,6 +41,7 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, { struct dst_entry *dst; + DEBUG_NET_WARN_ON_ONCE(!in_softirq()); dst = idst->dst; if (!dst) goto fail; @@ -47,7 +49,8 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, /* the cache already hold a dst reference; it can't go away */ dst_hold(dst); - if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) || + if (unlikely(!time_after(idst->refresh_ts, + READ_ONCE(dst_cache->reset_ts)) || (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) { dst_cache_per_cpu_dst_set(idst, NULL, 0); dst_release(dst); @@ -83,7 +86,7 @@ struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) return NULL; *saddr = idst->in_saddr.s_addr; - return container_of(dst, struct rtable, dst); + return dst_rtable(dst); } EXPORT_SYMBOL_GPL(dst_cache_get_ip4); @@ -111,8 +114,8 @@ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, return; idst = this_cpu_ptr(dst_cache->cache); - dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst, - rt6_get_cookie((struct rt6_info *)dst)); + dst_cache_per_cpu_dst_set(idst, dst, + rt6_get_cookie(dst_rt6_info(dst))); idst->in6_saddr = *saddr; } EXPORT_SYMBOL_GPL(dst_cache_set_ip6); @@ -170,7 +173,7 @@ void dst_cache_reset_now(struct dst_cache *dst_cache) if (!dst_cache->cache) return; - dst_cache->reset_ts = jiffies; + dst_cache_reset(dst_cache); for_each_possible_cpu(i) { struct dst_cache_pcpu *idst = per_cpu_ptr(dst_cache->cache, i); struct dst_entry *dst = idst->dst; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 3f933ffcef..6ebffbc632 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -1142,10 +1142,10 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct fib_rules_ops *ops; - int idx = 0, family; + int err, idx = 0, family; if (cb->strict_check) { - int err = fib_valid_dumprule_req(nlh, cb->extack); + err = fib_valid_dumprule_req(nlh, cb->extack); if (err < 0) return err; @@ -1158,17 +1158,17 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) if (ops == NULL) return -EAFNOSUPPORT; - dump_rules(skb, cb, ops); - - return skb->len; + return dump_rules(skb, cb, ops); } + err = 0; rcu_read_lock(); list_for_each_entry_rcu(ops, &net->rules_ops, list) { if (idx < cb->args[0] || !try_module_get(ops->owner)) goto skip; - if (dump_rules(skb, cb, ops) < 0) + err = dump_rules(skb, cb, ops); + if (err < 0) break; cb->args[1] = 0; @@ -1178,7 +1178,7 @@ skip: rcu_read_unlock(); cb->args[0] = idx; - return skb->len; + return err; } static void notify_rule_change(int event, struct fib_rule *rule, @@ -1293,7 +1293,8 @@ static int __init fib_rules_init(void) int err; rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, 0); + rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, + RTNL_FLAG_DUMP_UNLOCKED); err = register_pernet_subsys(&fib_rules_net_ops); if (err < 0) diff --git a/net/core/filter.c b/net/core/filter.c index ef3e78b6a3..ab0455c64e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -87,8 +87,11 @@ #include "dev.h" +/* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */ +static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check"); + static const struct bpf_func_proto * -bpf_sk_base_func_proto(enum bpf_func_id func_id); +bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len) { @@ -778,7 +781,7 @@ jmp_rest: BPF_EMIT_JMP; break; - /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ + /* ldxb 4 * ([14] & 0xf) is remapped into 6 insns. */ case BPF_LDX | BPF_MSH | BPF_B: { struct sock_filter tmp = { .code = BPF_LD | BPF_ABS | BPF_B, @@ -804,7 +807,7 @@ jmp_rest: *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); break; } - /* RET_K is remaped into 2 insns. RET_A case doesn't need an + /* RET_K is remapped into 2 insns. RET_A case doesn't need an * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. */ case BPF_RET | BPF_A: @@ -1662,6 +1665,11 @@ static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); static inline int __bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) { +#ifdef CONFIG_DEBUG_NET + /* Avoid a splat in pskb_may_pull_reason() */ + if (write_len > INT_MAX) + return -EINVAL; +#endif return skb_ensure_writable(skb, write_len); } @@ -2215,7 +2223,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, rcu_read_lock(); if (!nh) { dst = skb_dst(skb); - nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), + nexthop = rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr); } else { nexthop = &nh->ipv6_nh; @@ -2271,12 +2279,12 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, err = bpf_out_neigh_v6(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; goto out_xmit; out_drop: - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); out_xmit: return ret; @@ -2314,8 +2322,7 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, rcu_read_lock(); if (!nh) { - struct dst_entry *dst = skb_dst(skb); - struct rtable *rt = container_of(dst, struct rtable, dst); + struct rtable *rt = skb_rtable(skb); neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); } else if (nh->nh_family == AF_INET6) { @@ -2378,12 +2385,12 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, err = bpf_out_neigh_v4(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; goto out_xmit; out_drop: - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); out_xmit: return ret; @@ -2968,7 +2975,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, * * Then if B is non-zero AND there is no space allocate space and * compact A, B regions into page. If there is space shift ring to - * the rigth free'ing the next element in ring to place B, leaving + * the right free'ing the next element in ring to place B, leaving * A untouched except to reduce length. */ if (start != offset) { @@ -3537,13 +3544,20 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); - /* Due to header grow, MSS needs to be downgraded. */ - if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) - skb_decrease_gso_size(shinfo, len_diff); - /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= gso_type; shinfo->gso_segs = 0; + + /* Due to header growth, MSS needs to be downgraded. + * There is a BUG_ON() when segmenting the frag_list with + * head_frag true, so linearize the skb after downgrading + * the MSS. + */ + if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) { + skb_decrease_gso_size(shinfo, len_diff); + if (shinfo->frag_list) + return skb_linearize(skb); + } } return 0; @@ -4360,10 +4374,12 @@ static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; + u32 flags = ri->flags; struct bpf_map *map; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->flags = 0; ri->map_type = BPF_MAP_TYPE_UNSPEC; if (unlikely(!xdpf)) { @@ -4375,11 +4391,20 @@ static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: - map = READ_ONCE(ri->map); - if (unlikely(map)) { + if (unlikely(flags & BPF_F_BROADCAST)) { + map = READ_ONCE(ri->map); + + /* The map pointer is cleared when the map is being torn + * down by bpf_clear_redirect_map() + */ + if (unlikely(!map)) { + err = -ENOENT; + break; + } + WRITE_ONCE(ri->map, NULL); err = dev_map_enqueue_multi(xdpf, dev, map, - ri->flags & BPF_F_EXCLUDE_INGRESS); + flags & BPF_F_EXCLUDE_INGRESS); } else { err = dev_map_enqueue(fwd, xdpf, dev); } @@ -4442,9 +4467,9 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect_frame); static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog, - void *fwd, - enum bpf_map_type map_type, u32 map_id) + struct bpf_prog *xdp_prog, void *fwd, + enum bpf_map_type map_type, u32 map_id, + u32 flags) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_map *map; @@ -4454,11 +4479,20 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: - map = READ_ONCE(ri->map); - if (unlikely(map)) { + if (unlikely(flags & BPF_F_BROADCAST)) { + map = READ_ONCE(ri->map); + + /* The map pointer is cleared when the map is being torn + * down by bpf_clear_redirect_map() + */ + if (unlikely(!map)) { + err = -ENOENT; + break; + } + WRITE_ONCE(ri->map, NULL); err = dev_map_redirect_multi(dev, skb, xdp_prog, map, - ri->flags & BPF_F_EXCLUDE_INGRESS); + flags & BPF_F_EXCLUDE_INGRESS); } else { err = dev_map_generic_redirect(fwd, skb, xdp_prog); } @@ -4495,9 +4529,11 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; + u32 flags = ri->flags; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->flags = 0; ri->map_type = BPF_MAP_TYPE_UNSPEC; if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { @@ -4517,7 +4553,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, return 0; } - return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id); + return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id, flags); err: _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err); return err; @@ -4662,7 +4698,7 @@ set_compat: to->tunnel_tos = info->key.tos; to->tunnel_ttl = info->key.ttl; if (flags & BPF_F_TUNINFO_FLAGS) - to->tunnel_flags = info->key.tun_flags; + to->tunnel_flags = ip_tunnel_flags_to_be16(info->key.tun_flags); else to->tunnel_ext = 0; @@ -4705,7 +4741,7 @@ BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size) int err; if (unlikely(!info || - !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) { + !ip_tunnel_is_options_present(info->key.tun_flags))) { err = -ENOENT; goto err_clear; } @@ -4775,15 +4811,15 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, memset(info, 0, sizeof(*info)); info->mode = IP_TUNNEL_INFO_TX; - info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; - if (flags & BPF_F_DONT_FRAGMENT) - info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; - if (flags & BPF_F_ZERO_CSUM_TX) - info->key.tun_flags &= ~TUNNEL_CSUM; - if (flags & BPF_F_SEQ_NUMBER) - info->key.tun_flags |= TUNNEL_SEQ; - if (flags & BPF_F_NO_TUNNEL_KEY) - info->key.tun_flags &= ~TUNNEL_KEY; + __set_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags); + __assign_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags, + flags & BPF_F_DONT_FRAGMENT); + __assign_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags, + !(flags & BPF_F_ZERO_CSUM_TX)); + __assign_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags, + flags & BPF_F_SEQ_NUMBER); + __assign_bit(IP_TUNNEL_KEY_BIT, info->key.tun_flags, + !(flags & BPF_F_NO_TUNNEL_KEY)); info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; @@ -4821,13 +4857,15 @@ BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, { struct ip_tunnel_info *info = skb_tunnel_info(skb); const struct metadata_dst *md = this_cpu_ptr(md_dst); + IP_TUNNEL_DECLARE_FLAGS(present) = { }; if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) return -EINVAL; if (unlikely(size > IP_TUNNEL_OPTS_MAX)) return -ENOMEM; - ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT); + ip_tunnel_set_options_present(present); + ip_tunnel_info_opts_set(info, from, size, present); return 0; } @@ -5884,7 +5922,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); } else { - fl4.flowi4_mark = 0; + if (flags & BPF_FIB_LOOKUP_MARK) + fl4.flowi4_mark = params->mark; + else + fl4.flowi4_mark = 0; fl4.flowi4_secid = 0; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_uid = sock_net_uid(net, NULL); @@ -5988,7 +6029,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, return -ENODEV; idev = __in6_dev_get_safely(dev); - if (unlikely(!idev || !idev->cnf.forwarding)) + if (unlikely(!idev || !READ_ONCE(idev->cnf.forwarding))) return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { @@ -6027,7 +6068,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res, strict); } else { - fl6.flowi6_mark = 0; + if (flags & BPF_FIB_LOOKUP_MARK) + fl6.flowi6_mark = params->mark; + else + fl6.flowi6_mark = 0; fl6.flowi6_secid = 0; fl6.flowi6_tun_key.tun_id = 0; fl6.flowi6_uid = sock_net_uid(net, NULL); @@ -6105,7 +6149,7 @@ set_fwd_params: #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \ BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \ - BPF_FIB_LOOKUP_SRC) + BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK) BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, struct bpf_fib_lookup *, params, int, plen, u32, flags) @@ -7894,7 +7938,7 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } @@ -7987,7 +8031,7 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return NULL; } default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8006,7 +8050,7 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8193,7 +8237,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif #endif default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8252,13 +8296,13 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif #endif default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } #if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES) /* The nf_conn___init type is used in the NF_CONNTRACK kfuncs. The * kfuncs are defined in two different modules, and we want to be able - * to use them interchangably with the same BTF type ID. Because modules + * to use them interchangeably with the same BTF type ID. Because modules * can't de-duplicate BTF IDs between each other, we need the type to be * referenced in the vmlinux BTF or the verifier will get confused about * the different types. So we add this dummy type reference which will @@ -8313,7 +8357,7 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_tcp_sock_proto; #endif /* CONFIG_INET */ default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8342,8 +8386,6 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_event_output_data_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: @@ -8355,7 +8397,7 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_cgroup_classid_curr_proto; #endif default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8399,7 +8441,7 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skc_lookup_tcp_proto; #endif default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8410,7 +8452,7 @@ flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_load_bytes: return &bpf_flow_dissector_load_bytes_proto; default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8437,7 +8479,7 @@ lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8612,7 +8654,7 @@ static bool cg_skb_is_valid_access(int off, int size, return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): - if (!bpf_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return false; break; } @@ -8624,7 +8666,7 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; case bpf_ctx_range(struct __sk_buff, tstamp): - if (!bpf_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return false; break; default: @@ -11268,7 +11310,7 @@ sk_reuseport_func_proto(enum bpf_func_id func_id, case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } @@ -11450,7 +11492,7 @@ sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_sk_release: return &bpf_sk_release_proto; default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -11784,7 +11826,7 @@ const struct bpf_func_proto bpf_sock_from_file_proto = { }; static const struct bpf_func_proto * -bpf_sk_base_func_proto(enum bpf_func_id func_id) +bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func; @@ -11813,10 +11855,10 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } - if (!perfmon_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_PERFMON)) return NULL; return func; @@ -11869,6 +11911,103 @@ __bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern, return 0; } + +__bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct sk_buff *skb, struct sock *sk, + struct bpf_tcp_req_attrs *attrs, int attrs__sz) +{ +#if IS_ENABLED(CONFIG_SYN_COOKIES) + const struct request_sock_ops *ops; + struct inet_request_sock *ireq; + struct tcp_request_sock *treq; + struct request_sock *req; + struct net *net; + __u16 min_mss; + u32 tsoff = 0; + + if (attrs__sz != sizeof(*attrs) || + attrs->reserved[0] || attrs->reserved[1] || attrs->reserved[2]) + return -EINVAL; + + if (!skb_at_tc_ingress(skb)) + return -EINVAL; + + net = dev_net(skb->dev); + if (net != sock_net(sk)) + return -ENETUNREACH; + + switch (skb->protocol) { + case htons(ETH_P_IP): + ops = &tcp_request_sock_ops; + min_mss = 536; + break; +#if IS_BUILTIN(CONFIG_IPV6) + case htons(ETH_P_IPV6): + ops = &tcp6_request_sock_ops; + min_mss = IPV6_MIN_MTU - 60; + break; +#endif + default: + return -EINVAL; + } + + if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_LISTEN || + sk_is_mptcp(sk)) + return -EINVAL; + + if (attrs->mss < min_mss) + return -EINVAL; + + if (attrs->wscale_ok) { + if (!READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) + return -EINVAL; + + if (attrs->snd_wscale > TCP_MAX_WSCALE || + attrs->rcv_wscale > TCP_MAX_WSCALE) + return -EINVAL; + } + + if (attrs->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack)) + return -EINVAL; + + if (attrs->tstamp_ok) { + if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps)) + return -EINVAL; + + tsoff = attrs->rcv_tsecr - tcp_ns_to_ts(attrs->usec_ts_ok, tcp_clock_ns()); + } + + req = inet_reqsk_alloc(ops, sk, false); + if (!req) + return -ENOMEM; + + ireq = inet_rsk(req); + treq = tcp_rsk(req); + + req->rsk_listener = sk; + req->syncookie = 1; + req->mss = attrs->mss; + req->ts_recent = attrs->rcv_tsval; + + ireq->snd_wscale = attrs->snd_wscale; + ireq->rcv_wscale = attrs->rcv_wscale; + ireq->tstamp_ok = !!attrs->tstamp_ok; + ireq->sack_ok = !!attrs->sack_ok; + ireq->wscale_ok = !!attrs->wscale_ok; + ireq->ecn_ok = !!attrs->ecn_ok; + + treq->req_usec_ts = !!attrs->usec_ts_ok; + treq->ts_off = tsoff; + + skb_orphan(skb); + skb->sk = req_to_sk(req); + skb->destructor = sock_pfree; + + return 0; +#else + return -EOPNOTSUPP; +#endif +} + __bpf_kfunc_end_defs(); int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, @@ -11885,17 +12024,21 @@ int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, return 0; } -BTF_SET8_START(bpf_kfunc_check_set_skb) +BTF_KFUNCS_START(bpf_kfunc_check_set_skb) BTF_ID_FLAGS(func, bpf_dynptr_from_skb) -BTF_SET8_END(bpf_kfunc_check_set_skb) +BTF_KFUNCS_END(bpf_kfunc_check_set_skb) -BTF_SET8_START(bpf_kfunc_check_set_xdp) +BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) BTF_ID_FLAGS(func, bpf_dynptr_from_xdp) -BTF_SET8_END(bpf_kfunc_check_set_xdp) +BTF_KFUNCS_END(bpf_kfunc_check_set_xdp) -BTF_SET8_START(bpf_kfunc_check_set_sock_addr) +BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr) BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path) -BTF_SET8_END(bpf_kfunc_check_set_sock_addr) +BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr) + +BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk) +BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk) static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .owner = THIS_MODULE, @@ -11912,6 +12055,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = { .set = &bpf_kfunc_check_set_sock_addr, }; +static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_tcp_reqsk, +}; + static int __init bpf_kfunc_init(void) { int ret; @@ -11927,8 +12075,9 @@ static int __init bpf_kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); - return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, - &bpf_kfunc_set_sock_addr); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + &bpf_kfunc_set_sock_addr); + return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk); } late_initcall(bpf_kfunc_init); @@ -11968,9 +12117,9 @@ __bpf_kfunc int bpf_sock_destroy(struct sock_common *sock) __bpf_kfunc_end_defs(); -BTF_SET8_START(bpf_sk_iter_kfunc_ids) +BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids) BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS) -BTF_SET8_END(bpf_sk_iter_kfunc_ids) +BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids) static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id) { diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 272f092513..7b54f44f53 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -455,17 +455,25 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) { struct flow_dissector_key_enc_opts *enc_opt; + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; + u32 val; enc_opt = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS, target_container); - if (info->options_len) { - enc_opt->len = info->options_len; - ip_tunnel_info_opts_get(enc_opt->data, info); - enc_opt->dst_opt_type = info->key.tun_flags & - TUNNEL_OPTIONS_PRESENT; - } + if (!info->options_len) + return; + + enc_opt->len = info->options_len; + ip_tunnel_info_opts_get(enc_opt->data, info); + + ip_tunnel_set_options_present(flags); + ip_tunnel_flags_and(flags, info->key.tun_flags, flags); + + val = find_next_bit(flags, __IP_TUNNEL_FLAG_NUM, + IP_TUNNEL_GENEVE_OPT_BIT); + enc_opt->dst_opt_type = val < __IP_TUNNEL_FLAG_NUM ? val : 0; } } EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); @@ -1093,7 +1101,7 @@ bool __skb_flow_dissect(const struct net *net, } } - WARN_ON_ONCE(!net); + DEBUG_NET_WARN_ON_ONCE(!net); if (net) { enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; struct bpf_prog_array *run_array; diff --git a/net/core/gro.c b/net/core/gro.c index cefddf65f7..b3b43de1a6 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -3,6 +3,7 @@ #include <net/dst_metadata.h> #include <net/busy_poll.h> #include <trace/events/net.h> +#include <linux/skbuff_ref.h> #define MAX_GRO_SKBS 8 @@ -10,9 +11,6 @@ #define GRO_MAX_HEAD (MAX_HEADER + 128) static DEFINE_SPINLOCK(offload_lock); -struct list_head offload_base __read_mostly = LIST_HEAD_INIT(offload_base); -/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */ -int gro_normal_batch __read_mostly = 8; /** * dev_add_offload - register offload handlers @@ -31,7 +29,7 @@ void dev_add_offload(struct packet_offload *po) struct packet_offload *elem; spin_lock(&offload_lock); - list_for_each_entry(elem, &offload_base, list) { + list_for_each_entry(elem, &net_hotdata.offload_base, list) { if (po->priority < elem->priority) break; } @@ -55,7 +53,7 @@ EXPORT_SYMBOL(dev_add_offload); */ static void __dev_remove_offload(struct packet_offload *po) { - struct list_head *head = &offload_base; + struct list_head *head = &net_hotdata.offload_base; struct packet_offload *po1; spin_lock(&offload_lock); @@ -233,12 +231,39 @@ done: return 0; } +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) +{ + if (unlikely(p->len + skb->len >= 65536)) + return -E2BIG; + + if (NAPI_GRO_CB(p)->last == p) + skb_shinfo(p)->frag_list = skb; + else + NAPI_GRO_CB(p)->last->next = skb; + + skb_pull(skb, skb_gro_offset(skb)); + + NAPI_GRO_CB(p)->last = skb; + NAPI_GRO_CB(p)->count++; + p->data_len += skb->len; + + /* sk ownership - if any - completely transferred to the aggregated packet */ + skb->destructor = NULL; + skb->sk = NULL; + p->truesize += skb->truesize; + p->len += skb->len; + + NAPI_GRO_CB(skb)->same_flow = 1; + + return 0; +} + static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) { + struct list_head *head = &net_hotdata.offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; - struct list_head *head = &offload_base; int err = -ENOENT; BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); @@ -333,8 +358,6 @@ static void gro_list_prepare(const struct list_head *head, list_for_each_entry(p, head, list) { unsigned long diffs; - NAPI_GRO_CB(p)->flush = 0; - if (hash != skb_get_hash_raw(p)) { NAPI_GRO_CB(p)->same_flow = 0; continue; @@ -370,15 +393,22 @@ static void gro_list_prepare(const struct list_head *head, static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) { - const struct skb_shared_info *pinfo = skb_shinfo(skb); - const skb_frag_t *frag0 = &pinfo->frags[0]; + const struct skb_shared_info *pinfo; + const skb_frag_t *frag0; + unsigned int headlen; + NAPI_GRO_CB(skb)->network_offset = 0; NAPI_GRO_CB(skb)->data_offset = 0; - NAPI_GRO_CB(skb)->frag0 = NULL; - NAPI_GRO_CB(skb)->frag0_len = 0; + headlen = skb_headlen(skb); + NAPI_GRO_CB(skb)->frag0 = skb->data; + NAPI_GRO_CB(skb)->frag0_len = headlen; + if (headlen) + return; - if (!skb_headlen(skb) && pinfo->nr_frags && - !PageHighMem(skb_frag_page(frag0)) && + pinfo = skb_shinfo(skb); + frag0 = &pinfo->frags[0]; + + if (pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0)) && (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, @@ -439,7 +469,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff { u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); struct gro_list *gro_list = &napi->gro_hash[bucket]; - struct list_head *head = &offload_base; + struct list_head *head = &net_hotdata.offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; struct sk_buff *pp = NULL; @@ -467,7 +497,6 @@ found_ptype: sizeof(u32))); /* Avoid slow unaligned acc */ *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); - NAPI_GRO_CB(skb)->is_atomic = 1; NAPI_GRO_CB(skb)->count = 1; if (unlikely(skb_is_gso(skb))) { NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs; @@ -545,7 +574,7 @@ normal: struct packet_offload *gro_find_receive_by_type(__be16 type) { - struct list_head *offload_head = &offload_base; + struct list_head *offload_head = &net_hotdata.offload_base; struct packet_offload *ptype; list_for_each_entry_rcu(ptype, offload_head, list) { @@ -559,7 +588,7 @@ EXPORT_SYMBOL(gro_find_receive_by_type); struct packet_offload *gro_find_complete_by_type(__be16 type) { - struct list_head *offload_head = &offload_base; + struct list_head *offload_head = &net_hotdata.offload_base; struct packet_offload *ptype; list_for_each_entry_rcu(ptype, offload_head, list) { @@ -701,7 +730,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) skb_reset_mac_header(skb); skb_gro_reset_offset(skb, hlen); - if (unlikely(skb_gro_header_hard(skb, hlen))) { + if (unlikely(!skb_gro_may_pull(skb, hlen))) { eth = skb_gro_header_slow(skb, hlen, 0); if (unlikely(!eth)) { net_warn_ratelimited("%s: dropping impossible skb from %s\n", @@ -711,7 +740,10 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) } } else { eth = (const struct ethhdr *)skb->data; - gro_pull_from_frag0(skb, hlen); + + if (NAPI_GRO_CB(skb)->frag0 != skb->data) + gro_pull_from_frag0(skb, hlen); + NAPI_GRO_CB(skb)->frag0 += hlen; NAPI_GRO_CB(skb)->frag0_len -= hlen; } diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index ed5ec5de47..ff8e5b64bf 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -3,6 +3,7 @@ #include <linux/slab.h> #include <linux/netdevice.h> #include <net/gro_cells.h> +#include <net/hotdata.h> struct gro_cell { struct sk_buff_head napi_skbs; @@ -26,7 +27,7 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) cell = this_cpu_ptr(gcells->cells); - if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(netdev_max_backlog)) { + if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(net_hotdata.max_backlog)) { drop: dev_core_stats_rx_dropped_inc(dev); kfree_skb(skb); diff --git a/net/core/gso.c b/net/core/gso.c index 9e1803bfc9..bcd156372f 100644 --- a/net/core/gso.c +++ b/net/core/gso.c @@ -17,7 +17,7 @@ struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, struct packet_offload *ptype; rcu_read_lock(); - list_for_each_entry_rcu(ptype, &offload_base, list) { + list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { segs = ptype->callbacks.gso_segment(skb, features); break; @@ -48,7 +48,7 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, __skb_pull(skb, vlan_depth); rcu_read_lock(); - list_for_each_entry_rcu(ptype, &offload_base, list) { + list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { segs = ptype->callbacks.gso_segment(skb, features); break; diff --git a/net/core/gso_test.c b/net/core/gso_test.c deleted file mode 100644 index 358c44680d..0000000000 --- a/net/core/gso_test.c +++ /dev/null @@ -1,274 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later - -#include <kunit/test.h> -#include <linux/skbuff.h> - -static const char hdr[] = "abcdefgh"; -#define GSO_TEST_SIZE 1000 - -static void __init_skb(struct sk_buff *skb) -{ - skb_reset_mac_header(skb); - memcpy(skb_mac_header(skb), hdr, sizeof(hdr)); - - /* skb_segment expects skb->data at start of payload */ - skb_pull(skb, sizeof(hdr)); - skb_reset_network_header(skb); - skb_reset_transport_header(skb); - - /* proto is arbitrary, as long as not ETH_P_TEB or vlan */ - skb->protocol = htons(ETH_P_ATALK); - skb_shinfo(skb)->gso_size = GSO_TEST_SIZE; -} - -enum gso_test_nr { - GSO_TEST_LINEAR, - GSO_TEST_NO_GSO, - GSO_TEST_FRAGS, - GSO_TEST_FRAGS_PURE, - GSO_TEST_GSO_PARTIAL, - GSO_TEST_FRAG_LIST, - GSO_TEST_FRAG_LIST_PURE, - GSO_TEST_FRAG_LIST_NON_UNIFORM, - GSO_TEST_GSO_BY_FRAGS, -}; - -struct gso_test_case { - enum gso_test_nr id; - const char *name; - - /* input */ - unsigned int linear_len; - unsigned int nr_frags; - const unsigned int *frags; - unsigned int nr_frag_skbs; - const unsigned int *frag_skbs; - - /* output as expected */ - unsigned int nr_segs; - const unsigned int *segs; -}; - -static struct gso_test_case cases[] = { - { - .id = GSO_TEST_NO_GSO, - .name = "no_gso", - .linear_len = GSO_TEST_SIZE, - .nr_segs = 1, - .segs = (const unsigned int[]) { GSO_TEST_SIZE }, - }, - { - .id = GSO_TEST_LINEAR, - .name = "linear", - .linear_len = GSO_TEST_SIZE + GSO_TEST_SIZE + 1, - .nr_segs = 3, - .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 1 }, - }, - { - .id = GSO_TEST_FRAGS, - .name = "frags", - .linear_len = GSO_TEST_SIZE, - .nr_frags = 2, - .frags = (const unsigned int[]) { GSO_TEST_SIZE, 1 }, - .nr_segs = 3, - .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 1 }, - }, - { - .id = GSO_TEST_FRAGS_PURE, - .name = "frags_pure", - .nr_frags = 3, - .frags = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 2 }, - .nr_segs = 3, - .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 2 }, - }, - { - .id = GSO_TEST_GSO_PARTIAL, - .name = "gso_partial", - .linear_len = GSO_TEST_SIZE, - .nr_frags = 2, - .frags = (const unsigned int[]) { GSO_TEST_SIZE, 3 }, - .nr_segs = 2, - .segs = (const unsigned int[]) { 2 * GSO_TEST_SIZE, 3 }, - }, - { - /* commit 89319d3801d1: frag_list on mss boundaries */ - .id = GSO_TEST_FRAG_LIST, - .name = "frag_list", - .linear_len = GSO_TEST_SIZE, - .nr_frag_skbs = 2, - .frag_skbs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE }, - .nr_segs = 3, - .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, GSO_TEST_SIZE }, - }, - { - .id = GSO_TEST_FRAG_LIST_PURE, - .name = "frag_list_pure", - .nr_frag_skbs = 2, - .frag_skbs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE }, - .nr_segs = 2, - .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE }, - }, - { - /* commit 43170c4e0ba7: GRO of frag_list trains */ - .id = GSO_TEST_FRAG_LIST_NON_UNIFORM, - .name = "frag_list_non_uniform", - .linear_len = GSO_TEST_SIZE, - .nr_frag_skbs = 4, - .frag_skbs = (const unsigned int[]) { GSO_TEST_SIZE, 1, GSO_TEST_SIZE, 2 }, - .nr_segs = 4, - .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, GSO_TEST_SIZE, 3 }, - }, - { - /* commit 3953c46c3ac7 ("sk_buff: allow segmenting based on frag sizes") and - * commit 90017accff61 ("sctp: Add GSO support") - * - * "there will be a cover skb with protocol headers and - * children ones containing the actual segments" - */ - .id = GSO_TEST_GSO_BY_FRAGS, - .name = "gso_by_frags", - .nr_frag_skbs = 4, - .frag_skbs = (const unsigned int[]) { 100, 200, 300, 400 }, - .nr_segs = 4, - .segs = (const unsigned int[]) { 100, 200, 300, 400 }, - }, -}; - -static void gso_test_case_to_desc(struct gso_test_case *t, char *desc) -{ - sprintf(desc, "%s", t->name); -} - -KUNIT_ARRAY_PARAM(gso_test, cases, gso_test_case_to_desc); - -static void gso_test_func(struct kunit *test) -{ - const int shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - struct sk_buff *skb, *segs, *cur, *next, *last; - const struct gso_test_case *tcase; - netdev_features_t features; - struct page *page; - int i; - - tcase = test->param_value; - - page = alloc_page(GFP_KERNEL); - KUNIT_ASSERT_NOT_NULL(test, page); - skb = build_skb(page_address(page), sizeof(hdr) + tcase->linear_len + shinfo_size); - KUNIT_ASSERT_NOT_NULL(test, skb); - __skb_put(skb, sizeof(hdr) + tcase->linear_len); - - __init_skb(skb); - - if (tcase->nr_frags) { - unsigned int pg_off = 0; - - page = alloc_page(GFP_KERNEL); - KUNIT_ASSERT_NOT_NULL(test, page); - page_ref_add(page, tcase->nr_frags - 1); - - for (i = 0; i < tcase->nr_frags; i++) { - skb_fill_page_desc(skb, i, page, pg_off, tcase->frags[i]); - pg_off += tcase->frags[i]; - } - - KUNIT_ASSERT_LE(test, pg_off, PAGE_SIZE); - - skb->data_len = pg_off; - skb->len += skb->data_len; - skb->truesize += skb->data_len; - } - - if (tcase->frag_skbs) { - unsigned int total_size = 0, total_true_size = 0; - struct sk_buff *frag_skb, *prev = NULL; - - for (i = 0; i < tcase->nr_frag_skbs; i++) { - unsigned int frag_size; - - page = alloc_page(GFP_KERNEL); - KUNIT_ASSERT_NOT_NULL(test, page); - - frag_size = tcase->frag_skbs[i]; - frag_skb = build_skb(page_address(page), - frag_size + shinfo_size); - KUNIT_ASSERT_NOT_NULL(test, frag_skb); - __skb_put(frag_skb, frag_size); - - if (prev) - prev->next = frag_skb; - else - skb_shinfo(skb)->frag_list = frag_skb; - prev = frag_skb; - - total_size += frag_size; - total_true_size += frag_skb->truesize; - } - - skb->len += total_size; - skb->data_len += total_size; - skb->truesize += total_true_size; - - if (tcase->id == GSO_TEST_GSO_BY_FRAGS) - skb_shinfo(skb)->gso_size = GSO_BY_FRAGS; - } - - features = NETIF_F_SG | NETIF_F_HW_CSUM; - if (tcase->id == GSO_TEST_GSO_PARTIAL) - features |= NETIF_F_GSO_PARTIAL; - - /* TODO: this should also work with SG, - * rather than hit BUG_ON(i >= nfrags) - */ - if (tcase->id == GSO_TEST_FRAG_LIST_NON_UNIFORM) - features &= ~NETIF_F_SG; - - segs = skb_segment(skb, features); - if (IS_ERR(segs)) { - KUNIT_FAIL(test, "segs error %pe", segs); - goto free_gso_skb; - } else if (!segs) { - KUNIT_FAIL(test, "no segments"); - goto free_gso_skb; - } - - last = segs->prev; - for (cur = segs, i = 0; cur; cur = next, i++) { - next = cur->next; - - KUNIT_ASSERT_EQ(test, cur->len, sizeof(hdr) + tcase->segs[i]); - - /* segs have skb->data pointing to the mac header */ - KUNIT_ASSERT_PTR_EQ(test, skb_mac_header(cur), cur->data); - KUNIT_ASSERT_PTR_EQ(test, skb_network_header(cur), cur->data + sizeof(hdr)); - - /* header was copied to all segs */ - KUNIT_ASSERT_EQ(test, memcmp(skb_mac_header(cur), hdr, sizeof(hdr)), 0); - - /* last seg can be found through segs->prev pointer */ - if (!next) - KUNIT_ASSERT_PTR_EQ(test, cur, last); - - consume_skb(cur); - } - - KUNIT_ASSERT_EQ(test, i, tcase->nr_segs); - -free_gso_skb: - consume_skb(skb); -} - -static struct kunit_case gso_test_cases[] = { - KUNIT_CASE_PARAM(gso_test_func, gso_test_gen_params), - {} -}; - -static struct kunit_suite gso_test_suite = { - .name = "net_core_gso", - .test_cases = gso_test_cases, -}; - -kunit_test_suite(gso_test_suite); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("KUnit tests for segmentation offload"); diff --git a/net/core/hotdata.c b/net/core/hotdata.c new file mode 100644 index 0000000000..d0aaaaa556 --- /dev/null +++ b/net/core/hotdata.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/cache.h> +#include <linux/jiffies.h> +#include <linux/list.h> +#include <net/hotdata.h> +#include <net/proto_memory.h> + +struct net_hotdata net_hotdata __cacheline_aligned = { + .offload_base = LIST_HEAD_INIT(net_hotdata.offload_base), + .ptype_all = LIST_HEAD_INIT(net_hotdata.ptype_all), + .gro_normal_batch = 8, + + .netdev_budget = 300, + /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ + .netdev_budget_usecs = 2 * USEC_PER_SEC / HZ, + + .tstamp_prequeue = 1, + .max_backlog = 1000, + .dev_tx_weight = 64, + .dev_rx_weight = 64, + .sysctl_max_skb_frags = MAX_SKB_FRAGS, + .sysctl_skb_defer_max = 64, + .sysctl_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE +}; +EXPORT_SYMBOL(net_hotdata); diff --git a/net/core/ieee8021q_helpers.c b/net/core/ieee8021q_helpers.c new file mode 100644 index 0000000000..759a9b9f3f --- /dev/null +++ b/net/core/ieee8021q_helpers.c @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2024 Pengutronix, Oleksij Rempel <kernel@pengutronix.de> + +#include <linux/array_size.h> +#include <linux/printk.h> +#include <linux/types.h> +#include <net/dscp.h> +#include <net/ieee8021q.h> + +/* The following arrays map Traffic Types (TT) to traffic classes (TC) for + * different number of queues as shown in the example provided by + * IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic class mapping" and + * Table I-1 "Traffic type to traffic class mapping". + */ +static const u8 ieee8021q_8queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, + [IEEE8021Q_TT_BE] = 1, + [IEEE8021Q_TT_EE] = 2, + [IEEE8021Q_TT_CA] = 3, + [IEEE8021Q_TT_VI] = 4, + [IEEE8021Q_TT_VO] = 5, + [IEEE8021Q_TT_IC] = 6, + [IEEE8021Q_TT_NC] = 7, +}; + +static const u8 ieee8021q_7queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, + [IEEE8021Q_TT_BE] = 1, + [IEEE8021Q_TT_EE] = 2, + [IEEE8021Q_TT_CA] = 3, + [IEEE8021Q_TT_VI] = 4, [IEEE8021Q_TT_VO] = 4, + [IEEE8021Q_TT_IC] = 5, + [IEEE8021Q_TT_NC] = 6, +}; + +static const u8 ieee8021q_6queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, + [IEEE8021Q_TT_BE] = 1, + [IEEE8021Q_TT_EE] = 2, [IEEE8021Q_TT_CA] = 2, + [IEEE8021Q_TT_VI] = 3, [IEEE8021Q_TT_VO] = 3, + [IEEE8021Q_TT_IC] = 4, + [IEEE8021Q_TT_NC] = 5, +}; + +static const u8 ieee8021q_5queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1, + [IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2, + [IEEE8021Q_TT_IC] = 3, + [IEEE8021Q_TT_NC] = 4, +}; + +static const u8 ieee8021q_4queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1, + [IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2, + [IEEE8021Q_TT_IC] = 3, [IEEE8021Q_TT_NC] = 3, +}; + +static const u8 ieee8021q_3queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0, + [IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1, + [IEEE8021Q_TT_IC] = 2, [IEEE8021Q_TT_NC] = 2, +}; + +static const u8 ieee8021q_2queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0, + [IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1, + [IEEE8021Q_TT_IC] = 1, [IEEE8021Q_TT_NC] = 1, +}; + +static const u8 ieee8021q_1queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0, + [IEEE8021Q_TT_VI] = 0, [IEEE8021Q_TT_VO] = 0, + [IEEE8021Q_TT_IC] = 0, [IEEE8021Q_TT_NC] = 0, +}; + +/** + * ieee8021q_tt_to_tc - Map IEEE 802.1Q Traffic Type to Traffic Class + * @tt: IEEE 802.1Q Traffic Type + * @num_queues: Number of queues + * + * This function maps an IEEE 802.1Q Traffic Type to a Traffic Class (TC) based + * on the number of queues configured on the NIC. The mapping is based on the + * example provided by IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic + * class mapping" and Table I-1 "Traffic type to traffic class mapping". + * + * Return: Traffic Class corresponding to the given Traffic Type or negative + * value in case of error. + */ +int ieee8021q_tt_to_tc(enum ieee8021q_traffic_type tt, unsigned int num_queues) +{ + if (tt < 0 || tt >= IEEE8021Q_TT_MAX) { + pr_err("Requested Traffic Type (%d) is out of range (%d)\n", tt, + IEEE8021Q_TT_MAX); + return -EINVAL; + } + + switch (num_queues) { + case 8: + compiletime_assert(ARRAY_SIZE(ieee8021q_8queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_8queue_tt_tc_map != max - 1"); + return ieee8021q_8queue_tt_tc_map[tt]; + case 7: + compiletime_assert(ARRAY_SIZE(ieee8021q_7queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_7queue_tt_tc_map != max - 1"); + + return ieee8021q_7queue_tt_tc_map[tt]; + case 6: + compiletime_assert(ARRAY_SIZE(ieee8021q_6queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_6queue_tt_tc_map != max - 1"); + + return ieee8021q_6queue_tt_tc_map[tt]; + case 5: + compiletime_assert(ARRAY_SIZE(ieee8021q_5queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_5queue_tt_tc_map != max - 1"); + + return ieee8021q_5queue_tt_tc_map[tt]; + case 4: + compiletime_assert(ARRAY_SIZE(ieee8021q_4queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_4queue_tt_tc_map != max - 1"); + + return ieee8021q_4queue_tt_tc_map[tt]; + case 3: + compiletime_assert(ARRAY_SIZE(ieee8021q_3queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_3queue_tt_tc_map != max - 1"); + + return ieee8021q_3queue_tt_tc_map[tt]; + case 2: + compiletime_assert(ARRAY_SIZE(ieee8021q_2queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_2queue_tt_tc_map != max - 1"); + + return ieee8021q_2queue_tt_tc_map[tt]; + case 1: + compiletime_assert(ARRAY_SIZE(ieee8021q_1queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_1queue_tt_tc_map != max - 1"); + + return ieee8021q_1queue_tt_tc_map[tt]; + } + + pr_err("Invalid number of queues %d\n", num_queues); + + return -EINVAL; +} +EXPORT_SYMBOL_GPL(ieee8021q_tt_to_tc); + +/** + * ietf_dscp_to_ieee8021q_tt - Map IETF DSCP to IEEE 802.1Q Traffic Type + * @dscp: IETF DSCP value + * + * This function maps an IETF DSCP value to an IEEE 802.1Q Traffic Type (TT). + * Since there is no corresponding mapping between DSCP and IEEE 802.1Q Traffic + * Type, this function is inspired by the RFC8325 documentation which describe + * the mapping between DSCP and 802.11 User Priority (UP) values. + * + * Return: IEEE 802.1Q Traffic Type corresponding to the given DSCP value + */ +int ietf_dscp_to_ieee8021q_tt(u8 dscp) +{ + switch (dscp) { + case DSCP_CS0: + /* Comment from RFC8325: + * [RFC4594], Section 4.8, recommends High-Throughput Data be marked + * AF1x (that is, AF11, AF12, and AF13, according to the rules defined + * in [RFC2475]). + * + * By default (as described in Section 2.3), High-Throughput Data will + * map to UP 1 and, thus, to the Background Access Category (AC_BK), + * which is contrary to the intent expressed in [RFC4594]. + + * Unfortunately, there really is no corresponding fit for the High- + * Throughput Data service class within the constrained 4 Access + * Category [IEEE.802.11-2016] model. If the High-Throughput Data + * service class is assigned to the Best Effort Access Category (AC_BE), + * then it would contend with Low-Latency Data (while [RFC4594] + * recommends a distinction in servicing between these service classes) + * as well as with the default service class; alternatively, if it is + * assigned to the Background Access Category (AC_BK), then it would + * receive a less-then-best-effort service and contend with Low-Priority + * Data (as discussed in Section 4.2.10). + * + * As such, since there is no directly corresponding fit for the High- + * Throughout Data service class within the [IEEE.802.11-2016] model, it + * is generally RECOMMENDED to map High-Throughput Data to UP 0, thereby + * admitting it to the Best Effort Access Category (AC_BE). + * + * Note: The above text is from RFC8325 which is describing the mapping + * between DSCP and 802.11 User Priority (UP) values. The mapping + * between UP and IEEE 802.1Q Traffic Type is not defined in the RFC but + * the 802.11 AC_BK and AC_BE are closely related to the IEEE 802.1Q + * Traffic Types BE and BK. + */ + case DSCP_AF11: + case DSCP_AF12: + case DSCP_AF13: + return IEEE8021Q_TT_BE; + /* Comment from RFC8325: + * RFC3662 and RFC4594 both recommend Low-Priority Data be marked + * with DSCP CS1. The Low-Priority Data service class loosely + * corresponds to the [IEEE.802.11-2016] Background Access Category + */ + case DSCP_CS1: + return IEEE8021Q_TT_BK; + case DSCP_CS2: + case DSCP_AF21: + case DSCP_AF22: + case DSCP_AF23: + return IEEE8021Q_TT_EE; + case DSCP_CS3: + case DSCP_AF31: + case DSCP_AF32: + case DSCP_AF33: + return IEEE8021Q_TT_CA; + case DSCP_CS4: + case DSCP_AF41: + case DSCP_AF42: + case DSCP_AF43: + return IEEE8021Q_TT_VI; + case DSCP_CS5: + case DSCP_EF: + case DSCP_VOICE_ADMIT: + return IEEE8021Q_TT_VO; + case DSCP_CS6: + return IEEE8021Q_TT_IC; + case DSCP_CS7: + return IEEE8021Q_TT_NC; + } + + return SIMPLE_IETF_DSCP_TO_IEEE8021Q_TT(dscp); +} +EXPORT_SYMBOL_GPL(ietf_dscp_to_ieee8021q_tt); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 429571c258..ab15064114 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -33,7 +33,7 @@ static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event); static LIST_HEAD(lweventlist); static DEFINE_SPINLOCK(lweventlist_lock); -static unsigned char default_operstate(const struct net_device *dev) +static unsigned int default_operstate(const struct net_device *dev) { if (netif_testing(dev)) return IF_OPER_TESTING; @@ -62,16 +62,13 @@ static unsigned char default_operstate(const struct net_device *dev) return IF_OPER_UP; } - static void rfc2863_policy(struct net_device *dev) { - unsigned char operstate = default_operstate(dev); + unsigned int operstate = default_operstate(dev); - if (operstate == dev->operstate) + if (operstate == READ_ONCE(dev->operstate)) return; - write_lock(&dev_base_lock); - switch(dev->link_mode) { case IF_LINK_MODE_TESTING: if (operstate == IF_OPER_UP) @@ -87,9 +84,7 @@ static void rfc2863_policy(struct net_device *dev) break; } - dev->operstate = operstate; - - write_unlock(&dev_base_lock); + WRITE_ONCE(dev->operstate, operstate); } @@ -153,9 +148,9 @@ static void linkwatch_schedule_work(int urgent) * override the existing timer. */ if (test_bit(LW_URGENT, &linkwatch_flags)) - mod_delayed_work(system_wq, &linkwatch_work, 0); + mod_delayed_work(system_unbound_wq, &linkwatch_work, 0); else - schedule_delayed_work(&linkwatch_work, delay); + queue_delayed_work(system_unbound_wq, &linkwatch_work, delay); } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 552719c3bb..45fd88405b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -734,7 +734,9 @@ out_neigh_release: struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref) { - return ___neigh_create(tbl, pkey, dev, 0, false, want_ref); + bool exempt_from_gc = !!(dev->flags & IFF_LOOPBACK); + + return ___neigh_create(tbl, pkey, dev, 0, exempt_from_gc, want_ref); } EXPORT_SYMBOL(__neigh_create); @@ -1769,7 +1771,7 @@ static void neigh_parms_destroy(struct neigh_parms *parms) static struct lock_class_key neigh_table_proxy_queue_class; -static struct neigh_table *neigh_tables[NEIGH_NR_TABLES] __read_mostly; +static struct neigh_table __rcu *neigh_tables[NEIGH_NR_TABLES] __read_mostly; void neigh_table_init(int index, struct neigh_table *tbl) { @@ -1826,13 +1828,19 @@ void neigh_table_init(int index, struct neigh_table *tbl) tbl->last_flush = now; tbl->last_rand = now + tbl->parms.reachable_time * 20; - neigh_tables[index] = tbl; + rcu_assign_pointer(neigh_tables[index], tbl); } EXPORT_SYMBOL(neigh_table_init); +/* + * Only called from ndisc_cleanup(), which means this is dead code + * because we no longer can unload IPv6 module. + */ int neigh_table_clear(int index, struct neigh_table *tbl) { - neigh_tables[index] = NULL; + RCU_INIT_POINTER(neigh_tables[index], NULL); + synchronize_rcu(); + /* It is not clean... Fix it to unload IPv6 module safely */ cancel_delayed_work_sync(&tbl->managed_work); cancel_delayed_work_sync(&tbl->gc_work); @@ -1864,10 +1872,10 @@ static struct neigh_table *neigh_find_table(int family) switch (family) { case AF_INET: - tbl = neigh_tables[NEIGH_ARP_TABLE]; + tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ARP_TABLE]); break; case AF_INET6: - tbl = neigh_tables[NEIGH_ND_TABLE]; + tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ND_TABLE]); break; } @@ -2331,7 +2339,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, ndtmsg = nlmsg_data(nlh); for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { - tbl = neigh_tables[tidx]; + tbl = rcu_dereference_rtnl(neigh_tables[tidx]); if (!tbl) continue; if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) @@ -2519,7 +2527,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { struct neigh_parms *p; - tbl = neigh_tables[tidx]; + tbl = rcu_dereference_rtnl(neigh_tables[tidx]); if (!tbl) continue; @@ -2674,7 +2682,7 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx) if (!master_idx) return false; - master = dev ? netdev_master_upper_dev_get(dev) : NULL; + master = dev ? netdev_master_upper_dev_get_rcu(dev) : NULL; /* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another * invalid value for ifindex to denote "no master". @@ -2707,7 +2715,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, { struct net *net = sock_net(skb->sk); struct neighbour *n; - int rc, h, s_h = cb->args[1]; + int err = 0, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; struct neigh_hash_table *nht; unsigned int flags = NLM_F_MULTI; @@ -2715,7 +2723,6 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (filter->dev_idx || filter->master_idx) flags |= NLM_F_DUMP_FILTERED; - rcu_read_lock(); nht = rcu_dereference(tbl->nht); for (h = s_h; h < (1 << nht->hash_shift); h++) { @@ -2729,23 +2736,19 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || neigh_master_filtered(n->dev, filter->master_idx)) goto next; - if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, - flags) < 0) { - rc = -1; + err = neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, flags); + if (err < 0) goto out; - } next: idx++; } } - rc = skb->len; out: - rcu_read_unlock(); cb->args[1] = h; cb->args[2] = idx; - return rc; + return err; } static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, @@ -2754,7 +2757,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, { struct pneigh_entry *n; struct net *net = sock_net(skb->sk); - int rc, h, s_h = cb->args[3]; + int err = 0, h, s_h = cb->args[3]; int idx, s_idx = idx = cb->args[4]; unsigned int flags = NLM_F_MULTI; @@ -2772,11 +2775,11 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || neigh_master_filtered(n->dev, filter->master_idx)) goto next; - if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, flags, tbl) < 0) { + err = pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, flags, tbl); + if (err < 0) { read_unlock_bh(&tbl->lock); - rc = -1; goto out; } next: @@ -2785,12 +2788,10 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, } read_unlock_bh(&tbl->lock); - rc = skb->len; out: cb->args[3] = h; cb->args[4] = idx; - return rc; - + return err; } static int neigh_valid_dump_req(const struct nlmsghdr *nlh, @@ -2878,8 +2879,9 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; + rcu_read_lock(); for (t = 0; t < NEIGH_NR_TABLES; t++) { - tbl = neigh_tables[t]; + tbl = rcu_dereference(neigh_tables[t]); if (!tbl) continue; @@ -2895,9 +2897,10 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) if (err < 0) break; } + rcu_read_unlock(); cb->args[0] = t; - return skb->len; + return err; } static int neigh_valid_get_req(const struct nlmsghdr *nlh, @@ -3143,14 +3146,15 @@ int neigh_xmit(int index, struct net_device *dev, const void *addr, struct sk_buff *skb) { int err = -EAFNOSUPPORT; + if (likely(index < NEIGH_NR_TABLES)) { struct neigh_table *tbl; struct neighbour *neigh; - tbl = neigh_tables[index]; - if (!tbl) - goto out; rcu_read_lock(); + tbl = rcu_dereference(neigh_tables[index]); + if (!tbl) + goto out_unlock; if (index == NEIGH_ARP_TABLE) { u32 key = *((u32 *)addr); @@ -3166,6 +3170,7 @@ int neigh_xmit(int index, struct net_device *dev, goto out_kfree_skb; } err = READ_ONCE(neigh->output)(neigh, skb); +out_unlock: rcu_read_unlock(); } else if (index == NEIGH_LINK_TABLE) { @@ -3728,7 +3733,7 @@ static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write, static struct neigh_sysctl_table { struct ctl_table_header *sysctl_header; - struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1]; + struct ctl_table neigh_vars[NEIGH_VAR_MAX]; } neigh_sysctl_template __read_mostly = { .neigh_vars = { NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"), @@ -3779,7 +3784,6 @@ static struct neigh_sysctl_table { .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, - {}, }, }; @@ -3807,8 +3811,6 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, if (dev) { dev_name_source = dev->name; /* Terminate the table early */ - memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0, - sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL])); neigh_vars_size = NEIGH_VAR_BASE_REACHABLE_TIME_MS + 1; } else { struct neigh_table *tbl = p->tbl; @@ -3889,7 +3891,8 @@ static int __init neigh_init(void) { rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_get, neigh_dump_info, 0); + rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_get, neigh_dump_info, + RTNL_FLAG_DUMP_UNLOCKED); rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info, 0); diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 09f7ed1a04..fa6d396973 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -3,52 +3,22 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/wext.h> +#include <net/hotdata.h> #include "dev.h" -#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1) - -#define get_bucket(x) ((x) >> BUCKET_SPACE) -#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1)) -#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) - -static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos) +static void *dev_seq_from_index(struct seq_file *seq, loff_t *pos) { - struct net *net = seq_file_net(seq); + unsigned long ifindex = *pos; struct net_device *dev; - struct hlist_head *h; - unsigned int count = 0, offset = get_offset(*pos); - h = &net->dev_index_head[get_bucket(*pos)]; - hlist_for_each_entry_rcu(dev, h, index_hlist) { - if (++count == offset) - return dev; + for_each_netdev_dump(seq_file_net(seq), dev, ifindex) { + *pos = dev->ifindex; + return dev; } - - return NULL; -} - -static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos) -{ - struct net_device *dev; - unsigned int bucket; - - do { - dev = dev_from_same_bucket(seq, pos); - if (dev) - return dev; - - bucket = get_bucket(*pos) + 1; - *pos = set_bucket_offset(bucket, 1); - } while (bucket < NETDEV_HASHENTRIES); - return NULL; } -/* - * This is invoked by the /proc filesystem handler to display a device - * in detail. - */ static void *dev_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { @@ -56,16 +26,13 @@ static void *dev_seq_start(struct seq_file *seq, loff_t *pos) if (!*pos) return SEQ_START_TOKEN; - if (get_bucket(*pos) >= NETDEV_HASHENTRIES) - return NULL; - - return dev_from_bucket(seq, pos); + return dev_seq_from_index(seq, pos); } static void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; - return dev_from_bucket(seq, pos); + return dev_seq_from_index(seq, pos); } static void dev_seq_stop(struct seq_file *seq, void *v) @@ -177,7 +144,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x\n", - sd->processed, sd->dropped, sd->time_squeeze, 0, + sd->processed, atomic_read(&sd->dropped), + sd->time_squeeze, 0, 0, 0, 0, 0, /* was fastroute */ 0, /* was cpu_collision */ sd->received_rps, flow_limit_count, @@ -217,7 +185,7 @@ static void *ptype_get_idx(struct seq_file *seq, loff_t pos) } } - list_for_each_entry_rcu(pt, &ptype_all, list) { + list_for_each_entry_rcu(pt, &net_hotdata.ptype_all, list) { if (i == pos) return pt; ++i; @@ -265,13 +233,13 @@ static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) } } - nxt = ptype_all.next; + nxt = net_hotdata.ptype_all.next; goto ptype_all; } if (pt->type == htons(ETH_P_ALL)) { ptype_all: - if (nxt != &ptype_all) + if (nxt != &net_hotdata.ptype_all) goto found; hash = 0; nxt = ptype_base[0].next; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index a09d507c5b..4c27a360c2 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -24,6 +24,7 @@ #include <linux/of_net.h> #include <linux/cpu.h> #include <net/netdev_rx_queue.h> +#include <net/rps.h> #include "dev.h" #include "net-sysfs.h" @@ -34,10 +35,10 @@ static const char fmt_dec[] = "%d\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; -/* Caller holds RTNL or dev_base_lock */ +/* Caller holds RTNL or RCU */ static inline int dev_isalive(const struct net_device *dev) { - return dev->reg_state <= NETREG_REGISTERED; + return READ_ONCE(dev->reg_state) <= NETREG_REGISTERED; } /* use same locking rules as GIF* ioctl's */ @@ -48,10 +49,10 @@ static ssize_t netdev_show(const struct device *dev, struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; - read_lock(&dev_base_lock); + rcu_read_lock(); if (dev_isalive(ndev)) ret = (*format)(ndev, buf); - read_unlock(&dev_base_lock); + rcu_read_unlock(); return ret; } @@ -60,7 +61,7 @@ static ssize_t netdev_show(const struct device *dev, #define NETDEVICE_SHOW(field, format_string) \ static ssize_t format_##field(const struct net_device *dev, char *buf) \ { \ - return sysfs_emit(buf, format_string, dev->field); \ + return sysfs_emit(buf, format_string, READ_ONCE(dev->field)); \ } \ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ @@ -125,7 +126,7 @@ static DEVICE_ATTR_RO(iflink); static ssize_t format_name_assign_type(const struct net_device *dev, char *buf) { - return sysfs_emit(buf, fmt_dec, dev->name_assign_type); + return sysfs_emit(buf, fmt_dec, READ_ONCE(dev->name_assign_type)); } static ssize_t name_assign_type_show(struct device *dev, @@ -135,24 +136,28 @@ static ssize_t name_assign_type_show(struct device *dev, struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; - if (ndev->name_assign_type != NET_NAME_UNKNOWN) + if (READ_ONCE(ndev->name_assign_type) != NET_NAME_UNKNOWN) ret = netdev_show(dev, attr, buf, format_name_assign_type); return ret; } static DEVICE_ATTR_RO(name_assign_type); -/* use same locking rules as GIFHWADDR ioctl's */ +/* use same locking rules as GIFHWADDR ioctl's (dev_get_mac_address()) */ static ssize_t address_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; - read_lock(&dev_base_lock); + down_read(&dev_addr_sem); + + rcu_read_lock(); if (dev_isalive(ndev)) ret = sysfs_format_mac(buf, ndev->dev_addr, ndev->addr_len); - read_unlock(&dev_base_lock); + rcu_read_unlock(); + + up_read(&dev_addr_sem); return ret; } static DEVICE_ATTR_RO(address); @@ -161,10 +166,13 @@ static ssize_t broadcast_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); + int ret = -EINVAL; + rcu_read_lock(); if (dev_isalive(ndev)) - return sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len); - return -EINVAL; + ret = sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len); + rcu_read_unlock(); + return ret; } static DEVICE_ATTR_RO(broadcast); @@ -318,11 +326,9 @@ static ssize_t operstate_show(struct device *dev, const struct net_device *netdev = to_net_dev(dev); unsigned char operstate; - read_lock(&dev_base_lock); - operstate = netdev->operstate; + operstate = READ_ONCE(netdev->operstate); if (!netif_running(netdev)) operstate = IF_OPER_DOWN; - read_unlock(&dev_base_lock); if (operstate >= ARRAY_SIZE(operstates)) return -EINVAL; /* should not happen */ @@ -599,13 +605,13 @@ static ssize_t threaded_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); ssize_t ret = -EINVAL; - if (!rtnl_trylock()) - return restart_syscall(); + rcu_read_lock(); if (dev_isalive(netdev)) - ret = sysfs_emit(buf, fmt_dec, netdev->threaded); + ret = sysfs_emit(buf, fmt_dec, READ_ONCE(netdev->threaded)); + + rcu_read_unlock(); - rtnl_unlock(); return ret; } @@ -680,14 +686,14 @@ static ssize_t netstat_show(const struct device *d, WARN_ON(offset > sizeof(struct rtnl_link_stats64) || offset % sizeof(u64) != 0); - read_lock(&dev_base_lock); + rcu_read_lock(); if (dev_isalive(dev)) { struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); ret = sysfs_emit(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset)); } - read_unlock(&dev_base_lock); + rcu_read_unlock(); return ret; } @@ -1409,6 +1415,65 @@ static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init = __ATTR(hold_time, 0644, bql_show_hold_time, bql_set_hold_time); +static ssize_t bql_show_stall_thrs(struct netdev_queue *queue, char *buf) +{ + struct dql *dql = &queue->dql; + + return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs)); +} + +static ssize_t bql_set_stall_thrs(struct netdev_queue *queue, + const char *buf, size_t len) +{ + struct dql *dql = &queue->dql; + unsigned int value; + int err; + + err = kstrtouint(buf, 10, &value); + if (err < 0) + return err; + + value = msecs_to_jiffies(value); + if (value && (value < 4 || value > 4 / 2 * BITS_PER_LONG)) + return -ERANGE; + + if (!dql->stall_thrs && value) + dql->last_reap = jiffies; + /* Force last_reap to be live */ + smp_wmb(); + dql->stall_thrs = value; + + return len; +} + +static struct netdev_queue_attribute bql_stall_thrs_attribute __ro_after_init = + __ATTR(stall_thrs, 0644, bql_show_stall_thrs, bql_set_stall_thrs); + +static ssize_t bql_show_stall_max(struct netdev_queue *queue, char *buf) +{ + return sysfs_emit(buf, "%u\n", READ_ONCE(queue->dql.stall_max)); +} + +static ssize_t bql_set_stall_max(struct netdev_queue *queue, + const char *buf, size_t len) +{ + WRITE_ONCE(queue->dql.stall_max, 0); + return len; +} + +static struct netdev_queue_attribute bql_stall_max_attribute __ro_after_init = + __ATTR(stall_max, 0644, bql_show_stall_max, bql_set_stall_max); + +static ssize_t bql_show_stall_cnt(struct netdev_queue *queue, char *buf) +{ + struct dql *dql = &queue->dql; + + return sysfs_emit(buf, "%lu\n", dql->stall_cnt); +} + +static struct netdev_queue_attribute bql_stall_cnt_attribute __ro_after_init = + __ATTR(stall_cnt, 0444, bql_show_stall_cnt, NULL); + static ssize_t bql_show_inflight(struct netdev_queue *queue, char *buf) { @@ -1447,6 +1512,9 @@ static struct attribute *dql_attrs[] __ro_after_init = { &bql_limit_min_attribute.attr, &bql_hold_time_attribute.attr, &bql_inflight_attribute.attr, + &bql_stall_thrs_attribute.attr, + &bql_stall_cnt_attribute.attr, + &bql_stall_max_attribute.attr, NULL }; @@ -1454,6 +1522,9 @@ static const struct attribute_group dql_group = { .name = "byte_queue_limits", .attrs = dql_attrs, }; +#else +/* Fake declaration, all the code using it should be dead */ +extern const struct attribute_group dql_group; #endif /* CONFIG_BQL */ #ifdef CONFIG_XPS @@ -1691,6 +1762,15 @@ static const struct kobj_type netdev_queue_ktype = { .get_ownership = netdev_queue_get_ownership, }; +static bool netdev_uses_bql(const struct net_device *dev) +{ + if (dev->features & NETIF_F_LLTX || + dev->priv_flags & IFF_NO_QUEUE) + return false; + + return IS_ENABLED(CONFIG_BQL); +} + static int netdev_queue_add_kobject(struct net_device *dev, int index) { struct netdev_queue *queue = dev->_tx + index; @@ -1708,11 +1788,11 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index) if (error) goto err; -#ifdef CONFIG_BQL - error = sysfs_create_group(kobj, &dql_group); - if (error) - goto err; -#endif + if (netdev_uses_bql(dev)) { + error = sysfs_create_group(kobj, &dql_group); + if (error) + goto err; + } kobject_uevent(kobj, KOBJ_ADD); return 0; @@ -1733,9 +1813,9 @@ static int tx_queue_change_owner(struct net_device *ndev, int index, if (error) return error; -#ifdef CONFIG_BQL - error = sysfs_group_change_owner(kobj, &dql_group, kuid, kgid); -#endif + if (netdev_uses_bql(ndev)) + error = sysfs_group_change_owner(kobj, &dql_group, kuid, kgid); + return error; } #endif /* CONFIG_SYSFS */ @@ -1767,9 +1847,10 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) if (!refcount_read(&dev_net(dev)->ns.count)) queue->kobj.uevent_suppress = 1; -#ifdef CONFIG_BQL - sysfs_remove_group(&queue->kobj, &dql_group); -#endif + + if (netdev_uses_bql(dev)) + sysfs_remove_group(&queue->kobj, &dql_group); + kobject_put(&queue->kobj); } @@ -1965,7 +2046,7 @@ static void net_get_ownership(const struct device *d, kuid_t *uid, kgid_t *gid) net_ns_get_ownership(net, uid, gid); } -static struct class net_class __ro_after_init = { +static const struct class net_class = { .name = "net", .dev_release = netdev_release, .dev_groups = net_class_groups, diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 7279953342..6a823ba906 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -69,12 +69,15 @@ DEFINE_COOKIE(net_cookie); static struct net_generic *net_alloc_generic(void) { + unsigned int gen_ptrs = READ_ONCE(max_gen_ptrs); + unsigned int generic_size; struct net_generic *ng; - unsigned int generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]); + + generic_size = offsetof(struct net_generic, ptr[gen_ptrs]); ng = kzalloc(generic_size, GFP_KERNEL); if (ng) - ng->s.len = max_gen_ptrs; + ng->s.len = gen_ptrs; return ng; } @@ -318,8 +321,9 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) { /* Must be called with pernet_ops_rwsem held */ const struct pernet_operations *ops, *saved_ops; - int error = 0; LIST_HEAD(net_exit_list); + LIST_HEAD(dev_kill_list); + int error = 0; refcount_set(&net->ns.count, 1); ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt"); @@ -358,6 +362,15 @@ out_undo: synchronize_rcu(); ops = saved_ops; + rtnl_lock(); + list_for_each_entry_continue_reverse(ops, &pernet_list, list) { + if (ops->exit_batch_rtnl) + ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list); + } + unregister_netdevice_many(&dev_kill_list); + rtnl_unlock(); + + ops = saved_ops; list_for_each_entry_continue_reverse(ops, &pernet_list, list) ops_exit_list(ops, &net_exit_list); @@ -573,6 +586,7 @@ static void cleanup_net(struct work_struct *work) struct net *net, *tmp, *last; struct llist_node *net_kill_list; LIST_HEAD(net_exit_list); + LIST_HEAD(dev_kill_list); /* Atomically snapshot the list of namespaces to cleanup */ net_kill_list = llist_del_all(&cleanup_list); @@ -611,7 +625,15 @@ static void cleanup_net(struct work_struct *work) * the rcu_barrier() below isn't sufficient alone. * Also the pre_exit() and exit() methods need this barrier. */ - synchronize_rcu(); + synchronize_rcu_expedited(); + + rtnl_lock(); + list_for_each_entry_reverse(ops, &pernet_list, list) { + if (ops->exit_batch_rtnl) + ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list); + } + unregister_netdevice_many(&dev_kill_list); + rtnl_unlock(); /* Run all of the network namespace exit methods */ list_for_each_entry_reverse(ops, &pernet_list, list) @@ -671,11 +693,16 @@ EXPORT_SYMBOL_GPL(__put_net); * get_net_ns - increment the refcount of the network namespace * @ns: common namespace (net) * - * Returns the net's common namespace. + * Returns the net's common namespace or ERR_PTR() if ref is zero. */ struct ns_common *get_net_ns(struct ns_common *ns) { - return &get_net(container_of(ns, struct net, ns))->ns; + struct net *net; + + net = maybe_get_net(container_of(ns, struct net, ns)); + if (net) + return &net->ns; + return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(get_net_ns); @@ -1071,7 +1098,7 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) end: if (net_cb.fillargs.add_ref) put_net(net_cb.tgt_net); - return err < 0 ? err : skb->len; + return err; } static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, @@ -1186,14 +1213,25 @@ void __init net_ns_init(void) rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, RTNL_FLAG_DOIT_UNLOCKED); rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid, - RTNL_FLAG_DOIT_UNLOCKED); + RTNL_FLAG_DOIT_UNLOCKED | + RTNL_FLAG_DUMP_UNLOCKED); } static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list) { ops_pre_exit_list(ops, net_exit_list); synchronize_rcu(); + + if (ops->exit_batch_rtnl) { + LIST_HEAD(dev_kill_list); + + rtnl_lock(); + ops->exit_batch_rtnl(net_exit_list, &dev_kill_list); + unregister_netdevice_many(&dev_kill_list); + rtnl_unlock(); + } ops_exit_list(ops, net_exit_list); + ops_free_list(ops, net_exit_list); } @@ -1278,7 +1316,11 @@ static int register_pernet_operations(struct list_head *list, if (error < 0) return error; *ops->id = error; - max_gen_ptrs = max(max_gen_ptrs, *ops->id + 1); + /* This does not require READ_ONCE as writers already hold + * pernet_ops_rwsem. But WRITE_ONCE is needed to protect + * net_alloc_generic. + */ + WRITE_ONCE(max_gen_ptrs, max(max_gen_ptrs, *ops->id + 1)); } error = __register_pernet_operations(list, ops); if (error) { diff --git a/net/core/net_test.c b/net/core/net_test.c new file mode 100644 index 0000000000..9c3a590865 --- /dev/null +++ b/net/core/net_test.c @@ -0,0 +1,387 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <kunit/test.h> + +/* GSO */ + +#include <linux/skbuff.h> + +static const char hdr[] = "abcdefgh"; +#define GSO_TEST_SIZE 1000 + +static void __init_skb(struct sk_buff *skb) +{ + skb_reset_mac_header(skb); + memcpy(skb_mac_header(skb), hdr, sizeof(hdr)); + + /* skb_segment expects skb->data at start of payload */ + skb_pull(skb, sizeof(hdr)); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + + /* proto is arbitrary, as long as not ETH_P_TEB or vlan */ + skb->protocol = htons(ETH_P_ATALK); + skb_shinfo(skb)->gso_size = GSO_TEST_SIZE; +} + +enum gso_test_nr { + GSO_TEST_LINEAR, + GSO_TEST_NO_GSO, + GSO_TEST_FRAGS, + GSO_TEST_FRAGS_PURE, + GSO_TEST_GSO_PARTIAL, + GSO_TEST_FRAG_LIST, + GSO_TEST_FRAG_LIST_PURE, + GSO_TEST_FRAG_LIST_NON_UNIFORM, + GSO_TEST_GSO_BY_FRAGS, +}; + +struct gso_test_case { + enum gso_test_nr id; + const char *name; + + /* input */ + unsigned int linear_len; + unsigned int nr_frags; + const unsigned int *frags; + unsigned int nr_frag_skbs; + const unsigned int *frag_skbs; + + /* output as expected */ + unsigned int nr_segs; + const unsigned int *segs; +}; + +static struct gso_test_case cases[] = { + { + .id = GSO_TEST_NO_GSO, + .name = "no_gso", + .linear_len = GSO_TEST_SIZE, + .nr_segs = 1, + .segs = (const unsigned int[]) { GSO_TEST_SIZE }, + }, + { + .id = GSO_TEST_LINEAR, + .name = "linear", + .linear_len = GSO_TEST_SIZE + GSO_TEST_SIZE + 1, + .nr_segs = 3, + .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 1 }, + }, + { + .id = GSO_TEST_FRAGS, + .name = "frags", + .linear_len = GSO_TEST_SIZE, + .nr_frags = 2, + .frags = (const unsigned int[]) { GSO_TEST_SIZE, 1 }, + .nr_segs = 3, + .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 1 }, + }, + { + .id = GSO_TEST_FRAGS_PURE, + .name = "frags_pure", + .nr_frags = 3, + .frags = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 2 }, + .nr_segs = 3, + .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 2 }, + }, + { + .id = GSO_TEST_GSO_PARTIAL, + .name = "gso_partial", + .linear_len = GSO_TEST_SIZE, + .nr_frags = 2, + .frags = (const unsigned int[]) { GSO_TEST_SIZE, 3 }, + .nr_segs = 2, + .segs = (const unsigned int[]) { 2 * GSO_TEST_SIZE, 3 }, + }, + { + /* commit 89319d3801d1: frag_list on mss boundaries */ + .id = GSO_TEST_FRAG_LIST, + .name = "frag_list", + .linear_len = GSO_TEST_SIZE, + .nr_frag_skbs = 2, + .frag_skbs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE }, + .nr_segs = 3, + .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, GSO_TEST_SIZE }, + }, + { + .id = GSO_TEST_FRAG_LIST_PURE, + .name = "frag_list_pure", + .nr_frag_skbs = 2, + .frag_skbs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE }, + .nr_segs = 2, + .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE }, + }, + { + /* commit 43170c4e0ba7: GRO of frag_list trains */ + .id = GSO_TEST_FRAG_LIST_NON_UNIFORM, + .name = "frag_list_non_uniform", + .linear_len = GSO_TEST_SIZE, + .nr_frag_skbs = 4, + .frag_skbs = (const unsigned int[]) { GSO_TEST_SIZE, 1, GSO_TEST_SIZE, 2 }, + .nr_segs = 4, + .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, GSO_TEST_SIZE, 3 }, + }, + { + /* commit 3953c46c3ac7 ("sk_buff: allow segmenting based on frag sizes") and + * commit 90017accff61 ("sctp: Add GSO support") + * + * "there will be a cover skb with protocol headers and + * children ones containing the actual segments" + */ + .id = GSO_TEST_GSO_BY_FRAGS, + .name = "gso_by_frags", + .nr_frag_skbs = 4, + .frag_skbs = (const unsigned int[]) { 100, 200, 300, 400 }, + .nr_segs = 4, + .segs = (const unsigned int[]) { 100, 200, 300, 400 }, + }, +}; + +static void gso_test_case_to_desc(struct gso_test_case *t, char *desc) +{ + sprintf(desc, "%s", t->name); +} + +KUNIT_ARRAY_PARAM(gso_test, cases, gso_test_case_to_desc); + +static void gso_test_func(struct kunit *test) +{ + const int shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + struct sk_buff *skb, *segs, *cur, *next, *last; + const struct gso_test_case *tcase; + netdev_features_t features; + struct page *page; + int i; + + tcase = test->param_value; + + page = alloc_page(GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, page); + skb = build_skb(page_address(page), sizeof(hdr) + tcase->linear_len + shinfo_size); + KUNIT_ASSERT_NOT_NULL(test, skb); + __skb_put(skb, sizeof(hdr) + tcase->linear_len); + + __init_skb(skb); + + if (tcase->nr_frags) { + unsigned int pg_off = 0; + + page = alloc_page(GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, page); + page_ref_add(page, tcase->nr_frags - 1); + + for (i = 0; i < tcase->nr_frags; i++) { + skb_fill_page_desc(skb, i, page, pg_off, tcase->frags[i]); + pg_off += tcase->frags[i]; + } + + KUNIT_ASSERT_LE(test, pg_off, PAGE_SIZE); + + skb->data_len = pg_off; + skb->len += skb->data_len; + skb->truesize += skb->data_len; + } + + if (tcase->frag_skbs) { + unsigned int total_size = 0, total_true_size = 0; + struct sk_buff *frag_skb, *prev = NULL; + + for (i = 0; i < tcase->nr_frag_skbs; i++) { + unsigned int frag_size; + + page = alloc_page(GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, page); + + frag_size = tcase->frag_skbs[i]; + frag_skb = build_skb(page_address(page), + frag_size + shinfo_size); + KUNIT_ASSERT_NOT_NULL(test, frag_skb); + __skb_put(frag_skb, frag_size); + + if (prev) + prev->next = frag_skb; + else + skb_shinfo(skb)->frag_list = frag_skb; + prev = frag_skb; + + total_size += frag_size; + total_true_size += frag_skb->truesize; + } + + skb->len += total_size; + skb->data_len += total_size; + skb->truesize += total_true_size; + + if (tcase->id == GSO_TEST_GSO_BY_FRAGS) + skb_shinfo(skb)->gso_size = GSO_BY_FRAGS; + } + + features = NETIF_F_SG | NETIF_F_HW_CSUM; + if (tcase->id == GSO_TEST_GSO_PARTIAL) + features |= NETIF_F_GSO_PARTIAL; + + /* TODO: this should also work with SG, + * rather than hit BUG_ON(i >= nfrags) + */ + if (tcase->id == GSO_TEST_FRAG_LIST_NON_UNIFORM) + features &= ~NETIF_F_SG; + + segs = skb_segment(skb, features); + if (IS_ERR(segs)) { + KUNIT_FAIL(test, "segs error %pe", segs); + goto free_gso_skb; + } else if (!segs) { + KUNIT_FAIL(test, "no segments"); + goto free_gso_skb; + } + + last = segs->prev; + for (cur = segs, i = 0; cur; cur = next, i++) { + next = cur->next; + + KUNIT_ASSERT_EQ(test, cur->len, sizeof(hdr) + tcase->segs[i]); + + /* segs have skb->data pointing to the mac header */ + KUNIT_ASSERT_PTR_EQ(test, skb_mac_header(cur), cur->data); + KUNIT_ASSERT_PTR_EQ(test, skb_network_header(cur), cur->data + sizeof(hdr)); + + /* header was copied to all segs */ + KUNIT_ASSERT_EQ(test, memcmp(skb_mac_header(cur), hdr, sizeof(hdr)), 0); + + /* last seg can be found through segs->prev pointer */ + if (!next) + KUNIT_ASSERT_PTR_EQ(test, cur, last); + + consume_skb(cur); + } + + KUNIT_ASSERT_EQ(test, i, tcase->nr_segs); + +free_gso_skb: + consume_skb(skb); +} + +/* IP tunnel flags */ + +#include <net/ip_tunnels.h> + +struct ip_tunnel_flags_test { + const char *name; + + const u16 *src_bits; + const u16 *exp_bits; + u8 src_num; + u8 exp_num; + + __be16 exp_val; + bool exp_comp; +}; + +#define IP_TUNNEL_FLAGS_TEST(n, src, comp, eval, exp) { \ + .name = (n), \ + .src_bits = (src), \ + .src_num = ARRAY_SIZE(src), \ + .exp_comp = (comp), \ + .exp_val = (eval), \ + .exp_bits = (exp), \ + .exp_num = ARRAY_SIZE(exp), \ +} + +/* These are __be16-compatible and can be compared as is */ +static const u16 ip_tunnel_flags_1[] = { + IP_TUNNEL_KEY_BIT, + IP_TUNNEL_STRICT_BIT, + IP_TUNNEL_ERSPAN_OPT_BIT, +}; + +/* Due to the previous flags design limitation, setting either + * ``IP_TUNNEL_CSUM_BIT`` (on Big Endian) or ``IP_TUNNEL_DONT_FRAGMENT_BIT`` + * (on Little) also sets VTI/ISATAP bit. In the bitmap implementation, they + * correspond to ``BIT(16)``, which is bigger than ``U16_MAX``, but still is + * backward-compatible. + */ +#ifdef __LITTLE_ENDIAN +#define IP_TUNNEL_CONFLICT_BIT IP_TUNNEL_DONT_FRAGMENT_BIT +#else +#define IP_TUNNEL_CONFLICT_BIT IP_TUNNEL_CSUM_BIT +#endif + +static const u16 ip_tunnel_flags_2_src[] = { + IP_TUNNEL_CONFLICT_BIT, +}; + +static const u16 ip_tunnel_flags_2_exp[] = { + IP_TUNNEL_CONFLICT_BIT, + IP_TUNNEL_SIT_ISATAP_BIT, +}; + +/* Bits 17 and higher are not compatible with __be16 flags */ +static const u16 ip_tunnel_flags_3_src[] = { + IP_TUNNEL_VXLAN_OPT_BIT, + 17, + 18, + 20, +}; + +static const u16 ip_tunnel_flags_3_exp[] = { + IP_TUNNEL_VXLAN_OPT_BIT, +}; + +static const struct ip_tunnel_flags_test ip_tunnel_flags_test[] = { + IP_TUNNEL_FLAGS_TEST("compat", ip_tunnel_flags_1, true, + cpu_to_be16(BIT(IP_TUNNEL_KEY_BIT) | + BIT(IP_TUNNEL_STRICT_BIT) | + BIT(IP_TUNNEL_ERSPAN_OPT_BIT)), + ip_tunnel_flags_1), + IP_TUNNEL_FLAGS_TEST("conflict", ip_tunnel_flags_2_src, true, + VTI_ISVTI, ip_tunnel_flags_2_exp), + IP_TUNNEL_FLAGS_TEST("new", ip_tunnel_flags_3_src, false, + cpu_to_be16(BIT(IP_TUNNEL_VXLAN_OPT_BIT)), + ip_tunnel_flags_3_exp), +}; + +static void +ip_tunnel_flags_test_case_to_desc(const struct ip_tunnel_flags_test *t, + char *desc) +{ + strscpy(desc, t->name, KUNIT_PARAM_DESC_SIZE); +} +KUNIT_ARRAY_PARAM(ip_tunnel_flags_test, ip_tunnel_flags_test, + ip_tunnel_flags_test_case_to_desc); + +static void ip_tunnel_flags_test_run(struct kunit *test) +{ + const struct ip_tunnel_flags_test *t = test->param_value; + IP_TUNNEL_DECLARE_FLAGS(src) = { }; + IP_TUNNEL_DECLARE_FLAGS(exp) = { }; + IP_TUNNEL_DECLARE_FLAGS(out); + + for (u32 j = 0; j < t->src_num; j++) + __set_bit(t->src_bits[j], src); + for (u32 j = 0; j < t->exp_num; j++) + __set_bit(t->exp_bits[j], exp); + + KUNIT_ASSERT_EQ(test, t->exp_comp, + ip_tunnel_flags_is_be16_compat(src)); + KUNIT_ASSERT_EQ(test, (__force u16)t->exp_val, + (__force u16)ip_tunnel_flags_to_be16(src)); + + ip_tunnel_flags_from_be16(out, t->exp_val); + KUNIT_ASSERT_TRUE(test, __ipt_flag_op(bitmap_equal, exp, out)); +} + +static struct kunit_case net_test_cases[] = { + KUNIT_CASE_PARAM(gso_test_func, gso_test_gen_params), + KUNIT_CASE_PARAM(ip_tunnel_flags_test_run, + ip_tunnel_flags_test_gen_params), + { }, +}; + +static struct kunit_suite net_test_suite = { + .name = "net_core", + .test_cases = net_test_cases, +}; +kunit_test_suite(net_test_suite); + +MODULE_DESCRIPTION("KUnit tests for networking core"); +MODULE_LICENSE("GPL"); diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index be7f2ebd61..8350a0afa9 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -68,6 +68,12 @@ static const struct nla_policy netdev_napi_get_dump_nl_policy[NETDEV_A_NAPI_IFIN [NETDEV_A_NAPI_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), }; +/* NETDEV_CMD_QSTATS_GET - dump */ +static const struct nla_policy netdev_qstats_get_nl_policy[NETDEV_A_QSTATS_SCOPE + 1] = { + [NETDEV_A_QSTATS_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_QSTATS_SCOPE] = NLA_POLICY_MASK(NLA_UINT, 0x1), +}; + /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -138,6 +144,13 @@ static const struct genl_split_ops netdev_nl_ops[] = { .maxattr = NETDEV_A_NAPI_IFINDEX, .flags = GENL_CMD_CAP_DUMP, }, + { + .cmd = NETDEV_CMD_QSTATS_GET, + .dumpit = netdev_nl_qstats_get_dumpit, + .policy = netdev_qstats_get_nl_policy, + .maxattr = NETDEV_A_QSTATS_SCOPE, + .flags = GENL_CMD_CAP_DUMP, + }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index a47f2bcbe4..4db40fd5b4 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -28,6 +28,8 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); enum { NETDEV_NLGRP_MGMT, diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 918b109e0c..05f9515d2c 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -8,6 +8,7 @@ #include <net/xdp.h> #include <net/xdp_sock.h> #include <net/netdev_rx_queue.h> +#include <net/netdev_queues.h> #include <net/busy_poll.h> #include "netdev-genl-gen.h" @@ -58,22 +59,22 @@ XDP_METADATA_KFUNC_xxx nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, xdp_rx_meta, NETDEV_A_DEV_PAD) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES, - xsk_features, NETDEV_A_DEV_PAD)) { - genlmsg_cancel(rsp, hdr); - return -EINVAL; - } + xsk_features, NETDEV_A_DEV_PAD)) + goto err_cancel_msg; if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) { if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, - netdev->xdp_zc_max_segs)) { - genlmsg_cancel(rsp, hdr); - return -EINVAL; - } + netdev->xdp_zc_max_segs)) + goto err_cancel_msg; } genlmsg_end(rsp, hdr); return 0; + +err_cancel_msg: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; } static void @@ -460,6 +461,266 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) return err; } +#define NETDEV_STAT_NOT_SET (~0ULL) + +static void netdev_nl_stats_add(void *_sum, const void *_add, size_t size) +{ + const u64 *add = _add; + u64 *sum = _sum; + + while (size) { + if (*add != NETDEV_STAT_NOT_SET && *sum != NETDEV_STAT_NOT_SET) + *sum += *add; + sum++; + add++; + size -= 8; + } +} + +static int netdev_stat_put(struct sk_buff *rsp, unsigned int attr_id, u64 value) +{ + if (value == NETDEV_STAT_NOT_SET) + return 0; + return nla_put_uint(rsp, attr_id, value); +} + +static int +netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx) +{ + if (netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_PACKETS, rx->packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_BYTES, rx->bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROPS, rx->hw_drops) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, rx->hw_drop_overruns) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, rx->csum_unnecessary) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_NONE, rx->csum_none) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_BAD, rx->csum_bad) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_PACKETS, rx->hw_gro_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_BYTES, rx->hw_gro_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS, rx->hw_gro_wire_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES, rx->hw_gro_wire_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS, rx->hw_drop_ratelimits)) + return -EMSGSIZE; + return 0; +} + +static int +netdev_nl_stats_write_tx(struct sk_buff *rsp, struct netdev_queue_stats_tx *tx) +{ + if (netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_PACKETS, tx->packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROPS, tx->hw_drops) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_ERRORS, tx->hw_drop_errors) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_CSUM_NONE, tx->csum_none) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_NEEDS_CSUM, tx->needs_csum) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_PACKETS, tx->hw_gso_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_BYTES, tx->hw_gso_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, tx->hw_gso_wire_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, tx->hw_gso_wire_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, tx->hw_drop_ratelimits) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_STOP, tx->stop) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_WAKE, tx->wake)) + return -EMSGSIZE; + return 0; +} + +static int +netdev_nl_stats_queue(struct net_device *netdev, struct sk_buff *rsp, + u32 q_type, int i, const struct genl_info *info) +{ + const struct netdev_stat_ops *ops = netdev->stat_ops; + struct netdev_queue_stats_rx rx; + struct netdev_queue_stats_tx tx; + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex) || + nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_TYPE, q_type) || + nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_ID, i)) + goto nla_put_failure; + + switch (q_type) { + case NETDEV_QUEUE_TYPE_RX: + memset(&rx, 0xff, sizeof(rx)); + ops->get_queue_stats_rx(netdev, i, &rx); + if (!memchr_inv(&rx, 0xff, sizeof(rx))) + goto nla_cancel; + if (netdev_nl_stats_write_rx(rsp, &rx)) + goto nla_put_failure; + break; + case NETDEV_QUEUE_TYPE_TX: + memset(&tx, 0xff, sizeof(tx)); + ops->get_queue_stats_tx(netdev, i, &tx); + if (!memchr_inv(&tx, 0xff, sizeof(tx))) + goto nla_cancel; + if (netdev_nl_stats_write_tx(rsp, &tx)) + goto nla_put_failure; + break; + } + + genlmsg_end(rsp, hdr); + return 0; + +nla_cancel: + genlmsg_cancel(rsp, hdr); + return 0; +nla_put_failure: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +static int +netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp, + const struct genl_info *info, + struct netdev_nl_dump_ctx *ctx) +{ + const struct netdev_stat_ops *ops = netdev->stat_ops; + int i, err; + + if (!(netdev->flags & IFF_UP)) + return 0; + + i = ctx->rxq_idx; + while (ops->get_queue_stats_rx && i < netdev->real_num_rx_queues) { + err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_RX, + i, info); + if (err) + return err; + ctx->rxq_idx = i++; + } + i = ctx->txq_idx; + while (ops->get_queue_stats_tx && i < netdev->real_num_tx_queues) { + err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_TX, + i, info); + if (err) + return err; + ctx->txq_idx = i++; + } + + ctx->rxq_idx = 0; + ctx->txq_idx = 0; + return 0; +} + +static int +netdev_nl_stats_by_netdev(struct net_device *netdev, struct sk_buff *rsp, + const struct genl_info *info) +{ + struct netdev_queue_stats_rx rx_sum, rx; + struct netdev_queue_stats_tx tx_sum, tx; + const struct netdev_stat_ops *ops; + void *hdr; + int i; + + ops = netdev->stat_ops; + /* Netdev can't guarantee any complete counters */ + if (!ops->get_base_stats) + return 0; + + memset(&rx_sum, 0xff, sizeof(rx_sum)); + memset(&tx_sum, 0xff, sizeof(tx_sum)); + + ops->get_base_stats(netdev, &rx_sum, &tx_sum); + + /* The op was there, but nothing reported, don't bother */ + if (!memchr_inv(&rx_sum, 0xff, sizeof(rx_sum)) && + !memchr_inv(&tx_sum, 0xff, sizeof(tx_sum))) + return 0; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex)) + goto nla_put_failure; + + for (i = 0; i < netdev->real_num_rx_queues; i++) { + memset(&rx, 0xff, sizeof(rx)); + if (ops->get_queue_stats_rx) + ops->get_queue_stats_rx(netdev, i, &rx); + netdev_nl_stats_add(&rx_sum, &rx, sizeof(rx)); + } + for (i = 0; i < netdev->real_num_tx_queues; i++) { + memset(&tx, 0xff, sizeof(tx)); + if (ops->get_queue_stats_tx) + ops->get_queue_stats_tx(netdev, i, &tx); + netdev_nl_stats_add(&tx_sum, &tx, sizeof(tx)); + } + + if (netdev_nl_stats_write_rx(rsp, &rx_sum) || + netdev_nl_stats_write_tx(rsp, &tx_sum)) + goto nla_put_failure; + + genlmsg_end(rsp, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +static int +netdev_nl_qstats_get_dump_one(struct net_device *netdev, unsigned int scope, + struct sk_buff *skb, const struct genl_info *info, + struct netdev_nl_dump_ctx *ctx) +{ + if (!netdev->stat_ops) + return 0; + + switch (scope) { + case 0: + return netdev_nl_stats_by_netdev(netdev, skb, info); + case NETDEV_QSTATS_SCOPE_QUEUE: + return netdev_nl_stats_by_queue(netdev, skb, info, ctx); + } + + return -EINVAL; /* Should not happen, per netlink policy */ +} + +int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); + const struct genl_info *info = genl_info_dump(cb); + struct net *net = sock_net(skb->sk); + struct net_device *netdev; + unsigned int ifindex; + unsigned int scope; + int err = 0; + + scope = 0; + if (info->attrs[NETDEV_A_QSTATS_SCOPE]) + scope = nla_get_uint(info->attrs[NETDEV_A_QSTATS_SCOPE]); + + ifindex = 0; + if (info->attrs[NETDEV_A_QSTATS_IFINDEX]) + ifindex = nla_get_u32(info->attrs[NETDEV_A_QSTATS_IFINDEX]); + + rtnl_lock(); + if (ifindex) { + netdev = __dev_get_by_index(net, ifindex); + if (netdev && netdev->stat_ops) { + err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, + info, ctx); + } else { + NL_SET_BAD_ATTR(info->extack, + info->attrs[NETDEV_A_QSTATS_IFINDEX]); + err = netdev ? -EOPNOTSUPP : -ENODEV; + } + } else { + for_each_netdev_dump(net, netdev, ctx->ifindex) { + err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, + info, ctx); + if (err < 0) + break; + } + } + rtnl_unlock(); + + return err; +} + static int netdev_genl_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 543007f159..55bcacf67d 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -316,7 +316,7 @@ static int netpoll_owner_active(struct net_device *dev) struct napi_struct *napi; list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { - if (napi->poll_owner == smp_processor_id()) + if (READ_ONCE(napi->poll_owner) == smp_processor_id()) return 1; } return 0; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 4933762e5a..3772eb63dc 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -5,6 +5,7 @@ * Copyright (C) 2016 Red Hat, Inc. */ +#include <linux/error-injection.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> @@ -31,6 +32,8 @@ #define BIAS_MAX (LONG_MAX >> 1) #ifdef CONFIG_PAGE_POOL_STATS +static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats); + /* alloc_stat_inc is intended to be used in softirq context */ #define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++) /* recycle_stat_inc is safe to use when preemption is possible. */ @@ -121,9 +124,9 @@ int page_pool_ethtool_stats_get_count(void) } EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); -u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) +u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) { - struct page_pool_stats *pool_stats = stats; + const struct page_pool_stats *pool_stats = stats; *data++ = pool_stats->alloc_stats.fast; *data++ = pool_stats->alloc_stats.slow; @@ -170,16 +173,29 @@ static void page_pool_producer_unlock(struct page_pool *pool, spin_unlock_bh(&pool->ring.producer_lock); } +static void page_pool_struct_check(void) +{ + CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users); + CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page); + CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset); + CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag, 4 * sizeof(long)); +} + static int page_pool_init(struct page_pool *pool, - const struct page_pool_params *params) + const struct page_pool_params *params, + int cpuid) { unsigned int ring_qsize = 1024; /* Default */ + page_pool_struct_check(); + memcpy(&pool->p, ¶ms->fast, sizeof(pool->p)); memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow)); + pool->cpuid = cpuid; + /* Validate only known flags were used */ - if (pool->p.flags & ~(PP_FLAG_ALL)) + if (pool->slow.flags & ~PP_FLAG_ALL) return -EINVAL; if (pool->p.pool_size) @@ -193,22 +209,26 @@ static int page_pool_init(struct page_pool *pool, * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, * which is the XDP_TX use-case. */ - if (pool->p.flags & PP_FLAG_DMA_MAP) { + if (pool->slow.flags & PP_FLAG_DMA_MAP) { if ((pool->p.dma_dir != DMA_FROM_DEVICE) && (pool->p.dma_dir != DMA_BIDIRECTIONAL)) return -EINVAL; + + pool->dma_map = true; } - if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) { + if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) { /* In order to request DMA-sync-for-device the page * needs to be mapped */ - if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + if (!(pool->slow.flags & PP_FLAG_DMA_MAP)) return -EINVAL; if (!pool->p.max_len) return -EINVAL; + pool->dma_sync = true; + /* pool->p.offset has to be set according to the address * offset used by the DMA engine to start copying rx data */ @@ -217,14 +237,24 @@ static int page_pool_init(struct page_pool *pool, pool->has_init_callback = !!pool->slow.init_callback; #ifdef CONFIG_PAGE_POOL_STATS - pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); - if (!pool->recycle_stats) - return -ENOMEM; + if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) { + pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); + if (!pool->recycle_stats) + return -ENOMEM; + } else { + /* For system page pool instance we use a singular stats object + * instead of allocating a separate percpu variable for each + * (also percpu) page pool instance. + */ + pool->recycle_stats = &pp_system_recycle_stats; + pool->system = true; + } #endif if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) { #ifdef CONFIG_PAGE_POOL_STATS - free_percpu(pool->recycle_stats); + if (!pool->system) + free_percpu(pool->recycle_stats); #endif return -ENOMEM; } @@ -234,7 +264,7 @@ static int page_pool_init(struct page_pool *pool, /* Driver calling page_pool_create() also call page_pool_destroy() */ refcount_set(&pool->user_cnt, 1); - if (pool->p.flags & PP_FLAG_DMA_MAP) + if (pool->dma_map) get_device(pool->p.dev); return 0; @@ -244,19 +274,22 @@ static void page_pool_uninit(struct page_pool *pool) { ptr_ring_cleanup(&pool->ring, NULL); - if (pool->p.flags & PP_FLAG_DMA_MAP) + if (pool->dma_map) put_device(pool->p.dev); #ifdef CONFIG_PAGE_POOL_STATS - free_percpu(pool->recycle_stats); + if (!pool->system) + free_percpu(pool->recycle_stats); #endif } /** - * page_pool_create() - create a page pool. + * page_pool_create_percpu() - create a page pool for a given cpu. * @params: parameters, see struct page_pool_params + * @cpuid: cpu identifier */ -struct page_pool *page_pool_create(const struct page_pool_params *params) +struct page_pool * +page_pool_create_percpu(const struct page_pool_params *params, int cpuid) { struct page_pool *pool; int err; @@ -265,7 +298,7 @@ struct page_pool *page_pool_create(const struct page_pool_params *params) if (!pool) return ERR_PTR(-ENOMEM); - err = page_pool_init(pool, params); + err = page_pool_init(pool, params, cpuid); if (err < 0) goto err_free; @@ -282,6 +315,16 @@ err_free: kfree(pool); return ERR_PTR(err); } +EXPORT_SYMBOL(page_pool_create_percpu); + +/** + * page_pool_create() - create a page pool + * @params: parameters, see struct page_pool_params + */ +struct page_pool *page_pool_create(const struct page_pool_params *params) +{ + return page_pool_create_percpu(params, -1); +} EXPORT_SYMBOL(page_pool_create); static void page_pool_return_page(struct page_pool *pool, struct page *page); @@ -356,16 +399,26 @@ static struct page *__page_pool_get_cached(struct page_pool *pool) return page; } -static void page_pool_dma_sync_for_device(struct page_pool *pool, - struct page *page, - unsigned int dma_sync_size) +static void __page_pool_dma_sync_for_device(const struct page_pool *pool, + const struct page *page, + u32 dma_sync_size) { +#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) dma_addr_t dma_addr = page_pool_get_dma_addr(page); dma_sync_size = min(dma_sync_size, pool->p.max_len); - dma_sync_single_range_for_device(pool->p.dev, dma_addr, - pool->p.offset, dma_sync_size, - pool->p.dma_dir); + __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, + dma_sync_size, pool->p.dma_dir); +#endif +} + +static __always_inline void +page_pool_dma_sync_for_device(const struct page_pool *pool, + const struct page *page, + u32 dma_sync_size) +{ + if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) + __page_pool_dma_sync_for_device(pool, page, dma_sync_size); } static bool page_pool_dma_map(struct page_pool *pool, struct page *page) @@ -387,13 +440,12 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page) if (page_pool_set_dma_addr(page, dma)) goto unmap_failed; - if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) - page_pool_dma_sync_for_device(pool, page, pool->p.max_len); + page_pool_dma_sync_for_device(pool, page, pool->p.max_len); return true; unmap_failed: - WARN_ON_ONCE("unexpected DMA address, please report to netdev@"); + WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); @@ -433,8 +485,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, if (unlikely(!page)) return NULL; - if ((pool->p.flags & PP_FLAG_DMA_MAP) && - unlikely(!page_pool_dma_map(pool, page))) { + if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page))) { put_page(page); return NULL; } @@ -454,8 +505,8 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, gfp_t gfp) { const int bulk = PP_ALLOC_CACHE_REFILL; - unsigned int pp_flags = pool->p.flags; unsigned int pp_order = pool->p.order; + bool dma_map = pool->dma_map; struct page *page; int i, nr_pages; @@ -480,8 +531,7 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, */ for (i = 0; i < nr_pages; i++) { page = pool->alloc.cache[i]; - if ((pp_flags & PP_FLAG_DMA_MAP) && - unlikely(!page_pool_dma_map(pool, page))) { + if (dma_map && unlikely(!page_pool_dma_map(pool, page))) { put_page(page); continue; } @@ -523,6 +573,7 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) return page; } EXPORT_SYMBOL(page_pool_alloc_pages); +ALLOW_ERROR_INJECTION(page_pool_alloc_pages, NULL); /* Calculate distance between two u32 values, valid if distance is below 2^(31) * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution @@ -553,7 +604,7 @@ void __page_pool_release_page_dma(struct page_pool *pool, struct page *page) { dma_addr_t dma; - if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + if (!pool->dma_map) /* Always account for inflight pages, even if we didn't * map them */ @@ -630,8 +681,13 @@ static bool page_pool_recycle_in_cache(struct page *page, return true; } +static bool __page_pool_page_can_be_recycled(const struct page *page) +{ + return page_ref_count(page) == 1 && !page_is_pfmemalloc(page); +} + /* If the page refcnt == 1, this will try to recycle the page. - * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for + * If pool->dma_sync is set, we'll try to sync the DMA area for * the configured size min(dma_sync_size, pool->max_len). * If the page refcnt != 1, then the page will be returned to memory * subsystem. @@ -651,15 +707,12 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, * page is NOT reusable when allocated when system is under * some pressure. (page_is_pfmemalloc) */ - if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) { + if (likely(__page_pool_page_can_be_recycled(page))) { /* Read barrier done in page_ref_count / READ_ONCE */ - if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) - page_pool_dma_sync_for_device(pool, page, - dma_sync_size); + page_pool_dma_sync_for_device(pool, page, dma_sync_size); - if (allow_direct && in_softirq() && - page_pool_recycle_in_cache(page, pool)) + if (allow_direct && page_pool_recycle_in_cache(page, pool)) return NULL; /* Page found as candidate for recycling */ @@ -684,9 +737,35 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, return NULL; } +static bool page_pool_napi_local(const struct page_pool *pool) +{ + const struct napi_struct *napi; + u32 cpuid; + + if (unlikely(!in_softirq())) + return false; + + /* Allow direct recycle if we have reasons to believe that we are + * in the same context as the consumer would run, so there's + * no possible race. + * __page_pool_put_page() makes sure we're not in hardirq context + * and interrupts are enabled prior to accessing the cache. + */ + cpuid = smp_processor_id(); + if (READ_ONCE(pool->cpuid) == cpuid) + return true; + + napi = READ_ONCE(pool->p.napi); + + return napi && READ_ONCE(napi->list_owner) == cpuid; +} + void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { + if (!allow_direct) + allow_direct = page_pool_napi_local(pool); + page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); if (page && !page_pool_recycle_in_ring(pool, page)) { /* Cache full, fallback to free pages */ @@ -715,8 +794,11 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, int count) { int i, bulk_len = 0; + bool allow_direct; bool in_softirq; + allow_direct = page_pool_napi_local(pool); + for (i = 0; i < count; i++) { struct page *page = virt_to_head_page(data[i]); @@ -724,13 +806,13 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, if (!page_pool_is_last_ref(page)) continue; - page = __page_pool_put_page(pool, page, -1, false); + page = __page_pool_put_page(pool, page, -1, allow_direct); /* Approved for bulk recycling in ptr_ring cache */ if (page) data[bulk_len++] = page; } - if (unlikely(!bulk_len)) + if (!bulk_len) return; /* Bulk producer into ptr_ring page_pool cache */ @@ -766,10 +848,8 @@ static struct page *page_pool_drain_frag(struct page_pool *pool, if (likely(page_pool_unref_page(page, drain_count))) return NULL; - if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) { - if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) - page_pool_dma_sync_for_device(pool, page, -1); - + if (__page_pool_page_can_be_recycled(page)) { + page_pool_dma_sync_for_device(pool, page, -1); return page; } @@ -927,15 +1007,20 @@ static void page_pool_release_retry(struct work_struct *wq) } void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), - struct xdp_mem_info *mem) + const struct xdp_mem_info *mem) { refcount_inc(&pool->user_cnt); pool->disconnect = disconnect; pool->xdp_mem_id = mem->id; } -void page_pool_unlink_napi(struct page_pool *pool) +static void page_pool_disable_direct_recycling(struct page_pool *pool) { + /* Disable direct recycling based on pool->cpuid. + * Paired with READ_ONCE() in page_pool_napi_local(). + */ + WRITE_ONCE(pool->cpuid, -1); + if (!pool->p.napi) return; @@ -947,7 +1032,6 @@ void page_pool_unlink_napi(struct page_pool *pool) WRITE_ONCE(pool->p.napi, NULL); } -EXPORT_SYMBOL(page_pool_unlink_napi); void page_pool_destroy(struct page_pool *pool) { @@ -957,7 +1041,7 @@ void page_pool_destroy(struct page_pool *pool) if (!page_pool_put(pool)) return; - page_pool_unlink_napi(pool); + page_pool_disable_direct_recycling(pool); page_pool_free_frag(pool); if (!page_pool_release(pool)) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index bd50e9fe32..5e589f0a62 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -483,24 +483,15 @@ EXPORT_SYMBOL_GPL(__rtnl_link_unregister); */ static void rtnl_lock_unregistering_all(void) { - struct net *net; - bool unregistering; DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(&netdev_unregistering_wq, &wait); for (;;) { - unregistering = false; rtnl_lock(); /* We held write locked pernet_ops_rwsem, and parallel * setup_net() and cleanup_net() are not possible. */ - for_each_net(net) { - if (atomic_read(&net->dev_unreg_count) > 0) { - unregistering = true; - break; - } - } - if (!unregistering) + if (!atomic_read(&dev_unreg_count)) break; __rtnl_unlock(); @@ -851,9 +842,22 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, } EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); +void netdev_set_operstate(struct net_device *dev, int newstate) +{ + unsigned int old = READ_ONCE(dev->operstate); + + do { + if (old == newstate) + return; + } while (!try_cmpxchg(&dev->operstate, &old, newstate)); + + netdev_state_change(dev); +} +EXPORT_SYMBOL(netdev_set_operstate); + static void set_operstate(struct net_device *dev, unsigned char transition) { - unsigned char operstate = dev->operstate; + unsigned char operstate = READ_ONCE(dev->operstate); switch (transition) { case IF_OPER_UP: @@ -875,12 +879,7 @@ static void set_operstate(struct net_device *dev, unsigned char transition) break; } - if (dev->operstate != operstate) { - write_lock(&dev_base_lock); - dev->operstate = operstate; - write_unlock(&dev_base_lock); - netdev_state_change(dev); - } + netdev_set_operstate(dev, operstate); } static unsigned int rtnl_dev_get_flags(const struct net_device *dev) @@ -1037,8 +1036,8 @@ static size_t rtnl_proto_down_size(const struct net_device *dev) { size_t size = nla_total_size(1); - if (dev->proto_down_reason) - size += nla_total_size(0) + nla_total_size(4); + /* Assume dev->proto_down_reason is not zero. */ + size += nla_total_size(0) + nla_total_size(4); return size; } @@ -1456,17 +1455,18 @@ static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb, return 0; } -static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) +static int rtnl_fill_link_ifmap(struct sk_buff *skb, + const struct net_device *dev) { struct rtnl_link_ifmap map; memset(&map, 0, sizeof(map)); - map.mem_start = dev->mem_start; - map.mem_end = dev->mem_end; - map.base_addr = dev->base_addr; - map.irq = dev->irq; - map.dma = dev->dma; - map.port = dev->if_port; + map.mem_start = READ_ONCE(dev->mem_start); + map.mem_end = READ_ONCE(dev->mem_end); + map.base_addr = READ_ONCE(dev->base_addr); + map.irq = READ_ONCE(dev->irq); + map.dma = READ_ONCE(dev->dma); + map.port = READ_ONCE(dev->if_port); if (nla_put_64bit(skb, IFLA_MAP, sizeof(map), &map, IFLA_PAD)) return -EMSGSIZE; @@ -1477,13 +1477,15 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) static u32 rtnl_xdp_prog_skb(struct net_device *dev) { const struct bpf_prog *generic_xdp_prog; + u32 res = 0; - ASSERT_RTNL(); + rcu_read_lock(); + generic_xdp_prog = rcu_dereference(dev->xdp_prog); + if (generic_xdp_prog) + res = generic_xdp_prog->aux->id; + rcu_read_unlock(); - generic_xdp_prog = rtnl_dereference(dev->xdp_prog); - if (!generic_xdp_prog) - return 0; - return generic_xdp_prog->aux->id; + return res; } static u32 rtnl_xdp_prog_drv(struct net_device *dev) @@ -1603,7 +1605,8 @@ static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev) upper_dev = netdev_master_upper_dev_get_rcu(dev); if (upper_dev) - ret = nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex); + ret = nla_put_u32(skb, IFLA_MASTER, + READ_ONCE(upper_dev->ifindex)); rcu_read_unlock(); return ret; @@ -1612,10 +1615,10 @@ static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev) static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev, bool force) { - int ifindex = dev_get_iflink(dev); + int iflink = dev_get_iflink(dev); - if (force || dev->ifindex != ifindex) - return nla_put_u32(skb, IFLA_LINK, ifindex); + if (force || READ_ONCE(dev->ifindex) != iflink) + return nla_put_u32(skb, IFLA_LINK, iflink); return 0; } @@ -1699,7 +1702,7 @@ static int rtnl_fill_alt_ifnames(struct sk_buff *skb, struct netdev_name_node *name_node; int count = 0; - list_for_each_entry(name_node, &dev->name_node->list, list) { + list_for_each_entry_rcu(name_node, &dev->name_node->list, list) { if (nla_put_string(skb, IFLA_ALT_IFNAME, name_node->name)) return -EMSGSIZE; count++; @@ -1707,6 +1710,7 @@ static int rtnl_fill_alt_ifnames(struct sk_buff *skb, return count; } +/* RCU protected. */ static int rtnl_fill_prop_list(struct sk_buff *skb, const struct net_device *dev) { @@ -1735,10 +1739,10 @@ static int rtnl_fill_proto_down(struct sk_buff *skb, struct nlattr *pr; u32 preason; - if (nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) + if (nla_put_u8(skb, IFLA_PROTO_DOWN, READ_ONCE(dev->proto_down))) goto nla_put_failure; - preason = dev->proto_down_reason; + preason = READ_ONCE(dev->proto_down_reason); if (!preason) return 0; @@ -1811,6 +1815,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, u32 event, int *new_nsid, int new_ifindex, int tgt_netnsid, gfp_t gfp) { + char devname[IFNAMSIZ]; struct ifinfomsg *ifm; struct nlmsghdr *nlh; struct Qdisc *qdisc; @@ -1823,41 +1828,51 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, ifm = nlmsg_data(nlh); ifm->ifi_family = AF_UNSPEC; ifm->__ifi_pad = 0; - ifm->ifi_type = dev->type; - ifm->ifi_index = dev->ifindex; + ifm->ifi_type = READ_ONCE(dev->type); + ifm->ifi_index = READ_ONCE(dev->ifindex); ifm->ifi_flags = dev_get_flags(dev); ifm->ifi_change = change; if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid)) goto nla_put_failure; - qdisc = rtnl_dereference(dev->qdisc); - if (nla_put_string(skb, IFLA_IFNAME, dev->name) || - nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) || + netdev_copy_name(dev, devname); + if (nla_put_string(skb, IFLA_IFNAME, devname)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_TXQLEN, READ_ONCE(dev->tx_queue_len)) || nla_put_u8(skb, IFLA_OPERSTATE, - netif_running(dev) ? dev->operstate : IF_OPER_DOWN) || - nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) || - nla_put_u32(skb, IFLA_MTU, dev->mtu) || - nla_put_u32(skb, IFLA_MIN_MTU, dev->min_mtu) || - nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) || - nla_put_u32(skb, IFLA_GROUP, dev->group) || - nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) || - nla_put_u32(skb, IFLA_ALLMULTI, dev->allmulti) || - nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || - nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) || - nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) || - nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) || - nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE, dev->gso_ipv4_max_size) || - nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE, dev->gro_ipv4_max_size) || - nla_put_u32(skb, IFLA_TSO_MAX_SIZE, dev->tso_max_size) || - nla_put_u32(skb, IFLA_TSO_MAX_SEGS, dev->tso_max_segs) || + netif_running(dev) ? READ_ONCE(dev->operstate) : + IF_OPER_DOWN) || + nla_put_u8(skb, IFLA_LINKMODE, READ_ONCE(dev->link_mode)) || + nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) || + nla_put_u32(skb, IFLA_MIN_MTU, READ_ONCE(dev->min_mtu)) || + nla_put_u32(skb, IFLA_MAX_MTU, READ_ONCE(dev->max_mtu)) || + nla_put_u32(skb, IFLA_GROUP, READ_ONCE(dev->group)) || + nla_put_u32(skb, IFLA_PROMISCUITY, READ_ONCE(dev->promiscuity)) || + nla_put_u32(skb, IFLA_ALLMULTI, READ_ONCE(dev->allmulti)) || + nla_put_u32(skb, IFLA_NUM_TX_QUEUES, + READ_ONCE(dev->num_tx_queues)) || + nla_put_u32(skb, IFLA_GSO_MAX_SEGS, + READ_ONCE(dev->gso_max_segs)) || + nla_put_u32(skb, IFLA_GSO_MAX_SIZE, + READ_ONCE(dev->gso_max_size)) || + nla_put_u32(skb, IFLA_GRO_MAX_SIZE, + READ_ONCE(dev->gro_max_size)) || + nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE, + READ_ONCE(dev->gso_ipv4_max_size)) || + nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE, + READ_ONCE(dev->gro_ipv4_max_size)) || + nla_put_u32(skb, IFLA_TSO_MAX_SIZE, + READ_ONCE(dev->tso_max_size)) || + nla_put_u32(skb, IFLA_TSO_MAX_SEGS, + READ_ONCE(dev->tso_max_segs)) || #ifdef CONFIG_RPS - nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || + nla_put_u32(skb, IFLA_NUM_RX_QUEUES, + READ_ONCE(dev->num_rx_queues)) || #endif put_master_ifindex(skb, dev) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || - (qdisc && - nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) || nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_up_count) + @@ -1876,9 +1891,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, goto nla_put_failure; } - if (rtnl_fill_link_ifmap(skb, dev)) - goto nla_put_failure; - if (dev->addr_len) { if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) || nla_put(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast)) @@ -1911,9 +1923,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, goto nla_put_failure; } - if (rtnl_fill_link_netnsid(skb, dev, src_net, gfp)) - goto nla_put_failure; - if (new_nsid && nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0) goto nla_put_failure; @@ -1926,12 +1935,18 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, goto nla_put_failure; rcu_read_lock(); + if (rtnl_fill_link_netnsid(skb, dev, src_net, GFP_ATOMIC)) + goto nla_put_failure_rcu; + qdisc = rcu_dereference(dev->qdisc); + if (qdisc && nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) + goto nla_put_failure_rcu; if (rtnl_fill_link_af(skb, dev, ext_filter_mask)) goto nla_put_failure_rcu; - rcu_read_unlock(); - + if (rtnl_fill_link_ifmap(skb, dev)) + goto nla_put_failure_rcu; if (rtnl_fill_prop_list(skb, dev)) - goto nla_put_failure; + goto nla_put_failure_rcu; + rcu_read_unlock(); if (dev->dev.parent && nla_put_string(skb, IFLA_PARENT_DEV_NAME, @@ -2200,25 +2215,22 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { + const struct rtnl_link_ops *kind_ops = NULL; struct netlink_ext_ack *extack = cb->extack; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); - struct net *tgt_net = net; - int h, s_h; - int idx = 0, s_idx; - struct net_device *dev; - struct hlist_head *head; + unsigned int flags = NLM_F_MULTI; struct nlattr *tb[IFLA_MAX+1]; + struct { + unsigned long ifindex; + } *ctx = (void *)cb->ctx; + struct net *tgt_net = net; u32 ext_filter_mask = 0; - const struct rtnl_link_ops *kind_ops = NULL; - unsigned int flags = NLM_F_MULTI; + struct net_device *dev; int master_idx = 0; int netnsid = -1; int err, i; - s_h = cb->args[0]; - s_idx = cb->args[1]; - err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack); if (err < 0) { if (cb->strict_check) @@ -2262,36 +2274,18 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) flags |= NLM_F_DUMP_FILTERED; walk_entries: - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &tgt_net->dev_index_head[h]; - hlist_for_each_entry(dev, head, index_hlist) { - if (link_dump_filtered(dev, master_idx, kind_ops)) - goto cont; - if (idx < s_idx) - goto cont; - err = rtnl_fill_ifinfo(skb, dev, net, - RTM_NEWLINK, - NETLINK_CB(cb->skb).portid, - nlh->nlmsg_seq, 0, flags, - ext_filter_mask, 0, NULL, 0, - netnsid, GFP_KERNEL); - - if (err < 0) { - if (likely(skb->len)) - goto out; - - goto out_err; - } -cont: - idx++; - } + err = 0; + for_each_netdev_dump(tgt_net, dev, ctx->ifindex) { + if (link_dump_filtered(dev, master_idx, kind_ops)) + continue; + err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK, + NETLINK_CB(cb->skb).portid, + nlh->nlmsg_seq, 0, flags, + ext_filter_mask, 0, NULL, 0, + netnsid, GFP_KERNEL); + if (err < 0) + break; } -out: - err = skb->len; -out_err: - cb->args[1] = idx; - cb->args[0] = h; cb->seq = tgt_net->dev_base_seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); if (netnsid >= 0) @@ -2552,7 +2546,7 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) { if (nla_type(attr) != IFLA_VF_VLAN_INFO || - nla_len(attr) < NLA_HDRLEN) { + nla_len(attr) < sizeof(struct ifla_vf_vlan_info)) { return -EINVAL; } if (len >= MAX_VLAN_LIST_LEN) @@ -2983,11 +2977,9 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_LINKMODE]) { unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]); - write_lock(&dev_base_lock); if (dev->link_mode ^ value) status |= DO_SETLINK_NOTIFY; - dev->link_mode = value; - write_unlock(&dev_base_lock); + WRITE_ONCE(dev->link_mode, value); } if (tb[IFLA_VFINFO_LIST]) { @@ -3296,7 +3288,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) - dev = rtnl_dev_get(net, tb); + dev = rtnl_dev_get(tgt_net, tb); else if (tb[IFLA_GROUP]) err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); else @@ -5269,15 +5261,14 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { - nla_for_each_nested(attr, br_spec, rem) { - if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { - if (nla_len(attr) < sizeof(flags)) - return -EINVAL; + nla_for_each_nested_type(attr, IFLA_BRIDGE_FLAGS, br_spec, + rem) { + if (nla_len(attr) < sizeof(flags)) + return -EINVAL; - have_flags = true; - flags = nla_get_u16(attr); - break; - } + have_flags = true; + flags = nla_get_u16(attr); + break; } } @@ -5986,19 +5977,17 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; - int h, s_h, err, s_idx, s_idxattr, s_prividx; struct rtnl_stats_dump_filters filters; struct net *net = sock_net(skb->sk); unsigned int flags = NLM_F_MULTI; struct if_stats_msg *ifsm; - struct hlist_head *head; + struct { + unsigned long ifindex; + int idxattr; + int prividx; + } *ctx = (void *)cb->ctx; struct net_device *dev; - int idx = 0; - - s_h = cb->args[0]; - s_idx = cb->args[1]; - s_idxattr = cb->args[2]; - s_prividx = cb->args[3]; + int err; cb->seq = net->dev_base_seq; @@ -6017,39 +6006,26 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) if (err) return err; - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &net->dev_index_head[h]; - hlist_for_each_entry(dev, head, index_hlist) { - if (idx < s_idx) - goto cont; - err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, 0, - flags, &filters, - &s_idxattr, &s_prividx, - extack); - /* If we ran out of room on the first message, - * we're in trouble - */ - WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); + for_each_netdev_dump(net, dev, ctx->ifindex) { + err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, 0, + flags, &filters, + &ctx->idxattr, &ctx->prividx, + extack); + /* If we ran out of room on the first message, + * we're in trouble. + */ + WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); - if (err < 0) - goto out; - s_prividx = 0; - s_idxattr = 0; - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); -cont: - idx++; - } + if (err < 0) + break; + ctx->prividx = 0; + ctx->idxattr = 0; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); } -out: - cb->args[3] = s_prividx; - cb->args[2] = s_idxattr; - cb->args[1] = idx; - cb->args[0] = h; - return skb->len; + return err; } void rtnl_offload_xstats_notify(struct net_device *dev) @@ -6508,6 +6484,46 @@ static int rtnl_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, /* Process one rtnetlink message. */ +static int rtnl_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + rtnl_dumpit_func dumpit = cb->data; + int err; + + /* Previous iteration have already finished, avoid calling->dumpit() + * again, it may not expect to be called after it reached the end. + */ + if (!dumpit) + return 0; + + err = dumpit(skb, cb); + + /* Old dump handlers used to send NLM_DONE as in a separate recvmsg(). + * Some applications which parse netlink manually depend on this. + */ + if (cb->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE) { + if (err < 0 && err != -EMSGSIZE) + return err; + if (!err) + cb->data = NULL; + + return skb->len; + } + return err; +} + +static int rtnetlink_dump_start(struct sock *ssk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct netlink_dump_control *control) +{ + if (control->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE) { + WARN_ON(control->data); + control->data = control->dump; + control->dump = rtnl_dumpit; + } + + return netlink_dump_start(ssk, skb, nlh, control); +} + static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -6552,6 +6568,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, } owner = link->owner; dumpit = link->dumpit; + flags = link->flags; if (type == RTM_GETLINK - RTM_BASE) min_dump_alloc = rtnl_calcit(skb, nlh); @@ -6569,8 +6586,9 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, .dump = dumpit, .min_dump_alloc = min_dump_alloc, .module = owner, + .flags = flags, }; - err = netlink_dump_start(rtnl, skb, nlh, &c); + err = rtnetlink_dump_start(rtnl, skb, nlh, &c); /* netlink_dump_start() will keep a reference on * module if dump is still in progress. */ @@ -6716,7 +6734,7 @@ void __init rtnetlink_init(void) register_netdevice_notifier(&rtnetlink_dev_notifier); rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, - rtnl_dump_ifinfo, 0); + rtnl_dump_ifinfo, RTNL_FLAG_DUMP_SPLIT_NLM_DONE); rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, 0); rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, 0); diff --git a/net/core/scm.c b/net/core/scm.c index d0e0852a24..4f6a14babe 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -36,6 +36,7 @@ #include <net/compat.h> #include <net/scm.h> #include <net/cls_cgroup.h> +#include <net/af_unix.h> /* @@ -85,8 +86,15 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) return -ENOMEM; *fplp = fpl; fpl->count = 0; + fpl->count_unix = 0; fpl->max = SCM_MAX_FD; fpl->user = NULL; +#if IS_ENABLED(CONFIG_UNIX) + fpl->inflight = false; + fpl->dead = false; + fpl->edges = NULL; + INIT_LIST_HEAD(&fpl->vertices); +#endif } fpp = &fpl->fp[fpl->count]; @@ -109,6 +117,9 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) fput(file); return -EINVAL; } + if (unix_get_socket(file)) + fpl->count_unix++; + *fpp++ = file; fpl->count++; } @@ -371,8 +382,14 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) if (new_fpl) { for (i = 0; i < fpl->count; i++) get_file(fpl->fp[i]); + new_fpl->max = new_fpl->count; new_fpl->user = get_uid(fpl->user); +#if IS_ENABLED(CONFIG_UNIX) + new_fpl->inflight = false; + new_fpl->edges = NULL; + INIT_LIST_HEAD(&new_fpl->vertices); +#endif } return new_fpl; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 71dee435d5..466999a751 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -51,6 +51,7 @@ #endif #include <linux/string.h> #include <linux/skbuff.h> +#include <linux/skbuff_ref.h> #include <linux/splice.h> #include <linux/cache.h> #include <linux/rtnetlink.h> @@ -69,6 +70,7 @@ #include <net/sock.h> #include <net/checksum.h> #include <net/gso.h> +#include <net/hotdata.h> #include <net/ip6_checksum.h> #include <net/xfrm.h> #include <net/mpls.h> @@ -88,15 +90,10 @@ #include "dev.h" #include "sock_destructor.h" -struct kmem_cache *skbuff_cache __ro_after_init; -static struct kmem_cache *skbuff_fclone_cache __ro_after_init; #ifdef CONFIG_SKB_EXTENSIONS static struct kmem_cache *skbuff_ext_cache __ro_after_init; #endif - -static struct kmem_cache *skb_small_head_cache __ro_after_init; - #define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER) /* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. @@ -112,8 +109,23 @@ static struct kmem_cache *skb_small_head_cache __ro_after_init; #define SKB_SMALL_HEAD_HEADROOM \ SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) -int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; -EXPORT_SYMBOL(sysctl_max_skb_frags); +/* kcm_write_msgs() relies on casting paged frags to bio_vec to use + * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the + * netmem is a page. + */ +static_assert(offsetof(struct bio_vec, bv_page) == + offsetof(skb_frag_t, netmem)); +static_assert(sizeof_field(struct bio_vec, bv_page) == + sizeof_field(skb_frag_t, netmem)); + +static_assert(offsetof(struct bio_vec, bv_len) == offsetof(skb_frag_t, len)); +static_assert(sizeof_field(struct bio_vec, bv_len) == + sizeof_field(skb_frag_t, len)); + +static_assert(offsetof(struct bio_vec, bv_offset) == + offsetof(skb_frag_t, offset)); +static_assert(sizeof_field(struct bio_vec, bv_offset) == + sizeof_field(skb_frag_t, offset)); #undef FN #define FN(reason) [SKB_DROP_REASON_##reason] = #reason, @@ -297,7 +309,8 @@ void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) fragsz = SKB_DATA_ALIGN(fragsz); - return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); + return __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, + align_mask); } EXPORT_SYMBOL(__napi_alloc_frag_align); @@ -309,13 +322,15 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) if (in_hardirq() || irqs_disabled()) { struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache); - data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask); + data = __page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, + align_mask); } else { struct napi_alloc_cache *nc; local_bh_disable(); nc = this_cpu_ptr(&napi_alloc_cache); - data = page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); + data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, + align_mask); local_bh_enable(); } return data; @@ -328,7 +343,7 @@ static struct sk_buff *napi_skb_cache_get(void) struct sk_buff *skb; if (unlikely(!nc->skb_count)) { - nc->skb_count = kmem_cache_alloc_bulk(skbuff_cache, + nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, GFP_ATOMIC, NAPI_SKB_CACHE_BULK, nc->skb_cache); @@ -337,7 +352,7 @@ static struct sk_buff *napi_skb_cache_get(void) } skb = nc->skb_cache[--nc->skb_count]; - kasan_mempool_unpoison_object(skb, kmem_cache_size(skbuff_cache)); + kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache)); return skb; } @@ -395,7 +410,7 @@ struct sk_buff *slab_build_skb(void *data) struct sk_buff *skb; unsigned int size; - skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC); + skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC); if (unlikely(!skb)) return NULL; @@ -446,7 +461,7 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size) { struct sk_buff *skb; - skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC); + skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC); if (unlikely(!skb)) return NULL; @@ -557,7 +572,7 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, obj_size = SKB_HEAD_ALIGN(*size); if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE && !(flags & KMALLOC_NOT_NORMAL_BITS)) { - obj = kmem_cache_alloc_node(skb_small_head_cache, + obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node); *size = SKB_SMALL_HEAD_CACHE_SIZE; @@ -565,7 +580,7 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, goto out; /* Try again but now we are using pfmemalloc reserves */ ret_pfmemalloc = true; - obj = kmem_cache_alloc_node(skb_small_head_cache, flags, node); + obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node); goto out; } @@ -628,7 +643,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, u8 *data; cache = (flags & SKB_ALLOC_FCLONE) - ? skbuff_fclone_cache : skbuff_cache; + ? net_hotdata.skbuff_fclone_cache : net_hotdata.skbuff_cache; if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) gfp_mask |= __GFP_MEMALLOC; @@ -758,10 +773,9 @@ skb_fail: EXPORT_SYMBOL(__netdev_alloc_skb); /** - * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance + * napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance * @napi: napi instance this buffer was allocated for * @len: length to allocate - * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages * * Allocate a new sk_buff for use in NAPI receive. This buffer will * attempt to allocate the head from a special reserved region used @@ -770,9 +784,9 @@ EXPORT_SYMBOL(__netdev_alloc_skb); * * %NULL is returned if there is no free memory. */ -struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, - gfp_t gfp_mask) +struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) { + gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN; struct napi_alloc_cache *nc; struct sk_buff *skb; bool pfmemalloc; @@ -843,19 +857,19 @@ skb_success: skb_fail: return skb; } -EXPORT_SYMBOL(__napi_alloc_skb); +EXPORT_SYMBOL(napi_alloc_skb); -void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, - int size, unsigned int truesize) +void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem, + int off, int size, unsigned int truesize) { DEBUG_NET_WARN_ON_ONCE(size > truesize); - skb_fill_page_desc(skb, i, page, off, size); + skb_fill_netmem_desc(skb, i, netmem, off, size); skb->len += size; skb->data_len += size; skb->truesize += truesize; } -EXPORT_SYMBOL(skb_add_rx_frag); +EXPORT_SYMBOL(skb_add_rx_frag_netmem); void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, unsigned int truesize) @@ -895,12 +909,101 @@ static bool is_pp_page(struct page *page) return (page->pp_magic & ~0x3UL) == PP_SIGNATURE; } +int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, + unsigned int headroom) +{ #if IS_ENABLED(CONFIG_PAGE_POOL) -bool napi_pp_put_page(struct page *page, bool napi_safe) + u32 size, truesize, len, max_head_size, off; + struct sk_buff *skb = *pskb, *nskb; + int err, i, head_off; + void *data; + + /* XDP does not support fraglist so we need to linearize + * the skb. + */ + if (skb_has_frag_list(skb)) + return -EOPNOTSUPP; + + max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom); + if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE) + return -ENOMEM; + + size = min_t(u32, skb->len, max_head_size); + truesize = SKB_HEAD_ALIGN(size) + headroom; + data = page_pool_dev_alloc_va(pool, &truesize); + if (!data) + return -ENOMEM; + + nskb = napi_build_skb(data, truesize); + if (!nskb) { + page_pool_free_va(pool, data, true); + return -ENOMEM; + } + + skb_reserve(nskb, headroom); + skb_copy_header(nskb, skb); + skb_mark_for_recycle(nskb); + + err = skb_copy_bits(skb, 0, nskb->data, size); + if (err) { + consume_skb(nskb); + return err; + } + skb_put(nskb, size); + + head_off = skb_headroom(nskb) - skb_headroom(skb); + skb_headers_offset_update(nskb, head_off); + + off = size; + len = skb->len - off; + for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) { + struct page *page; + u32 page_off; + + size = min_t(u32, len, PAGE_SIZE); + truesize = size; + + page = page_pool_dev_alloc(pool, &page_off, &truesize); + if (!page) { + consume_skb(nskb); + return -ENOMEM; + } + + skb_add_rx_frag(nskb, i, page, page_off, size, truesize); + err = skb_copy_bits(skb, off, page_address(page) + page_off, + size); + if (err) { + consume_skb(nskb); + return err; + } + + len -= size; + off += size; + } + + consume_skb(skb); + *pskb = nskb; + + return 0; +#else + return -EOPNOTSUPP; +#endif +} +EXPORT_SYMBOL(skb_pp_cow_data); + +int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, + struct bpf_prog *prog) { - bool allow_direct = false; - struct page_pool *pp; + if (!prog->aux->xdp_has_frags) + return -EINVAL; + + return skb_pp_cow_data(pool, pskb, XDP_PACKET_HEADROOM); +} +EXPORT_SYMBOL(skb_cow_data_for_xdp); +#if IS_ENABLED(CONFIG_PAGE_POOL) +bool napi_pp_put_page(struct page *page) +{ page = compound_head(page); /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation @@ -913,38 +1016,18 @@ bool napi_pp_put_page(struct page *page, bool napi_safe) if (unlikely(!is_pp_page(page))) return false; - pp = page->pp; - - /* Allow direct recycle if we have reasons to believe that we are - * in the same context as the consumer would run, so there's - * no possible race. - * __page_pool_put_page() makes sure we're not in hardirq context - * and interrupts are enabled prior to accessing the cache. - */ - if (napi_safe || in_softirq()) { - const struct napi_struct *napi = READ_ONCE(pp->p.napi); - - allow_direct = napi && - READ_ONCE(napi->list_owner) == smp_processor_id(); - } - - /* Driver set this to memory recycling info. Reset it on recycle. - * This will *not* work for NIC using a split-page memory model. - * The page will be returned to the pool here regardless of the - * 'flipped' fragment being in use or not. - */ - page_pool_put_full_page(pp, page, allow_direct); + page_pool_put_full_page(page->pp, page, false); return true; } EXPORT_SYMBOL(napi_pp_put_page); #endif -static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) +static bool skb_pp_recycle(struct sk_buff *skb, void *data) { if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) return false; - return napi_pp_put_page(virt_to_page(data), napi_safe); + return napi_pp_put_page(virt_to_page(data)); } /** @@ -981,17 +1064,17 @@ static int skb_pp_frag_ref(struct sk_buff *skb) static void skb_kfree_head(void *head, unsigned int end_offset) { if (end_offset == SKB_SMALL_HEAD_HEADROOM) - kmem_cache_free(skb_small_head_cache, head); + kmem_cache_free(net_hotdata.skb_small_head_cache, head); else kfree(head); } -static void skb_free_head(struct sk_buff *skb, bool napi_safe) +static void skb_free_head(struct sk_buff *skb) { unsigned char *head = skb->head; if (skb->head_frag) { - if (skb_pp_recycle(skb, head, napi_safe)) + if (skb_pp_recycle(skb, head)) return; skb_free_frag(head); } else { @@ -999,15 +1082,12 @@ static void skb_free_head(struct sk_buff *skb, bool napi_safe) } } -static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason, - bool napi_safe) +static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason) { struct skb_shared_info *shinfo = skb_shinfo(skb); int i; - if (skb->cloned && - atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, - &shinfo->dataref)) + if (!skb_data_unref(skb, shinfo)) goto exit; if (skb_zcopy(skb)) { @@ -1019,13 +1099,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason, } for (i = 0; i < shinfo->nr_frags; i++) - napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe); + __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); free_head: if (shinfo->frag_list) kfree_skb_list_reason(shinfo->frag_list, reason); - skb_free_head(skb, napi_safe); + skb_free_head(skb); exit: /* When we clone an SKB we copy the reycling bit. The pp_recycle * bit is only set on the head though, so in order to avoid races @@ -1048,7 +1128,7 @@ static void kfree_skbmem(struct sk_buff *skb) switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: - kmem_cache_free(skbuff_cache, skb); + kmem_cache_free(net_hotdata.skbuff_cache, skb); return; case SKB_FCLONE_ORIG: @@ -1069,7 +1149,7 @@ static void kfree_skbmem(struct sk_buff *skb) if (!refcount_dec_and_test(&fclones->fclone_ref)) return; fastpath: - kmem_cache_free(skbuff_fclone_cache, fclones); + kmem_cache_free(net_hotdata.skbuff_fclone_cache, fclones); } void skb_release_head_state(struct sk_buff *skb) @@ -1086,12 +1166,11 @@ void skb_release_head_state(struct sk_buff *skb) } /* Free everything but the sk_buff shell. */ -static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason, - bool napi_safe) +static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason) { skb_release_head_state(skb); if (likely(skb->head)) - skb_release_data(skb, reason, napi_safe); + skb_release_data(skb, reason); } /** @@ -1105,7 +1184,7 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason, void __kfree_skb(struct sk_buff *skb) { - skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false); + skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED); kfree_skbmem(skb); } EXPORT_SYMBOL(__kfree_skb); @@ -1162,11 +1241,11 @@ static void kfree_skb_add_bulk(struct sk_buff *skb, return; } - skb_release_all(skb, reason, false); + skb_release_all(skb, reason); sa->skb_array[sa->skb_count++] = skb; if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { - kmem_cache_free_bulk(skbuff_cache, KFREE_SKB_BULK_SIZE, + kmem_cache_free_bulk(net_hotdata.skbuff_cache, KFREE_SKB_BULK_SIZE, sa->skb_array); sa->skb_count = 0; } @@ -1191,7 +1270,7 @@ kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason) } if (sa.skb_count) - kmem_cache_free_bulk(skbuff_cache, sa.skb_count, sa.skb_array); + kmem_cache_free_bulk(net_hotdata.skbuff_cache, sa.skb_count, sa.skb_array); } EXPORT_SYMBOL(kfree_skb_list_reason); @@ -1223,22 +1302,28 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) has_trans = skb_transport_header_was_set(skb); printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" - "mac=(%d,%d) net=(%d,%d) trans=%d\n" + "mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n" "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" - "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n" - "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n", + "csum(0x%x start=%u offset=%u ip_summed=%u complete_sw=%u valid=%u level=%u)\n" + "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n" + "priority=0x%x mark=0x%x alloc_cpu=%u vlan_all=0x%x\n" + "encapsulation=%d inner(proto=0x%04x, mac=%u, net=%u, trans=%u)\n", level, skb->len, headroom, skb_headlen(skb), tailroom, has_mac ? skb->mac_header : -1, has_mac ? skb_mac_header_len(skb) : -1, + skb->mac_len, skb->network_header, has_trans ? skb_network_header_len(skb) : -1, has_trans ? skb->transport_header : -1, sh->tx_flags, sh->nr_frags, sh->gso_size, sh->gso_type, sh->gso_segs, - skb->csum, skb->ip_summed, skb->csum_complete_sw, - skb->csum_valid, skb->csum_level, + skb->csum, skb->csum_start, skb->csum_offset, skb->ip_summed, + skb->csum_complete_sw, skb->csum_valid, skb->csum_level, skb->hash, skb->sw_hash, skb->l4_hash, - ntohs(skb->protocol), skb->pkt_type, skb->skb_iif); + ntohs(skb->protocol), skb->pkt_type, skb->skb_iif, + skb->priority, skb->mark, skb->alloc_cpu, skb->vlan_all, + skb->encapsulation, skb->inner_protocol, skb->inner_mac_header, + skb->inner_network_header, skb->inner_transport_header); if (dev) printk("%sdev name=%s feat=%pNF\n", @@ -1336,7 +1421,7 @@ EXPORT_SYMBOL(consume_skb); void __consume_stateless_skb(struct sk_buff *skb) { trace_consume_skb(skb, __builtin_return_address(0)); - skb_release_data(skb, SKB_CONSUMED, false); + skb_release_data(skb, SKB_CONSUMED); kfree_skbmem(skb); } @@ -1353,9 +1438,9 @@ static void napi_skb_cache_put(struct sk_buff *skb) if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++) kasan_mempool_unpoison_object(nc->skb_cache[i], - kmem_cache_size(skbuff_cache)); + kmem_cache_size(net_hotdata.skbuff_cache)); - kmem_cache_free_bulk(skbuff_cache, NAPI_SKB_CACHE_HALF, + kmem_cache_free_bulk(net_hotdata.skbuff_cache, NAPI_SKB_CACHE_HALF, nc->skb_cache + NAPI_SKB_CACHE_HALF); nc->skb_count = NAPI_SKB_CACHE_HALF; } @@ -1363,7 +1448,7 @@ static void napi_skb_cache_put(struct sk_buff *skb) void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason) { - skb_release_all(skb, reason, true); + skb_release_all(skb, reason); napi_skb_cache_put(skb); } @@ -1401,7 +1486,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget) return; } - skb_release_all(skb, SKB_CONSUMED, !!budget); + skb_release_all(skb, SKB_CONSUMED); napi_skb_cache_put(skb); } EXPORT_SYMBOL(napi_consume_skb); @@ -1532,7 +1617,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg); */ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) { - skb_release_all(dst, SKB_CONSUMED, false); + skb_release_all(dst, SKB_CONSUMED); return __skb_clone(dst, src); } EXPORT_SYMBOL_GPL(skb_morph); @@ -1600,7 +1685,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) return NULL; } - uarg->ubuf.callback = msg_zerocopy_callback; + uarg->ubuf.ops = &msg_zerocopy_ubuf_ops; uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; uarg->len = 1; uarg->bytelen = size; @@ -1626,7 +1711,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, u32 bytelen, next; /* there might be non MSG_ZEROCOPY users */ - if (uarg->callback != msg_zerocopy_callback) + if (uarg->ops != &msg_zerocopy_ubuf_ops) return NULL; /* realloc only when socket is locked (TCP, UDP cork), @@ -1737,8 +1822,8 @@ release: sock_put(sk); } -void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, - bool success) +static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg, + bool success) { struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg); @@ -1747,7 +1832,6 @@ void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, if (refcount_dec_and_test(&uarg->refcnt)) __msg_zerocopy_callback(uarg_zc); } -EXPORT_SYMBOL_GPL(msg_zerocopy_callback); void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) { @@ -1757,10 +1841,15 @@ void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) uarg_to_msgzc(uarg)->len--; if (have_uref) - msg_zerocopy_callback(NULL, uarg, true); + msg_zerocopy_complete(NULL, uarg, true); } EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); +const struct ubuf_info_ops msg_zerocopy_ubuf_ops = { + .complete = msg_zerocopy_complete, +}; +EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops); + int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg) @@ -1768,11 +1857,18 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct ubuf_info *orig_uarg = skb_zcopy(skb); int err, orig_len = skb->len; - /* An skb can only point to one uarg. This edge case happens when - * TCP appends to an skb, but zerocopy_realloc triggered a new alloc. - */ - if (orig_uarg && uarg != orig_uarg) - return -EEXIST; + if (uarg->ops->link_skb) { + err = uarg->ops->link_skb(skb, uarg); + if (err) + return err; + } else { + /* An skb can only point to one uarg. This edge case happens + * when TCP appends to an skb, but zerocopy_realloc triggered + * a new alloc. + */ + if (orig_uarg && uarg != orig_uarg) + return -EEXIST; + } err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len); if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { @@ -1786,7 +1882,8 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, return err; } - skb_zcopy_set(skb, uarg, NULL); + if (!uarg->ops->link_skb) + skb_zcopy_set(skb, uarg, NULL); return skb->len - orig_len; } EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); @@ -1906,10 +2003,11 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) /* skb frags point to kernel buffers */ for (i = 0; i < new_frags - 1; i++) { - __skb_fill_page_desc(skb, i, head, 0, psize); + __skb_fill_netmem_desc(skb, i, page_to_netmem(head), 0, psize); head = (struct page *)page_private(head); } - __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off); + __skb_fill_netmem_desc(skb, new_frags - 1, page_to_netmem(head), 0, + d_off); skb_shinfo(skb)->nr_frags = new_frags; release: @@ -1951,7 +2049,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; - n = kmem_cache_alloc(skbuff_cache, gfp_mask); + n = kmem_cache_alloc(net_hotdata.skbuff_cache, gfp_mask); if (!n) return NULL; @@ -2014,11 +2112,17 @@ static inline int skb_alloc_rx_flag(const struct sk_buff *skb) struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) { - int headerlen = skb_headroom(skb); - unsigned int size = skb_end_offset(skb) + skb->data_len; - struct sk_buff *n = __alloc_skb(size, gfp_mask, - skb_alloc_rx_flag(skb), NUMA_NO_NODE); + struct sk_buff *n; + unsigned int size; + int headerlen; + + if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) + return NULL; + headerlen = skb_headroom(skb); + size = skb_end_offset(skb) + skb->data_len; + n = __alloc_skb(size, gfp_mask, + skb_alloc_rx_flag(skb), NUMA_NO_NODE); if (!n) return NULL; @@ -2163,9 +2267,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); - skb_release_data(skb, SKB_CONSUMED, false); + skb_release_data(skb, SKB_CONSUMED); } else { - skb_free_head(skb, false); + skb_free_head(skb); } off = (data + nhead) - skb->head; @@ -2346,12 +2450,17 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, /* * Allocate the copy buffer */ - struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom, - gfp_mask, skb_alloc_rx_flag(skb), - NUMA_NO_NODE); - int oldheadroom = skb_headroom(skb); int head_copy_len, head_copy_off; + struct sk_buff *n; + int oldheadroom; + if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) + return NULL; + + oldheadroom = skb_headroom(skb); + n = __alloc_skb(newheadroom + skb->len + newtailroom, + gfp_mask, skb_alloc_rx_flag(skb), + NUMA_NO_NODE); if (!n) return NULL; @@ -3647,7 +3756,8 @@ skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) if (plen) { page = virt_to_head_page(from->head); offset = from->data - (unsigned char *)page_address(page); - __skb_fill_page_desc(to, 0, page, offset, plen); + __skb_fill_netmem_desc(to, 0, page_to_netmem(page), + offset, plen); get_page(page); j = 1; len -= plen; @@ -4889,7 +4999,7 @@ static void skb_extensions_init(void) {} void __init skb_init(void) { - skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache", + net_hotdata.skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache", sizeof(struct sk_buff), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC| @@ -4897,7 +5007,7 @@ void __init skb_init(void) offsetof(struct sk_buff, cb), sizeof_field(struct sk_buff, cb), NULL); - skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", + net_hotdata.skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", sizeof(struct sk_buff_fclones), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, @@ -4906,7 +5016,7 @@ void __init skb_init(void) * struct skb_shared_info is located at the end of skb->head, * and should not be copied to/from user. */ - skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head", + net_hotdata.skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head", SKB_SMALL_HEAD_CACHE_SIZE, 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, @@ -5779,7 +5889,7 @@ void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) { if (head_stolen) { skb_release_head_state(skb); - kmem_cache_free(skbuff_cache, skb); + kmem_cache_free(net_hotdata.skbuff_cache, skb); } else { __kfree_skb(skb); } @@ -6465,12 +6575,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, skb_frag_ref(skb, i); if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); - skb_release_data(skb, SKB_CONSUMED, false); + skb_release_data(skb, SKB_CONSUMED); } else { /* we can reuse existing recount- all we did was * relocate values */ - skb_free_head(skb, false); + skb_free_head(skb); } skb->head = data; @@ -6605,7 +6715,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, skb_kfree_head(data, size); return -ENOMEM; } - skb_release_data(skb, SKB_CONSUMED, false); + skb_release_data(skb, SKB_CONSUMED); skb->head = data; skb->head_frag = 0; @@ -6885,6 +6995,19 @@ free_now: EXPORT_SYMBOL(__skb_ext_put); #endif /* CONFIG_SKB_EXTENSIONS */ +static void kfree_skb_napi_cache(struct sk_buff *skb) +{ + /* if SKB is a clone, don't handle this case */ + if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { + __kfree_skb(skb); + return; + } + + local_bh_disable(); + __napi_kfree_skb(skb, SKB_CONSUMED); + local_bh_enable(); +} + /** * skb_attempt_defer_free - queue skb for remote freeing * @skb: buffer @@ -6900,10 +7023,10 @@ void skb_attempt_defer_free(struct sk_buff *skb) unsigned int defer_max; bool kick; - if (WARN_ON_ONCE(cpu >= nr_cpu_ids) || - !cpu_online(cpu) || - cpu == raw_smp_processor_id()) { -nodefer: __kfree_skb(skb); + if (cpu == raw_smp_processor_id() || + WARN_ON_ONCE(cpu >= nr_cpu_ids) || + !cpu_online(cpu)) { +nodefer: kfree_skb_napi_cache(skb); return; } @@ -6911,7 +7034,7 @@ nodefer: __kfree_skb(skb); DEBUG_NET_WARN_ON_ONCE(skb->destructor); sd = &per_cpu(softnet_data, cpu); - defer_max = READ_ONCE(sysctl_skb_defer_max); + defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); if (READ_ONCE(sd->defer_count) >= defer_max) goto nodefer; @@ -6929,8 +7052,8 @@ nodefer: __kfree_skb(skb); /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU * if we are unlucky enough (this seems very unlikely). */ - if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) - smp_call_function_single_async(cpu, &sd->defer_csd); + if (unlikely(kick)) + kick_defer_list_purge(sd, cpu); } static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, @@ -6963,7 +7086,7 @@ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, ssize_t maxsize, gfp_t gfp) { - size_t frag_limit = READ_ONCE(sysctl_max_skb_frags); + size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags); struct page *pages[8], **ppages = pages; ssize_t spliced = 0, ret = 0; unsigned int i; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 4d75ef9d24..bbf40b9997 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -434,7 +434,8 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, page = sg_page(sge); if (copied + copy > len) copy = len - copied; - copy = copy_page_to_iter(page, sge->offset, copy, iter); + if (copy) + copy = copy_page_to_iter(page, sge->offset, copy, iter); if (!copy) { copied = copied ? copied : -EFAULT; goto out; @@ -1226,11 +1227,8 @@ static void sk_psock_verdict_data_ready(struct sock *sk) rcu_read_lock(); psock = sk_psock(sk); - if (psock) { - read_lock_bh(&sk->sk_callback_lock); + if (psock) sk_psock_data_ready(sk, psock); - read_unlock_bh(&sk->sk_callback_lock); - } rcu_read_unlock(); } } diff --git a/net/core/sock.c b/net/core/sock.c index 9cf404e803..100e975073 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -127,6 +127,7 @@ #include <net/net_namespace.h> #include <net/request_sock.h> #include <net/sock.h> +#include <net/proto_memory.h> #include <linux/net_tstamp.h> #include <net/xfrm.h> #include <linux/ipsec.h> @@ -283,7 +284,6 @@ __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; EXPORT_SYMBOL(sysctl_rmem_max); __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; -int sysctl_mem_pcpu_rsv __read_mostly = SK_MEMORY_PCPU_RESERVE; int sysctl_tstamp_allow_data __read_mostly = 1; @@ -482,7 +482,7 @@ int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; - if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { + if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { atomic_inc(&sk->sk_drops); trace_sock_rcvqueue_full(sk, skb); return -ENOMEM; @@ -552,7 +552,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, skb->dev = NULL; - if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { + if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) { atomic_inc(&sk->sk_drops); goto discard_and_relse; } @@ -2053,8 +2053,9 @@ static void sock_copy(struct sock *nsk, const struct sock *osk) memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); - memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, - prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); + unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, + prot->obj_size - offsetof(struct sock, sk_dontcopy_end), + /* alloc is larger than struct, see sk_prot_alloc() */); #ifdef CONFIG_SECURITY_NETWORK nsk->sk_security = sptr; @@ -2525,13 +2526,12 @@ EXPORT_SYMBOL(skb_set_owner_w); static bool can_skb_orphan_partial(const struct sk_buff *skb) { -#ifdef CONFIG_TLS_DEVICE /* Drivers depend on in-order delivery for crypto offload, * partial orphan breaks out-of-order-OK logic. */ - if (skb->decrypted) + if (skb_is_decrypted(skb)) return false; -#endif + return (skb->destructor == sock_wfree || (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); } @@ -2583,8 +2583,18 @@ EXPORT_SYMBOL(sock_efree); #ifdef CONFIG_INET void sock_pfree(struct sk_buff *skb) { - if (sk_is_refcounted(skb->sk)) - sock_gen_put(skb->sk); + struct sock *sk = skb->sk; + + if (!sk_is_refcounted(sk)) + return; + + if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) { + inet_reqsk(sk)->rsk_listener = NULL; + reqsk_free(inet_reqsk(sk)); + return; + } + + sock_gen_put(sk); } EXPORT_SYMBOL(sock_pfree); #endif /* CONFIG_INET */ @@ -3231,8 +3241,8 @@ int sock_no_socketpair(struct socket *sock1, struct socket *sock2) } EXPORT_SYMBOL(sock_no_socketpair); -int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +int sock_no_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { return -EOPNOTSUPP; } @@ -3327,7 +3337,7 @@ static void sock_def_error_report(struct sock *sk) wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_poll(&wq->wait, EPOLLERR); - sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); + sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR); rcu_read_unlock(); } @@ -3342,7 +3352,7 @@ void sock_def_readable(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } @@ -3362,7 +3372,7 @@ static void sock_def_write_space(struct sock *sk) EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); @@ -3387,7 +3397,7 @@ static void sock_def_write_space_wfree(struct sock *sk) EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } } @@ -3732,6 +3742,9 @@ void sk_common_release(struct sock *sk) sk->sk_prot->unhash(sk); + if (sk->sk_socket) + sk->sk_socket->sk = NULL; + /* * In this point socket cannot receive new packets, but it is possible * that some packets are in flight because some CPU runs receiver and @@ -4224,3 +4237,65 @@ int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) return sock_ioctl_out(sk, cmd, arg); } EXPORT_SYMBOL(sk_ioctl); + +static int __init sock_struct_check(void) +{ + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog); + + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat); + + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); + + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags); + + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey); + + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag); + return 0; +} + +core_initcall(sock_struct_check); diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index c53b731f2d..6541228380 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -16,9 +16,10 @@ #include <linux/inet_diag.h> #include <linux/sock_diag.h> -static const struct sock_diag_handler *sock_diag_handlers[AF_MAX]; -static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh); -static DEFINE_MUTEX(sock_diag_table_mutex); +static const struct sock_diag_handler __rcu *sock_diag_handlers[AF_MAX]; + +static struct sock_diag_inet_compat __rcu *inet_rcv_compat; + static struct workqueue_struct *broadcast_wq; DEFINE_COOKIE(sock_cookie); @@ -122,6 +123,24 @@ static size_t sock_diag_nlmsg_size(void) + nla_total_size_64bit(sizeof(struct tcp_info))); /* INET_DIAG_INFO */ } +static const struct sock_diag_handler *sock_diag_lock_handler(int family) +{ + const struct sock_diag_handler *handler; + + rcu_read_lock(); + handler = rcu_dereference(sock_diag_handlers[family]); + if (handler && !try_module_get(handler->owner)) + handler = NULL; + rcu_read_unlock(); + + return handler; +} + +static void sock_diag_unlock_handler(const struct sock_diag_handler *handler) +{ + module_put(handler->owner); +} + static void sock_diag_broadcast_destroy_work(struct work_struct *work) { struct broadcast_sk *bsk = @@ -138,12 +157,12 @@ static void sock_diag_broadcast_destroy_work(struct work_struct *work) if (!skb) goto out; - mutex_lock(&sock_diag_table_mutex); - hndl = sock_diag_handlers[sk->sk_family]; - if (hndl && hndl->get_info) - err = hndl->get_info(skb, sk); - mutex_unlock(&sock_diag_table_mutex); - + hndl = sock_diag_lock_handler(sk->sk_family); + if (hndl) { + if (hndl->get_info) + err = hndl->get_info(skb, sk); + sock_diag_unlock_handler(hndl); + } if (!err) nlmsg_multicast(sock_net(sk)->diag_nlsk, skb, 0, group, GFP_KERNEL); @@ -166,51 +185,45 @@ void sock_diag_broadcast_destroy(struct sock *sk) queue_work(broadcast_wq, &bsk->work); } -void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) +void sock_diag_register_inet_compat(const struct sock_diag_inet_compat *ptr) { - mutex_lock(&sock_diag_table_mutex); - inet_rcv_compat = fn; - mutex_unlock(&sock_diag_table_mutex); + xchg((__force const struct sock_diag_inet_compat **)&inet_rcv_compat, + ptr); } EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat); -void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) +void sock_diag_unregister_inet_compat(const struct sock_diag_inet_compat *ptr) { - mutex_lock(&sock_diag_table_mutex); - inet_rcv_compat = NULL; - mutex_unlock(&sock_diag_table_mutex); + const struct sock_diag_inet_compat *old; + + old = xchg((__force const struct sock_diag_inet_compat **)&inet_rcv_compat, + NULL); + WARN_ON_ONCE(old != ptr); } EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat); int sock_diag_register(const struct sock_diag_handler *hndl) { - int err = 0; + int family = hndl->family; - if (hndl->family >= AF_MAX) + if (family >= AF_MAX) return -EINVAL; - mutex_lock(&sock_diag_table_mutex); - if (sock_diag_handlers[hndl->family]) - err = -EBUSY; - else - WRITE_ONCE(sock_diag_handlers[hndl->family], hndl); - mutex_unlock(&sock_diag_table_mutex); - - return err; + return !cmpxchg((const struct sock_diag_handler **) + &sock_diag_handlers[family], + NULL, hndl) ? 0 : -EBUSY; } EXPORT_SYMBOL_GPL(sock_diag_register); -void sock_diag_unregister(const struct sock_diag_handler *hnld) +void sock_diag_unregister(const struct sock_diag_handler *hndl) { - int family = hnld->family; + int family = hndl->family; if (family >= AF_MAX) return; - mutex_lock(&sock_diag_table_mutex); - BUG_ON(sock_diag_handlers[family] != hnld); - WRITE_ONCE(sock_diag_handlers[family], NULL); - mutex_unlock(&sock_diag_table_mutex); + xchg((const struct sock_diag_handler **)&sock_diag_handlers[family], + NULL); } EXPORT_SYMBOL_GPL(sock_diag_unregister); @@ -227,20 +240,20 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; req->sdiag_family = array_index_nospec(req->sdiag_family, AF_MAX); - if (READ_ONCE(sock_diag_handlers[req->sdiag_family]) == NULL) + if (!rcu_access_pointer(sock_diag_handlers[req->sdiag_family])) sock_load_diag_module(req->sdiag_family, 0); - mutex_lock(&sock_diag_table_mutex); - hndl = sock_diag_handlers[req->sdiag_family]; + hndl = sock_diag_lock_handler(req->sdiag_family); if (hndl == NULL) - err = -ENOENT; - else if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY) + return -ENOENT; + + if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY) err = hndl->dump(skb, nlh); else if (nlh->nlmsg_type == SOCK_DESTROY && hndl->destroy) err = hndl->destroy(skb, nlh); else err = -EOPNOTSUPP; - mutex_unlock(&sock_diag_table_mutex); + sock_diag_unlock_handler(hndl); return err; } @@ -248,20 +261,27 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh) static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + const struct sock_diag_inet_compat *ptr; int ret; switch (nlh->nlmsg_type) { case TCPDIAG_GETSOCK: case DCCPDIAG_GETSOCK: - if (inet_rcv_compat == NULL) + + if (!rcu_access_pointer(inet_rcv_compat)) sock_load_diag_module(AF_INET, 0); - mutex_lock(&sock_diag_table_mutex); - if (inet_rcv_compat != NULL) - ret = inet_rcv_compat(skb, nlh); - else - ret = -EOPNOTSUPP; - mutex_unlock(&sock_diag_table_mutex); + rcu_read_lock(); + ptr = rcu_dereference(inet_rcv_compat); + if (ptr && !try_module_get(ptr->owner)) + ptr = NULL; + rcu_read_unlock(); + + ret = -EOPNOTSUPP; + if (ptr) { + ret = ptr->fn(skb, nlh); + module_put(ptr->owner); + } return ret; case SOCK_DIAG_BY_FAMILY: @@ -272,13 +292,9 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, } } -static DEFINE_MUTEX(sock_diag_mutex); - static void sock_diag_rcv(struct sk_buff *skb) { - mutex_lock(&sock_diag_mutex); netlink_rcv_skb(skb, &sock_diag_rcv_msg); - mutex_unlock(&sock_diag_mutex); } static int sock_diag_bind(struct net *net, int group) @@ -286,12 +302,12 @@ static int sock_diag_bind(struct net *net, int group) switch (group) { case SKNLGRP_INET_TCP_DESTROY: case SKNLGRP_INET_UDP_DESTROY: - if (!READ_ONCE(sock_diag_handlers[AF_INET])) + if (!rcu_access_pointer(sock_diag_handlers[AF_INET])) sock_load_diag_module(AF_INET, 0); break; case SKNLGRP_INET6_TCP_DESTROY: case SKNLGRP_INET6_UDP_DESTROY: - if (!READ_ONCE(sock_diag_handlers[AF_INET6])) + if (!rcu_access_pointer(sock_diag_handlers[AF_INET6])) sock_load_diag_module(AF_INET6, 0); break; } diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 8598466a38..d3dbb92153 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -24,8 +24,16 @@ struct bpf_stab { #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +/* This mutex is used to + * - protect race between prog/link attach/detach and link prog update, and + * - protect race between releasing and accessing map in bpf_link. + * A single global mutex lock is used since it is expected contention is low. + */ +static DEFINE_MUTEX(sockmap_mutex); + static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which); + struct bpf_prog *old, struct bpf_link *link, + u32 which); static struct sk_psock_progs *sock_map_progs(struct bpf_map *map); static struct bpf_map *sock_map_alloc(union bpf_attr *attr) @@ -71,7 +79,9 @@ int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - ret = sock_map_prog_update(map, prog, NULL, attr->attach_type); + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, prog, NULL, NULL, attr->attach_type); + mutex_unlock(&sockmap_mutex); fdput(f); return ret; } @@ -103,7 +113,9 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) goto put_prog; } - ret = sock_map_prog_update(map, NULL, prog, attr->attach_type); + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type); + mutex_unlock(&sockmap_mutex); put_prog: bpf_prog_put(prog); put_map: @@ -411,9 +423,6 @@ static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, struct sock *sk; int err = 0; - if (irqs_disabled()) - return -EOPNOTSUPP; /* locks here are hardirq-unsafe */ - spin_lock_bh(&stab->lock); sk = *psk; if (!sk_test || sk_test == sk) @@ -936,9 +945,6 @@ static long sock_hash_delete_elem(struct bpf_map *map, void *key) struct bpf_shtab_elem *elem; int ret = -ENOENT; - if (irqs_disabled()) - return -EOPNOTSUPP; /* locks here are hardirq-unsafe */ - hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); @@ -1460,55 +1466,84 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) return NULL; } -static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog, - u32 which) +static int sock_map_prog_link_lookup(struct bpf_map *map, struct bpf_prog ***pprog, + struct bpf_link ***plink, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); + struct bpf_prog **cur_pprog; + struct bpf_link **cur_plink; if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: - *pprog = &progs->msg_parser; + cur_pprog = &progs->msg_parser; + cur_plink = &progs->msg_parser_link; break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: - *pprog = &progs->stream_parser; + cur_pprog = &progs->stream_parser; + cur_plink = &progs->stream_parser_link; break; #endif case BPF_SK_SKB_STREAM_VERDICT: if (progs->skb_verdict) return -EBUSY; - *pprog = &progs->stream_verdict; + cur_pprog = &progs->stream_verdict; + cur_plink = &progs->stream_verdict_link; break; case BPF_SK_SKB_VERDICT: if (progs->stream_verdict) return -EBUSY; - *pprog = &progs->skb_verdict; + cur_pprog = &progs->skb_verdict; + cur_plink = &progs->skb_verdict_link; break; default: return -EOPNOTSUPP; } + *pprog = cur_pprog; + if (plink) + *plink = cur_plink; return 0; } +/* Handle the following four cases: + * prog_attach: prog != NULL, old == NULL, link == NULL + * prog_detach: prog == NULL, old != NULL, link == NULL + * link_attach: prog != NULL, old == NULL, link != NULL + * link_detach: prog == NULL, old != NULL, link != NULL + */ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which) + struct bpf_prog *old, struct bpf_link *link, + u32 which) { struct bpf_prog **pprog; + struct bpf_link **plink; int ret; - ret = sock_map_prog_lookup(map, &pprog, which); + ret = sock_map_prog_link_lookup(map, &pprog, &plink, which); if (ret) return ret; - if (old) - return psock_replace_prog(pprog, prog, old); + /* for prog_attach/prog_detach/link_attach, return error if a bpf_link + * exists for that prog. + */ + if ((!link || prog) && *plink) + return -EBUSY; - psock_set_prog(pprog, prog); - return 0; + if (old) { + ret = psock_replace_prog(pprog, prog, old); + if (!ret) + *plink = NULL; + } else { + psock_set_prog(pprog, prog); + if (link) + *plink = link; + } + + return ret; } int sock_map_bpf_prog_query(const union bpf_attr *attr, @@ -1533,7 +1568,7 @@ int sock_map_bpf_prog_query(const union bpf_attr *attr, rcu_read_lock(); - ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type); + ret = sock_map_prog_link_lookup(map, &pprog, NULL, attr->query.attach_type); if (ret) goto end; @@ -1639,19 +1674,23 @@ void sock_map_close(struct sock *sk, long timeout) lock_sock(sk); rcu_read_lock(); - psock = sk_psock_get(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - release_sock(sk); - saved_close = READ_ONCE(sk->sk_prot)->close; - } else { + psock = sk_psock(sk); + if (likely(psock)) { saved_close = psock->saved_close; sock_map_remove_links(sk, psock); + psock = sk_psock_get(sk); + if (unlikely(!psock)) + goto no_psock; rcu_read_unlock(); sk_psock_stop(psock); release_sock(sk); cancel_delayed_work_sync(&psock->work); sk_psock_put(sk, psock); + } else { + saved_close = READ_ONCE(sk->sk_prot)->close; +no_psock: + rcu_read_unlock(); + release_sock(sk); } /* Make sure we do not recurse. This is a bug. @@ -1663,6 +1702,196 @@ void sock_map_close(struct sock *sk, long timeout) } EXPORT_SYMBOL_GPL(sock_map_close); +struct sockmap_link { + struct bpf_link link; + struct bpf_map *map; + enum bpf_attach_type attach_type; +}; + +static void sock_map_link_release(struct bpf_link *link) +{ + struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + + mutex_lock(&sockmap_mutex); + if (!sockmap_link->map) + goto out; + + WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link, + sockmap_link->attach_type)); + + bpf_map_put_with_uref(sockmap_link->map); + sockmap_link->map = NULL; +out: + mutex_unlock(&sockmap_mutex); +} + +static int sock_map_link_detach(struct bpf_link *link) +{ + sock_map_link_release(link); + return 0; +} + +static void sock_map_link_dealloc(struct bpf_link *link) +{ + kfree(link); +} + +/* Handle the following two cases: + * case 1: link != NULL, prog != NULL, old != NULL + * case 2: link != NULL, prog != NULL, old == NULL + */ +static int sock_map_link_update_prog(struct bpf_link *link, + struct bpf_prog *prog, + struct bpf_prog *old) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + struct bpf_prog **pprog, *old_link_prog; + struct bpf_link **plink; + int ret = 0; + + mutex_lock(&sockmap_mutex); + + /* If old prog is not NULL, ensure old prog is the same as link->prog. */ + if (old && link->prog != old) { + ret = -EPERM; + goto out; + } + /* Ensure link->prog has the same type/attach_type as the new prog. */ + if (link->prog->type != prog->type || + link->prog->expected_attach_type != prog->expected_attach_type) { + ret = -EINVAL; + goto out; + } + + ret = sock_map_prog_link_lookup(sockmap_link->map, &pprog, &plink, + sockmap_link->attach_type); + if (ret) + goto out; + + /* return error if the stored bpf_link does not match the incoming bpf_link. */ + if (link != *plink) { + ret = -EBUSY; + goto out; + } + + if (old) { + ret = psock_replace_prog(pprog, prog, old); + if (ret) + goto out; + } else { + psock_set_prog(pprog, prog); + } + + bpf_prog_inc(prog); + old_link_prog = xchg(&link->prog, prog); + bpf_prog_put(old_link_prog); + +out: + mutex_unlock(&sockmap_mutex); + return ret; +} + +static u32 sock_map_link_get_map_id(const struct sockmap_link *sockmap_link) +{ + u32 map_id = 0; + + mutex_lock(&sockmap_mutex); + if (sockmap_link->map) + map_id = sockmap_link->map->id; + mutex_unlock(&sockmap_mutex); + return map_id; +} + +static int sock_map_link_fill_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + u32 map_id = sock_map_link_get_map_id(sockmap_link); + + info->sockmap.map_id = map_id; + info->sockmap.attach_type = sockmap_link->attach_type; + return 0; +} + +static void sock_map_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + u32 map_id = sock_map_link_get_map_id(sockmap_link); + + seq_printf(seq, "map_id:\t%u\n", map_id); + seq_printf(seq, "attach_type:\t%u\n", sockmap_link->attach_type); +} + +static const struct bpf_link_ops sock_map_link_ops = { + .release = sock_map_link_release, + .dealloc = sock_map_link_dealloc, + .detach = sock_map_link_detach, + .update_prog = sock_map_link_update_prog, + .fill_link_info = sock_map_link_fill_info, + .show_fdinfo = sock_map_link_show_fdinfo, +}; + +int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_link_primer link_primer; + struct sockmap_link *sockmap_link; + enum bpf_attach_type attach_type; + struct bpf_map *map; + int ret; + + if (attr->link_create.flags) + return -EINVAL; + + map = bpf_map_get_with_uref(attr->link_create.target_fd); + if (IS_ERR(map)) + return PTR_ERR(map); + if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) { + ret = -EINVAL; + goto out; + } + + sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER); + if (!sockmap_link) { + ret = -ENOMEM; + goto out; + } + + attach_type = attr->link_create.attach_type; + bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog); + sockmap_link->map = map; + sockmap_link->attach_type = attach_type; + + ret = bpf_link_prime(&sockmap_link->link, &link_primer); + if (ret) { + kfree(sockmap_link); + goto out; + } + + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type); + mutex_unlock(&sockmap_mutex); + if (ret) { + bpf_link_cleanup(&link_primer); + goto out; + } + + /* Increase refcnt for the prog since when old prog is replaced with + * psock_replace_prog() and psock_set_prog() its refcnt will be decreased. + * + * Actually, we do not need to increase refcnt for the prog since bpf_link + * will hold a reference. But in order to have less complexity w.r.t. + * replacing/setting prog, let us increase the refcnt to make things simpler. + */ + bpf_prog_inc(prog); + + return bpf_link_settle(&link_primer); + +out: + bpf_map_put_with_uref(map); + return ret; +} + static int sock_map_iter_attach_target(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 986f15e5d6..c9fb9ad874 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -23,6 +23,9 @@ #include <net/net_ratelimit.h> #include <net/busy_poll.h> #include <net/pkt_sched.h> +#include <net/hotdata.h> +#include <net/proto_memory.h> +#include <net/rps.h> #include "dev.h" @@ -138,7 +141,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, mutex_lock(&sock_flow_mutex); - orig_sock_table = rcu_dereference_protected(rps_sock_flow_table, + orig_sock_table = rcu_dereference_protected( + net_hotdata.rps_sock_flow_table, lockdep_is_held(&sock_flow_mutex)); size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; @@ -159,7 +163,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, mutex_unlock(&sock_flow_mutex); return -ENOMEM; } - rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; + net_hotdata.rps_cpu_mask = + roundup_pow_of_two(nr_cpu_ids) - 1; sock_table->mask = size - 1; } else sock_table = orig_sock_table; @@ -170,7 +175,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, sock_table = NULL; if (sock_table != orig_sock_table) { - rcu_assign_pointer(rps_sock_flow_table, sock_table); + rcu_assign_pointer(net_hotdata.rps_sock_flow_table, + sock_table); if (sock_table) { static_branch_inc(&rps_needed); static_branch_inc(&rfs_needed); @@ -300,8 +306,8 @@ static int proc_do_dev_weight(struct ctl_table *table, int write, ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret && write) { weight = READ_ONCE(weight_p); - WRITE_ONCE(dev_rx_weight, weight * dev_weight_rx_bias); - WRITE_ONCE(dev_tx_weight, weight * dev_weight_tx_bias); + WRITE_ONCE(net_hotdata.dev_rx_weight, weight * dev_weight_rx_bias); + WRITE_ONCE(net_hotdata.dev_tx_weight, weight * dev_weight_tx_bias); } mutex_unlock(&dev_weight_mutex); @@ -410,7 +416,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "mem_pcpu_rsv", - .data = &sysctl_mem_pcpu_rsv, + .data = &net_hotdata.sysctl_mem_pcpu_rsv, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -439,7 +445,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "netdev_max_backlog", - .data = &netdev_max_backlog, + .data = &net_hotdata.max_backlog, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec @@ -498,7 +504,7 @@ static struct ctl_table net_core_table[] = { #endif { .procname = "netdev_tstamp_prequeue", - .data = &netdev_tstamp_prequeue, + .data = &net_hotdata.tstamp_prequeue, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec @@ -576,7 +582,7 @@ static struct ctl_table net_core_table[] = { #endif { .procname = "netdev_budget", - .data = &netdev_budget, + .data = &net_hotdata.netdev_budget, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec @@ -590,7 +596,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "max_skb_frags", - .data = &sysctl_max_skb_frags, + .data = &net_hotdata.sysctl_max_skb_frags, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -599,7 +605,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "netdev_budget_usecs", - .data = &netdev_budget_usecs, + .data = &net_hotdata.netdev_budget_usecs, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -632,7 +638,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "gro_normal_batch", - .data = &gro_normal_batch, + .data = &net_hotdata.gro_normal_batch, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -649,13 +655,12 @@ static struct ctl_table net_core_table[] = { }, { .procname = "skb_defer_max", - .data = &sysctl_skb_defer_max, + .data = &net_hotdata.sysctl_skb_defer_max, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, - { } }; static struct ctl_table netns_core_table[] = { @@ -692,7 +697,6 @@ static struct ctl_table netns_core_table[] = { .extra2 = SYSCTL_ONE, .proc_handler = proc_dou8vec_minmax, }, - { } }; static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str) @@ -710,20 +714,21 @@ __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup); static __net_init int sysctl_core_net_init(struct net *net) { - struct ctl_table *tbl, *tmp; + size_t table_size = ARRAY_SIZE(netns_core_table); + struct ctl_table *tbl; tbl = netns_core_table; if (!net_eq(net, &init_net)) { + int i; tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); if (tbl == NULL) goto err_dup; - for (tmp = tbl; tmp->procname; tmp++) - tmp->data += (char *)net - (char *)&init_net; + for (i = 0; i < table_size; ++i) + tbl[i].data += (char *)net - (char *)&init_net; } - net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, - ARRAY_SIZE(netns_core_table)); + net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, table_size); if (net->core.sysctl_hdr == NULL) goto err_reg; @@ -738,7 +743,7 @@ err_dup: static __net_exit void sysctl_core_net_exit(struct net *net) { - struct ctl_table *tbl; + const struct ctl_table *tbl; tbl = net->core.sysctl_hdr->ctl_table_arg; unregister_net_sysctl_table(net->core.sysctl_hdr); diff --git a/net/core/xdp.c b/net/core/xdp.c index 4869c1c2d8..bcc5551c64 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -16,6 +16,7 @@ #include <linux/bug.h> #include <net/page_pool/helpers.h> +#include <net/hotdata.h> #include <net/xdp.h> #include <net/xdp_priv.h> /* struct xdp_mem_allocator */ #include <trace/events/xdp.h> @@ -75,7 +76,7 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) xa = container_of(rcu, struct xdp_mem_allocator, rcu); /* Allow this ID to be reused */ - ida_simple_remove(&mem_id_pool, xa->mem.id); + ida_free(&mem_id_pool, xa->mem.id); kfree(xa); } @@ -126,10 +127,8 @@ void xdp_unreg_mem_model(struct xdp_mem_info *mem) return; if (type == MEM_TYPE_PAGE_POOL) { - rcu_read_lock(); - xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); + xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); page_pool_destroy(xa->page_pool); - rcu_read_unlock(); } } EXPORT_SYMBOL_GPL(xdp_unreg_mem_model); @@ -242,7 +241,7 @@ static int __mem_id_cyclic_get(gfp_t gfp) int id; again: - id = ida_simple_get(&mem_id_pool, mem_id_next, MEM_ID_MAX, gfp); + id = ida_alloc_range(&mem_id_pool, mem_id_next, MEM_ID_MAX - 1, gfp); if (id < 0) { if (id == -ENOSPC) { /* Cyclic allocator, reset next id */ @@ -294,10 +293,8 @@ static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem, mutex_lock(&mem_id_lock); ret = __mem_id_init_hash_table(); mutex_unlock(&mem_id_lock); - if (ret < 0) { - WARN_ON(1); + if (ret < 0) return ERR_PTR(ret); - } } xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp); @@ -317,7 +314,7 @@ static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem, /* Insert allocator into ID lookup table */ ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node); if (IS_ERR(ptr)) { - ida_simple_remove(&mem_id_pool, mem->id); + ida_free(&mem_id_pool, mem->id); mem->id = 0; errno = PTR_ERR(ptr); goto err; @@ -589,7 +586,7 @@ EXPORT_SYMBOL_GPL(xdp_warn); int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp) { - n_skb = kmem_cache_alloc_bulk(skbuff_cache, gfp, n_skb, skbs); + n_skb = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, gfp, n_skb, skbs); if (unlikely(!n_skb)) return -ENOMEM; @@ -658,7 +655,7 @@ struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, { struct sk_buff *skb; - skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC); + skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC); if (unlikely(!skb)) return NULL; @@ -771,11 +768,11 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, __bpf_kfunc_end_defs(); -BTF_SET8_START(xdp_metadata_kfunc_ids) +BTF_KFUNCS_START(xdp_metadata_kfunc_ids) #define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS) XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC -BTF_SET8_END(xdp_metadata_kfunc_ids) +BTF_KFUNCS_END(xdp_metadata_kfunc_ids) static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index c4bbac9974..1cba001bb4 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -376,15 +376,11 @@ EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup); int __init dccp_ackvec_init(void) { - dccp_ackvec_slab = kmem_cache_create("dccp_ackvec", - sizeof(struct dccp_ackvec), 0, - SLAB_HWCACHE_ALIGN, NULL); + dccp_ackvec_slab = KMEM_CACHE(dccp_ackvec, SLAB_HWCACHE_ALIGN); if (dccp_ackvec_slab == NULL) goto out_err; - dccp_ackvec_record_slab = kmem_cache_create("dccp_ackvec_record", - sizeof(struct dccp_ackvec_record), - 0, SLAB_HWCACHE_ALIGN, NULL); + dccp_ackvec_record_slab = KMEM_CACHE(dccp_ackvec_record, SLAB_HWCACHE_ALIGN); if (dccp_ackvec_record_slab == NULL) goto out_destroy_slab; diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig index a3eeb84d16..e3d388c33d 100644 --- a/net/dccp/ccids/Kconfig +++ b/net/dccp/ccids/Kconfig @@ -13,7 +13,7 @@ config IP_DCCP_CCID2_DEBUG config IP_DCCP_CCID3 bool "CCID-3 (TCP-Friendly)" - def_bool y if (IP_DCCP = y || IP_DCCP = m) + default IP_DCCP = y || IP_DCCP = m help CCID-3 denotes TCP-Friendly Rate Control (TFRC), an equation-based rate-controlled congestion control mechanism. TFRC is designed to diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 4d9823d6dc..d6b30700af 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -353,6 +353,7 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) /** * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm * @sk: socket to perform estimator on + * @mrtt: measured RTT * * This code is almost identical with TCP's tcp_rtt_estimator(), since * - it has a higher sampling frequency (recommended by RFC 1323), diff --git a/net/dccp/diag.c b/net/dccp/diag.c index 8a82c5a2c5..f5019d95c3 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c @@ -58,6 +58,7 @@ static int dccp_diag_dump_one(struct netlink_callback *cb, } static const struct inet_diag_handler dccp_diag_handler = { + .owner = THIS_MODULE, .dump = dccp_diag_dump, .dump_one = dccp_diag_dump_one, .idiag_get_info = dccp_diag_get_info, diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 44b033fe1e..5926159a6f 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -24,6 +24,7 @@ #include <net/xfrm.h> #include <net/secure_seq.h> #include <net/netns/generic.h> +#include <net/rstreason.h> #include "ackvec.h" #include "ccid.h" @@ -521,7 +522,8 @@ out: return err; } -static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) +static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb, + enum sk_rst_reason reason) { int err; const struct iphdr *rxiph; @@ -655,8 +657,11 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (dccp_v4_send_response(sk, req)) goto drop_and_free; - inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); - reqsk_put(req); + if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT))) + reqsk_free(req); + else + reqsk_put(req); + return 0; drop_and_free: @@ -706,7 +711,7 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; reset: - dccp_v4_ctl_send_reset(sk, skb); + dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); kfree_skb(skb); return 0; } @@ -869,7 +874,7 @@ lookup: if (nsk == sk) { reqsk_put(req); } else if (dccp_child_process(sk, nsk, skb)) { - dccp_v4_ctl_send_reset(sk, skb); + dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); goto discard_and_relse; } else { sock_put(sk); @@ -909,7 +914,7 @@ no_dccp_socket: if (dh->dccph_type != DCCP_PKT_RESET) { DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; - dccp_v4_ctl_send_reset(sk, skb); + dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); } discard_it: @@ -1039,7 +1044,7 @@ static void __net_exit dccp_v4_exit_net(struct net *net) static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list) { - inet_twsk_purge(&dccp_hashinfo, AF_INET); + inet_twsk_purge(&dccp_hashinfo); } static struct pernet_operations dccp_v4_ops = { diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index ded07e09f8..da5dba120b 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -29,6 +29,7 @@ #include <net/secure_seq.h> #include <net/netns/generic.h> #include <net/sock.h> +#include <net/rstreason.h> #include "dccp.h" #include "ipv6.h" @@ -256,7 +257,8 @@ static void dccp_v6_reqsk_destructor(struct request_sock *req) kfree_skb(inet_rsk(req)->pktopts); } -static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) +static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb, + enum sk_rst_reason reason) { const struct ipv6hdr *rxip6h; struct sk_buff *skb; @@ -398,8 +400,11 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (dccp_v6_send_response(sk, req)) goto drop_and_free; - inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); - reqsk_put(req); + if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT))) + reqsk_free(req); + else + reqsk_put(req); + return 0; drop_and_free: @@ -656,7 +661,7 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; reset: - dccp_v6_ctl_send_reset(sk, skb); + dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); discard: if (opt_skb != NULL) __kfree_skb(opt_skb); @@ -762,7 +767,7 @@ lookup: if (nsk == sk) { reqsk_put(req); } else if (dccp_child_process(sk, nsk, skb)) { - dccp_v6_ctl_send_reset(sk, skb); + dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); goto discard_and_relse; } else { sock_put(sk); @@ -801,7 +806,7 @@ no_dccp_socket: if (dh->dccph_type != DCCP_PKT_RESET) { DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; - dccp_v6_ctl_send_reset(sk, skb); + dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); } discard_it: @@ -1119,15 +1124,9 @@ static void __net_exit dccp_v6_exit_net(struct net *net) inet_ctl_sock_destroy(pn->v6_ctl_sk); } -static void __net_exit dccp_v6_exit_batch(struct list_head *net_exit_list) -{ - inet_twsk_purge(&dccp_hashinfo, AF_INET6); -} - static struct pernet_operations dccp_v6_ops = { .init = dccp_v6_init_net, .exit = dccp_v6_exit_net, - .exit_batch = dccp_v6_exit_batch, .id = &dccp_v6_pernet_id, .size = sizeof(struct dccp_v6_pernet), }; diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 64d805b27a..251a57cf58 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -15,6 +15,7 @@ #include <net/sock.h> #include <net/xfrm.h> #include <net/inet_timewait_sock.h> +#include <net/rstreason.h> #include "ackvec.h" #include "ccid.h" @@ -202,7 +203,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; drop: if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET) - req->rsk_ops->send_reset(sk, skb); + req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); inet_csk_reqsk_queue_drop(sk, req); out: diff --git a/net/dccp/output.c b/net/dccp/output.c index fd2eb148d2..5c2e24f3c3 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -204,7 +204,7 @@ void dccp_write_space(struct sock *sk) wake_up_interruptible(&wq->wait); /* Should agree with poll, otherwise some programs break */ if (sock_writeable(sk)) - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index ee8d4f5afa..3fc474d6e5 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c @@ -90,8 +90,6 @@ static struct ctl_table dccp_default_table[] = { .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, - - { } }; static struct ctl_table_header *dccp_table_header; diff --git a/net/devlink/core.c b/net/devlink/core.c index 7f0b093208..f49cd83f19 100644 --- a/net/devlink/core.c +++ b/net/devlink/core.c @@ -314,7 +314,7 @@ static void devlink_release(struct work_struct *work) mutex_destroy(&devlink->lock); lockdep_unregister_key(&devlink->lock_key); put_device(devlink->dev); - kfree(devlink); + kvfree(devlink); } void devlink_put(struct devlink *devlink) @@ -420,7 +420,7 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, if (!devlink_reload_actions_valid(ops)) return NULL; - devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL); + devlink = kvzalloc(struct_size(devlink, priv, priv_size), GFP_KERNEL); if (!devlink) return NULL; @@ -455,7 +455,7 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, return devlink; err_xa_alloc: - kfree(devlink); + kvfree(devlink); return NULL; } EXPORT_SYMBOL_GPL(devlink_alloc_ns); diff --git a/net/devlink/dev.c b/net/devlink/dev.c index 19dbf54074..13c73f50da 100644 --- a/net/devlink/dev.c +++ b/net/devlink/dev.c @@ -1202,23 +1202,19 @@ static void __devlink_compat_running_version(struct devlink *devlink, if (err) goto free_msg; - nla_for_each_attr(nlattr, (void *)msg->data, msg->len, rem) { + nla_for_each_attr_type(nlattr, DEVLINK_ATTR_INFO_VERSION_RUNNING, + (void *)msg->data, msg->len, rem) { const struct nlattr *kv; int rem_kv; - if (nla_type(nlattr) != DEVLINK_ATTR_INFO_VERSION_RUNNING) - continue; - - nla_for_each_nested(kv, nlattr, rem_kv) { - if (nla_type(kv) != DEVLINK_ATTR_INFO_VERSION_VALUE) - continue; - + nla_for_each_nested_type(kv, DEVLINK_ATTR_INFO_VERSION_VALUE, + nlattr, rem_kv) { strlcat(buf, nla_data(kv), len); strlcat(buf, " ", len); } } free_msg: - nlmsg_free(msg); + nlmsg_consume(msg); } void devlink_compat_running_version(struct devlink *devlink, diff --git a/net/devlink/param.c b/net/devlink/param.c index 22bc3b5005..dcf0d1cceb 100644 --- a/net/devlink/param.c +++ b/net/devlink/param.c @@ -158,11 +158,12 @@ static int devlink_param_get(struct devlink *devlink, static int devlink_param_set(struct devlink *devlink, const struct devlink_param *param, - struct devlink_param_gset_ctx *ctx) + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack) { if (!param->set) return -EOPNOTSUPP; - return param->set(devlink, param->id, ctx); + return param->set(devlink, param->id, ctx, extack); } static int @@ -571,7 +572,7 @@ static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, return -EOPNOTSUPP; ctx.val = value; ctx.cmode = cmode; - err = devlink_param_set(devlink, param, &ctx); + err = devlink_param_set(devlink, param, &ctx, info->extack); if (err) return err; } diff --git a/net/devlink/port.c b/net/devlink/port.c index 118d130d2a..be9158b445 100644 --- a/net/devlink/port.c +++ b/net/devlink/port.c @@ -16,6 +16,7 @@ static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ DEVLINK_PORT_FN_STATE_ACTIVE), [DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK), + [DEVLINK_PORT_FN_ATTR_MAX_IO_EQS] = { .type = NLA_U32 }, }; #define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port) \ @@ -182,6 +183,30 @@ static int devlink_port_fn_caps_fill(struct devlink_port *devlink_port, return 0; } +static int devlink_port_fn_max_io_eqs_fill(struct devlink_port *port, + struct sk_buff *msg, + struct netlink_ext_ack *extack, + bool *msg_updated) +{ + u32 max_io_eqs; + int err; + + if (!port->ops->port_fn_max_io_eqs_get) + return 0; + + err = port->ops->port_fn_max_io_eqs_get(port, &max_io_eqs, extack); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + err = nla_put_u32(msg, DEVLINK_PORT_FN_ATTR_MAX_IO_EQS, max_io_eqs); + if (err) + return err; + *msg_updated = true; + return 0; +} + int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port) { if (devlink_nl_put_handle(msg, devlink_port->devlink)) @@ -410,6 +435,18 @@ static int devlink_port_fn_caps_set(struct devlink_port *devlink_port, } static int +devlink_port_fn_max_io_eqs_set(struct devlink_port *devlink_port, + const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + u32 max_io_eqs; + + max_io_eqs = nla_get_u32(attr); + return devlink_port->ops->port_fn_max_io_eqs_set(devlink_port, + max_io_eqs, extack); +} + +static int devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port, struct netlink_ext_ack *extack) { @@ -430,6 +467,9 @@ devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *por err = devlink_port_fn_state_fill(port, msg, extack, &msg_updated); if (err) goto out; + err = devlink_port_fn_max_io_eqs_fill(port, msg, extack, &msg_updated); + if (err) + goto out; err = devlink_rel_devlink_handle_put(msg, port->devlink, port->rel_index, DEVLINK_PORT_FN_ATTR_DEVLINK, @@ -726,6 +766,12 @@ static int devlink_port_function_validate(struct devlink_port *devlink_port, } } } + if (tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS] && + !ops->port_fn_max_io_eqs_set) { + NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS], + "Function does not support max_io_eqs setting"); + return -EOPNOTSUPP; + } return 0; } @@ -761,6 +807,13 @@ static int devlink_port_function_set(struct devlink_port *port, return err; } + attr = tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS]; + if (attr) { + err = devlink_port_fn_max_io_eqs_set(port, attr, extack); + if (err) + return err; + } + /* Keep this as the last function attribute set, so that when * multiple port function attributes are set along with state, * Those can be applied first before activating the state. diff --git a/net/dsa/devlink.c b/net/dsa/devlink.c index 431bf52290..0aac887d00 100644 --- a/net/dsa/devlink.c +++ b/net/dsa/devlink.c @@ -194,7 +194,8 @@ int dsa_devlink_param_get(struct devlink *dl, u32 id, EXPORT_SYMBOL_GPL(dsa_devlink_param_get); int dsa_devlink_param_set(struct devlink *dl, u32 id, - struct devlink_param_gset_ctx *ctx) + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack) { struct dsa_switch *ds = dsa_devlink_to_ds(dl); diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index ac7be864e8..12521a7d40 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -15,7 +15,6 @@ #include <linux/slab.h> #include <linux/rtnetlink.h> #include <linux/of.h> -#include <linux/of_mdio.h> #include <linux/of_net.h> #include <net/dsa_stubs.h> #include <net/sch_generic.h> @@ -626,7 +625,6 @@ static void dsa_switch_teardown_tag_protocol(struct dsa_switch *ds) static int dsa_switch_setup(struct dsa_switch *ds) { - struct device_node *dn; int err; if (ds->setup) @@ -666,10 +664,7 @@ static int dsa_switch_setup(struct dsa_switch *ds) dsa_user_mii_bus_init(ds); - dn = of_get_child_by_name(ds->dev->of_node, "mdio"); - - err = of_mdiobus_register(ds->user_mii_bus, dn); - of_node_put(dn); + err = mdiobus_register(ds->user_mii_bus); if (err < 0) goto free_user_mii_bus; } @@ -1510,6 +1505,16 @@ static int dsa_switch_probe(struct dsa_switch *ds) if (!ds->num_ports) return -EINVAL; + if (ds->phylink_mac_ops) { + if (ds->ops->phylink_mac_select_pcs || + ds->ops->phylink_mac_prepare || + ds->ops->phylink_mac_config || + ds->ops->phylink_mac_finish || + ds->ops->phylink_mac_link_down || + ds->ops->phylink_mac_link_up) + return -EINVAL; + } + if (np) { err = dsa_switch_parse_of(ds, np); if (err) diff --git a/net/dsa/port.c b/net/dsa/port.c index c42dac8767..9a249d4ac3 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -1535,30 +1535,11 @@ void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp, cpu_dp->tag_ops = tag_ops; } -static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp) -{ - struct device_node *phy_dn; - struct phy_device *phydev; - - phy_dn = of_parse_phandle(dp->dn, "phy-handle", 0); - if (!phy_dn) - return NULL; - - phydev = of_phy_find_device(phy_dn); - if (!phydev) { - of_node_put(phy_dn); - return ERR_PTR(-EPROBE_DEFER); - } - - of_node_put(phy_dn); - return phydev; -} - static struct phylink_pcs * dsa_port_phylink_mac_select_pcs(struct phylink_config *config, phy_interface_t interface) { - struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_port *dp = dsa_phylink_to_port(config); struct phylink_pcs *pcs = ERR_PTR(-EOPNOTSUPP); struct dsa_switch *ds = dp->ds; @@ -1572,7 +1553,7 @@ static int dsa_port_phylink_mac_prepare(struct phylink_config *config, unsigned int mode, phy_interface_t interface) { - struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_port *dp = dsa_phylink_to_port(config); struct dsa_switch *ds = dp->ds; int err = 0; @@ -1587,7 +1568,7 @@ static void dsa_port_phylink_mac_config(struct phylink_config *config, unsigned int mode, const struct phylink_link_state *state) { - struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_port *dp = dsa_phylink_to_port(config); struct dsa_switch *ds = dp->ds; if (!ds->ops->phylink_mac_config) @@ -1600,7 +1581,7 @@ static int dsa_port_phylink_mac_finish(struct phylink_config *config, unsigned int mode, phy_interface_t interface) { - struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_port *dp = dsa_phylink_to_port(config); struct dsa_switch *ds = dp->ds; int err = 0; @@ -1615,18 +1596,11 @@ static void dsa_port_phylink_mac_link_down(struct phylink_config *config, unsigned int mode, phy_interface_t interface) { - struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); - struct phy_device *phydev = NULL; + struct dsa_port *dp = dsa_phylink_to_port(config); struct dsa_switch *ds = dp->ds; - if (dsa_port_is_user(dp)) - phydev = dp->user->phydev; - - if (!ds->ops->phylink_mac_link_down) { - if (ds->ops->adjust_link && phydev) - ds->ops->adjust_link(ds, dp->index, phydev); + if (!ds->ops->phylink_mac_link_down) return; - } ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface); } @@ -1638,14 +1612,11 @@ static void dsa_port_phylink_mac_link_up(struct phylink_config *config, int speed, int duplex, bool tx_pause, bool rx_pause) { - struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_port *dp = dsa_phylink_to_port(config); struct dsa_switch *ds = dp->ds; - if (!ds->ops->phylink_mac_link_up) { - if (ds->ops->adjust_link && phydev) - ds->ops->adjust_link(ds, dp->index, phydev); + if (!ds->ops->phylink_mac_link_up) return; - } ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev, speed, duplex, tx_pause, rx_pause); @@ -1662,6 +1633,7 @@ static const struct phylink_mac_ops dsa_port_phylink_mac_ops = { int dsa_port_phylink_create(struct dsa_port *dp) { + const struct phylink_mac_ops *mac_ops; struct dsa_switch *ds = dp->ds; phy_interface_t mode; struct phylink *pl; @@ -1685,8 +1657,12 @@ int dsa_port_phylink_create(struct dsa_port *dp) } } - pl = phylink_create(&dp->pl_config, of_fwnode_handle(dp->dn), - mode, &dsa_port_phylink_mac_ops); + mac_ops = &dsa_port_phylink_mac_ops; + if (ds->phylink_mac_ops) + mac_ops = ds->phylink_mac_ops; + + pl = phylink_create(&dp->pl_config, of_fwnode_handle(dp->dn), mode, + mac_ops); if (IS_ERR(pl)) { pr_err("error creating PHYLINK: %ld\n", PTR_ERR(pl)); return PTR_ERR(pl); @@ -1703,78 +1679,6 @@ void dsa_port_phylink_destroy(struct dsa_port *dp) dp->pl = NULL; } -static int dsa_shared_port_setup_phy_of(struct dsa_port *dp, bool enable) -{ - struct dsa_switch *ds = dp->ds; - struct phy_device *phydev; - int port = dp->index; - int err = 0; - - phydev = dsa_port_get_phy_device(dp); - if (!phydev) - return 0; - - if (IS_ERR(phydev)) - return PTR_ERR(phydev); - - if (enable) { - err = genphy_resume(phydev); - if (err < 0) - goto err_put_dev; - - err = genphy_read_status(phydev); - if (err < 0) - goto err_put_dev; - } else { - err = genphy_suspend(phydev); - if (err < 0) - goto err_put_dev; - } - - if (ds->ops->adjust_link) - ds->ops->adjust_link(ds, port, phydev); - - dev_dbg(ds->dev, "enabled port's phy: %s", phydev_name(phydev)); - -err_put_dev: - put_device(&phydev->mdio.dev); - return err; -} - -static int dsa_shared_port_fixed_link_register_of(struct dsa_port *dp) -{ - struct device_node *dn = dp->dn; - struct dsa_switch *ds = dp->ds; - struct phy_device *phydev; - int port = dp->index; - phy_interface_t mode; - int err; - - err = of_phy_register_fixed_link(dn); - if (err) { - dev_err(ds->dev, - "failed to register the fixed PHY of port %d\n", - port); - return err; - } - - phydev = of_phy_find_device(dn); - - err = of_get_phy_mode(dn, &mode); - if (err) - mode = PHY_INTERFACE_MODE_NA; - phydev->interface = mode; - - genphy_read_status(phydev); - - if (ds->ops->adjust_link) - ds->ops->adjust_link(ds, port, phydev); - - put_device(&phydev->mdio.dev); - - return 0; -} - static int dsa_shared_port_phylink_register(struct dsa_port *dp) { struct dsa_switch *ds = dp->ds; @@ -1952,12 +1856,23 @@ static void dsa_shared_port_validate_of(struct dsa_port *dp, dn, dsa_port_is_cpu(dp) ? "CPU" : "DSA", dp->index); } +static void dsa_shared_port_link_down(struct dsa_port *dp) +{ + struct dsa_switch *ds = dp->ds; + + if (ds->phylink_mac_ops && ds->phylink_mac_ops->mac_link_down) + ds->phylink_mac_ops->mac_link_down(&dp->pl_config, MLO_AN_FIXED, + PHY_INTERFACE_MODE_NA); + else if (ds->ops->phylink_mac_link_down) + ds->ops->phylink_mac_link_down(ds, dp->index, MLO_AN_FIXED, + PHY_INTERFACE_MODE_NA); +} + int dsa_shared_port_link_register_of(struct dsa_port *dp) { struct dsa_switch *ds = dp->ds; bool missing_link_description; bool missing_phy_mode; - int port = dp->index; dsa_shared_port_validate_of(dp, &missing_phy_mode, &missing_link_description); @@ -1967,46 +1882,28 @@ int dsa_shared_port_link_register_of(struct dsa_port *dp) dsa_switches_apply_workarounds)) return -EINVAL; - if (!ds->ops->adjust_link) { - if (missing_link_description) { - dev_warn(ds->dev, - "Skipping phylink registration for %s port %d\n", - dsa_port_is_cpu(dp) ? "CPU" : "DSA", dp->index); - } else { - if (ds->ops->phylink_mac_link_down) - ds->ops->phylink_mac_link_down(ds, port, - MLO_AN_FIXED, PHY_INTERFACE_MODE_NA); + if (missing_link_description) { + dev_warn(ds->dev, + "Skipping phylink registration for %s port %d\n", + dsa_port_is_cpu(dp) ? "CPU" : "DSA", dp->index); + } else { + dsa_shared_port_link_down(dp); - return dsa_shared_port_phylink_register(dp); - } - return 0; + return dsa_shared_port_phylink_register(dp); } - dev_warn(ds->dev, - "Using legacy PHYLIB callbacks. Please migrate to PHYLINK!\n"); - - if (of_phy_is_fixed_link(dp->dn)) - return dsa_shared_port_fixed_link_register_of(dp); - else - return dsa_shared_port_setup_phy_of(dp, true); + return 0; } void dsa_shared_port_link_unregister_of(struct dsa_port *dp) { - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->adjust_link && dp->pl) { + if (dp->pl) { rtnl_lock(); phylink_disconnect_phy(dp->pl); rtnl_unlock(); dsa_port_phylink_destroy(dp); return; } - - if (of_phy_is_fixed_link(dp->dn)) - of_phy_deregister_fixed_link(dp->dn); - else - dsa_shared_port_setup_phy_of(dp, false); } int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr, diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 2717e9d7b6..1aba1d05c2 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -75,7 +75,7 @@ sja1105_tagger_private(struct dsa_switch *ds) } /* Similar to is_link_local_ether_addr(hdr->h_dest) but also covers PTP */ -static inline bool sja1105_is_link_local(const struct sk_buff *skb) +static bool sja1105_is_link_local(const struct sk_buff *skb) { const struct ethhdr *hdr = eth_hdr(skb); u64 dmac = ether_addr_to_u64(hdr->h_dest); @@ -121,7 +121,7 @@ static void sja1105_meta_unpack(const struct sk_buff *skb, packing(buf + 7, &meta->switch_id, 7, 0, 1, UNPACK, 0); } -static inline bool sja1105_is_meta_frame(const struct sk_buff *skb) +static bool sja1105_is_meta_frame(const struct sk_buff *skb) { const struct ethhdr *hdr = eth_hdr(skb); u64 smac = ether_addr_to_u64(hdr->h_source); diff --git a/net/dsa/trace.h b/net/dsa/trace.h index 567f29a397..83f3e5f784 100644 --- a/net/dsa/trace.h +++ b/net/dsa/trace.h @@ -39,8 +39,8 @@ DECLARE_EVENT_CLASS(dsa_port_addr_op_hw, ), TP_fast_assign( - __assign_str(dev, dev_name(dp->ds->dev)); - __assign_str(kind, dsa_port_kind(dp)); + __assign_str(dev); + __assign_str(kind); __entry->port = dp->index; ether_addr_copy(__entry->addr, addr); __entry->vid = vid; @@ -98,8 +98,8 @@ DECLARE_EVENT_CLASS(dsa_port_addr_op_refcount, ), TP_fast_assign( - __assign_str(dev, dev_name(dp->ds->dev)); - __assign_str(kind, dsa_port_kind(dp)); + __assign_str(dev); + __assign_str(kind); __entry->port = dp->index; ether_addr_copy(__entry->addr, addr); __entry->vid = vid; @@ -157,8 +157,8 @@ DECLARE_EVENT_CLASS(dsa_port_addr_del_not_found, ), TP_fast_assign( - __assign_str(dev, dev_name(dp->ds->dev)); - __assign_str(kind, dsa_port_kind(dp)); + __assign_str(dev); + __assign_str(kind); __entry->port = dp->index; ether_addr_copy(__entry->addr, addr); __entry->vid = vid; @@ -199,7 +199,7 @@ TRACE_EVENT(dsa_lag_fdb_add_hw, ), TP_fast_assign( - __assign_str(dev, lag_dev->name); + __assign_str(dev); ether_addr_copy(__entry->addr, addr); __entry->vid = vid; dsa_db_print(db, __entry->db_buf); @@ -227,7 +227,7 @@ TRACE_EVENT(dsa_lag_fdb_add_bump, ), TP_fast_assign( - __assign_str(dev, lag_dev->name); + __assign_str(dev); ether_addr_copy(__entry->addr, addr); __entry->vid = vid; dsa_db_print(db, __entry->db_buf); @@ -255,7 +255,7 @@ TRACE_EVENT(dsa_lag_fdb_del_hw, ), TP_fast_assign( - __assign_str(dev, lag_dev->name); + __assign_str(dev); ether_addr_copy(__entry->addr, addr); __entry->vid = vid; dsa_db_print(db, __entry->db_buf); @@ -283,7 +283,7 @@ TRACE_EVENT(dsa_lag_fdb_del_drop, ), TP_fast_assign( - __assign_str(dev, lag_dev->name); + __assign_str(dev); ether_addr_copy(__entry->addr, addr); __entry->vid = vid; dsa_db_print(db, __entry->db_buf); @@ -310,7 +310,7 @@ TRACE_EVENT(dsa_lag_fdb_del_not_found, ), TP_fast_assign( - __assign_str(dev, lag_dev->name); + __assign_str(dev); ether_addr_copy(__entry->addr, addr); __entry->vid = vid; dsa_db_print(db, __entry->db_buf); @@ -338,8 +338,8 @@ DECLARE_EVENT_CLASS(dsa_vlan_op_hw, ), TP_fast_assign( - __assign_str(dev, dev_name(dp->ds->dev)); - __assign_str(kind, dsa_port_kind(dp)); + __assign_str(dev); + __assign_str(kind); __entry->port = dp->index; __entry->vid = vlan->vid; __entry->flags = vlan->flags; @@ -383,8 +383,8 @@ DECLARE_EVENT_CLASS(dsa_vlan_op_refcount, ), TP_fast_assign( - __assign_str(dev, dev_name(dp->ds->dev)); - __assign_str(kind, dsa_port_kind(dp)); + __assign_str(dev); + __assign_str(kind); __entry->port = dp->index; __entry->vid = vlan->vid; __entry->flags = vlan->flags; @@ -426,8 +426,8 @@ TRACE_EVENT(dsa_vlan_del_not_found, ), TP_fast_assign( - __assign_str(dev, dev_name(dp->ds->dev)); - __assign_str(kind, dsa_port_kind(dp)); + __assign_str(dev); + __assign_str(kind); __entry->port = dp->index; __entry->vid = vlan->vid; ), diff --git a/net/dsa/user.c b/net/dsa/user.c index b15e71cc34..867c5fe9a4 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -210,7 +210,7 @@ static int dsa_user_sync_uc(struct net_device *dev, return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, - &ctx); + &ctx); } static int dsa_user_unsync_uc(struct net_device *dev, @@ -230,7 +230,7 @@ static int dsa_user_unsync_uc(struct net_device *dev, return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, - &ctx); + &ctx); } static int dsa_user_sync_mc(struct net_device *dev, @@ -250,7 +250,7 @@ static int dsa_user_sync_mc(struct net_device *dev, return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, - &ctx); + &ctx); } static int dsa_user_unsync_mc(struct net_device *dev, @@ -270,7 +270,7 @@ static int dsa_user_unsync_mc(struct net_device *dev, return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, - &ctx); + &ctx); } void dsa_user_sync_ha(struct net_device *dev) @@ -352,7 +352,7 @@ void dsa_user_mii_bus_init(struct dsa_switch *ds) /* user device handling ****************************************************/ static int dsa_user_get_iflink(const struct net_device *dev) { - return dsa_user_to_conduit(dev)->ifindex; + return READ_ONCE(dsa_user_to_conduit(dev)->ifindex); } static int dsa_user_open(struct net_device *dev) @@ -875,8 +875,8 @@ static int dsa_user_port_obj_del(struct net_device *dev, const void *ctx, return err; } -static inline netdev_tx_t dsa_user_netpoll_send_skb(struct net_device *dev, - struct sk_buff *skb) +static netdev_tx_t dsa_user_netpoll_send_skb(struct net_device *dev, + struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER struct dsa_user_priv *p = netdev_priv(dev); @@ -1222,7 +1222,7 @@ static int dsa_user_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) return ret; } -static int dsa_user_set_eee(struct net_device *dev, struct ethtool_eee *e) +static int dsa_user_set_eee(struct net_device *dev, struct ethtool_keee *e) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; @@ -1242,7 +1242,7 @@ static int dsa_user_set_eee(struct net_device *dev, struct ethtool_eee *e) return phylink_ethtool_set_eee(dp->pl, e); } -static int dsa_user_get_eee(struct net_device *dev, struct ethtool_eee *e) +static int dsa_user_get_eee(struct net_device *dev, struct ethtool_keee *e) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; @@ -2120,7 +2120,7 @@ int dsa_user_change_mtu(struct net_device *dev, int new_mtu) if (err) goto out_port_failed; - dev->mtu = new_mtu; + WRITE_ONCE(dev->mtu, new_mtu); dsa_bridge_mtu_normalization(dp); @@ -2137,6 +2137,32 @@ out_conduit_failed: } static int __maybe_unused +dsa_user_dcbnl_set_apptrust(struct net_device *dev, u8 *sel, int nsel) +{ + struct dsa_port *dp = dsa_user_to_port(dev); + struct dsa_switch *ds = dp->ds; + int port = dp->index; + + if (!ds->ops->port_set_apptrust) + return -EOPNOTSUPP; + + return ds->ops->port_set_apptrust(ds, port, sel, nsel); +} + +static int __maybe_unused +dsa_user_dcbnl_get_apptrust(struct net_device *dev, u8 *sel, int *nsel) +{ + struct dsa_port *dp = dsa_user_to_port(dev); + struct dsa_switch *ds = dp->ds; + int port = dp->index; + + if (!ds->ops->port_get_apptrust) + return -EOPNOTSUPP; + + return ds->ops->port_get_apptrust(ds, port, sel, nsel); +} + +static int __maybe_unused dsa_user_dcbnl_set_default_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_user_to_port(dev); @@ -2163,6 +2189,58 @@ dsa_user_dcbnl_set_default_prio(struct net_device *dev, struct dcb_app *app) return 0; } +/* Update the DSCP prio entries on all user ports of the switch in case + * the switch supports global DSCP prio instead of per port DSCP prios. + */ +static int dsa_user_dcbnl_ieee_global_dscp_setdel(struct net_device *dev, + struct dcb_app *app, bool del) +{ + int (*setdel)(struct net_device *dev, struct dcb_app *app); + struct dsa_port *dp = dsa_user_to_port(dev); + struct dsa_switch *ds = dp->ds; + struct dsa_port *other_dp; + int err, restore_err; + + if (del) + setdel = dcb_ieee_delapp; + else + setdel = dcb_ieee_setapp; + + dsa_switch_for_each_user_port(other_dp, ds) { + struct net_device *user = other_dp->user; + + if (!user || user == dev) + continue; + + err = setdel(user, app); + if (err) + goto err_try_to_restore; + } + + return 0; + +err_try_to_restore: + + /* Revert logic to restore previous state of app entries */ + if (!del) + setdel = dcb_ieee_delapp; + else + setdel = dcb_ieee_setapp; + + dsa_switch_for_each_user_port_continue_reverse(other_dp, ds) { + struct net_device *user = other_dp->user; + + if (!user || user == dev) + continue; + + restore_err = setdel(user, app); + if (restore_err) + netdev_err(user, "Failed to restore DSCP prio entry configuration\n"); + } + + return err; +} + static int __maybe_unused dsa_user_dcbnl_add_dscp_prio(struct net_device *dev, struct dcb_app *app) { @@ -2194,6 +2272,17 @@ dsa_user_dcbnl_add_dscp_prio(struct net_device *dev, struct dcb_app *app) return err; } + if (!ds->dscp_prio_mapping_is_global) + return 0; + + err = dsa_user_dcbnl_ieee_global_dscp_setdel(dev, app, false); + if (err) { + if (ds->ops->port_del_dscp_prio) + ds->ops->port_del_dscp_prio(ds, port, dscp, new_prio); + dcb_ieee_delapp(dev, app); + return err; + } + return 0; } @@ -2264,6 +2353,18 @@ dsa_user_dcbnl_del_dscp_prio(struct net_device *dev, struct dcb_app *app) return err; } + if (!ds->dscp_prio_mapping_is_global) + return 0; + + err = dsa_user_dcbnl_ieee_global_dscp_setdel(dev, app, true); + if (err) { + if (ds->ops->port_add_dscp_prio) + ds->ops->port_add_dscp_prio(ds, port, dscp, + app->priority); + dcb_ieee_setapp(dev, app); + return err; + } + return 0; } @@ -2376,6 +2477,8 @@ static const struct ethtool_ops dsa_user_ethtool_ops = { static const struct dcbnl_rtnl_ops __maybe_unused dsa_user_dcbnl_ops = { .ieee_setapp = dsa_user_dcbnl_ieee_setapp, .ieee_delapp = dsa_user_dcbnl_ieee_delapp, + .dcbnl_setapptrust = dsa_user_dcbnl_set_apptrust, + .dcbnl_getapptrust = dsa_user_dcbnl_get_apptrust, }; static void dsa_user_get_stats64(struct net_device *dev, @@ -2429,7 +2532,7 @@ static const struct net_device_ops dsa_user_netdev_ops = { .ndo_fill_forward_path = dsa_user_fill_forward_path, }; -static struct device_type dsa_type = { +static const struct device_type dsa_type = { .name = "dsa", }; @@ -2445,7 +2548,7 @@ EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change); static void dsa_user_phylink_fixed_state(struct phylink_config *config, struct phylink_link_state *state) { - struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_port *dp = dsa_phylink_to_port(config); struct dsa_switch *ds = dp->ds; /* No need to check that this operation is valid, the callback would @@ -2625,11 +2728,7 @@ int dsa_user_create(struct dsa_port *port) user_dev->vlan_features = conduit->vlan_features; p = netdev_priv(user_dev); - user_dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!user_dev->tstats) { - free_netdev(user_dev); - return -ENOMEM; - } + user_dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; ret = gro_cells_init(&p->gcells, user_dev); if (ret) @@ -2695,7 +2794,6 @@ out_phy: out_gcells: gro_cells_destroy(&p->gcells); out_free: - free_percpu(user_dev->tstats); free_netdev(user_dev); port->user = NULL; return ret; @@ -2716,7 +2814,6 @@ void dsa_user_destroy(struct net_device *user_dev) dsa_port_phylink_destroy(dp); gro_cells_destroy(&p->gcells); - free_percpu(user_dev->tstats); free_netdev(user_dev); } diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 049c3adeb8..4e3651101b 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -161,9 +161,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) skb->dev = dev; skb_reset_mac_header(skb); - eth = (struct ethhdr *)skb->data; - skb_pull_inline(skb, ETH_HLEN); - + eth = eth_skb_pull_mac(skb); eth_skb_pkt_type(skb, dev); /* diff --git a/net/ethtool/eee.c b/net/ethtool/eee.c index 2853394d06..bf398973eb 100644 --- a/net/ethtool/eee.c +++ b/net/ethtool/eee.c @@ -4,16 +4,13 @@ #include "common.h" #include "bitset.h" -#define EEE_MODES_COUNT \ - (sizeof_field(struct ethtool_eee, supported) * BITS_PER_BYTE) - struct eee_req_info { struct ethnl_req_info base; }; struct eee_reply_data { struct ethnl_reply_data base; - struct ethtool_eee eee; + struct ethtool_keee eee; }; #define EEE_REPDATA(__reply_base) \ @@ -30,6 +27,7 @@ static int eee_prepare_data(const struct ethnl_req_info *req_base, { struct eee_reply_data *data = EEE_REPDATA(reply_base); struct net_device *dev = reply_base->dev; + struct ethtool_keee *eee = &data->eee; int ret; if (!dev->ethtool_ops->get_eee) @@ -37,7 +35,7 @@ static int eee_prepare_data(const struct ethnl_req_info *req_base, ret = ethnl_ops_begin(dev); if (ret < 0) return ret; - ret = dev->ethtool_ops->get_eee(dev, &data->eee); + ret = dev->ethtool_ops->get_eee(dev, eee); ethnl_ops_complete(dev); return ret; @@ -48,24 +46,21 @@ static int eee_reply_size(const struct ethnl_req_info *req_base, { bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct eee_reply_data *data = EEE_REPDATA(reply_base); - const struct ethtool_eee *eee = &data->eee; + const struct ethtool_keee *eee = &data->eee; int len = 0; int ret; - BUILD_BUG_ON(sizeof(eee->advertised) * BITS_PER_BYTE != - EEE_MODES_COUNT); - BUILD_BUG_ON(sizeof(eee->lp_advertised) * BITS_PER_BYTE != - EEE_MODES_COUNT); - /* MODES_OURS */ - ret = ethnl_bitset32_size(&eee->advertised, &eee->supported, - EEE_MODES_COUNT, link_mode_names, compact); + ret = ethnl_bitset_size(eee->advertised, eee->supported, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_mode_names, compact); if (ret < 0) return ret; len += ret; /* MODES_PEERS */ - ret = ethnl_bitset32_size(&eee->lp_advertised, NULL, - EEE_MODES_COUNT, link_mode_names, compact); + ret = ethnl_bitset_size(eee->lp_advertised, NULL, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_mode_names, compact); if (ret < 0) return ret; len += ret; @@ -84,24 +79,26 @@ static int eee_fill_reply(struct sk_buff *skb, { bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct eee_reply_data *data = EEE_REPDATA(reply_base); - const struct ethtool_eee *eee = &data->eee; + const struct ethtool_keee *eee = &data->eee; int ret; - ret = ethnl_put_bitset32(skb, ETHTOOL_A_EEE_MODES_OURS, - &eee->advertised, &eee->supported, - EEE_MODES_COUNT, link_mode_names, compact); + ret = ethnl_put_bitset(skb, ETHTOOL_A_EEE_MODES_OURS, + eee->advertised, eee->supported, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_mode_names, compact); if (ret < 0) return ret; - ret = ethnl_put_bitset32(skb, ETHTOOL_A_EEE_MODES_PEER, - &eee->lp_advertised, NULL, EEE_MODES_COUNT, - link_mode_names, compact); + ret = ethnl_put_bitset(skb, ETHTOOL_A_EEE_MODES_PEER, + eee->lp_advertised, NULL, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_mode_names, compact); if (ret < 0) return ret; - if (nla_put_u8(skb, ETHTOOL_A_EEE_ACTIVE, !!eee->eee_active) || - nla_put_u8(skb, ETHTOOL_A_EEE_ENABLED, !!eee->eee_enabled) || + if (nla_put_u8(skb, ETHTOOL_A_EEE_ACTIVE, eee->eee_active) || + nla_put_u8(skb, ETHTOOL_A_EEE_ENABLED, eee->eee_enabled) || nla_put_u8(skb, ETHTOOL_A_EEE_TX_LPI_ENABLED, - !!eee->tx_lpi_enabled) || + eee->tx_lpi_enabled) || nla_put_u32(skb, ETHTOOL_A_EEE_TX_LPI_TIMER, eee->tx_lpi_timer)) return -EMSGSIZE; @@ -132,7 +129,7 @@ ethnl_set_eee(struct ethnl_req_info *req_info, struct genl_info *info) { struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; - struct ethtool_eee eee = {}; + struct ethtool_keee eee = {}; bool mod = false; int ret; @@ -140,14 +137,15 @@ ethnl_set_eee(struct ethnl_req_info *req_info, struct genl_info *info) if (ret < 0) return ret; - ret = ethnl_update_bitset32(&eee.advertised, EEE_MODES_COUNT, - tb[ETHTOOL_A_EEE_MODES_OURS], - link_mode_names, info->extack, &mod); + ret = ethnl_update_bitset(eee.advertised, + __ETHTOOL_LINK_MODE_MASK_NBITS, + tb[ETHTOOL_A_EEE_MODES_OURS], + link_mode_names, info->extack, &mod); if (ret < 0) return ret; - ethnl_update_bool32(&eee.eee_enabled, tb[ETHTOOL_A_EEE_ENABLED], &mod); - ethnl_update_bool32(&eee.tx_lpi_enabled, - tb[ETHTOOL_A_EEE_TX_LPI_ENABLED], &mod); + ethnl_update_bool(&eee.eee_enabled, tb[ETHTOOL_A_EEE_ENABLED], &mod); + ethnl_update_bool(&eee.tx_lpi_enabled, tb[ETHTOOL_A_EEE_TX_LPI_ENABLED], + &mod); ethnl_update_u32(&eee.tx_lpi_timer, tb[ETHTOOL_A_EEE_TX_LPI_TIMER], &mod); if (!mod) diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 7519b0818b..fcc3dbef8b 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -26,12 +26,12 @@ #include <linux/sched/signal.h> #include <linux/net.h> #include <linux/pm_runtime.h> +#include <linux/utsname.h> #include <net/devlink.h> #include <net/ipv6.h> #include <net/xdp_sock_drv.h> #include <net/flow_offload.h> #include <linux/ethtool_netlink.h> -#include <generated/utsrelease.h> #include "common.h" /* State held across locks and calls for commands which have devlink fallback */ @@ -713,7 +713,8 @@ ethtool_get_drvinfo(struct net_device *dev, struct ethtool_devlink_compat *rsp) struct device *parent = dev->dev.parent; rsp->info.cmd = ETHTOOL_GDRVINFO; - strscpy(rsp->info.version, UTS_RELEASE, sizeof(rsp->info.version)); + strscpy(rsp->info.version, init_uts_ns.name.release, + sizeof(rsp->info.version)); if (ops->get_drvinfo) { ops->get_drvinfo(dev, &rsp->info); if (!rsp->info.bus_info[0] && parent) @@ -1276,11 +1277,11 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]); const struct ethtool_ops *ops = dev->ethtool_ops; u32 dev_indir_size = 0, dev_key_size = 0, i; + u32 user_indir_len = 0, indir_bytes = 0; struct ethtool_rxfh_param rxfh_dev = {}; struct netlink_ext_ack *extack = NULL; struct ethtool_rxnfc rx_rings; struct ethtool_rxfh rxfh; - u32 indir_bytes = 0; u8 *rss_config; int ret; @@ -1305,7 +1306,8 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, if (rxfh.input_xfrm && rxfh.input_xfrm != RXH_XFRM_SYM_XOR && rxfh.input_xfrm != RXH_XFRM_NO_CHANGE) return -EINVAL; - if ((rxfh.input_xfrm & RXH_XFRM_SYM_XOR) && + if (rxfh.input_xfrm != RXH_XFRM_NO_CHANGE && + (rxfh.input_xfrm & RXH_XFRM_SYM_XOR) && !ops->cap_rss_sym_xor_supported) return -EOPNOTSUPP; @@ -1340,6 +1342,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, */ if (rxfh.indir_size && rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) { + user_indir_len = indir_bytes; rxfh_dev.indir = (u32 *)rss_config; rxfh_dev.indir_size = dev_indir_size; ret = ethtool_copy_validate_indir(rxfh_dev.indir, @@ -1366,7 +1369,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, rxfh_dev.key_size = dev_key_size; rxfh_dev.key = rss_config + indir_bytes; if (copy_from_user(rxfh_dev.key, - useraddr + rss_cfg_offset + indir_bytes, + useraddr + rss_cfg_offset + user_indir_len, rxfh.key_size)) { ret = -EFAULT; goto out; @@ -1508,22 +1511,57 @@ static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) return 0; } +static void eee_to_keee(struct ethtool_keee *keee, + const struct ethtool_eee *eee) +{ + memset(keee, 0, sizeof(*keee)); + + keee->eee_enabled = eee->eee_enabled; + keee->tx_lpi_enabled = eee->tx_lpi_enabled; + keee->tx_lpi_timer = eee->tx_lpi_timer; + + ethtool_convert_legacy_u32_to_link_mode(keee->advertised, + eee->advertised); +} + +static void keee_to_eee(struct ethtool_eee *eee, + const struct ethtool_keee *keee) +{ + bool overflow; + + memset(eee, 0, sizeof(*eee)); + + eee->eee_active = keee->eee_active; + eee->eee_enabled = keee->eee_enabled; + eee->tx_lpi_enabled = keee->tx_lpi_enabled; + eee->tx_lpi_timer = keee->tx_lpi_timer; + + overflow = !ethtool_convert_link_mode_to_legacy_u32(&eee->supported, + keee->supported); + ethtool_convert_link_mode_to_legacy_u32(&eee->advertised, + keee->advertised); + ethtool_convert_link_mode_to_legacy_u32(&eee->lp_advertised, + keee->lp_advertised); + if (overflow) + pr_warn("Ethtool ioctl interface doesn't support passing EEE linkmodes beyond bit 32\n"); +} + static int ethtool_get_eee(struct net_device *dev, char __user *useraddr) { - struct ethtool_eee edata; + struct ethtool_keee keee; + struct ethtool_eee eee; int rc; if (!dev->ethtool_ops->get_eee) return -EOPNOTSUPP; - memset(&edata, 0, sizeof(struct ethtool_eee)); - edata.cmd = ETHTOOL_GEEE; - rc = dev->ethtool_ops->get_eee(dev, &edata); - + memset(&keee, 0, sizeof(keee)); + rc = dev->ethtool_ops->get_eee(dev, &keee); if (rc) return rc; - if (copy_to_user(useraddr, &edata, sizeof(edata))) + keee_to_eee(&eee, &keee); + if (copy_to_user(useraddr, &eee, sizeof(eee))) return -EFAULT; return 0; @@ -1531,16 +1569,18 @@ static int ethtool_get_eee(struct net_device *dev, char __user *useraddr) static int ethtool_set_eee(struct net_device *dev, char __user *useraddr) { - struct ethtool_eee edata; + struct ethtool_keee keee; + struct ethtool_eee eee; int ret; if (!dev->ethtool_ops->set_eee) return -EOPNOTSUPP; - if (copy_from_user(&edata, useraddr, sizeof(edata))) + if (copy_from_user(&eee, useraddr, sizeof(eee))) return -EFAULT; - ret = dev->ethtool_ops->set_eee(dev, &edata); + eee_to_keee(&keee, &eee); + ret = dev->ethtool_ops->set_eee(dev, &keee); if (!ret) ethtool_notify(dev, ETHTOOL_MSG_EEE_NTF, NULL); return ret; @@ -2182,7 +2222,7 @@ static int ethtool_get_phy_stats_ethtool(struct net_device *dev, const struct ethtool_ops *ops = dev->ethtool_ops; int n_stats, ret; - if (!ops || !ops->get_sset_count || ops->get_ethtool_phy_stats) + if (!ops || !ops->get_sset_count || !ops->get_ethtool_phy_stats) return -EOPNOTSUPP; n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS); diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c index b2de2108b3..34d76e8784 100644 --- a/net/ethtool/linkstate.c +++ b/net/ethtool/linkstate.c @@ -37,6 +37,8 @@ static int linkstate_get_sqi(struct net_device *dev) mutex_lock(&phydev->lock); if (!phydev->drv || !phydev->drv->get_sqi) ret = -EOPNOTSUPP; + else if (!phydev->link) + ret = -ENETDOWN; else ret = phydev->drv->get_sqi(phydev); mutex_unlock(&phydev->lock); @@ -55,6 +57,8 @@ static int linkstate_get_sqi_max(struct net_device *dev) mutex_lock(&phydev->lock); if (!phydev->drv || !phydev->drv->get_sqi_max) ret = -EOPNOTSUPP; + else if (!phydev->link) + ret = -ENETDOWN; else ret = phydev->drv->get_sqi_max(phydev); mutex_unlock(&phydev->lock); @@ -62,6 +66,17 @@ static int linkstate_get_sqi_max(struct net_device *dev) return ret; }; +static bool linkstate_sqi_critical_error(int sqi) +{ + return sqi < 0 && sqi != -EOPNOTSUPP && sqi != -ENETDOWN; +} + +static bool linkstate_sqi_valid(struct linkstate_reply_data *data) +{ + return data->sqi >= 0 && data->sqi_max >= 0 && + data->sqi <= data->sqi_max; +} + static int linkstate_get_link_ext_state(struct net_device *dev, struct linkstate_reply_data *data) { @@ -93,12 +108,12 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, data->link = __ethtool_get_link(dev); ret = linkstate_get_sqi(dev); - if (ret < 0 && ret != -EOPNOTSUPP) + if (linkstate_sqi_critical_error(ret)) goto out; data->sqi = ret; ret = linkstate_get_sqi_max(dev); - if (ret < 0 && ret != -EOPNOTSUPP) + if (linkstate_sqi_critical_error(ret)) goto out; data->sqi_max = ret; @@ -136,11 +151,10 @@ static int linkstate_reply_size(const struct ethnl_req_info *req_base, len = nla_total_size(sizeof(u8)) /* LINKSTATE_LINK */ + 0; - if (data->sqi != -EOPNOTSUPP) - len += nla_total_size(sizeof(u32)); - - if (data->sqi_max != -EOPNOTSUPP) - len += nla_total_size(sizeof(u32)); + if (linkstate_sqi_valid(data)) { + len += nla_total_size(sizeof(u32)); /* LINKSTATE_SQI */ + len += nla_total_size(sizeof(u32)); /* LINKSTATE_SQI_MAX */ + } if (data->link_ext_state_provided) len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_STATE */ @@ -164,13 +178,14 @@ static int linkstate_fill_reply(struct sk_buff *skb, nla_put_u8(skb, ETHTOOL_A_LINKSTATE_LINK, !!data->link)) return -EMSGSIZE; - if (data->sqi != -EOPNOTSUPP && - nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI, data->sqi)) - return -EMSGSIZE; + if (linkstate_sqi_valid(data)) { + if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI, data->sqi)) + return -EMSGSIZE; - if (data->sqi_max != -EOPNOTSUPP && - nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX, data->sqi_max)) - return -EMSGSIZE; + if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX, + data->sqi_max)) + return -EMSGSIZE; + } if (data->link_ext_state_provided) { if (nla_put_u8(skb, ETHTOOL_A_LINKSTATE_EXT_STATE, diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index fe3553f60b..bd04f28d5c 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -477,11 +477,7 @@ out: return ret; } -/* Default ->dumpit() handler for GET requests. Device iteration copied from - * rtnl_dump_ifinfo(); we have to be more careful about device hashtable - * persistence as we cannot guarantee to hold RTNL lock through the whole - * function as rtnetnlink does. - */ +/* Default ->dumpit() handler for GET requests. */ static int ethnl_default_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { @@ -490,14 +486,14 @@ static int ethnl_default_dumpit(struct sk_buff *skb, struct net_device *dev; int ret = 0; - rtnl_lock(); + rcu_read_lock(); for_each_netdev_dump(net, dev, ctx->pos_ifindex) { dev_hold(dev); - rtnl_unlock(); + rcu_read_unlock(); ret = ethnl_default_dump_one(skb, dev, ctx, genl_info_dump(cb)); - rtnl_lock(); + rcu_read_lock(); dev_put(dev); if (ret < 0 && ret != -EOPNOTSUPP) { @@ -507,7 +503,7 @@ static int ethnl_default_dumpit(struct sk_buff *skb, } ret = 0; } - rtnl_unlock(); + rcu_read_unlock(); return ret; } diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c index cc478af771..776ac96cda 100644 --- a/net/ethtool/pse-pd.c +++ b/net/ethtool/pse-pd.c @@ -82,6 +82,10 @@ static int pse_reply_size(const struct ethnl_req_info *req_base, len += nla_total_size(sizeof(u32)); /* _PODL_PSE_ADMIN_STATE */ if (st->podl_pw_status > 0) len += nla_total_size(sizeof(u32)); /* _PODL_PSE_PW_D_STATUS */ + if (st->c33_admin_state > 0) + len += nla_total_size(sizeof(u32)); /* _C33_PSE_ADMIN_STATE */ + if (st->c33_pw_status > 0) + len += nla_total_size(sizeof(u32)); /* _C33_PSE_PW_D_STATUS */ return len; } @@ -103,6 +107,16 @@ static int pse_fill_reply(struct sk_buff *skb, st->podl_pw_status)) return -EMSGSIZE; + if (st->c33_admin_state > 0 && + nla_put_u32(skb, ETHTOOL_A_C33_PSE_ADMIN_STATE, + st->c33_admin_state)) + return -EMSGSIZE; + + if (st->c33_pw_status > 0 && + nla_put_u32(skb, ETHTOOL_A_C33_PSE_PW_D_STATUS, + st->c33_pw_status)) + return -EMSGSIZE; + return 0; } @@ -113,25 +127,18 @@ const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1] = { [ETHTOOL_A_PODL_PSE_ADMIN_CONTROL] = NLA_POLICY_RANGE(NLA_U32, ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED, ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED), + [ETHTOOL_A_C33_PSE_ADMIN_CONTROL] = + NLA_POLICY_RANGE(NLA_U32, ETHTOOL_C33_PSE_ADMIN_STATE_DISABLED, + ETHTOOL_C33_PSE_ADMIN_STATE_ENABLED), }; static int ethnl_set_pse_validate(struct ethnl_req_info *req_info, struct genl_info *info) { - return !!info->attrs[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]; -} - -static int -ethnl_set_pse(struct ethnl_req_info *req_info, struct genl_info *info) -{ struct net_device *dev = req_info->dev; - struct pse_control_config config = {}; struct nlattr **tb = info->attrs; struct phy_device *phydev; - /* this values are already validated by the ethnl_pse_set_policy */ - config.admin_cotrol = nla_get_u32(tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]); - phydev = dev->phydev; if (!phydev) { NL_SET_ERR_MSG(info->extack, "No PHY is attached"); @@ -143,7 +150,42 @@ ethnl_set_pse(struct ethnl_req_info *req_info, struct genl_info *info) return -EOPNOTSUPP; } - /* Return errno directly - PSE has no notification */ + if (tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL] && + !pse_has_podl(phydev->psec)) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL], + "setting PoDL PSE admin control not supported"); + return -EOPNOTSUPP; + } + if (tb[ETHTOOL_A_C33_PSE_ADMIN_CONTROL] && + !pse_has_c33(phydev->psec)) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_C33_PSE_ADMIN_CONTROL], + "setting C33 PSE admin control not supported"); + return -EOPNOTSUPP; + } + + return 1; +} + +static int +ethnl_set_pse(struct ethnl_req_info *req_info, struct genl_info *info) +{ + struct net_device *dev = req_info->dev; + struct pse_control_config config = {}; + struct nlattr **tb = info->attrs; + struct phy_device *phydev; + + phydev = dev->phydev; + /* These values are already validated by the ethnl_pse_set_policy */ + if (tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]) + config.podl_admin_control = nla_get_u32(tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]); + if (tb[ETHTOOL_A_C33_PSE_ADMIN_CONTROL]) + config.c33_admin_control = nla_get_u32(tb[ETHTOOL_A_C33_PSE_ADMIN_CONTROL]); + + /* Return errno directly - PSE has no notification + * pse_ethtool_set_config() will do nothing if the config is null + */ return pse_ethtool_set_config(phydev->psec, info->extack, &config); } diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 71679137ef..5c4c4505ab 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -111,7 +111,8 @@ rss_reply_size(const struct ethnl_req_info *req_base, const struct rss_reply_data *data = RSS_REPDATA(reply_base); int len; - len = nla_total_size(sizeof(u32)) + /* _RSS_HFUNC */ + len = nla_total_size(sizeof(u32)) + /* _RSS_CONTEXT */ + nla_total_size(sizeof(u32)) + /* _RSS_HFUNC */ nla_total_size(sizeof(u32)) + /* _RSS_INPUT_XFRM */ nla_total_size(sizeof(u32) * data->indir_size) + /* _RSS_INDIR */ nla_total_size(data->hkey_size); /* _RSS_HKEY */ @@ -124,6 +125,11 @@ rss_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct rss_reply_data *data = RSS_REPDATA(reply_base); + struct rss_req_info *request = RSS_REQINFO(req_base); + + if (request->rss_context && + nla_put_u32(skb, ETHTOOL_A_RSS_CONTEXT, request->rss_context)) + return -EMSGSIZE; if ((data->hfunc && nla_put_u32(skb, ETHTOOL_A_RSS_HFUNC, data->hfunc)) || diff --git a/net/ethtool/tsinfo.c b/net/ethtool/tsinfo.c index 9daed0aab1..57d496287e 100644 --- a/net/ethtool/tsinfo.c +++ b/net/ethtool/tsinfo.c @@ -13,14 +13,18 @@ struct tsinfo_req_info { struct tsinfo_reply_data { struct ethnl_reply_data base; struct ethtool_ts_info ts_info; + struct ethtool_ts_stats stats; }; #define TSINFO_REPDATA(__reply_base) \ container_of(__reply_base, struct tsinfo_reply_data, base) +#define ETHTOOL_TS_STAT_CNT \ + (__ETHTOOL_A_TS_STAT_CNT - (ETHTOOL_A_TS_STAT_UNSPEC + 1)) + const struct nla_policy ethnl_tsinfo_get_policy[] = { [ETHTOOL_A_TSINFO_HEADER] = - NLA_POLICY_NESTED(ethnl_header_policy), + NLA_POLICY_NESTED(ethnl_header_policy_stats), }; static int tsinfo_prepare_data(const struct ethnl_req_info *req_base, @@ -34,6 +38,12 @@ static int tsinfo_prepare_data(const struct ethnl_req_info *req_base, ret = ethnl_ops_begin(dev); if (ret < 0) return ret; + if (req_base->flags & ETHTOOL_FLAG_STATS) { + ethtool_stats_init((u64 *)&data->stats, + sizeof(data->stats) / sizeof(u64)); + if (dev->ethtool_ops->get_ts_stats) + dev->ethtool_ops->get_ts_stats(dev, &data->stats); + } ret = __ethtool_get_ts_info(dev, &data->ts_info); ethnl_ops_complete(dev); @@ -79,10 +89,47 @@ static int tsinfo_reply_size(const struct ethnl_req_info *req_base, } if (ts_info->phc_index >= 0) len += nla_total_size(sizeof(u32)); /* _TSINFO_PHC_INDEX */ + if (req_base->flags & ETHTOOL_FLAG_STATS) + len += nla_total_size(0) + /* _TSINFO_STATS */ + nla_total_size_64bit(sizeof(u64)) * ETHTOOL_TS_STAT_CNT; return len; } +static int tsinfo_put_stat(struct sk_buff *skb, u64 val, u16 attrtype) +{ + if (val == ETHTOOL_STAT_NOT_SET) + return 0; + if (nla_put_uint(skb, attrtype, val)) + return -EMSGSIZE; + return 0; +} + +static int tsinfo_put_stats(struct sk_buff *skb, + const struct ethtool_ts_stats *stats) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, ETHTOOL_A_TSINFO_STATS); + if (!nest) + return -EMSGSIZE; + + if (tsinfo_put_stat(skb, stats->tx_stats.pkts, + ETHTOOL_A_TS_STAT_TX_PKTS) || + tsinfo_put_stat(skb, stats->tx_stats.lost, + ETHTOOL_A_TS_STAT_TX_LOST) || + tsinfo_put_stat(skb, stats->tx_stats.err, + ETHTOOL_A_TS_STAT_TX_ERR)) + goto err_cancel; + + nla_nest_end(skb, nest); + return 0; + +err_cancel: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static int tsinfo_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) @@ -119,6 +166,9 @@ static int tsinfo_fill_reply(struct sk_buff *skb, if (ts_info->phc_index >= 0 && nla_put_u32(skb, ETHTOOL_A_TSINFO_PHC_INDEX, ts_info->phc_index)) return -EMSGSIZE; + if (req_base->flags & ETHTOOL_FLAG_STATS && + tsinfo_put_stats(skb, &data->stats)) + return -EMSGSIZE; return 0; } diff --git a/net/handshake/tlshd.c b/net/handshake/tlshd.c index d697f68c59..d6f5283982 100644 --- a/net/handshake/tlshd.c +++ b/net/handshake/tlshd.c @@ -213,7 +213,6 @@ static int tls_handshake_accept(struct handshake_req *req, if (!hdr) goto out_cancel; - ret = -EMSGSIZE; ret = nla_put_s32(msg, HANDSHAKE_A_ACCEPT_SOCKFD, fd); if (ret < 0) goto out_cancel; diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 9d71b66183..e6904288d4 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -28,29 +28,19 @@ static bool is_slave_up(struct net_device *dev) return dev && is_admin_up(dev) && netif_oper_up(dev); } -static void __hsr_set_operstate(struct net_device *dev, int transition) -{ - write_lock(&dev_base_lock); - if (dev->operstate != transition) { - dev->operstate = transition; - write_unlock(&dev_base_lock); - netdev_state_change(dev); - } else { - write_unlock(&dev_base_lock); - } -} - static void hsr_set_operstate(struct hsr_port *master, bool has_carrier) { - if (!is_admin_up(master->dev)) { - __hsr_set_operstate(master->dev, IF_OPER_DOWN); + struct net_device *dev = master->dev; + + if (!is_admin_up(dev)) { + netdev_set_operstate(dev, IF_OPER_DOWN); return; } if (has_carrier) - __hsr_set_operstate(master->dev, IF_OPER_UP); + netdev_set_operstate(dev, IF_OPER_UP); else - __hsr_set_operstate(master->dev, IF_OPER_LOWERLAYERDOWN); + netdev_set_operstate(dev, IF_OPER_LOWERLAYERDOWN); } static bool hsr_check_carrier(struct hsr_port *master) @@ -71,39 +61,36 @@ static bool hsr_check_carrier(struct hsr_port *master) return false; } -static void hsr_check_announce(struct net_device *hsr_dev, - unsigned char old_operstate) +static void hsr_check_announce(struct net_device *hsr_dev) { struct hsr_priv *hsr; hsr = netdev_priv(hsr_dev); - - if (hsr_dev->operstate == IF_OPER_UP && old_operstate != IF_OPER_UP) { - /* Went up */ - hsr->announce_count = 0; - mod_timer(&hsr->announce_timer, - jiffies + msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL)); + if (netif_running(hsr_dev) && netif_oper_up(hsr_dev)) { + /* Enable announce timer and start sending supervisory frames */ + if (!timer_pending(&hsr->announce_timer)) { + hsr->announce_count = 0; + mod_timer(&hsr->announce_timer, jiffies + + msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL)); + } + } else { + /* Deactivate the announce timer */ + timer_delete(&hsr->announce_timer); } - - if (hsr_dev->operstate != IF_OPER_UP && old_operstate == IF_OPER_UP) - /* Went down */ - del_timer(&hsr->announce_timer); } void hsr_check_carrier_and_operstate(struct hsr_priv *hsr) { struct hsr_port *master; - unsigned char old_operstate; bool has_carrier; master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); /* netif_stacked_transfer_operstate() cannot be used here since * it doesn't set IF_OPER_LOWERLAYERDOWN (?) */ - old_operstate = master->dev->operstate; has_carrier = hsr_check_carrier(master); hsr_set_operstate(master, has_carrier); - hsr_check_announce(master->dev, old_operstate); + hsr_check_announce(master->dev); } int hsr_get_max_mtu(struct hsr_priv *hsr) @@ -133,7 +120,7 @@ static int hsr_dev_change_mtu(struct net_device *dev, int new_mtu) return -EINVAL; } - dev->mtu = new_mtu; + WRITE_ONCE(dev->mtu, new_mtu); return 0; } @@ -142,30 +129,32 @@ static int hsr_dev_open(struct net_device *dev) { struct hsr_priv *hsr; struct hsr_port *port; - char designation; + const char *designation = NULL; hsr = netdev_priv(dev); - designation = '\0'; hsr_for_each_port(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { case HSR_PT_SLAVE_A: - designation = 'A'; + designation = "Slave A"; break; case HSR_PT_SLAVE_B: - designation = 'B'; + designation = "Slave B"; + break; + case HSR_PT_INTERLINK: + designation = "Interlink"; break; default: - designation = '?'; + designation = "Unknown"; } if (!is_slave_up(port->dev)) - netdev_warn(dev, "Slave %c (%s) is not up; please bring it up to get a fully working HSR network\n", + netdev_warn(dev, "%s (%s) is not up; please bring it up to get a fully working HSR network\n", designation, port->dev->name); } - if (designation == '\0') + if (!designation) netdev_warn(dev, "No slave devices configured\n"); return 0; @@ -296,6 +285,7 @@ static void send_hsr_supervision_frame(struct hsr_port *master, struct hsr_priv *hsr = master->hsr; __u8 type = HSR_TLV_LIFE_CHECK; struct hsr_sup_payload *hsr_sp; + struct hsr_sup_tlv *hsr_stlv; struct hsr_sup_tag *hsr_stag; struct sk_buff *skb; @@ -335,6 +325,16 @@ static void send_hsr_supervision_frame(struct hsr_port *master, hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload)); ether_addr_copy(hsr_sp->macaddress_A, master->dev->dev_addr); + if (hsr->redbox) { + hsr_stlv = skb_put(skb, sizeof(struct hsr_sup_tlv)); + hsr_stlv->HSR_TLV_type = PRP_TLV_REDBOX_MAC; + hsr_stlv->HSR_TLV_length = sizeof(struct hsr_sup_payload); + + /* Payload: MacAddressRedBox */ + hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload)); + ether_addr_copy(hsr_sp->macaddress_A, hsr->macaddress_redbox); + } + if (skb_put_padto(skb, ETH_ZLEN)) { spin_unlock_bh(&hsr->seqnr_lock); return; @@ -416,6 +416,10 @@ void hsr_del_ports(struct hsr_priv *hsr) if (port) hsr_del_port(port); + port = hsr_port_get_hsr(hsr, HSR_PT_INTERLINK); + if (port) + hsr_del_port(port); + port = hsr_port_get_hsr(hsr, HSR_PT_MASTER); if (port) hsr_del_port(port); @@ -477,7 +481,7 @@ static const struct net_device_ops hsr_device_ops = { .ndo_set_rx_mode = hsr_set_rx_mode, }; -static struct device_type hsr_type = { +static const struct device_type hsr_type = { .name = "hsr", }; @@ -545,8 +549,8 @@ static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = { }; int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], - unsigned char multicast_spec, u8 protocol_version, - struct netlink_ext_ack *extack) + struct net_device *interlink, unsigned char multicast_spec, + u8 protocol_version, struct netlink_ext_ack *extack) { bool unregister = false; struct hsr_priv *hsr; @@ -555,6 +559,7 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], hsr = netdev_priv(hsr_dev); INIT_LIST_HEAD(&hsr->ports); INIT_LIST_HEAD(&hsr->node_db); + INIT_LIST_HEAD(&hsr->proxy_node_db); spin_lock_init(&hsr->list_lock); eth_hw_addr_set(hsr_dev, slave[0]->dev_addr); @@ -580,9 +585,11 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], /* Overflow soon to find bugs easier: */ hsr->sequence_nr = HSR_SEQNR_START; hsr->sup_sequence_nr = HSR_SUP_SEQNR_START; + hsr->interlink_sequence_nr = HSR_SEQNR_START; timer_setup(&hsr->announce_timer, hsr_announce, 0); timer_setup(&hsr->prune_timer, hsr_prune_nodes, 0); + timer_setup(&hsr->prune_proxy_timer, hsr_prune_proxy_nodes, 0); ether_addr_copy(hsr->sup_multicast_addr, def_multicast_addr); hsr->sup_multicast_addr[ETH_ALEN - 1] = multicast_spec; @@ -615,6 +622,17 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], if (res) goto err_unregister; + if (interlink) { + res = hsr_add_port(hsr, interlink, HSR_PT_INTERLINK, extack); + if (res) + goto err_unregister; + + hsr->redbox = true; + ether_addr_copy(hsr->macaddress_redbox, interlink->dev_addr); + mod_timer(&hsr->prune_proxy_timer, + jiffies + msecs_to_jiffies(PRUNE_PROXY_PERIOD)); + } + hsr_debugfs_init(hsr, hsr_dev); mod_timer(&hsr->prune_timer, jiffies + msecs_to_jiffies(PRUNE_PERIOD)); diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h index 9060c92168..655284095b 100644 --- a/net/hsr/hsr_device.h +++ b/net/hsr/hsr_device.h @@ -16,8 +16,8 @@ void hsr_del_ports(struct hsr_priv *hsr); void hsr_dev_setup(struct net_device *dev); int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], - unsigned char multicast_spec, u8 protocol_version, - struct netlink_ext_ack *extack); + struct net_device *interlink, unsigned char multicast_spec, + u8 protocol_version, struct netlink_ext_ack *extack); void hsr_check_carrier_and_operstate(struct hsr_priv *hsr); int hsr_get_max_mtu(struct hsr_priv *hsr); #endif /* __HSR_DEVICE_H */ diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index 5d68cb1816..05a61b8286 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -377,6 +377,15 @@ static int hsr_xmit(struct sk_buff *skb, struct hsr_port *port, */ ether_addr_copy(eth_hdr(skb)->h_source, port->dev->dev_addr); } + + /* When HSR node is used as RedBox - the frame received from HSR ring + * requires source MAC address (SA) replacement to one which can be + * recognized by SAN devices (otherwise, frames are dropped by switch) + */ + if (port->type == HSR_PT_INTERLINK) + ether_addr_copy(eth_hdr(skb)->h_source, + port->hsr->macaddress_redbox); + return dev_queue_xmit(skb); } @@ -390,9 +399,57 @@ bool prp_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port) bool hsr_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port) { + struct sk_buff *skb; + if (port->dev->features & NETIF_F_HW_HSR_FWD) return prp_drop_frame(frame, port); + /* RedBox specific frames dropping policies + * + * Do not send HSR supervisory frames to SAN devices + */ + if (frame->is_supervision && port->type == HSR_PT_INTERLINK) + return true; + + /* Do not forward to other HSR port (A or B) unicast frames which + * are addressed to interlink port (and are in the ProxyNodeTable). + */ + skb = frame->skb_hsr; + if (skb && prp_drop_frame(frame, port) && + is_unicast_ether_addr(eth_hdr(skb)->h_dest) && + hsr_is_node_in_db(&port->hsr->proxy_node_db, + eth_hdr(skb)->h_dest)) { + return true; + } + + /* Do not forward to port C (Interlink) frames from nodes A and B + * if DA is in NodeTable. + */ + if ((frame->port_rcv->type == HSR_PT_SLAVE_A || + frame->port_rcv->type == HSR_PT_SLAVE_B) && + port->type == HSR_PT_INTERLINK) { + skb = frame->skb_hsr; + if (skb && is_unicast_ether_addr(eth_hdr(skb)->h_dest) && + hsr_is_node_in_db(&port->hsr->node_db, + eth_hdr(skb)->h_dest)) { + return true; + } + } + + /* Do not forward to port A and B unicast frames received on the + * interlink port if it is addressed to one of nodes registered in + * the ProxyNodeTable. + */ + if ((port->type == HSR_PT_SLAVE_A || port->type == HSR_PT_SLAVE_B) && + frame->port_rcv->type == HSR_PT_INTERLINK) { + skb = frame->skb_std; + if (skb && is_unicast_ether_addr(eth_hdr(skb)->h_dest) && + hsr_is_node_in_db(&port->hsr->proxy_node_db, + eth_hdr(skb)->h_dest)) { + return true; + } + } + return false; } @@ -448,13 +505,14 @@ static void hsr_forward_do(struct hsr_frame_info *frame) } /* Check if frame is to be dropped. Eg. for PRP no forward - * between ports. + * between ports, or sending HSR supervision to RedBox. */ if (hsr->proto_ops->drop_frame && hsr->proto_ops->drop_frame(frame, port)) continue; - if (port->type != HSR_PT_MASTER) + if (port->type == HSR_PT_SLAVE_A || + port->type == HSR_PT_SLAVE_B) skb = hsr->proto_ops->create_tagged_frame(frame, port); else skb = hsr->proto_ops->get_untagged_frame(frame, port); @@ -469,7 +527,9 @@ static void hsr_forward_do(struct hsr_frame_info *frame) hsr_deliver_master(skb, port->dev, frame->node_src); } else { if (!hsr_xmit(skb, port, frame)) - sent = true; + if (port->type == HSR_PT_SLAVE_A || + port->type == HSR_PT_SLAVE_B) + sent = true; } } } @@ -503,10 +563,12 @@ static void handle_std_frame(struct sk_buff *skb, frame->skb_prp = NULL; frame->skb_std = skb; - if (port->type != HSR_PT_MASTER) { + if (port->type != HSR_PT_MASTER) frame->is_from_san = true; - } else { - /* Sequence nr for the master node */ + + if (port->type == HSR_PT_MASTER || + port->type == HSR_PT_INTERLINK) { + /* Sequence nr for the master/interlink node */ lockdep_assert_held(&hsr->seqnr_lock); frame->sequence_nr = hsr->sequence_nr; hsr->sequence_nr++; @@ -564,6 +626,7 @@ static int fill_frame_info(struct hsr_frame_info *frame, { struct hsr_priv *hsr = port->hsr; struct hsr_vlan_ethhdr *vlan_hdr; + struct list_head *n_db; struct ethhdr *ethhdr; __be16 proto; int ret; @@ -574,9 +637,13 @@ static int fill_frame_info(struct hsr_frame_info *frame, memset(frame, 0, sizeof(*frame)); frame->is_supervision = is_supervision_frame(port->hsr, skb); - frame->node_src = hsr_get_node(port, &hsr->node_db, skb, - frame->is_supervision, - port->type); + + n_db = &hsr->node_db; + if (port->type == HSR_PT_INTERLINK) + n_db = &hsr->proxy_node_db; + + frame->node_src = hsr_get_node(port, n_db, skb, + frame->is_supervision, port->type); if (!frame->node_src) return -1; /* Unknown node and !is_supervision, or no mem */ diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 26329db092..614df96497 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -71,6 +71,14 @@ static struct hsr_node *find_node_by_addr_A(struct list_head *node_db, return NULL; } +/* Check if node for a given MAC address is already present in data base + */ +bool hsr_is_node_in_db(struct list_head *node_db, + const unsigned char addr[ETH_ALEN]) +{ + return !!find_node_by_addr_A(node_db, addr); +} + /* Helper for device init; the self_node is used in hsr_rcv() to recognize * frames from self that's been looped over the HSR ring. */ @@ -223,6 +231,15 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db, } } + /* Check if required node is not in proxy nodes table */ + list_for_each_entry_rcu(node, &hsr->proxy_node_db, mac_list) { + if (ether_addr_equal(node->macaddress_A, ethhdr->h_source)) { + if (hsr->proto_ops->update_san_info) + hsr->proto_ops->update_san_info(node, is_sup); + return node; + } + } + /* Everyone may create a node entry, connected node to a HSR/PRP * device. */ @@ -418,6 +435,10 @@ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb, node_dst = find_node_by_addr_A(&port->hsr->node_db, eth_hdr(skb)->h_dest); + if (!node_dst && port->hsr->redbox) + node_dst = find_node_by_addr_A(&port->hsr->proxy_node_db, + eth_hdr(skb)->h_dest); + if (!node_dst) { if (port->hsr->prot_version != PRP_V1 && net_ratelimit()) netdev_err(skb->dev, "%s: Unknown node\n", __func__); @@ -561,6 +582,37 @@ void hsr_prune_nodes(struct timer_list *t) jiffies + msecs_to_jiffies(PRUNE_PERIOD)); } +void hsr_prune_proxy_nodes(struct timer_list *t) +{ + struct hsr_priv *hsr = from_timer(hsr, t, prune_proxy_timer); + unsigned long timestamp; + struct hsr_node *node; + struct hsr_node *tmp; + + spin_lock_bh(&hsr->list_lock); + list_for_each_entry_safe(node, tmp, &hsr->proxy_node_db, mac_list) { + timestamp = node->time_in[HSR_PT_INTERLINK]; + + /* Prune old entries */ + if (time_is_before_jiffies(timestamp + + msecs_to_jiffies(HSR_PROXY_NODE_FORGET_TIME))) { + hsr_nl_nodedown(hsr, node->macaddress_A); + if (!node->removed) { + list_del_rcu(&node->mac_list); + node->removed = true; + /* Note that we need to free this entry later: */ + kfree_rcu(node, rcu_head); + } + } + } + + spin_unlock_bh(&hsr->list_lock); + + /* Restart timer */ + mod_timer(&hsr->prune_proxy_timer, + jiffies + msecs_to_jiffies(PRUNE_PROXY_PERIOD)); +} + void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos, unsigned char addr[ETH_ALEN]) { diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index b23556251d..7619e31c1d 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -46,6 +46,7 @@ int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node, u16 sequence_nr); void hsr_prune_nodes(struct timer_list *t); +void hsr_prune_proxy_nodes(struct timer_list *t); int hsr_create_self_node(struct hsr_priv *hsr, const unsigned char addr_a[ETH_ALEN], @@ -67,6 +68,9 @@ void prp_handle_san_frame(bool san, enum hsr_port_type port, struct hsr_node *node); void prp_update_san_info(struct hsr_node *node, bool is_sup); +bool hsr_is_node_in_db(struct list_head *node_db, + const unsigned char addr[ETH_ALEN]); + struct hsr_node { struct list_head mac_list; /* Protect R/W access to seq_out */ diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 9756e657ba..d7ae32473c 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -96,7 +96,7 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, break; /* Handled in ndo_change_mtu() */ mtu_max = hsr_get_max_mtu(port->hsr); master = hsr_port_get_hsr(port->hsr, HSR_PT_MASTER); - master->dev->mtu = mtu_max; + WRITE_ONCE(master->dev->mtu, mtu_max); break; case NETDEV_UNREGISTER: if (!is_hsr_master(dev)) { diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 18e01791ad..23850b16d1 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -21,6 +21,7 @@ */ #define HSR_LIFE_CHECK_INTERVAL 2000 /* ms */ #define HSR_NODE_FORGET_TIME 60000 /* ms */ +#define HSR_PROXY_NODE_FORGET_TIME 60000 /* ms */ #define HSR_ANNOUNCE_INTERVAL 100 /* ms */ #define HSR_ENTRY_FORGET_TIME 400 /* ms */ @@ -35,6 +36,7 @@ * HSR_NODE_FORGET_TIME? */ #define PRUNE_PERIOD 3000 /* ms */ +#define PRUNE_PROXY_PERIOD 3000 /* ms */ #define HSR_TLV_EOT 0 /* End of TLVs */ #define HSR_TLV_ANNOUNCE 22 #define HSR_TLV_LIFE_CHECK 23 @@ -192,11 +194,14 @@ struct hsr_priv { struct rcu_head rcu_head; struct list_head ports; struct list_head node_db; /* Known HSR nodes */ + struct list_head proxy_node_db; /* RedBox HSR proxy nodes */ struct hsr_self_node __rcu *self_node; /* MACs of slaves */ struct timer_list announce_timer; /* Supervision frame dispatch */ struct timer_list prune_timer; + struct timer_list prune_proxy_timer; int announce_count; u16 sequence_nr; + u16 interlink_sequence_nr; /* Interlink port seq_nr */ u16 sup_sequence_nr; /* For HSRv1 separate seq_nr for supervision */ enum hsr_version prot_version; /* Indicate if HSRv0, HSRv1 or PRPv1 */ spinlock_t seqnr_lock; /* locking for sequence_nr */ @@ -209,6 +214,8 @@ struct hsr_priv { * of lan_id */ bool fwd_offloaded; /* Forwarding offloaded to HW */ + bool redbox; /* Device supports HSR RedBox */ + unsigned char macaddress_redbox[ETH_ALEN]; unsigned char sup_multicast_addr[ETH_ALEN] __aligned(sizeof(u16)); /* Align to u16 boundary to avoid unaligned access * in ether_addr_equal diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 78fe40eb9f..898f18c6da 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -23,6 +23,7 @@ static const struct nla_policy hsr_policy[IFLA_HSR_MAX + 1] = { [IFLA_HSR_SUPERVISION_ADDR] = { .len = ETH_ALEN }, [IFLA_HSR_SEQ_NR] = { .type = NLA_U16 }, [IFLA_HSR_PROTOCOL] = { .type = NLA_U8 }, + [IFLA_HSR_INTERLINK] = { .type = NLA_U32 }, }; /* Here, it seems a netdevice has already been allocated for us, and the @@ -35,8 +36,8 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, enum hsr_version proto_version; unsigned char multicast_spec; u8 proto = HSR_PROTOCOL_HSR; - struct net_device *link[2]; + struct net_device *link[2], *interlink = NULL; if (!data) { NL_SET_ERR_MSG_MOD(extack, "No slave devices specified"); return -EINVAL; @@ -67,6 +68,20 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, return -EINVAL; } + if (data[IFLA_HSR_INTERLINK]) + interlink = __dev_get_by_index(src_net, + nla_get_u32(data[IFLA_HSR_INTERLINK])); + + if (interlink && interlink == link[0]) { + NL_SET_ERR_MSG_MOD(extack, "Interlink and Slave1 are the same"); + return -EINVAL; + } + + if (interlink && interlink == link[1]) { + NL_SET_ERR_MSG_MOD(extack, "Interlink and Slave2 are the same"); + return -EINVAL; + } + if (!data[IFLA_HSR_MULTICAST_SPEC]) multicast_spec = 0; else @@ -96,10 +111,17 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, } } - if (proto == HSR_PROTOCOL_PRP) + if (proto == HSR_PROTOCOL_PRP) { proto_version = PRP_V1; + if (interlink) { + NL_SET_ERR_MSG_MOD(extack, + "Interlink only works with HSR"); + return -EINVAL; + } + } - return hsr_dev_finalize(dev, link, multicast_spec, proto_version, extack); + return hsr_dev_finalize(dev, link, interlink, multicast_spec, + proto_version, extack); } static void hsr_dellink(struct net_device *dev, struct list_head *head) @@ -107,6 +129,7 @@ static void hsr_dellink(struct net_device *dev, struct list_head *head) struct hsr_priv *hsr = netdev_priv(dev); del_timer_sync(&hsr->prune_timer); + del_timer_sync(&hsr->prune_proxy_timer); del_timer_sync(&hsr->announce_timer); hsr_debugfs_term(hsr); @@ -114,6 +137,7 @@ static void hsr_dellink(struct net_device *dev, struct list_head *head) hsr_del_self_node(hsr); hsr_del_nodes(&hsr->node_db); + hsr_del_nodes(&hsr->proxy_node_db); unregister_netdevice_queue(dev, head); } diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index 1b6457f357..af6cf64a00 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -55,6 +55,7 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) protocol = eth_hdr(skb)->h_proto; if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) && + port->type != HSR_PT_INTERLINK && hsr->proto_ops->invalid_dan_ingress_frame && hsr->proto_ops->invalid_dan_ingress_frame(protocol)) goto finish_pass; diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 2c087b7f17..77b4e92027 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -93,7 +93,7 @@ static int lowpan_neigh_construct(struct net_device *dev, struct neighbour *n) static int lowpan_get_iflink(const struct net_device *dev) { - return lowpan_802154_dev(dev)->wdev->ifindex; + return READ_ONCE(lowpan_802154_dev(dev)->wdev->ifindex); } static const struct net_device_ops lowpan_netdev_ops = { @@ -280,5 +280,6 @@ static void __exit lowpan_cleanup_module(void) module_init(lowpan_init_module); module_exit(lowpan_cleanup_module); +MODULE_DESCRIPTION("IPv6 over Low power Wireless Personal Area Network IEEE 802.15.4 core"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("lowpan"); diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 6dd960ec55..56ef873828 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -338,7 +338,6 @@ static struct ctl_table lowpan_frags_ns_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; /* secret interval has been deprecated */ @@ -351,7 +350,6 @@ static struct ctl_table lowpan_frags_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; static int __net_init lowpan_frags_ns_sysctl_register(struct net *net) @@ -370,10 +368,8 @@ static int __net_init lowpan_frags_ns_sysctl_register(struct net *net) goto err_alloc; /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - table[0].procname = NULL; + if (net->user_ns != &init_user_ns) table_size = 0; - } } table[0].data = &ieee802154_lowpan->fqdir->high_thresh; @@ -399,7 +395,7 @@ err_alloc: static void __net_exit lowpan_frags_ns_sysctl_unregister(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index 00302e8b96..990a83455d 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -1137,4 +1137,5 @@ module_init(af_ieee802154_init); module_exit(af_ieee802154_remove); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IEEE 802.15.4 socket interface"); MODULE_ALIAS_NETPROTO(PF_IEEE802154); diff --git a/net/ieee802154/sysfs.c b/net/ieee802154/sysfs.c index d290393380..6708160ebf 100644 --- a/net/ieee802154/sysfs.c +++ b/net/ieee802154/sysfs.c @@ -93,7 +93,7 @@ static SIMPLE_DEV_PM_OPS(wpan_phy_pm_ops, wpan_phy_suspend, wpan_phy_resume); #define WPAN_PHY_PM_OPS NULL #endif -struct class wpan_phy_class = { +const struct class wpan_phy_class = { .name = "ieee802154", .dev_release = wpan_phy_release, .dev_groups = pmib_groups, diff --git a/net/ieee802154/sysfs.h b/net/ieee802154/sysfs.h index 337545b639..69961e1662 100644 --- a/net/ieee802154/sysfs.h +++ b/net/ieee802154/sysfs.h @@ -5,6 +5,6 @@ int wpan_phy_sysfs_init(void); void wpan_phy_sysfs_exit(void); -extern struct class wpan_phy_class; +extern const struct class wpan_phy_class; #endif /* __IEEE802154_SYSFS_H */ diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h index 62aa646525..591ce0a16f 100644 --- a/net/ieee802154/trace.h +++ b/net/ieee802154/trace.h @@ -75,7 +75,7 @@ TRACE_EVENT(802154_rdev_add_virtual_intf, ), TP_fast_assign( WPAN_PHY_ASSIGN; - __assign_str(vir_intf_name, name ? name : "<noname>"); + __assign_str(vir_intf_name); __entry->type = type; __entry->extended_addr = extended_addr; ), diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index a5a820ee20..b24d746166 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -119,6 +119,7 @@ #endif #include <net/l3mdev.h> #include <net/compat.h> +#include <net/rps.h> #include <trace/events/sock.h> @@ -757,7 +758,9 @@ void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *new sock_rps_record_flow(newsk); WARN_ON(!((1 << newsk->sk_state) & (TCPF_ESTABLISHED | TCPF_SYN_RECV | - TCPF_CLOSE_WAIT | TCPF_CLOSE))); + TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | + TCPF_CLOSING | TCPF_CLOSE_WAIT | + TCPF_CLOSE))); if (test_bit(SOCK_SUPPORT_ZC, &sock->flags)) set_bit(SOCK_SUPPORT_ZC, &newsock->flags); @@ -770,16 +773,16 @@ void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *new * Accept a pending connection. The TCP layer now gives BSD semantics. */ -int inet_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +int inet_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { struct sock *sk1 = sock->sk, *sk2; - int err = -EINVAL; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ - sk2 = READ_ONCE(sk1->sk_prot)->accept(sk1, flags, &err, kern); + arg->err = -EINVAL; + sk2 = READ_ONCE(sk1->sk_prot)->accept(sk1, arg); if (!sk2) - return err; + return arg->err; lock_sock(sk2); __inet_accept(sock, newsock, sk2); @@ -1071,6 +1074,7 @@ const struct proto_ops inet_stream_ops = { #endif .splice_eof = inet_splice_eof, .splice_read = tcp_splice_read, + .set_peek_off = sk_set_peek_off, .read_sock = tcp_read_sock, .read_skb = tcp_read_skb, .sendmsg_locked = tcp_sendmsg_locked, @@ -1103,7 +1107,7 @@ const struct proto_ops inet_dgram_ops = { .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .splice_eof = inet_splice_eof, - .set_peek_off = sk_set_peek_off, + .set_peek_off = udp_set_peek_off, #ifdef CONFIG_COMPAT .compat_ioctl = inet_compat_ioctl, #endif @@ -1305,8 +1309,8 @@ static int inet_sk_reselect_saddr(struct sock *sk) int inet_sk_rebuild_header(struct sock *sk) { + struct rtable *rt = dst_rtable(__sk_dst_check(sk, 0)); struct inet_sock *inet = inet_sk(sk); - struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); __be32 daddr; struct ip_options_rcu *inet_opt; struct flowi4 *fl4; @@ -1326,7 +1330,7 @@ int inet_sk_rebuild_header(struct sock *sk) fl4 = &inet->cork.fl.u.ip4; rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, - sk->sk_protocol, RT_CONN_FLAGS(sk), + sk->sk_protocol, ip_sock_rt_tos(sk), sk->sk_bound_dev_if); if (!IS_ERR(rt)) { err = 0; @@ -1480,7 +1484,6 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) struct sk_buff *p; unsigned int hlen; unsigned int off; - unsigned int id; int flush = 1; int proto; @@ -1506,13 +1509,10 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) goto out; NAPI_GRO_CB(skb)->proto = proto; - id = ntohl(*(__be32 *)&iph->id); - flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF)); - id >>= 16; + flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (ntohl(*(__be32 *)&iph->id) & ~IP_DF)); list_for_each_entry(p, head, list) { struct iphdr *iph2; - u16 flush_id; if (!NAPI_GRO_CB(p)->same_flow) continue; @@ -1529,48 +1529,10 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) NAPI_GRO_CB(p)->same_flow = 0; continue; } - - /* All fields must match except length and checksum. */ - NAPI_GRO_CB(p)->flush |= - (iph->ttl ^ iph2->ttl) | - (iph->tos ^ iph2->tos) | - ((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)); - - NAPI_GRO_CB(p)->flush |= flush; - - /* We need to store of the IP ID check to be included later - * when we can verify that this packet does in fact belong - * to a given flow. - */ - flush_id = (u16)(id - ntohs(iph2->id)); - - /* This bit of code makes it much easier for us to identify - * the cases where we are doing atomic vs non-atomic IP ID - * checks. Specifically an atomic check can return IP ID - * values 0 - 0xFFFF, while a non-atomic check can only - * return 0 or 0xFFFF. - */ - if (!NAPI_GRO_CB(p)->is_atomic || - !(iph->frag_off & htons(IP_DF))) { - flush_id ^= NAPI_GRO_CB(p)->count; - flush_id = flush_id ? 0xFFFF : 0; - } - - /* If the previous IP ID value was based on an atomic - * datagram we can overwrite the value and ignore it. - */ - if (NAPI_GRO_CB(skb)->is_atomic) - NAPI_GRO_CB(p)->flush_id = flush_id; - else - NAPI_GRO_CB(p)->flush_id |= flush_id; } - NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF)); NAPI_GRO_CB(skb)->flush |= flush; - skb_set_network_header(skb, off); - /* The above will be needed by the transport layer if there is one - * immediately following this IP hdr. - */ + NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = off; /* Note : No need to call skb_gro_postpull_rcsum() here, * as we already checked checksum over ipv4 header was 0 @@ -1751,19 +1713,6 @@ static const struct net_protocol igmp_protocol = { }; #endif -static const struct net_protocol tcp_protocol = { - .handler = tcp_v4_rcv, - .err_handler = tcp_v4_err, - .no_policy = 1, - .icmp_strict_tag_validation = 1, -}; - -static const struct net_protocol udp_protocol = { - .handler = udp_rcv, - .err_handler = udp_err, - .no_policy = 1, -}; - static const struct net_protocol icmp_protocol = { .handler = icmp_rcv, .err_handler = icmp_err, @@ -1904,14 +1853,6 @@ static int ipv4_proc_init(void); * IP protocol layer initialiser */ -static struct packet_offload ip_packet_offload __read_mostly = { - .type = cpu_to_be16(ETH_P_IP), - .callbacks = { - .gso_segment = inet_gso_segment, - .gro_receive = inet_gro_receive, - .gro_complete = inet_gro_complete, - }, -}; static const struct net_offload ipip_offload = { .callbacks = { @@ -1938,7 +1879,15 @@ static int __init ipv4_offload_init(void) if (ipip_offload_init() < 0) pr_crit("%s: Cannot add IPIP protocol offload\n", __func__); - dev_add_offload(&ip_packet_offload); + net_hotdata.ip_packet_offload = (struct packet_offload) { + .type = cpu_to_be16(ETH_P_IP), + .callbacks = { + .gso_segment = inet_gso_segment, + .gro_receive = inet_gro_receive, + .gro_complete = inet_gro_complete, + }, + }; + dev_add_offload(&net_hotdata.ip_packet_offload); return 0; } @@ -1992,9 +1941,22 @@ static int __init inet_init(void) if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) pr_crit("%s: Cannot add ICMP protocol\n", __func__); - if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0) + + net_hotdata.udp_protocol = (struct net_protocol) { + .handler = udp_rcv, + .err_handler = udp_err, + .no_policy = 1, + }; + if (inet_add_protocol(&net_hotdata.udp_protocol, IPPROTO_UDP) < 0) pr_crit("%s: Cannot add UDP protocol\n", __func__); - if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0) + + net_hotdata.tcp_protocol = (struct net_protocol) { + .handler = tcp_v4_rcv, + .err_handler = tcp_v4_err, + .no_policy = 1, + .icmp_strict_tag_validation = 1, + }; + if (inet_add_protocol(&net_hotdata.tcp_protocol, IPPROTO_TCP) < 0) pr_crit("%s: Cannot add TCP protocol\n", __func__); #ifdef CONFIG_IP_MULTICAST if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 0d0d725b46..11c1519b36 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -456,7 +456,8 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) /*unsigned long now; */ struct net *net = dev_net(dev); - rt = ip_route_output(net, sip, tip, 0, l3mdev_master_ifindex_rcu(dev)); + rt = ip_route_output(net, sip, tip, 0, l3mdev_master_ifindex_rcu(dev), + RT_SCOPE_UNIVERSE); if (IS_ERR(rt)) return 1; if (rt->dst.dev != dev) { @@ -1002,6 +1003,55 @@ out_of_mem: * User level interface (ioctl) */ +static struct net_device *arp_req_dev_by_name(struct net *net, struct arpreq *r, + bool getarp) +{ + struct net_device *dev; + + if (getarp) + dev = dev_get_by_name_rcu(net, r->arp_dev); + else + dev = __dev_get_by_name(net, r->arp_dev); + if (!dev) + return ERR_PTR(-ENODEV); + + /* Mmmm... It is wrong... ARPHRD_NETROM == 0 */ + if (!r->arp_ha.sa_family) + r->arp_ha.sa_family = dev->type; + + if ((r->arp_flags & ATF_COM) && r->arp_ha.sa_family != dev->type) + return ERR_PTR(-EINVAL); + + return dev; +} + +static struct net_device *arp_req_dev(struct net *net, struct arpreq *r) +{ + struct net_device *dev; + struct rtable *rt; + __be32 ip; + + if (r->arp_dev[0]) + return arp_req_dev_by_name(net, r, false); + + if (r->arp_flags & ATF_PUBL) + return NULL; + + ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; + + rt = ip_route_output(net, ip, 0, 0, 0, RT_SCOPE_LINK); + if (IS_ERR(rt)) + return ERR_CAST(rt); + + dev = rt->dst.dev; + ip_rt_put(rt); + + if (!dev) + return ERR_PTR(-EINVAL); + + return dev; +} + /* * Set (create) an ARP cache entry. */ @@ -1022,11 +1072,8 @@ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on) static int arp_req_set_public(struct net *net, struct arpreq *r, struct net_device *dev) { - __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; - if (mask && mask != htonl(0xFFFFFFFF)) - return -EINVAL; if (!dev && (r->arp_flags & ATF_COM)) { dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family, r->arp_ha.sa_data); @@ -1034,6 +1081,8 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, return -ENODEV; } if (mask) { + __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; + if (!pneigh_lookup(&arp_tbl, net, &ip, dev, 1)) return -ENOBUFS; return 0; @@ -1042,29 +1091,20 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, return arp_req_set_proxy(net, dev, 1); } -static int arp_req_set(struct net *net, struct arpreq *r, - struct net_device *dev) +static int arp_req_set(struct net *net, struct arpreq *r) { - __be32 ip; struct neighbour *neigh; + struct net_device *dev; + __be32 ip; int err; + dev = arp_req_dev(net, r); + if (IS_ERR(dev)) + return PTR_ERR(dev); + if (r->arp_flags & ATF_PUBL) return arp_req_set_public(net, r, dev); - ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; - if (r->arp_flags & ATF_PERM) - r->arp_flags |= ATF_COM; - if (!dev) { - struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0); - - if (IS_ERR(rt)) - return PTR_ERR(rt); - dev = rt->dst.dev; - ip_rt_put(rt); - if (!dev) - return -EINVAL; - } switch (dev->type) { #if IS_ENABLED(CONFIG_FDDI) case ARPHRD_FDDI: @@ -1086,12 +1126,18 @@ static int arp_req_set(struct net *net, struct arpreq *r, break; } + ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; + neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev); err = PTR_ERR(neigh); if (!IS_ERR(neigh)) { unsigned int state = NUD_STALE; - if (r->arp_flags & ATF_PERM) + + if (r->arp_flags & ATF_PERM) { + r->arp_flags |= ATF_COM; state = NUD_PERMANENT; + } + err = neigh_update(neigh, (r->arp_flags & ATF_COM) ? r->arp_ha.sa_data : NULL, state, NEIGH_UPDATE_F_OVERRIDE | @@ -1115,27 +1161,40 @@ static unsigned int arp_state_to_flags(struct neighbour *neigh) * Get an ARP cache entry. */ -static int arp_req_get(struct arpreq *r, struct net_device *dev) +static int arp_req_get(struct net *net, struct arpreq *r) { __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; struct neighbour *neigh; - int err = -ENXIO; + struct net_device *dev; + + if (!r->arp_dev[0]) + return -ENODEV; + + dev = arp_req_dev_by_name(net, r, true); + if (IS_ERR(dev)) + return PTR_ERR(dev); neigh = neigh_lookup(&arp_tbl, &ip, dev); - if (neigh) { - if (!(READ_ONCE(neigh->nud_state) & NUD_NOARP)) { - read_lock_bh(&neigh->lock); - memcpy(r->arp_ha.sa_data, neigh->ha, - min(dev->addr_len, sizeof(r->arp_ha.sa_data_min))); - r->arp_flags = arp_state_to_flags(neigh); - read_unlock_bh(&neigh->lock); - r->arp_ha.sa_family = dev->type; - strscpy(r->arp_dev, dev->name, sizeof(r->arp_dev)); - err = 0; - } + if (!neigh) + return -ENXIO; + + if (READ_ONCE(neigh->nud_state) & NUD_NOARP) { neigh_release(neigh); + return -ENXIO; } - return err; + + read_lock_bh(&neigh->lock); + memcpy(r->arp_ha.sa_data, neigh->ha, + min(dev->addr_len, sizeof(r->arp_ha.sa_data_min))); + r->arp_flags = arp_state_to_flags(neigh); + read_unlock_bh(&neigh->lock); + + neigh_release(neigh); + + r->arp_ha.sa_family = dev->type; + netdev_copy_name(dev, r->arp_dev); + + return 0; } int arp_invalidate(struct net_device *dev, __be32 ip, bool force) @@ -1166,36 +1225,31 @@ int arp_invalidate(struct net_device *dev, __be32 ip, bool force) static int arp_req_delete_public(struct net *net, struct arpreq *r, struct net_device *dev) { - __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; - if (mask == htonl(0xFFFFFFFF)) - return pneigh_delete(&arp_tbl, net, &ip, dev); + if (mask) { + __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; - if (mask) - return -EINVAL; + return pneigh_delete(&arp_tbl, net, &ip, dev); + } return arp_req_set_proxy(net, dev, 0); } -static int arp_req_delete(struct net *net, struct arpreq *r, - struct net_device *dev) +static int arp_req_delete(struct net *net, struct arpreq *r) { + struct net_device *dev; __be32 ip; + dev = arp_req_dev(net, r); + if (IS_ERR(dev)) + return PTR_ERR(dev); + if (r->arp_flags & ATF_PUBL) return arp_req_delete_public(net, r, dev); ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; - if (!dev) { - struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0); - if (IS_ERR(rt)) - return PTR_ERR(rt); - dev = rt->dst.dev; - ip_rt_put(rt); - if (!dev) - return -EINVAL; - } + return arp_invalidate(dev, ip, true); } @@ -1205,9 +1259,9 @@ static int arp_req_delete(struct net *net, struct arpreq *r, int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) { - int err; struct arpreq r; - struct net_device *dev = NULL; + __be32 *netmask; + int err; switch (cmd) { case SIOCDARP: @@ -1230,42 +1284,34 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (!(r.arp_flags & ATF_PUBL) && (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB))) return -EINVAL; + + netmask = &((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr; if (!(r.arp_flags & ATF_NETMASK)) - ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr = - htonl(0xFFFFFFFFUL); - rtnl_lock(); - if (r.arp_dev[0]) { - err = -ENODEV; - dev = __dev_get_by_name(net, r.arp_dev); - if (!dev) - goto out; - - /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ - if (!r.arp_ha.sa_family) - r.arp_ha.sa_family = dev->type; - err = -EINVAL; - if ((r.arp_flags & ATF_COM) && r.arp_ha.sa_family != dev->type) - goto out; - } else if (cmd == SIOCGARP) { - err = -ENODEV; - goto out; - } + *netmask = htonl(0xFFFFFFFFUL); + else if (*netmask && *netmask != htonl(0xFFFFFFFFUL)) + return -EINVAL; switch (cmd) { case SIOCDARP: - err = arp_req_delete(net, &r, dev); + rtnl_lock(); + err = arp_req_delete(net, &r); + rtnl_unlock(); break; case SIOCSARP: - err = arp_req_set(net, &r, dev); + rtnl_lock(); + err = arp_req_set(net, &r); + rtnl_unlock(); break; case SIOCGARP: - err = arp_req_get(&r, dev); + rcu_read_lock(); + err = arp_req_get(net, &r); + rcu_read_unlock(); + + if (!err && copy_to_user(arg, &r, sizeof(r))) + err = -EFAULT; break; } -out: - rtnl_unlock(); - if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r))) - err = -EFAULT; + return err; } diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index ae8b15e689..18227757ec 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -12,7 +12,7 @@ #include <net/bpf_sk_storage.h> /* "extern" is to avoid sparse warning. It is only used in bpf_struct_ops.c. */ -extern struct bpf_struct_ops bpf_tcp_congestion_ops; +static struct bpf_struct_ops bpf_tcp_congestion_ops; static u32 unsupported_ops[] = { offsetof(struct tcp_congestion_ops, get_info), @@ -20,6 +20,7 @@ static u32 unsupported_ops[] = { static const struct btf_type *tcp_sock_type; static u32 tcp_sock_id, sock_id; +static const struct btf_type *tcp_congestion_ops_type; static int bpf_tcp_ca_init(struct btf *btf) { @@ -36,6 +37,11 @@ static int bpf_tcp_ca_init(struct btf *btf) tcp_sock_id = type_id; tcp_sock_type = btf_type_by_id(btf, tcp_sock_id); + type_id = btf_find_by_name_kind(btf, "tcp_congestion_ops", BTF_KIND_STRUCT); + if (type_id < 0) + return -EINVAL; + tcp_congestion_ops_type = btf_type_by_id(btf, type_id); + return 0; } @@ -101,6 +107,9 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, case offsetof(struct tcp_sock, snd_cwnd_cnt): end = offsetofend(struct tcp_sock, snd_cwnd_cnt); break; + case offsetof(struct tcp_sock, snd_cwnd_stamp): + end = offsetofend(struct tcp_sock, snd_cwnd_stamp); + break; case offsetof(struct tcp_sock, snd_ssthresh): end = offsetofend(struct tcp_sock, snd_ssthresh); break; @@ -149,7 +158,7 @@ static u32 prog_ops_moff(const struct bpf_prog *prog) u32 midx; midx = prog->expected_attach_type; - t = bpf_tcp_congestion_ops.type; + t = tcp_congestion_ops_type; m = &btf_type_member(t)[midx]; return __btf_member_bit_offset(t, m) / 8; @@ -191,17 +200,17 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } -BTF_SET8_START(bpf_tcp_ca_check_kfunc_ids) +BTF_KFUNCS_START(bpf_tcp_ca_check_kfunc_ids) BTF_ID_FLAGS(func, tcp_reno_ssthresh) BTF_ID_FLAGS(func, tcp_reno_cong_avoid) BTF_ID_FLAGS(func, tcp_reno_undo_cwnd) BTF_ID_FLAGS(func, tcp_slow_start) BTF_ID_FLAGS(func, tcp_cong_avoid_ai) -BTF_SET8_END(bpf_tcp_ca_check_kfunc_ids) +BTF_KFUNCS_END(bpf_tcp_ca_check_kfunc_ids) static const struct btf_kfunc_id_set bpf_tcp_ca_kfunc_set = { .owner = THIS_MODULE, @@ -301,7 +310,8 @@ static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk) return 0; } -static void bpf_tcp_ca_cong_control(struct sock *sk, const struct rate_sample *rs) +static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag, + const struct rate_sample *rs) { } @@ -339,7 +349,7 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = { .release = __bpf_tcp_ca_release, }; -struct bpf_struct_ops bpf_tcp_congestion_ops = { +static struct bpf_struct_ops bpf_tcp_congestion_ops = { .verifier_ops = &bpf_tcp_ca_verifier_ops, .reg = bpf_tcp_ca_reg, .unreg = bpf_tcp_ca_unreg, @@ -350,10 +360,16 @@ struct bpf_struct_ops bpf_tcp_congestion_ops = { .validate = bpf_tcp_ca_validate, .name = "tcp_congestion_ops", .cfi_stubs = &__bpf_ops_tcp_congestion_ops, + .owner = THIS_MODULE, }; static int __init bpf_tcp_ca_kfunc_init(void) { - return register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_tcp_ca_kfunc_set); + int ret; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_tcp_ca_kfunc_set); + ret = ret ?: register_bpf_struct_ops(&bpf_tcp_congestion_ops, tcp_congestion_ops); + + return ret; } late_initcall(bpf_tcp_ca_kfunc_init); diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index d048aa8332..e9cb27061c 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -864,11 +864,8 @@ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def, net_clen_bits, net_spot + 1, 1); - if (net_spot < 0) { - if (net_spot == -2) - return -EFAULT; + if (net_spot < 0) return 0; - } switch (doi_def->type) { case CIPSO_V4_MAP_PASS: @@ -1813,11 +1810,35 @@ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len, return CIPSO_V4_HDR_LEN + ret_val; } +static int cipso_v4_get_actual_opt_len(const unsigned char *data, int len) +{ + int iter = 0, optlen = 0; + + /* determining the new total option length is tricky because of + * the padding necessary, the only thing i can think to do at + * this point is walk the options one-by-one, skipping the + * padding at the end to determine the actual option size and + * from there we can determine the new total option length + */ + while (iter < len) { + if (data[iter] == IPOPT_END) { + break; + } else if (data[iter] == IPOPT_NOP) { + iter++; + } else { + iter += data[iter + 1]; + optlen = iter; + } + } + return optlen; +} + /** * cipso_v4_sock_setattr - Add a CIPSO option to a socket * @sk: the socket * @doi_def: the CIPSO DOI to use * @secattr: the specific security attributes of the socket + * @sk_locked: true if caller holds the socket lock * * Description: * Set the CIPSO option on the given socket using the DOI definition and @@ -1829,7 +1850,8 @@ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len, */ int cipso_v4_sock_setattr(struct sock *sk, const struct cipso_v4_doi *doi_def, - const struct netlbl_lsm_secattr *secattr) + const struct netlbl_lsm_secattr *secattr, + bool sk_locked) { int ret_val = -EPERM; unsigned char *buf = NULL; @@ -1879,8 +1901,7 @@ int cipso_v4_sock_setattr(struct sock *sk, sk_inet = inet_sk(sk); - old = rcu_dereference_protected(sk_inet->inet_opt, - lockdep_sock_is_held(sk)); + old = rcu_dereference_protected(sk_inet->inet_opt, sk_locked); if (inet_test_bit(IS_ICSK, sk)) { sk_conn = inet_csk(sk); if (old) @@ -1988,7 +2009,6 @@ static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr) u8 cipso_len; u8 cipso_off; unsigned char *cipso_ptr; - int iter; int optlen_new; cipso_off = opt->opt.cipso - sizeof(struct iphdr); @@ -2008,19 +2028,8 @@ static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr) memmove(cipso_ptr, cipso_ptr + cipso_len, opt->opt.optlen - cipso_off - cipso_len); - /* determining the new total option length is tricky because of - * the padding necessary, the only thing i can think to do at - * this point is walk the options one-by-one, skipping the - * padding at the end to determine the actual option size and - * from there we can determine the new total option length */ - iter = 0; - optlen_new = 0; - while (iter < opt->opt.optlen) - if (opt->opt.__data[iter] != IPOPT_NOP) { - iter += opt->opt.__data[iter + 1]; - optlen_new = iter; - } else - iter++; + optlen_new = cipso_v4_get_actual_opt_len(opt->opt.__data, + opt->opt.optlen); hdr_delta = opt->opt.optlen; opt->opt.optlen = (optlen_new + 3) & ~3; hdr_delta -= opt->opt.optlen; @@ -2240,7 +2249,8 @@ int cipso_v4_skbuff_setattr(struct sk_buff *skb, */ int cipso_v4_skbuff_delattr(struct sk_buff *skb) { - int ret_val; + int ret_val, cipso_len, hdr_len_actual, new_hdr_len_actual, new_hdr_len, + hdr_len_delta; struct iphdr *iph; struct ip_options *opt = &IPCB(skb)->opt; unsigned char *cipso_ptr; @@ -2253,16 +2263,37 @@ int cipso_v4_skbuff_delattr(struct sk_buff *skb) if (ret_val < 0) return ret_val; - /* the easiest thing to do is just replace the cipso option with noop - * options since we don't change the size of the packet, although we - * still need to recalculate the checksum */ - iph = ip_hdr(skb); cipso_ptr = (unsigned char *)iph + opt->cipso; - memset(cipso_ptr, IPOPT_NOOP, cipso_ptr[1]); + cipso_len = cipso_ptr[1]; + + hdr_len_actual = sizeof(struct iphdr) + + cipso_v4_get_actual_opt_len((unsigned char *)(iph + 1), + opt->optlen); + new_hdr_len_actual = hdr_len_actual - cipso_len; + new_hdr_len = (new_hdr_len_actual + 3) & ~3; + hdr_len_delta = (iph->ihl << 2) - new_hdr_len; + + /* 1. shift any options after CIPSO to the left */ + memmove(cipso_ptr, cipso_ptr + cipso_len, + new_hdr_len_actual - opt->cipso); + /* 2. move the whole IP header to its new place */ + memmove((unsigned char *)iph + hdr_len_delta, iph, new_hdr_len_actual); + /* 3. adjust the skb layout */ + skb_pull(skb, hdr_len_delta); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + /* 4. re-fill new padding with IPOPT_END (may now be longer) */ + memset((unsigned char *)iph + new_hdr_len_actual, IPOPT_END, + new_hdr_len - new_hdr_len_actual); + + opt->optlen -= hdr_len_delta; opt->cipso = 0; opt->is_changed = 1; - + if (hdr_len_delta != 0) { + iph->ihl = new_hdr_len >> 2; + iph_set_totlen(iph, skb->len); + } ip_send_check(iph); return 0; diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 2cc50cbfc2..cc6d0bd7b0 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -119,7 +119,7 @@ void ip4_datagram_release_cb(struct sock *sk) rt = ip_route_output_ports(sock_net(sk), &fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, sk->sk_protocol, - RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); + ip_sock_rt_tos(sk), sk->sk_bound_dev_if); dst = !IS_ERR(rt) ? &rt->dst : NULL; sk_dst_set(sk, dst); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index bc74f131fe..d09f557eaa 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -224,6 +224,7 @@ static struct in_ifaddr *inet_alloc_ifa(void) static void inet_rcu_free_ifa(struct rcu_head *head) { struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head); + if (ifa->ifa_dev) in_dev_put(ifa->ifa_dev); kfree(ifa); @@ -231,7 +232,11 @@ static void inet_rcu_free_ifa(struct rcu_head *head) static void inet_free_ifa(struct in_ifaddr *ifa) { - call_rcu(&ifa->rcu_head, inet_rcu_free_ifa); + /* Our reference to ifa->ifa_dev must be freed ASAP + * to release the reference to the netdev the same way. + * in_dev_put() -> in_dev_finish_destroy() -> netdev_put() + */ + call_rcu_hurry(&ifa->rcu_head, inet_rcu_free_ifa); } static void in_dev_free_rcu(struct rcu_head *head) @@ -713,34 +718,37 @@ static void check_lifetime(struct work_struct *work) rcu_read_lock(); hlist_for_each_entry_rcu(ifa, &inet_addr_lst[i], hash) { - unsigned long age; + unsigned long age, tstamp; + u32 preferred_lft; + u32 valid_lft; + u32 flags; - if (ifa->ifa_flags & IFA_F_PERMANENT) + flags = READ_ONCE(ifa->ifa_flags); + if (flags & IFA_F_PERMANENT) continue; + preferred_lft = READ_ONCE(ifa->ifa_preferred_lft); + valid_lft = READ_ONCE(ifa->ifa_valid_lft); + tstamp = READ_ONCE(ifa->ifa_tstamp); /* We try to batch several events at once. */ - age = (now - ifa->ifa_tstamp + + age = (now - tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; - if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME && - age >= ifa->ifa_valid_lft) { + if (valid_lft != INFINITY_LIFE_TIME && + age >= valid_lft) { change_needed = true; - } else if (ifa->ifa_preferred_lft == + } else if (preferred_lft == INFINITY_LIFE_TIME) { continue; - } else if (age >= ifa->ifa_preferred_lft) { - if (time_before(ifa->ifa_tstamp + - ifa->ifa_valid_lft * HZ, next)) - next = ifa->ifa_tstamp + - ifa->ifa_valid_lft * HZ; + } else if (age >= preferred_lft) { + if (time_before(tstamp + valid_lft * HZ, next)) + next = tstamp + valid_lft * HZ; - if (!(ifa->ifa_flags & IFA_F_DEPRECATED)) + if (!(flags & IFA_F_DEPRECATED)) change_needed = true; - } else if (time_before(ifa->ifa_tstamp + - ifa->ifa_preferred_lft * HZ, + } else if (time_before(tstamp + preferred_lft * HZ, next)) { - next = ifa->ifa_tstamp + - ifa->ifa_preferred_lft * HZ; + next = tstamp + preferred_lft * HZ; } } rcu_read_unlock(); @@ -804,24 +812,26 @@ static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft, __u32 prefered_lft) { unsigned long timeout; + u32 flags; - ifa->ifa_flags &= ~(IFA_F_PERMANENT | IFA_F_DEPRECATED); + flags = ifa->ifa_flags & ~(IFA_F_PERMANENT | IFA_F_DEPRECATED); timeout = addrconf_timeout_fixup(valid_lft, HZ); if (addrconf_finite_timeout(timeout)) - ifa->ifa_valid_lft = timeout; + WRITE_ONCE(ifa->ifa_valid_lft, timeout); else - ifa->ifa_flags |= IFA_F_PERMANENT; + flags |= IFA_F_PERMANENT; timeout = addrconf_timeout_fixup(prefered_lft, HZ); if (addrconf_finite_timeout(timeout)) { if (timeout == 0) - ifa->ifa_flags |= IFA_F_DEPRECATED; - ifa->ifa_preferred_lft = timeout; + flags |= IFA_F_DEPRECATED; + WRITE_ONCE(ifa->ifa_preferred_lft, timeout); } - ifa->ifa_tstamp = jiffies; + WRITE_ONCE(ifa->ifa_flags, flags); + WRITE_ONCE(ifa->ifa_tstamp, jiffies); if (!ifa->ifa_cstamp) - ifa->ifa_cstamp = ifa->ifa_tstamp; + WRITE_ONCE(ifa->ifa_cstamp, ifa->ifa_tstamp); } static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, @@ -1312,7 +1322,7 @@ static __be32 in_dev_select_addr(const struct in_device *in_dev, const struct in_ifaddr *ifa; in_dev_for_each_ifa_rcu(ifa, in_dev) { - if (ifa->ifa_flags & IFA_F_SECONDARY) + if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY) continue; if (ifa->ifa_scope != RT_SCOPE_LINK && ifa->ifa_scope <= scope) @@ -1340,7 +1350,7 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) localnet_scope = RT_SCOPE_LINK; in_dev_for_each_ifa_rcu(ifa, in_dev) { - if (ifa->ifa_flags & IFA_F_SECONDARY) + if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY) continue; if (min(ifa->ifa_scope, localnet_scope) > scope) continue; @@ -1671,12 +1681,14 @@ static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp, return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci); } -static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, +static int inet_fill_ifaddr(struct sk_buff *skb, const struct in_ifaddr *ifa, struct inet_fill_args *args) { struct ifaddrmsg *ifm; struct nlmsghdr *nlh; + unsigned long tstamp; u32 preferred, valid; + u32 flags; nlh = nlmsg_put(skb, args->portid, args->seq, args->event, sizeof(*ifm), args->flags); @@ -1686,7 +1698,13 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, ifm = nlmsg_data(nlh); ifm->ifa_family = AF_INET; ifm->ifa_prefixlen = ifa->ifa_prefixlen; - ifm->ifa_flags = ifa->ifa_flags; + + flags = READ_ONCE(ifa->ifa_flags); + /* Warning : ifm->ifa_flags is an __u8, it holds only 8 bits. + * The 32bit value is given in IFA_FLAGS attribute. + */ + ifm->ifa_flags = (__u8)flags; + ifm->ifa_scope = ifa->ifa_scope; ifm->ifa_index = ifa->ifa_dev->dev->ifindex; @@ -1694,11 +1712,12 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) goto nla_put_failure; - if (!(ifm->ifa_flags & IFA_F_PERMANENT)) { - preferred = ifa->ifa_preferred_lft; - valid = ifa->ifa_valid_lft; + tstamp = READ_ONCE(ifa->ifa_tstamp); + if (!(flags & IFA_F_PERMANENT)) { + preferred = READ_ONCE(ifa->ifa_preferred_lft); + valid = READ_ONCE(ifa->ifa_valid_lft); if (preferred != INFINITY_LIFE_TIME) { - long tval = (jiffies - ifa->ifa_tstamp) / HZ; + long tval = (jiffies - tstamp) / HZ; if (preferred > tval) preferred -= tval; @@ -1725,10 +1744,10 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) || (ifa->ifa_proto && nla_put_u8(skb, IFA_PROTO, ifa->ifa_proto)) || - nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) || + nla_put_u32(skb, IFA_FLAGS, flags) || (ifa->ifa_rt_priority && nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) || - put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp, + put_cacheinfo(skb, READ_ONCE(ifa->ifa_cstamp), tstamp, preferred, valid)) goto nla_put_failure; @@ -1798,15 +1817,15 @@ static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, } static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb, - struct netlink_callback *cb, int s_ip_idx, + struct netlink_callback *cb, int *s_ip_idx, struct inet_fill_args *fillargs) { struct in_ifaddr *ifa; int ip_idx = 0; int err; - in_dev_for_each_ifa_rtnl(ifa, in_dev) { - if (ip_idx < s_ip_idx) { + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (ip_idx < *s_ip_idx) { ip_idx++; continue; } @@ -1818,9 +1837,9 @@ static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb, ip_idx++; } err = 0; - + ip_idx = 0; done: - cb->args[2] = ip_idx; + *s_ip_idx = ip_idx; return err; } @@ -1830,7 +1849,7 @@ done: static u32 inet_base_seq(const struct net *net) { u32 res = atomic_read(&net->ipv4.dev_addr_genid) + - net->dev_base_seq; + READ_ONCE(net->dev_base_seq); /* Must not return 0 (see nl_dump_check_consistent()). * Chose a value far away from 0. @@ -1852,75 +1871,52 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) }; struct net *net = sock_net(skb->sk); struct net *tgt_net = net; - int h, s_h; - int idx, s_idx; - int s_ip_idx; - struct net_device *dev; + struct { + unsigned long ifindex; + int ip_idx; + } *ctx = (void *)cb->ctx; struct in_device *in_dev; - struct hlist_head *head; + struct net_device *dev; int err = 0; - s_h = cb->args[0]; - s_idx = idx = cb->args[1]; - s_ip_idx = cb->args[2]; - + rcu_read_lock(); if (cb->strict_check) { err = inet_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net, skb->sk, cb); if (err < 0) - goto put_tgt_net; + goto done; - err = 0; if (fillargs.ifindex) { - dev = __dev_get_by_index(tgt_net, fillargs.ifindex); + dev = dev_get_by_index_rcu(tgt_net, fillargs.ifindex); if (!dev) { err = -ENODEV; - goto put_tgt_net; - } - - in_dev = __in_dev_get_rtnl(dev); - if (in_dev) { - err = in_dev_dump_addr(in_dev, skb, cb, s_ip_idx, - &fillargs); + goto done; } - goto put_tgt_net; - } - } - - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &tgt_net->dev_index_head[h]; - rcu_read_lock(); - cb->seq = inet_base_seq(tgt_net); - hlist_for_each_entry_rcu(dev, head, index_hlist) { - if (idx < s_idx) - goto cont; - if (h > s_h || idx > s_idx) - s_ip_idx = 0; in_dev = __in_dev_get_rcu(dev); if (!in_dev) - goto cont; - - err = in_dev_dump_addr(in_dev, skb, cb, s_ip_idx, - &fillargs); - if (err < 0) { - rcu_read_unlock(); goto done; - } -cont: - idx++; + err = in_dev_dump_addr(in_dev, skb, cb, &ctx->ip_idx, + &fillargs); + goto done; } - rcu_read_unlock(); } + cb->seq = inet_base_seq(tgt_net); + + for_each_netdev_dump(tgt_net, dev, ctx->ifindex) { + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) + continue; + err = in_dev_dump_addr(in_dev, skb, cb, &ctx->ip_idx, + &fillargs); + if (err < 0) + goto done; + } done: - cb->args[0] = h; - cb->args[1] = idx; -put_tgt_net: if (fillargs.netnsid >= 0) put_net(tgt_net); - - return skb->len ? : err; + rcu_read_unlock(); + return err; } static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, @@ -1982,7 +1978,7 @@ static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev, return -EMSGSIZE; for (i = 0; i < IPV4_DEVCONF_MAX; i++) - ((u32 *) nla_data(nla))[i] = in_dev->cnf.data[i]; + ((u32 *) nla_data(nla))[i] = READ_ONCE(in_dev->cnf.data[i]); return 0; } @@ -2068,9 +2064,9 @@ static int inet_netconf_msgsize_devconf(int type) } static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, - struct ipv4_devconf *devconf, u32 portid, - u32 seq, int event, unsigned int flags, - int type) + const struct ipv4_devconf *devconf, + u32 portid, u32 seq, int event, + unsigned int flags, int type) { struct nlmsghdr *nlh; struct netconfmsg *ncm; @@ -2095,27 +2091,28 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, if ((all || type == NETCONFA_FORWARDING) && nla_put_s32(skb, NETCONFA_FORWARDING, - IPV4_DEVCONF(*devconf, FORWARDING)) < 0) + IPV4_DEVCONF_RO(*devconf, FORWARDING)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_RP_FILTER) && nla_put_s32(skb, NETCONFA_RP_FILTER, - IPV4_DEVCONF(*devconf, RP_FILTER)) < 0) + IPV4_DEVCONF_RO(*devconf, RP_FILTER)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_MC_FORWARDING) && nla_put_s32(skb, NETCONFA_MC_FORWARDING, - IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0) + IPV4_DEVCONF_RO(*devconf, MC_FORWARDING)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_BC_FORWARDING) && nla_put_s32(skb, NETCONFA_BC_FORWARDING, - IPV4_DEVCONF(*devconf, BC_FORWARDING)) < 0) + IPV4_DEVCONF_RO(*devconf, BC_FORWARDING)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_PROXY_NEIGH) && nla_put_s32(skb, NETCONFA_PROXY_NEIGH, - IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0) + IPV4_DEVCONF_RO(*devconf, PROXY_ARP)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, - IPV4_DEVCONF(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0) + IPV4_DEVCONF_RO(*devconf, + IGNORE_ROUTES_WITH_LINKDOWN)) < 0) goto nla_put_failure; out: @@ -2204,21 +2201,20 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); - struct nlattr *tb[NETCONFA_MAX+1]; + struct nlattr *tb[NETCONFA_MAX + 1]; + const struct ipv4_devconf *devconf; + struct in_device *in_dev = NULL; + struct net_device *dev = NULL; struct sk_buff *skb; - struct ipv4_devconf *devconf; - struct in_device *in_dev; - struct net_device *dev; int ifindex; int err; err = inet_netconf_valid_get_req(in_skb, nlh, tb, extack); if (err) - goto errout; + return err; - err = -EINVAL; if (!tb[NETCONFA_IFINDEX]) - goto errout; + return -EINVAL; ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]); switch (ifindex) { @@ -2229,10 +2225,10 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb, devconf = net->ipv4.devconf_dflt; break; default: - dev = __dev_get_by_index(net, ifindex); - if (!dev) - goto errout; - in_dev = __in_dev_get_rtnl(dev); + err = -ENODEV; + dev = dev_get_by_index(net, ifindex); + if (dev) + in_dev = in_dev_get(dev); if (!in_dev) goto errout; devconf = &in_dev->cnf; @@ -2256,6 +2252,9 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb, } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: + if (in_dev) + in_dev_put(in_dev); + dev_put(dev); return err; } @@ -2264,11 +2263,13 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb, { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); - int h, s_h; - int idx, s_idx; + struct { + unsigned long ifindex; + unsigned int all_default; + } *ctx = (void *)cb->ctx; + const struct in_device *in_dev; struct net_device *dev; - struct in_device *in_dev; - struct hlist_head *head; + int err = 0; if (cb->strict_check) { struct netlink_ext_ack *extack = cb->extack; @@ -2285,64 +2286,45 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb, } } - s_h = cb->args[0]; - s_idx = idx = cb->args[1]; - - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &net->dev_index_head[h]; - rcu_read_lock(); - cb->seq = inet_base_seq(net); - hlist_for_each_entry_rcu(dev, head, index_hlist) { - if (idx < s_idx) - goto cont; - in_dev = __in_dev_get_rcu(dev); - if (!in_dev) - goto cont; - - if (inet_netconf_fill_devconf(skb, dev->ifindex, - &in_dev->cnf, - NETLINK_CB(cb->skb).portid, - nlh->nlmsg_seq, - RTM_NEWNETCONF, - NLM_F_MULTI, - NETCONFA_ALL) < 0) { - rcu_read_unlock(); - goto done; - } - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); -cont: - idx++; - } - rcu_read_unlock(); + rcu_read_lock(); + for_each_netdev_dump(net, dev, ctx->ifindex) { + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) + continue; + err = inet_netconf_fill_devconf(skb, dev->ifindex, + &in_dev->cnf, + NETLINK_CB(cb->skb).portid, + nlh->nlmsg_seq, + RTM_NEWNETCONF, NLM_F_MULTI, + NETCONFA_ALL); + if (err < 0) + goto done; } - if (h == NETDEV_HASHENTRIES) { - if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, - net->ipv4.devconf_all, - NETLINK_CB(cb->skb).portid, - nlh->nlmsg_seq, - RTM_NEWNETCONF, NLM_F_MULTI, - NETCONFA_ALL) < 0) + if (ctx->all_default == 0) { + err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, + net->ipv4.devconf_all, + NETLINK_CB(cb->skb).portid, + nlh->nlmsg_seq, + RTM_NEWNETCONF, NLM_F_MULTI, + NETCONFA_ALL); + if (err < 0) goto done; - else - h++; - } - if (h == NETDEV_HASHENTRIES + 1) { - if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, - net->ipv4.devconf_dflt, - NETLINK_CB(cb->skb).portid, - nlh->nlmsg_seq, - RTM_NEWNETCONF, NLM_F_MULTI, - NETCONFA_ALL) < 0) + ctx->all_default++; + } + if (ctx->all_default == 1) { + err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, + net->ipv4.devconf_dflt, + NETLINK_CB(cb->skb).portid, + nlh->nlmsg_seq, + RTM_NEWNETCONF, NLM_F_MULTI, + NETCONFA_ALL); + if (err < 0) goto done; - else - h++; + ctx->all_default++; } done: - cb->args[0] = h; - cb->args[1] = idx; - - return skb->len; + rcu_read_unlock(); + return err; } #ifdef CONFIG_SYSCTL @@ -2546,7 +2528,7 @@ static int ipv4_doint_and_flush(struct ctl_table *ctl, int write, static struct devinet_sysctl_table { struct ctl_table_header *sysctl_header; - struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX]; + struct ctl_table devinet_vars[IPV4_DEVCONF_MAX]; } devinet_sysctl = { .devinet_vars = { DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", @@ -2609,7 +2591,7 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name, if (!t) goto out; - for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) { + for (i = 0; i < ARRAY_SIZE(t->devinet_vars); i++) { t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf; t->devinet_vars[i].extra1 = p; t->devinet_vars[i].extra2 = net; @@ -2683,7 +2665,6 @@ static struct ctl_table ctl_forward_entry[] = { .extra1 = &ipv4_devconf, .extra2 = &init_net, }, - { }, }; #endif @@ -2780,7 +2761,7 @@ err_alloc_all: static __net_exit void devinet_exit_net(struct net *net) { #ifdef CONFIG_SYSCTL - struct ctl_table *tbl; + const struct ctl_table *tbl; tbl = net->ipv4.forw_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.forw_hdr); @@ -2823,7 +2804,9 @@ void __init devinet_init(void) rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, 0); rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, 0); - rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, 0); + rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, + RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE); rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf, - inet_netconf_dump_devconf, 0); + inet_netconf_dump_devconf, + RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED); } diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index d33d124218..619a4df7be 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -20,6 +20,7 @@ #include <net/udp.h> #include <net/tcp.h> #include <net/espintcp.h> +#include <linux/skbuff_ref.h> #include <linux/highmem.h> @@ -114,7 +115,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) */ if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) - skb_page_unref(skb, sg_page(sg), false); + skb_page_unref(sg_page(sg), skb->pp_recycle); } #ifdef CONFIG_INET_ESPINTCP @@ -238,8 +239,7 @@ static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb) #else static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb) { - kfree_skb(skb); - + WARN_ON(1); return -EOPNOTSUPP; } #endif @@ -347,7 +347,6 @@ static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb, __be16 dport) { struct udphdr *uh; - __be32 *udpdata32; unsigned int len; len = skb->len + esp->tailen - skb_transport_offset(skb); @@ -362,12 +361,6 @@ static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb, *skb_mac_header(skb) = IPPROTO_UDP; - if (encap_type == UDP_ENCAP_ESPINUDP_NON_IKE) { - udpdata32 = (__be32 *)(uh + 1); - udpdata32[0] = udpdata32[1] = 0; - return (struct ip_esp_hdr *)(udpdata32 + 2); - } - return (struct ip_esp_hdr *)(uh + 1); } @@ -423,7 +416,6 @@ static int esp_output_encap(struct xfrm_state *x, struct sk_buff *skb, switch (encap_type) { default: case UDP_ENCAP_ESPINUDP: - case UDP_ENCAP_ESPINUDP_NON_IKE: esph = esp_output_udp_encap(skb, encap_type, esp, sport, dport); break; case TCP_ENCAP_ESPINTCP: @@ -775,7 +767,6 @@ int esp_input_done2(struct sk_buff *skb, int err) source = th->source; break; case UDP_ENCAP_ESPINUDP: - case UDP_ENCAP_ESPINUDP_NON_IKE: source = uh->source; break; default: @@ -1179,9 +1170,6 @@ static int esp_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) case UDP_ENCAP_ESPINUDP: x->props.header_len += sizeof(struct udphdr); break; - case UDP_ENCAP_ESPINUDP_NON_IKE: - x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32); - break; #ifdef CONFIG_INET_ESPINTCP case TCP_ENCAP_ESPINTCP: /* only the length field, TCP encap is done by diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index b3271957ad..3f28ecbdca 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -56,6 +56,13 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head, x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, (xfrm_address_t *)&ip_hdr(skb)->daddr, spi, IPPROTO_ESP, AF_INET); + + if (unlikely(x && x->dir && x->dir != XFRM_SA_DIR_IN)) { + /* non-offload path will record the error and audit log */ + xfrm_state_put(x); + x = NULL; + } + if (!x) goto out_reset; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 390f4be7f7..7ad2cafb92 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -916,7 +916,8 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct rtmsg *rtm; int err, i; - ASSERT_RTNL(); + if (filter->rtnl_held) + ASSERT_RTNL(); if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request"); @@ -961,7 +962,10 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, break; case RTA_OIF: ifindex = nla_get_u32(tb[i]); - filter->dev = __dev_get_by_index(net, ifindex); + if (filter->rtnl_held) + filter->dev = __dev_get_by_index(net, ifindex); + else + filter->dev = dev_get_by_index_rcu(net, ifindex); if (!filter->dev) return -ENODEV; break; @@ -983,20 +987,24 @@ EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req); static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { - struct fib_dump_filter filter = { .dump_routes = true, - .dump_exceptions = true }; + struct fib_dump_filter filter = { + .dump_routes = true, + .dump_exceptions = true, + .rtnl_held = false, + }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int h, s_h; unsigned int e = 0, s_e; struct fib_table *tb; struct hlist_head *head; - int dumped = 0, err; + int dumped = 0, err = 0; + rcu_read_lock(); if (cb->strict_check) { err = ip_valid_fib_dump_req(net, nlh, &filter, cb); if (err < 0) - return err; + goto unlock; } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); @@ -1005,29 +1013,26 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) /* ipv4 does not use prefix flag */ if (filter.flags & RTM_F_PREFIX) - return skb->len; + goto unlock; if (filter.table_id) { tb = fib_get_table(net, filter.table_id); if (!tb) { if (rtnl_msg_family(cb->nlh) != PF_INET) - return skb->len; + goto unlock; NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist"); - return -ENOENT; + err = -ENOENT; + goto unlock; } - - rcu_read_lock(); err = fib_table_dump(tb, skb, cb, &filter); - rcu_read_unlock(); - return skb->len ? : err; + goto unlock; } s_h = cb->args[0]; s_e = cb->args[1]; - rcu_read_lock(); - + err = 0; for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv4.fib_table_hash[h]; @@ -1038,25 +1043,20 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) memset(&cb->args[2], 0, sizeof(cb->args) - 2 * sizeof(cb->args[0])); err = fib_table_dump(tb, skb, cb, &filter); - if (err < 0) { - if (likely(skb->len)) - goto out; - - goto out_err; - } + if (err < 0) + goto out; dumped = 1; next: e++; } } out: - err = skb->len; -out_err: - rcu_read_unlock(); cb->args[1] = e; cb->args[0] = h; +unlock: + rcu_read_unlock(); return err; } @@ -1659,5 +1659,6 @@ void __init ip_fib_init(void) rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, 0); rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, 0); - rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, 0); + rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, + RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE); } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 5eb1b8d302..8956026bc0 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -254,7 +254,7 @@ void free_fib_info(struct fib_info *fi) return; } - call_rcu(&fi->rcu, free_fib_info_rcu); + call_rcu_hurry(&fi->rcu, free_fib_info_rcu); } EXPORT_SYMBOL_GPL(free_fib_info); @@ -2270,6 +2270,15 @@ void fib_select_path(struct net *net, struct fib_result *res, fib_select_default(fl4, res); check_saddr: - if (!fl4->saddr) - fl4->saddr = fib_result_prefsrc(net, res); + if (!fl4->saddr) { + struct net_device *l3mdev; + + l3mdev = dev_get_by_index_rcu(net, fl4->flowi4_l3mdev); + + if (!l3mdev || + l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) == l3mdev) + fl4->saddr = fib_result_prefsrc(net, res); + else + fl4->saddr = inet_select_addr(l3mdev, 0, RT_SCOPE_LINK); + } } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 3ff35f8117..8f30e3f00b 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -501,7 +501,7 @@ static void tnode_free(struct key_vector *tn) if (tnode_free_size >= READ_ONCE(sysctl_fib_sync_mem)) { tnode_free_size = 0; - synchronize_rcu(); + synchronize_net(); } } @@ -1629,6 +1629,7 @@ set_result: res->nhc = nhc; res->type = fa->fa_type; res->scope = fi->fib_scope; + res->dscp = fa->fa_dscp; res->fi = fi; res->table = tb; res->fa_head = &n->leaf; @@ -2368,7 +2369,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, * and key == 0 means the dump has wrapped around and we are done. */ if (count && !key) - return skb->len; + return 0; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { int err; @@ -2394,7 +2395,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, cb->args[3] = key; cb->args[2] = count; - return skb->len; + return 0; } void __init fib_trie_init(void) diff --git a/net/ipv4/fou_bpf.c b/net/ipv4/fou_bpf.c index 4da03bf45c..54984f3170 100644 --- a/net/ipv4/fou_bpf.c +++ b/net/ipv4/fou_bpf.c @@ -64,7 +64,7 @@ __bpf_kfunc int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx, info->encap.type = TUNNEL_ENCAP_NONE; } - if (info->key.tun_flags & TUNNEL_CSUM) + if (test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags)) info->encap.flags |= TUNNEL_ENCAP_FLAG_CSUM; info->encap.sport = encap->sport; @@ -100,10 +100,10 @@ __bpf_kfunc int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx, __bpf_kfunc_end_defs(); -BTF_SET8_START(fou_kfunc_set) +BTF_KFUNCS_START(fou_kfunc_set) BTF_ID_FLAGS(func, bpf_skb_set_fou_encap) BTF_ID_FLAGS(func, bpf_skb_get_fou_encap) -BTF_SET8_END(fou_kfunc_set) +BTF_KFUNCS_END(fou_kfunc_set) static const struct btf_kfunc_id_set fou_bpf_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/ipv4/fou_core.c b/net/ipv4/fou_core.c index 0c41076e31..0abbc413e0 100644 --- a/net/ipv4/fou_core.c +++ b/net/ipv4/fou_core.c @@ -351,7 +351,7 @@ static struct sk_buff *gue_gro_receive(struct sock *sk, optlen = guehdr->hlen << 2; len += optlen; - if (skb_gro_header_hard(skb, len)) { + if (!skb_gro_may_pull(skb, len)) { guehdr = skb_gro_header_slow(skb, len, off); if (unlikely(!guehdr)) goto out; @@ -433,7 +433,7 @@ next_proto: offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[proto]); - if (WARN_ON_ONCE(!ops || !ops->callbacks.gro_receive)) + if (!ops || !ops->callbacks.gro_receive) goto out; pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index cbb2b4bb0d..6701a98d9a 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -73,7 +73,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) return -EINVAL; - tpi->flags = gre_flags_to_tnl_flags(greh->flags); + gre_flags_to_tnl_flags(tpi->flags, greh->flags); hdr_len = gre_calc_hlen(tpi->flags); if (!pskb_may_pull(skb, nhs + hdr_len)) @@ -217,5 +217,5 @@ module_init(gre_init); module_exit(gre_exit); MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver"); -MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); +MODULE_AUTHOR("D. Kozlov <xeb@mail.ru>"); MODULE_LICENSE("GPL"); diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 311e70bfce..5028c72d49 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -174,7 +174,7 @@ static struct sk_buff *gre_gro_receive(struct list_head *head, grehlen += GRE_HEADER_SECTION; hlen = off + grehlen; - if (skb_gro_header_hard(skb, hlen)) { + if (!skb_gro_may_pull(skb, hlen)) { greh = skb_gro_header_slow(skb, hlen, off); if (unlikely(!greh)) goto out; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 437e782b96..ab6d0d98db 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -93,6 +93,8 @@ #include <net/ip_fib.h> #include <net/l3mdev.h> #include <net/addrconf.h> +#define CREATE_TRACE_POINTS +#include <trace/events/icmp.h> /* * Build xmit assembly blocks @@ -483,6 +485,7 @@ static struct rtable *icmp_route_lookup(struct net *net, struct icmp_bxm *param) { struct net_device *route_lookup_dev; + struct dst_entry *dst, *dst2; struct rtable *rt, *rt2; struct flowi4 fl4_dec; int err; @@ -508,16 +511,17 @@ static struct rtable *icmp_route_lookup(struct net *net, /* No need to clone since we're just using its address. */ rt2 = rt; - rt = (struct rtable *) xfrm_lookup(net, &rt->dst, - flowi4_to_flowi(fl4), NULL, 0); - if (!IS_ERR(rt)) { + dst = xfrm_lookup(net, &rt->dst, + flowi4_to_flowi(fl4), NULL, 0); + rt = dst_rtable(dst); + if (!IS_ERR(dst)) { if (rt != rt2) return rt; - } else if (PTR_ERR(rt) == -EPERM) { + } else if (PTR_ERR(dst) == -EPERM) { rt = NULL; - } else + } else { return rt; - + } err = xfrm_decode_session_reverse(net, skb_in, flowi4_to_flowi(&fl4_dec), AF_INET); if (err) goto relookup_failed; @@ -551,19 +555,19 @@ static struct rtable *icmp_route_lookup(struct net *net, if (err) goto relookup_failed; - rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst, - flowi4_to_flowi(&fl4_dec), NULL, - XFRM_LOOKUP_ICMP); - if (!IS_ERR(rt2)) { + dst2 = xfrm_lookup(net, &rt2->dst, flowi4_to_flowi(&fl4_dec), NULL, + XFRM_LOOKUP_ICMP); + rt2 = dst_rtable(dst2); + if (!IS_ERR(dst2)) { dst_release(&rt->dst); memcpy(fl4, &fl4_dec, sizeof(*fl4)); rt = rt2; - } else if (PTR_ERR(rt2) == -EPERM) { + } else if (PTR_ERR(dst2) == -EPERM) { if (rt) dst_release(&rt->dst); return rt2; } else { - err = PTR_ERR(rt2); + err = PTR_ERR(dst2); goto relookup_failed; } return rt; @@ -768,6 +772,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, if (!fl4.saddr) fl4.saddr = htonl(INADDR_DUMMY); + trace_icmp_send(skb_in, type, code); + icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt); ende: ip_rt_put(rt); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index efeeca2b13..9bf09de6a2 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -120,12 +120,12 @@ */ #define IGMP_V1_SEEN(in_dev) \ - (IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \ + (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \ IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \ ((in_dev)->mr_v1_seen && \ time_before(jiffies, (in_dev)->mr_v1_seen))) #define IGMP_V2_SEEN(in_dev) \ - (IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \ + (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \ IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \ ((in_dev)->mr_v2_seen && \ time_before(jiffies, (in_dev)->mr_v2_seen))) @@ -1842,7 +1842,8 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) if (!dev) { struct rtable *rt = ip_route_output(net, imr->imr_multiaddr.s_addr, - 0, 0, 0); + 0, 0, 0, + RT_SCOPE_UNIVERSE); if (!IS_ERR(rt)) { dev = rt->dst.dev; ip_rt_put(rt); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index d1492c649a..d4f0eff8b2 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -661,7 +661,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) /* * This will accept the next outstanding connection. */ -struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) +struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock_queue *queue = &icsk->icsk_accept_queue; @@ -680,7 +680,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) /* Find already established connection */ if (reqsk_queue_empty(queue)) { - long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + long timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); /* If this is a non blocking socket don't sleep */ error = -EAGAIN; @@ -692,6 +692,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) goto out_err; } req = reqsk_queue_remove(queue, sk); + arg->is_empty = reqsk_queue_empty(queue); newsk = req->sk; if (sk->sk_protocol == IPPROTO_TCP && @@ -745,7 +746,7 @@ out: out_err: newsk = NULL; req = NULL; - *err = error; + arg->err = error; goto out; } EXPORT_SYMBOL(inet_csk_accept); @@ -930,8 +931,9 @@ static struct request_sock *inet_reqsk_clone(struct request_sock *req, memcpy(nreq_sk, req_sk, offsetof(struct sock, sk_dontcopy_begin)); - memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end, - req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end)); + unsafe_memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end, + req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end), + /* alloc is larger than struct, see above */); sk_node_init(&nreq_sk->sk_node); nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping; @@ -1120,25 +1122,34 @@ drop: inet_csk_reqsk_queue_drop_and_put(oreq->rsk_listener, oreq); } -static void reqsk_queue_hash_req(struct request_sock *req, +static bool reqsk_queue_hash_req(struct request_sock *req, unsigned long timeout) { + bool found_dup_sk = false; + + if (!inet_ehash_insert(req_to_sk(req), NULL, &found_dup_sk)) + return false; + + /* The timer needs to be setup after a successful insertion. */ timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED); mod_timer(&req->rsk_timer, jiffies + timeout); - inet_ehash_insert(req_to_sk(req), NULL, NULL); /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. */ smp_wmb(); refcount_set(&req->rsk_refcnt, 2 + 1); + return true; } -void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, +bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, unsigned long timeout) { - reqsk_queue_hash_req(req, timeout); + if (!reqsk_queue_hash_req(req, timeout)) + return false; + inet_csk_reqsk_queue_added(sk); + return true; } EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); @@ -1491,7 +1502,7 @@ static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *f rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, sk->sk_protocol, - RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); + ip_sock_rt_tos(sk), sk->sk_bound_dev_if); if (IS_ERR(rt)) rt = NULL; if (rt) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 9804e9608a..9712cdb808 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -32,7 +32,7 @@ #include <linux/inet_diag.h> #include <linux/sock_diag.h> -static const struct inet_diag_handler **inet_diag_table; +static const struct inet_diag_handler __rcu **inet_diag_table; struct inet_diag_entry { const __be32 *saddr; @@ -48,28 +48,28 @@ struct inet_diag_entry { #endif }; -static DEFINE_MUTEX(inet_diag_table_mutex); - static const struct inet_diag_handler *inet_diag_lock_handler(int proto) { - if (proto < 0 || proto >= IPPROTO_MAX) { - mutex_lock(&inet_diag_table_mutex); - return ERR_PTR(-ENOENT); - } + const struct inet_diag_handler *handler; + + if (proto < 0 || proto >= IPPROTO_MAX) + return NULL; if (!READ_ONCE(inet_diag_table[proto])) sock_load_diag_module(AF_INET, proto); - mutex_lock(&inet_diag_table_mutex); - if (!inet_diag_table[proto]) - return ERR_PTR(-ENOENT); + rcu_read_lock(); + handler = rcu_dereference(inet_diag_table[proto]); + if (handler && !try_module_get(handler->owner)) + handler = NULL; + rcu_read_unlock(); - return inet_diag_table[proto]; + return handler; } static void inet_diag_unlock_handler(const struct inet_diag_handler *handler) { - mutex_unlock(&inet_diag_table_mutex); + module_put(handler->owner); } void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk) @@ -104,9 +104,12 @@ static size_t inet_sk_attr_size(struct sock *sk, const struct inet_diag_handler *handler; size_t aux = 0; - handler = inet_diag_table[req->sdiag_protocol]; + rcu_read_lock(); + handler = rcu_dereference(inet_diag_table[req->sdiag_protocol]); + DEBUG_NET_WARN_ON_ONCE(!handler); if (handler && handler->idiag_get_aux_size) aux = handler->idiag_get_aux_size(sk, net_admin); + rcu_read_unlock(); return nla_total_size(sizeof(struct tcp_info)) + nla_total_size(sizeof(struct inet_diag_msg)) @@ -244,10 +247,16 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct nlmsghdr *nlh; struct nlattr *attr; void *info = NULL; + int protocol; cb_data = cb->data; - handler = inet_diag_table[inet_diag_get_protocol(req, cb_data)]; - BUG_ON(!handler); + protocol = inet_diag_get_protocol(req, cb_data); + + /* inet_diag_lock_handler() made sure inet_diag_table[] is stable. */ + handler = rcu_dereference_protected(inet_diag_table[protocol], 1); + DEBUG_NET_WARN_ON_ONCE(!handler); + if (!handler) + return -ENXIO; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags); @@ -605,9 +614,10 @@ static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb, protocol = inet_diag_get_protocol(req, &dump_data); handler = inet_diag_lock_handler(protocol); - if (IS_ERR(handler)) { - err = PTR_ERR(handler); - } else if (cmd == SOCK_DIAG_BY_FAMILY) { + if (!handler) + return -ENOENT; + + if (cmd == SOCK_DIAG_BY_FAMILY) { struct netlink_callback cb = { .nlh = nlh, .skb = in_skb, @@ -1035,6 +1045,10 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, num = 0; ilb = &hashinfo->lhash2[i]; + if (hlist_nulls_empty(&ilb->nulls_head)) { + s_num = 0; + continue; + } spin_lock(&ilb->lock); sk_nulls_for_each(sk, node, &ilb->nulls_head) { struct inet_sock *inet = inet_sk(sk); @@ -1099,6 +1113,10 @@ resume_bind_walk: accum = 0; ibb = &hashinfo->bhash2[i]; + if (hlist_empty(&ibb->chain)) { + s_num = 0; + continue; + } spin_lock_bh(&ibb->lock); inet_bind_bucket_for_each(tb2, &ibb->chain) { if (!net_eq(ib2_net(tb2), net)) @@ -1259,12 +1277,12 @@ static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, again: prev_min_dump_alloc = cb->min_dump_alloc; handler = inet_diag_lock_handler(protocol); - if (!IS_ERR(handler)) + if (handler) { handler->dump(skb, cb, r); - else - err = PTR_ERR(handler); - inet_diag_unlock_handler(handler); - + inet_diag_unlock_handler(handler); + } else { + err = -ENOENT; + } /* The skb is not large enough to fit one sk info and * inet_sk_diag_fill() has requested for a larger skb. */ @@ -1365,6 +1383,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb, req.sdiag_family = AF_UNSPEC; /* compatibility */ req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type); req.idiag_ext = rc->idiag_ext; + req.pad = 0; req.idiag_states = rc->idiag_states; req.id = rc->id; @@ -1380,6 +1399,7 @@ static int inet_diag_get_exact_compat(struct sk_buff *in_skb, req.sdiag_family = rc->idiag_family; req.sdiag_protocol = inet_diag_type2proto(nlh->nlmsg_type); req.idiag_ext = rc->idiag_ext; + req.pad = 0; req.idiag_states = rc->idiag_states; req.id = rc->id; @@ -1457,10 +1477,9 @@ int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk) } handler = inet_diag_lock_handler(sk->sk_protocol); - if (IS_ERR(handler)) { - inet_diag_unlock_handler(handler); + if (!handler) { nlmsg_cancel(skb, nlh); - return PTR_ERR(handler); + return -ENOENT; } attr = handler->idiag_info_size @@ -1479,6 +1498,7 @@ int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk) } static const struct sock_diag_handler inet_diag_handler = { + .owner = THIS_MODULE, .family = AF_INET, .dump = inet_diag_handler_cmd, .get_info = inet_diag_handler_get_info, @@ -1486,6 +1506,7 @@ static const struct sock_diag_handler inet_diag_handler = { }; static const struct sock_diag_handler inet6_diag_handler = { + .owner = THIS_MODULE, .family = AF_INET6, .dump = inet_diag_handler_cmd, .get_info = inet_diag_handler_get_info, @@ -1495,20 +1516,12 @@ static const struct sock_diag_handler inet6_diag_handler = { int inet_diag_register(const struct inet_diag_handler *h) { const __u16 type = h->idiag_type; - int err = -EINVAL; if (type >= IPPROTO_MAX) - goto out; + return -EINVAL; - mutex_lock(&inet_diag_table_mutex); - err = -EEXIST; - if (!inet_diag_table[type]) { - WRITE_ONCE(inet_diag_table[type], h); - err = 0; - } - mutex_unlock(&inet_diag_table_mutex); -out: - return err; + return !cmpxchg((const struct inet_diag_handler **)&inet_diag_table[type], + NULL, h) ? 0 : -EEXIST; } EXPORT_SYMBOL_GPL(inet_diag_register); @@ -1519,12 +1532,16 @@ void inet_diag_unregister(const struct inet_diag_handler *h) if (type >= IPPROTO_MAX) return; - mutex_lock(&inet_diag_table_mutex); - WRITE_ONCE(inet_diag_table[type], NULL); - mutex_unlock(&inet_diag_table_mutex); + xchg((const struct inet_diag_handler **)&inet_diag_table[type], + NULL); } EXPORT_SYMBOL_GPL(inet_diag_unregister); +static const struct sock_diag_inet_compat inet_diag_compat = { + .owner = THIS_MODULE, + .fn = inet_diag_rcv_msg_compat, +}; + static int __init inet_diag_init(void) { const int inet_diag_table_size = (IPPROTO_MAX * @@ -1543,7 +1560,7 @@ static int __init inet_diag_init(void) if (err) goto out_free_inet; - sock_diag_register_inet_compat(inet_diag_rcv_msg_compat); + sock_diag_register_inet_compat(&inet_diag_compat); out: return err; @@ -1558,7 +1575,7 @@ static void __exit inet_diag_exit(void) { sock_diag_unregister(&inet6_diag_handler); sock_diag_unregister(&inet_diag_handler); - sock_diag_unregister_inet_compat(inet_diag_rcv_msg_compat); + sock_diag_unregister_inet_compat(&inet_diag_compat); kfree(inet_diag_table); } diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index c88c9034d6..faaec92a46 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -175,7 +175,7 @@ static void fqdir_free_fn(struct work_struct *work) } } -static DECLARE_WORK(fqdir_free_work, fqdir_free_fn); +static DECLARE_DELAYED_WORK(fqdir_free_work, fqdir_free_fn); static void fqdir_work_fn(struct work_struct *work) { @@ -184,7 +184,7 @@ static void fqdir_work_fn(struct work_struct *work) rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); if (llist_add(&fqdir->free_list, &fqdir_free_list)) - queue_work(system_wq, &fqdir_free_work); + queue_delayed_work(system_wq, &fqdir_free_work, HZ); } int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 4e470f1848..48d0d49418 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -24,6 +24,7 @@ #include <net/inet6_hashtables.h> #endif #include <net/secure_seq.h> +#include <net/hotdata.h> #include <net/ip.h> #include <net/tcp.h> #include <net/sock_reuseport.h> @@ -32,8 +33,6 @@ u32 inet_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport) { - static u32 inet_ehash_secret __read_mostly; - net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); return __inet_ehashfn(laddr, lport, faddr, fport, @@ -566,7 +565,8 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); - if (twsk_unique(sk, sk2, twp)) + if (sk->sk_protocol == IPPROTO_TCP && + tcp_twsk_unique(sk, sk2, twp)) break; } goto not_unique; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index e8de45d34d..e28075f000 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -264,14 +264,18 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) EXPORT_SYMBOL_GPL(__inet_twsk_schedule); /* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */ -void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family) +void inet_twsk_purge(struct inet_hashinfo *hashinfo) { + struct inet_ehash_bucket *head = &hashinfo->ehash[0]; + unsigned int ehash_mask = hashinfo->ehash_mask; struct hlist_nulls_node *node; unsigned int slot; struct sock *sk; - for (slot = 0; slot <= hashinfo->ehash_mask; slot++) { - struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; + for (slot = 0; slot <= ehash_mask; slot++, head++) { + if (hlist_nulls_empty(&head->chain)) + continue; + restart_rcu: cond_resched(); rcu_read_lock(); @@ -283,15 +287,13 @@ restart: TCPF_NEW_SYN_RECV)) continue; - if (sk->sk_family != family || - refcount_read(&sock_net(sk)->ns.count)) + if (refcount_read(&sock_net(sk)->ns.count)) continue; if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) continue; - if (unlikely(sk->sk_family != family || - refcount_read(&sock_net(sk)->ns.count))) { + if (refcount_read(&sock_net(sk)->ns.count)) { sock_gen_put(sk); goto restart; } diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index e9fed83e9b..5bd7599634 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -81,10 +81,7 @@ void __init inet_initpeers(void) inet_peer_threshold = clamp_val(nr_entries, 4096, 65536 + 128); - peer_cachep = kmem_cache_create("inet_peer_cache", - sizeof(struct inet_peer), - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, - NULL); + peer_cachep = KMEM_CACHE(inet_peer, SLAB_HWCACHE_ALIGN | SLAB_PANIC); } /* Called with rcu_read_lock() or base->lock held */ diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index fb947d1613..08e2c92e25 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -580,7 +580,6 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &dist_min, }, - { } }; /* secret interval has been deprecated */ @@ -593,7 +592,6 @@ static struct ctl_table ip4_frags_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; static int __net_init ip4_frags_ns_ctl_register(struct net *net) @@ -632,7 +630,7 @@ err_alloc: static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->ipv4.frags_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.frags_hdr); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 1954a56fec..ba20547352 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -265,6 +265,7 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, struct net *net = dev_net(skb->dev); struct metadata_dst *tun_dst = NULL; struct erspan_base_hdr *ershdr; + IP_TUNNEL_DECLARE_FLAGS(flags); struct ip_tunnel_net *itn; struct ip_tunnel *tunnel; const struct iphdr *iph; @@ -272,12 +273,14 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, int ver; int len; + ip_tunnel_flags_copy(flags, tpi->flags); + itn = net_generic(net, erspan_net_id); iph = ip_hdr(skb); if (is_erspan_type1(gre_hdr_len)) { ver = 0; - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, - tpi->flags | TUNNEL_NO_KEY, + __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr, iph->daddr, 0); } else { if (unlikely(!pskb_may_pull(skb, @@ -287,8 +290,8 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); ver = ershdr->ver; iph = ip_hdr(skb); - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, - tpi->flags | TUNNEL_KEY, + __set_bit(IP_TUNNEL_KEY_BIT, flags); + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr, iph->daddr, tpi->key); } @@ -312,10 +315,9 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, struct ip_tunnel_info *info; unsigned char *gh; __be64 tun_id; - __be16 flags; - tpi->flags |= TUNNEL_KEY; - flags = tpi->flags; + __set_bit(IP_TUNNEL_KEY_BIT, tpi->flags); + ip_tunnel_flags_copy(flags, tpi->flags); tun_id = key32_to_tunnel_id(tpi->key); tun_dst = ip_tun_rx_dst(skb, flags, @@ -338,7 +340,8 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, ERSPAN_V2_MDSIZE); info = &tun_dst->u.tun_info; - info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, + info->key.tun_flags); info->options_len = sizeof(*md); } @@ -381,10 +384,13 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, tnl_params = &tunnel->parms.iph; if (tunnel->collect_md || tnl_params->daddr == 0) { - __be16 flags; + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; __be64 tun_id; - flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY); + __set_bit(IP_TUNNEL_CSUM_BIT, flags); + __set_bit(IP_TUNNEL_KEY_BIT, flags); + ip_tunnel_flags_and(flags, tpi->flags, flags); + tun_id = key32_to_tunnel_id(tpi->key); tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0); if (!tun_dst) @@ -464,12 +470,15 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, __be16 proto) { struct ip_tunnel *tunnel = netdev_priv(dev); - __be16 flags = tunnel->parms.o_flags; + IP_TUNNEL_DECLARE_FLAGS(flags); + + ip_tunnel_flags_copy(flags, tunnel->parms.o_flags); /* Push GRE header. */ gre_build_header(skb, tunnel->tun_hlen, flags, proto, tunnel->parms.o_key, - (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); + test_bit(IP_TUNNEL_SEQ_BIT, flags) ? + htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); } @@ -483,10 +492,10 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, __be16 proto) { struct ip_tunnel *tunnel = netdev_priv(dev); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; int tunnel_hlen; - __be16 flags; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || @@ -500,14 +509,19 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, goto err_free_skb; /* Push Tunnel header. */ - if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM))) + if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, + tunnel->parms.o_flags))) goto err_free_skb; - flags = tun_info->key.tun_flags & - (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); + __set_bit(IP_TUNNEL_CSUM_BIT, flags); + __set_bit(IP_TUNNEL_KEY_BIT, flags); + __set_bit(IP_TUNNEL_SEQ_BIT, flags); + ip_tunnel_flags_and(flags, tun_info->key.tun_flags, flags); + gre_build_header(skb, tunnel_hlen, flags, proto, tunnel_id_to_key32(tun_info->key.tun_id), - (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); + test_bit(IP_TUNNEL_SEQ_BIT, flags) ? + htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen); @@ -521,6 +535,7 @@ err_free_skb: static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; struct erspan_metadata *md; @@ -536,7 +551,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) goto err_free_skb; key = &tun_info->key; - if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) + if (!test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_info->key.tun_flags)) goto err_free_skb; if (tun_info->options_len < sizeof(*md)) goto err_free_skb; @@ -589,8 +604,9 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) goto err_free_skb; } - gre_build_header(skb, 8, TUNNEL_SEQ, - proto, 0, htonl(atomic_fetch_inc(&tunnel->o_seqno))); + __set_bit(IP_TUNNEL_SEQ_BIT, flags); + gre_build_header(skb, 8, flags, proto, 0, + htonl(atomic_fetch_inc(&tunnel->o_seqno))); ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen); @@ -664,7 +680,8 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, tnl_params = &tunnel->parms.iph; } - if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM))) + if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, + tunnel->parms.o_flags))) goto free_skb; __gre_xmit(skb, dev, tnl_params, skb->protocol); @@ -706,7 +723,7 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, /* Push ERSPAN header */ if (tunnel->erspan_ver == 0) { proto = htons(ETH_P_ERSPAN); - tunnel->parms.o_flags &= ~TUNNEL_SEQ; + __clear_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags); } else if (tunnel->erspan_ver == 1) { erspan_build_header(skb, ntohl(tunnel->parms.o_key), tunnel->index, @@ -721,7 +738,7 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, goto free_skb; } - tunnel->parms.o_flags &= ~TUNNEL_KEY; + __clear_bit(IP_TUNNEL_KEY_BIT, tunnel->parms.o_flags); __gre_xmit(skb, dev, &tunnel->parms.iph, proto); return NETDEV_TX_OK; @@ -744,7 +761,8 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, return NETDEV_TX_OK; } - if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM))) + if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, + tunnel->parms.o_flags))) goto free_skb; if (skb_cow_head(skb, dev->needed_headroom)) @@ -762,7 +780,6 @@ free_skb: static void ipgre_link_update(struct net_device *dev, bool set_mtu) { struct ip_tunnel *tunnel = netdev_priv(dev); - __be16 flags; int len; len = tunnel->tun_hlen; @@ -776,12 +793,11 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu) dev->needed_headroom += len; if (set_mtu) - dev->mtu = max_t(int, dev->mtu - len, 68); - - flags = tunnel->parms.o_flags; + WRITE_ONCE(dev->mtu, max_t(int, dev->mtu - len, 68)); - if (flags & TUNNEL_SEQ || - (flags & TUNNEL_CSUM && tunnel->encap.type != TUNNEL_ENCAP_NONE)) { + if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags) || + (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags) && + tunnel->encap.type != TUNNEL_ENCAP_NONE)) { dev->features &= ~NETIF_F_GSO_SOFTWARE; dev->hw_features &= ~NETIF_F_GSO_SOFTWARE; } else { @@ -790,20 +806,29 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu) } } -static int ipgre_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, +static int ipgre_tunnel_ctl(struct net_device *dev, + struct ip_tunnel_parm_kern *p, int cmd) { + __be16 i_flags, o_flags; int err; + if (!ip_tunnel_flags_is_be16_compat(p->i_flags) || + !ip_tunnel_flags_is_be16_compat(p->o_flags)) + return -EOVERFLOW; + + i_flags = ip_tunnel_flags_to_be16(p->i_flags); + o_flags = ip_tunnel_flags_to_be16(p->o_flags); + if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p->iph.version != 4 || p->iph.protocol != IPPROTO_GRE || p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)) || - ((p->i_flags | p->o_flags) & (GRE_VERSION | GRE_ROUTING))) + ((i_flags | o_flags) & (GRE_VERSION | GRE_ROUTING))) return -EINVAL; } - p->i_flags = gre_flags_to_tnl_flags(p->i_flags); - p->o_flags = gre_flags_to_tnl_flags(p->o_flags); + gre_flags_to_tnl_flags(p->i_flags, i_flags); + gre_flags_to_tnl_flags(p->o_flags, o_flags); err = ip_tunnel_ctl(dev, p, cmd); if (err) @@ -812,15 +837,18 @@ static int ipgre_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, if (cmd == SIOCCHGTUNNEL) { struct ip_tunnel *t = netdev_priv(dev); - t->parms.i_flags = p->i_flags; - t->parms.o_flags = p->o_flags; + ip_tunnel_flags_copy(t->parms.i_flags, p->i_flags); + ip_tunnel_flags_copy(t->parms.o_flags, p->o_flags); if (strcmp(dev->rtnl_link_ops->kind, "erspan")) ipgre_link_update(dev, true); } - p->i_flags = gre_tnl_flags_to_gre_flags(p->i_flags); - p->o_flags = gre_tnl_flags_to_gre_flags(p->o_flags); + i_flags = gre_tnl_flags_to_gre_flags(p->i_flags); + ip_tunnel_flags_from_be16(p->i_flags, i_flags); + o_flags = gre_tnl_flags_to_gre_flags(p->o_flags); + ip_tunnel_flags_from_be16(p->o_flags, o_flags); + return 0; } @@ -960,7 +988,6 @@ static void ipgre_tunnel_setup(struct net_device *dev) static void __gre_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel; - __be16 flags; tunnel = netdev_priv(dev); tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); @@ -972,14 +999,13 @@ static void __gre_tunnel_init(struct net_device *dev) dev->features |= GRE_FEATURES | NETIF_F_LLTX; dev->hw_features |= GRE_FEATURES; - flags = tunnel->parms.o_flags; - /* TCP offload with GRE SEQ is not supported, nor can we support 2 * levels of outer headers requiring an update. */ - if (flags & TUNNEL_SEQ) + if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags)) return; - if (flags & TUNNEL_CSUM && tunnel->encap.type != TUNNEL_ENCAP_NONE) + if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags) && + tunnel->encap.type != TUNNEL_ENCAP_NONE) return; dev->features |= NETIF_F_GSO_SOFTWARE; @@ -1030,14 +1056,16 @@ static int __net_init ipgre_init_net(struct net *net) return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL); } -static void __net_exit ipgre_exit_batch_net(struct list_head *list_net) +static void __net_exit ipgre_exit_batch_rtnl(struct list_head *list_net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops); + ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops, + dev_to_kill); } static struct pernet_operations ipgre_net_ops = { .init = ipgre_init_net, - .exit_batch = ipgre_exit_batch_net, + .exit_batch_rtnl = ipgre_exit_batch_rtnl, .id = &ipgre_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1134,7 +1162,7 @@ static int erspan_validate(struct nlattr *tb[], struct nlattr *data[], static int ipgre_netlink_parms(struct net_device *dev, struct nlattr *data[], struct nlattr *tb[], - struct ip_tunnel_parm *parms, + struct ip_tunnel_parm_kern *parms, __u32 *fwmark) { struct ip_tunnel *t = netdev_priv(dev); @@ -1150,10 +1178,12 @@ static int ipgre_netlink_parms(struct net_device *dev, parms->link = nla_get_u32(data[IFLA_GRE_LINK]); if (data[IFLA_GRE_IFLAGS]) - parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS])); + gre_flags_to_tnl_flags(parms->i_flags, + nla_get_be16(data[IFLA_GRE_IFLAGS])); if (data[IFLA_GRE_OFLAGS]) - parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS])); + gre_flags_to_tnl_flags(parms->o_flags, + nla_get_be16(data[IFLA_GRE_OFLAGS])); if (data[IFLA_GRE_IKEY]) parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); @@ -1201,7 +1231,7 @@ static int ipgre_netlink_parms(struct net_device *dev, static int erspan_netlink_parms(struct net_device *dev, struct nlattr *data[], struct nlattr *tb[], - struct ip_tunnel_parm *parms, + struct ip_tunnel_parm_kern *parms, __u32 *fwmark) { struct ip_tunnel *t = netdev_priv(dev); @@ -1360,7 +1390,7 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { - struct ip_tunnel_parm p; + struct ip_tunnel_parm_kern p; __u32 fwmark = 0; int err; @@ -1378,7 +1408,7 @@ static int erspan_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { - struct ip_tunnel_parm p; + struct ip_tunnel_parm_kern p; __u32 fwmark = 0; int err; @@ -1397,8 +1427,8 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm_kern p; __u32 fwmark = t->fwmark; - struct ip_tunnel_parm p; int err; err = ipgre_newlink_encap_setup(dev, data); @@ -1413,8 +1443,8 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], if (err < 0) return err; - t->parms.i_flags = p.i_flags; - t->parms.o_flags = p.o_flags; + ip_tunnel_flags_copy(t->parms.i_flags, p.i_flags); + ip_tunnel_flags_copy(t->parms.o_flags, p.o_flags); ipgre_link_update(dev, !tb[IFLA_MTU]); @@ -1426,8 +1456,8 @@ static int erspan_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm_kern p; __u32 fwmark = t->fwmark; - struct ip_tunnel_parm p; int err; err = ipgre_newlink_encap_setup(dev, data); @@ -1442,8 +1472,8 @@ static int erspan_changelink(struct net_device *dev, struct nlattr *tb[], if (err < 0) return err; - t->parms.i_flags = p.i_flags; - t->parms.o_flags = p.o_flags; + ip_tunnel_flags_copy(t->parms.i_flags, p.i_flags); + ip_tunnel_flags_copy(t->parms.o_flags, p.o_flags); return 0; } @@ -1499,8 +1529,10 @@ static size_t ipgre_get_size(const struct net_device *dev) static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm *p = &t->parms; - __be16 o_flags = p->o_flags; + struct ip_tunnel_parm_kern *p = &t->parms; + IP_TUNNEL_DECLARE_FLAGS(o_flags); + + ip_tunnel_flags_copy(o_flags, p->o_flags); if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || nla_put_be16(skb, IFLA_GRE_IFLAGS, @@ -1548,7 +1580,7 @@ static int erspan_fill_info(struct sk_buff *skb, const struct net_device *dev) if (t->erspan_ver <= 2) { if (t->erspan_ver != 0 && !t->collect_md) - t->parms.o_flags |= TUNNEL_KEY; + __set_bit(IP_TUNNEL_KEY_BIT, t->parms.o_flags); if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver)) goto nla_put_failure; @@ -1702,14 +1734,16 @@ static int __net_init ipgre_tap_init_net(struct net *net) return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0"); } -static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net) +static void __net_exit ipgre_tap_exit_batch_rtnl(struct list_head *list_net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops); + ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops, + dev_to_kill); } static struct pernet_operations ipgre_tap_net_ops = { .init = ipgre_tap_init_net, - .exit_batch = ipgre_tap_exit_batch_net, + .exit_batch_rtnl = ipgre_tap_exit_batch_rtnl, .id = &gre_tap_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1720,14 +1754,16 @@ static int __net_init erspan_init_net(struct net *net) &erspan_link_ops, "erspan0"); } -static void __net_exit erspan_exit_batch_net(struct list_head *net_list) +static void __net_exit erspan_exit_batch_rtnl(struct list_head *net_list, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops); + ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops, + dev_to_kill); } static struct pernet_operations erspan_net_ops = { .init = erspan_init_net, - .exit_batch = erspan_exit_batch_net, + .exit_batch_rtnl = erspan_exit_batch_rtnl, .id = &erspan_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 5e9c815665..d6fbcbd235 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -616,7 +616,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk, dst = skb_dst(skb); if (curr_dst != dst) { hint = ip_extract_route_hint(net, skb, - ((struct rtable *)dst)->rt_type); + dst_rtable(dst)->rt_type); /* dispatch old sublist */ if (!list_empty(&sublist)) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 67d8466223..9500031a1f 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -198,7 +198,7 @@ EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - struct rtable *rt = (struct rtable *)dst; + struct rtable *rt = dst_rtable(dst); struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; @@ -475,7 +475,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, goto packet_routed; /* Make sure we can route this packet. */ - rt = (struct rtable *)__sk_dst_check(sk, 0); + rt = dst_rtable(__sk_dst_check(sk, 0)); if (!rt) { __be32 daddr; @@ -493,7 +493,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, inet->inet_dport, inet->inet_sport, sk->sk_protocol, - RT_CONN_FLAGS_TOS(sk, tos), + RT_TOS(tos), sk->sk_bound_dev_if); if (IS_ERR(rt)) goto no_route; @@ -971,7 +971,7 @@ static int __ip_append_data(struct sock *sk, bool zc = false; unsigned int maxfraglen, fragheaderlen, maxnonfragsize; int csummode = CHECKSUM_NONE; - struct rtable *rt = (struct rtable *)cork->dst; + struct rtable *rt = dst_rtable(cork->dst); bool paged, hold_tskey, extra_uref = false; unsigned int wmem_alloc_delta = 0; u32 tskey = 0; @@ -1390,7 +1390,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct ip_options *opt = NULL; - struct rtable *rt = (struct rtable *)cork->dst; + struct rtable *rt = dst_rtable(cork->dst); struct iphdr *iph; u8 pmtudisc, ttl; __be16 df = 0; @@ -1473,7 +1473,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, * by icmp_hdr(skb)->type. */ if (sk->sk_type == SOCK_RAW && - !inet_test_bit(HDRINCL, sk)) + !(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH)) icmp_type = fl4->fl4_icmp_type; else icmp_type = icmp_hdr(skb)->type; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 21d2ffa919..cf377377b5 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -894,7 +894,7 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); - int val = 0, err; + int val = 0, err, retv; bool needs_rtnl = setsockopt_needs_rtnl(optname); switch (optname) { @@ -938,8 +938,12 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, /* If optlen==0, it is equivalent to val == 0 */ - if (optname == IP_ROUTER_ALERT) - return ip_ra_control(sk, val ? 1 : 0, NULL); + if (optname == IP_ROUTER_ALERT) { + retv = ip_ra_control(sk, val ? 1 : 0, NULL); + if (retv == 0) + inet_assign_bit(RTALERT, sk, val); + return retv; + } if (ip_mroute_opt(optname)) return ip_mroute_setsockopt(sk, optname, optval, optlen); @@ -1575,6 +1579,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_BIND_ADDRESS_NO_PORT: val = inet_test_bit(BIND_ADDRESS_NO_PORT, sk); goto copyval; + case IP_ROUTER_ALERT: + val = inet_test_bit(RTALERT, sk); + goto copyval; case IP_TTL: val = READ_ONCE(inet->uc_ttl); if (val < 0) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 55039191b8..bccef2fcf6 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -56,17 +56,13 @@ static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) IP_TNL_HASH_BITS); } -static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, - __be16 flags, __be32 key) +static bool ip_tunnel_key_match(const struct ip_tunnel_parm_kern *p, + const unsigned long *flags, __be32 key) { - if (p->i_flags & TUNNEL_KEY) { - if (flags & TUNNEL_KEY) - return key == p->i_key; - else - /* key expected, none present */ - return false; - } else - return !(flags & TUNNEL_KEY); + if (!test_bit(IP_TUNNEL_KEY_BIT, flags)) + return !test_bit(IP_TUNNEL_KEY_BIT, p->i_flags); + + return test_bit(IP_TUNNEL_KEY_BIT, p->i_flags) && p->i_key == key; } /* Fallback tunnel: no source, no destination, no key, no options @@ -81,7 +77,7 @@ static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, Given src, dst and key, find appropriate for input tunnel. */ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, - int link, __be16 flags, + int link, const unsigned long *flags, __be32 remote, __be32 local, __be32 key) { @@ -102,10 +98,9 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, if (!ip_tunnel_key_match(&t->parms, flags, key)) continue; - if (t->parms.link == link) + if (READ_ONCE(t->parms.link) == link) return t; - else - cand = t; + cand = t; } hlist_for_each_entry_rcu(t, head, hash_node) { @@ -117,9 +112,9 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, if (!ip_tunnel_key_match(&t->parms, flags, key)) continue; - if (t->parms.link == link) + if (READ_ONCE(t->parms.link) == link) return t; - else if (!cand) + if (!cand) cand = t; } @@ -137,22 +132,23 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, if (!ip_tunnel_key_match(&t->parms, flags, key)) continue; - if (t->parms.link == link) + if (READ_ONCE(t->parms.link) == link) return t; - else if (!cand) + if (!cand) cand = t; } hlist_for_each_entry_rcu(t, head, hash_node) { - if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) || + if ((!test_bit(IP_TUNNEL_NO_KEY_BIT, flags) && + t->parms.i_key != key) || t->parms.iph.saddr != 0 || t->parms.iph.daddr != 0 || !(t->dev->flags & IFF_UP)) continue; - if (t->parms.link == link) + if (READ_ONCE(t->parms.link) == link) return t; - else if (!cand) + if (!cand) cand = t; } @@ -172,7 +168,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, EXPORT_SYMBOL_GPL(ip_tunnel_lookup); static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, - struct ip_tunnel_parm *parms) + struct ip_tunnel_parm_kern *parms) { unsigned int h; __be32 remote; @@ -183,7 +179,8 @@ static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, else remote = 0; - if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI)) + if (!test_bit(IP_TUNNEL_KEY_BIT, parms->i_flags) && + test_bit(IP_TUNNEL_VTI_BIT, parms->i_flags)) i_key = 0; h = ip_tunnel_hash(i_key, remote); @@ -207,21 +204,23 @@ static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t) } static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, - struct ip_tunnel_parm *parms, + struct ip_tunnel_parm_kern *parms, int type) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; + IP_TUNNEL_DECLARE_FLAGS(flags); __be32 key = parms->i_key; - __be16 flags = parms->i_flags; int link = parms->link; struct ip_tunnel *t = NULL; struct hlist_head *head = ip_bucket(itn, parms); + ip_tunnel_flags_copy(flags, parms->i_flags); + hlist_for_each_entry_rcu(t, head, hash_node) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && - link == t->parms.link && + link == READ_ONCE(t->parms.link) && type == t->dev->type && ip_tunnel_key_match(&t->parms, flags, key)) break; @@ -231,7 +230,7 @@ static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, static struct net_device *__ip_tunnel_create(struct net *net, const struct rtnl_link_ops *ops, - struct ip_tunnel_parm *parms) + struct ip_tunnel_parm_kern *parms) { int err; struct ip_tunnel *tunnel; @@ -327,7 +326,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) static struct ip_tunnel *ip_tunnel_create(struct net *net, struct ip_tunnel_net *itn, - struct ip_tunnel_parm *parms) + struct ip_tunnel_parm_kern *parms) { struct ip_tunnel *nt; struct net_device *dev; @@ -387,15 +386,15 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, } #endif - if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) || - ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) { + if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) != + test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) { DEV_STATS_INC(tunnel->dev, rx_crc_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } - if (tunnel->parms.i_flags&TUNNEL_SEQ) { - if (!(tpi->flags&TUNNEL_SEQ) || + if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) { + if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) || (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { DEV_STATS_INC(tunnel->dev, rx_fifo_errors); DEV_STATS_INC(tunnel->dev, rx_errors); @@ -544,7 +543,7 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, struct rt6_info *rt6; __be32 daddr; - rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) : + rt6 = skb_valid_dst(skb) ? dst_rt6_info(skb_dst(skb)) : NULL; daddr = md ? dst : tunnel->parms.iph.daddr; @@ -639,7 +638,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } - if (key->tun_flags & TUNNEL_DONT_FRAGMENT) + if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags)) df = htons(IP_DF); if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen, key->u.ipv4.dst, true)) { @@ -774,7 +773,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, tunnel->parms.o_key, RT_TOS(tos), - dev_net(dev), tunnel->parms.link, + dev_net(dev), READ_ONCE(tunnel->parms.link), tunnel->fwmark, skb_get_hash(skb), 0); if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) @@ -872,7 +871,7 @@ EXPORT_SYMBOL_GPL(ip_tunnel_xmit); static void ip_tunnel_update(struct ip_tunnel_net *itn, struct ip_tunnel *t, struct net_device *dev, - struct ip_tunnel_parm *p, + struct ip_tunnel_parm_kern *p, bool set_mtu, __u32 fwmark) { @@ -894,17 +893,18 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, if (t->parms.link != p->link || t->fwmark != fwmark) { int mtu; - t->parms.link = p->link; + WRITE_ONCE(t->parms.link, p->link); t->fwmark = fwmark; mtu = ip_tunnel_bind_dev(dev); if (set_mtu) - dev->mtu = mtu; + WRITE_ONCE(dev->mtu, mtu); } dst_cache_reset(&t->dst_cache); netdev_state_change(dev); } -int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, + int cmd) { int err = 0; struct ip_tunnel *t = netdev_priv(dev); @@ -928,10 +928,10 @@ int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) goto done; if (p->iph.ttl) p->iph.frag_off |= htons(IP_DF); - if (!(p->i_flags & VTI_ISVTI)) { - if (!(p->i_flags & TUNNEL_KEY)) + if (!test_bit(IP_TUNNEL_VTI_BIT, p->i_flags)) { + if (!test_bit(IP_TUNNEL_KEY_BIT, p->i_flags)) p->i_key = 0; - if (!(p->o_flags & TUNNEL_KEY)) + if (!test_bit(IP_TUNNEL_KEY_BIT, p->o_flags)) p->o_key = 0; } @@ -1006,16 +1006,58 @@ done: } EXPORT_SYMBOL_GPL(ip_tunnel_ctl); +bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp, + const void __user *data) +{ + struct ip_tunnel_parm p; + + if (copy_from_user(&p, data, sizeof(p))) + return false; + + strscpy(kp->name, p.name); + kp->link = p.link; + ip_tunnel_flags_from_be16(kp->i_flags, p.i_flags); + ip_tunnel_flags_from_be16(kp->o_flags, p.o_flags); + kp->i_key = p.i_key; + kp->o_key = p.o_key; + memcpy(&kp->iph, &p.iph, min(sizeof(kp->iph), sizeof(p.iph))); + + return true; +} +EXPORT_SYMBOL_GPL(ip_tunnel_parm_from_user); + +bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp) +{ + struct ip_tunnel_parm p; + + if (!ip_tunnel_flags_is_be16_compat(kp->i_flags) || + !ip_tunnel_flags_is_be16_compat(kp->o_flags)) + return false; + + memset(&p, 0, sizeof(p)); + + strscpy(p.name, kp->name); + p.link = kp->link; + p.i_flags = ip_tunnel_flags_to_be16(kp->i_flags); + p.o_flags = ip_tunnel_flags_to_be16(kp->o_flags); + p.i_key = kp->i_key; + p.o_key = kp->o_key; + memcpy(&p.iph, &kp->iph, min(sizeof(p.iph), sizeof(kp->iph))); + + return !copy_to_user(data, &p, sizeof(p)); +} +EXPORT_SYMBOL_GPL(ip_tunnel_parm_to_user); + int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { - struct ip_tunnel_parm p; + struct ip_tunnel_parm_kern p; int err; - if (copy_from_user(&p, data, sizeof(p))) + if (!ip_tunnel_parm_from_user(&p, data)) return -EFAULT; err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd); - if (!err && copy_to_user(data, &p, sizeof(p))) + if (!err && !ip_tunnel_parm_to_user(data, &p)) return -EFAULT; return err; } @@ -1040,7 +1082,7 @@ int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) new_mtu = max_mtu; } - dev->mtu = new_mtu; + WRITE_ONCE(dev->mtu, new_mtu); return 0; } EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu); @@ -1078,15 +1120,15 @@ struct net *ip_tunnel_get_link_net(const struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - return tunnel->net; + return READ_ONCE(tunnel->net); } EXPORT_SYMBOL(ip_tunnel_get_link_net); int ip_tunnel_get_iflink(const struct net_device *dev) { - struct ip_tunnel *tunnel = netdev_priv(dev); + const struct ip_tunnel *tunnel = netdev_priv(dev); - return tunnel->parms.link; + return READ_ONCE(tunnel->parms.link); } EXPORT_SYMBOL(ip_tunnel_get_iflink); @@ -1094,7 +1136,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname) { struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); - struct ip_tunnel_parm parms; + struct ip_tunnel_parm_kern parms; unsigned int i; itn->rtnl_link_ops = ops; @@ -1157,24 +1199,22 @@ static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn, } void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, - struct rtnl_link_ops *ops) + struct rtnl_link_ops *ops, + struct list_head *dev_to_kill) { struct ip_tunnel_net *itn; struct net *net; - LIST_HEAD(list); - rtnl_lock(); + ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) { itn = net_generic(net, id); - ip_tunnel_destroy(net, itn, &list, ops); + ip_tunnel_destroy(net, itn, dev_to_kill, ops); } - unregister_netdevice_many(&list); - rtnl_unlock(); } EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm *p, __u32 fwmark) + struct ip_tunnel_parm_kern *p, __u32 fwmark) { struct ip_tunnel *nt; struct net *net = dev_net(dev); @@ -1228,7 +1268,7 @@ err_register_netdevice: EXPORT_SYMBOL_GPL(ip_tunnel_newlink); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm *p, __u32 fwmark) + struct ip_tunnel_parm_kern *p, __u32 fwmark) { struct ip_tunnel *t; struct ip_tunnel *tunnel = netdev_priv(dev); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 80ccd6661a..a3676155be 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -125,6 +125,7 @@ EXPORT_SYMBOL_GPL(__iptunnel_pull_header); struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, gfp_t flags) { + IP_TUNNEL_DECLARE_FLAGS(tun_flags) = { }; struct metadata_dst *res; struct ip_tunnel_info *dst, *src; @@ -144,10 +145,10 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, sizeof(struct in6_addr)); else dst->key.u.ipv4.dst = src->key.u.ipv4.src; - dst->key.tun_flags = src->key.tun_flags; + ip_tunnel_flags_copy(dst->key.tun_flags, src->key.tun_flags); dst->mode = src->mode | IP_TUNNEL_INFO_TX; ip_tunnel_info_opts_set(dst, ip_tunnel_info_opts(src), - src->options_len, 0); + src->options_len, tun_flags); return res; } @@ -497,7 +498,7 @@ static int ip_tun_parse_opts_geneve(struct nlattr *attr, opt->opt_class = nla_get_be16(attr); attr = tb[LWTUNNEL_IP_OPT_GENEVE_TYPE]; opt->type = nla_get_u8(attr); - info->key.tun_flags |= TUNNEL_GENEVE_OPT; + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags); } return sizeof(struct geneve_opt) + data_len; @@ -525,7 +526,7 @@ static int ip_tun_parse_opts_vxlan(struct nlattr *attr, attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP]; md->gbp = nla_get_u32(attr); md->gbp &= VXLAN_GBP_MASK; - info->key.tun_flags |= TUNNEL_VXLAN_OPT; + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags); } return sizeof(struct vxlan_metadata); @@ -574,7 +575,7 @@ static int ip_tun_parse_opts_erspan(struct nlattr *attr, set_hwid(&md->u.md2, nla_get_u8(attr)); } - info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags); } return sizeof(struct erspan_metadata); @@ -585,7 +586,7 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info, { int err, rem, opt_len, opts_len = 0; struct nlattr *nla; - __be16 type = 0; + u32 type = 0; if (!attr) return 0; @@ -598,7 +599,7 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info, nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) { switch (nla_type(nla)) { case LWTUNNEL_IP_OPTS_GENEVE: - if (type && type != TUNNEL_GENEVE_OPT) + if (type && type != IP_TUNNEL_GENEVE_OPT_BIT) return -EINVAL; opt_len = ip_tun_parse_opts_geneve(nla, info, opts_len, extack); @@ -607,7 +608,7 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info, opts_len += opt_len; if (opts_len > IP_TUNNEL_OPTS_MAX) return -EINVAL; - type = TUNNEL_GENEVE_OPT; + type = IP_TUNNEL_GENEVE_OPT_BIT; break; case LWTUNNEL_IP_OPTS_VXLAN: if (type) @@ -617,7 +618,7 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info, if (opt_len < 0) return opt_len; opts_len += opt_len; - type = TUNNEL_VXLAN_OPT; + type = IP_TUNNEL_VXLAN_OPT_BIT; break; case LWTUNNEL_IP_OPTS_ERSPAN: if (type) @@ -627,7 +628,7 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info, if (opt_len < 0) return opt_len; opts_len += opt_len; - type = TUNNEL_ERSPAN_OPT; + type = IP_TUNNEL_ERSPAN_OPT_BIT; break; default: return -EINVAL; @@ -705,10 +706,16 @@ static int ip_tun_build_state(struct net *net, struct nlattr *attr, if (tb[LWTUNNEL_IP_TOS]) tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]); - if (tb[LWTUNNEL_IP_FLAGS]) - tun_info->key.tun_flags |= - (nla_get_be16(tb[LWTUNNEL_IP_FLAGS]) & - ~TUNNEL_OPTIONS_PRESENT); + if (tb[LWTUNNEL_IP_FLAGS]) { + IP_TUNNEL_DECLARE_FLAGS(flags); + + ip_tunnel_flags_from_be16(flags, + nla_get_be16(tb[LWTUNNEL_IP_FLAGS])); + ip_tunnel_clear_options_present(flags); + + ip_tunnel_flags_or(tun_info->key.tun_flags, + tun_info->key.tun_flags, flags); + } tun_info->mode = IP_TUNNEL_INFO_TX; tun_info->options_len = opt_len; @@ -812,18 +819,18 @@ static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type, struct nlattr *nest; int err = 0; - if (!(tun_info->key.tun_flags & TUNNEL_OPTIONS_PRESENT)) + if (!ip_tunnel_is_options_present(tun_info->key.tun_flags)) return 0; nest = nla_nest_start_noflag(skb, type); if (!nest) return -ENOMEM; - if (tun_info->key.tun_flags & TUNNEL_GENEVE_OPT) + if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_info->key.tun_flags)) err = ip_tun_fill_encap_opts_geneve(skb, tun_info); - else if (tun_info->key.tun_flags & TUNNEL_VXLAN_OPT) + else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, tun_info->key.tun_flags)) err = ip_tun_fill_encap_opts_vxlan(skb, tun_info); - else if (tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT) + else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_info->key.tun_flags)) err = ip_tun_fill_encap_opts_erspan(skb, tun_info); if (err) { @@ -846,7 +853,8 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb, nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) || - nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags) || + nla_put_be16(skb, LWTUNNEL_IP_FLAGS, + ip_tunnel_flags_to_be16(tun_info->key.tun_flags)) || ip_tun_fill_encap_opts(skb, LWTUNNEL_IP_OPTS, tun_info)) return -ENOMEM; @@ -857,11 +865,11 @@ static int ip_tun_opts_nlsize(struct ip_tunnel_info *info) { int opt_len; - if (!(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT)) + if (!ip_tunnel_is_options_present(info->key.tun_flags)) return 0; opt_len = nla_total_size(0); /* LWTUNNEL_IP_OPTS */ - if (info->key.tun_flags & TUNNEL_GENEVE_OPT) { + if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags)) { struct geneve_opt *opt; int offset = 0; @@ -874,10 +882,10 @@ static int ip_tun_opts_nlsize(struct ip_tunnel_info *info) /* OPT_GENEVE_DATA */ offset += sizeof(*opt) + opt->length * 4; } - } else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) { + } else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) { opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_VXLAN */ + nla_total_size(4); /* OPT_VXLAN_GBP */ - } else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) { + } else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags)) { struct erspan_metadata *md = ip_tunnel_info_opts(info); opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_ERSPAN */ @@ -984,10 +992,17 @@ static int ip6_tun_build_state(struct net *net, struct nlattr *attr, if (tb[LWTUNNEL_IP6_TC]) tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]); - if (tb[LWTUNNEL_IP6_FLAGS]) - tun_info->key.tun_flags |= - (nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]) & - ~TUNNEL_OPTIONS_PRESENT); + if (tb[LWTUNNEL_IP6_FLAGS]) { + IP_TUNNEL_DECLARE_FLAGS(flags); + __be16 data; + + data = nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]); + ip_tunnel_flags_from_be16(flags, data); + ip_tunnel_clear_options_present(flags); + + ip_tunnel_flags_or(tun_info->key.tun_flags, + tun_info->key.tun_flags, flags); + } tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6; tun_info->options_len = opt_len; @@ -1008,7 +1023,8 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb, nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) || nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) || - nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags) || + nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, + ip_tunnel_flags_to_be16(tun_info->key.tun_flags)) || ip_tun_fill_encap_opts(skb, LWTUNNEL_IP6_OPTS, tun_info)) return -ENOMEM; @@ -1116,7 +1132,7 @@ bool ip_tunnel_netlink_encap_parms(struct nlattr *data[], EXPORT_SYMBOL_GPL(ip_tunnel_netlink_encap_parms); void ip_tunnel_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms) + struct ip_tunnel_parm_kern *parms) { if (data[IFLA_IPTUN_LINK]) parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); @@ -1139,8 +1155,12 @@ void ip_tunnel_netlink_parms(struct nlattr *data[], if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC])) parms->iph.frag_off = htons(IP_DF); - if (data[IFLA_IPTUN_FLAGS]) - parms->i_flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]); + if (data[IFLA_IPTUN_FLAGS]) { + __be16 flags; + + flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]); + ip_tunnel_flags_from_be16(parms->i_flags, flags); + } if (data[IFLA_IPTUN_PROTO]) parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index d1d6bb28ed..14536da9f5 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -51,8 +51,11 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, const struct iphdr *iph = ip_hdr(skb); struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); + + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr, iph->daddr, 0); if (tunnel) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) @@ -167,7 +170,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct ip_tunnel_parm *parms = &tunnel->parms; + struct ip_tunnel_parm_kern *parms = &tunnel->parms; struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; /* Device to other host */ int pkt_len = skb->len; @@ -322,8 +325,11 @@ static int vti4_err(struct sk_buff *skb, u32 info) const struct iphdr *iph = (const struct iphdr *)skb->data; int protocol = iph->protocol; struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; + + __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->daddr, iph->saddr, 0); if (!tunnel) return -1; @@ -373,8 +379,9 @@ static int vti4_err(struct sk_buff *skb, u32 info) } static int -vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) { + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; int err = 0; if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { @@ -383,20 +390,26 @@ vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) return -EINVAL; } - if (!(p->i_flags & GRE_KEY)) + if (!ip_tunnel_flags_is_be16_compat(p->i_flags) || + !ip_tunnel_flags_is_be16_compat(p->o_flags)) + return -EOVERFLOW; + + if (!(ip_tunnel_flags_to_be16(p->i_flags) & GRE_KEY)) p->i_key = 0; - if (!(p->o_flags & GRE_KEY)) + if (!(ip_tunnel_flags_to_be16(p->o_flags) & GRE_KEY)) p->o_key = 0; - p->i_flags = VTI_ISVTI; + __set_bit(IP_TUNNEL_VTI_BIT, flags); + ip_tunnel_flags_copy(p->i_flags, flags); err = ip_tunnel_ctl(dev, p, cmd); if (err) return err; if (cmd != SIOCDELTUNNEL) { - p->i_flags |= GRE_KEY; - p->o_flags |= GRE_KEY; + ip_tunnel_flags_from_be16(flags, GRE_KEY); + ip_tunnel_flags_or(p->i_flags, p->i_flags, flags); + ip_tunnel_flags_or(p->o_flags, p->o_flags, flags); } return 0; } @@ -510,14 +523,16 @@ static int __net_init vti_init_net(struct net *net) return 0; } -static void __net_exit vti_exit_batch_net(struct list_head *list_net) +static void __net_exit vti_exit_batch_rtnl(struct list_head *list_net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops); + ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops, + dev_to_kill); } static struct pernet_operations vti_net_ops = { .init = vti_init_net, - .exit_batch = vti_exit_batch_net, + .exit_batch_rtnl = vti_exit_batch_rtnl, .id = &vti_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -529,7 +544,7 @@ static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], } static void vti_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms, + struct ip_tunnel_parm_kern *parms, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); @@ -539,7 +554,7 @@ static void vti_netlink_parms(struct nlattr *data[], if (!data) return; - parms->i_flags = VTI_ISVTI; + __set_bit(IP_TUNNEL_VTI_BIT, parms->i_flags); if (data[IFLA_VTI_LINK]) parms->link = nla_get_u32(data[IFLA_VTI_LINK]); @@ -564,7 +579,7 @@ static int vti_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { - struct ip_tunnel_parm parms; + struct ip_tunnel_parm_kern parms; __u32 fwmark = 0; vti_netlink_parms(data, &parms, &fwmark); @@ -576,8 +591,8 @@ static int vti_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm_kern p; __u32 fwmark = t->fwmark; - struct ip_tunnel_parm p; vti_netlink_parms(data, &p, &fwmark); return ip_tunnel_changelink(dev, tb, &p, fwmark); @@ -604,7 +619,7 @@ static size_t vti_get_size(const struct net_device *dev) static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm *p = &t->parms; + struct ip_tunnel_parm_kern *p = &t->parms; if (nla_put_u32(skb, IFLA_VTI_LINK, p->link) || nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key) || diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 03afa3871e..923a2ef68c 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -130,13 +130,16 @@ static int ipip_err(struct sk_buff *skb, u32 info) struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); const struct iphdr *iph = (const struct iphdr *)skb->data; + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; int err = 0; - t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, - iph->daddr, iph->saddr, 0); + __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); + + t = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->daddr, + iph->saddr, 0); if (!t) { err = -ENOENT; goto out; @@ -213,13 +216,16 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto) { struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct metadata_dst *tun_dst = NULL; struct ip_tunnel *tunnel; const struct iphdr *iph; + __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); + iph = ip_hdr(skb); - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, - iph->saddr, iph->daddr, 0); + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr, + iph->daddr, 0); if (tunnel) { const struct tnl_ptk_info *tpi; @@ -238,7 +244,9 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto) if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; if (tunnel->collect_md) { - tun_dst = ip_tun_rx_dst(skb, 0, 0, 0); + ip_tunnel_flags_zero(flags); + + tun_dst = ip_tun_rx_dst(skb, flags, 0, 0); if (!tun_dst) return 0; ip_tunnel_md_udp_encap(skb, &tun_dst->u.tun_info); @@ -330,7 +338,7 @@ static bool ipip_tunnel_ioctl_verify_protocol(u8 ipproto) } static int -ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) { if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p->iph.version != 4 || @@ -340,7 +348,8 @@ ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) } p->i_key = p->o_key = 0; - p->i_flags = p->o_flags = 0; + ip_tunnel_flags_zero(p->i_flags); + ip_tunnel_flags_zero(p->o_flags); return ip_tunnel_ctl(dev, p, cmd); } @@ -405,8 +414,8 @@ static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], } static void ipip_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms, bool *collect_md, - __u32 *fwmark) + struct ip_tunnel_parm_kern *parms, + bool *collect_md, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); @@ -432,8 +441,8 @@ static int ipip_newlink(struct net *src_net, struct net_device *dev, struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + struct ip_tunnel_parm_kern p; __u32 fwmark = 0; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { @@ -452,8 +461,8 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + struct ip_tunnel_parm_kern p; bool collect_md; __u32 fwmark = t->fwmark; @@ -510,7 +519,7 @@ static size_t ipip_get_size(const struct net_device *dev) static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct ip_tunnel_parm *parm = &tunnel->parms; + struct ip_tunnel_parm_kern *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || @@ -592,14 +601,16 @@ static int __net_init ipip_init_net(struct net *net) return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0"); } -static void __net_exit ipip_exit_batch_net(struct list_head *list_net) +static void __net_exit ipip_exit_batch_rtnl(struct list_head *list_net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops); + ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops, + dev_to_kill); } static struct pernet_operations ipip_net_ops = { .init = ipip_init_net, - .exit_batch = ipip_exit_batch_net, + .exit_batch_rtnl = ipip_exit_batch_rtnl, .id = &ipip_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index b53c36c473..6c750bd13d 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -441,7 +441,7 @@ static bool ipmr_init_vif_indev(const struct net_device *dev) static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) { struct net_device *tunnel_dev, *new_dev; - struct ip_tunnel_parm p = { }; + struct ip_tunnel_parm_kern p = { }; int err; tunnel_dev = __dev_get_by_name(net, "tunl0"); @@ -2589,7 +2589,9 @@ errout_free: static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { - struct fib_dump_filter filter = {}; + struct fib_dump_filter filter = { + .rtnl_held = true, + }; int err; if (cb->strict_check) { @@ -3141,10 +3143,7 @@ int __init ip_mr_init(void) { int err; - mrt_cachep = kmem_cache_create("ip_mrt_cache", - sizeof(struct mfc_cache), - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, - NULL); + mrt_cachep = KMEM_CACHE(mfc_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC); err = register_pernet_subsys(&ipmr_net_ops); if (err) diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index f71a7e9a7d..1b991b8895 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -10,6 +10,10 @@ config NF_DEFRAG_IPV4 tristate default n +# old sockopt interface and eval loop +config IP_NF_IPTABLES_LEGACY + tristate + config NF_SOCKET_IPV4 tristate "IPv4 socket lookup support" help @@ -152,7 +156,7 @@ config IP_NF_MATCH_ECN config IP_NF_MATCH_RPFILTER tristate '"rpfilter" reverse path filter match support' depends on NETFILTER_ADVANCED - depends on IP_NF_MANGLE || IP_NF_RAW + depends on IP_NF_MANGLE || IP_NF_RAW || NFT_COMPAT help This option allows you to match packets whose replies would go out via the interface the packet came in. @@ -173,6 +177,7 @@ config IP_NF_MATCH_TTL config IP_NF_FILTER tristate "Packet filtering" default m if NETFILTER_ADVANCED=n + select IP_NF_IPTABLES_LEGACY help Packet filtering defines a table `filter', which has a series of rules for simple packet filtering at local input, forwarding and @@ -182,7 +187,7 @@ config IP_NF_FILTER config IP_NF_TARGET_REJECT tristate "REJECT target support" - depends on IP_NF_FILTER + depends on IP_NF_FILTER || NFT_COMPAT select NF_REJECT_IPV4 default m if NETFILTER_ADVANCED=n help @@ -212,6 +217,7 @@ config IP_NF_NAT default m if NETFILTER_ADVANCED=n select NF_NAT select NETFILTER_XT_NAT + select IP_NF_IPTABLES_LEGACY help This enables the `nat' table in iptables. This allows masquerading, port forwarding and other forms of full Network Address Port @@ -252,6 +258,7 @@ endif # IP_NF_NAT config IP_NF_MANGLE tristate "Packet mangling" default m if NETFILTER_ADVANCED=n + select IP_NF_IPTABLES_LEGACY help This option adds a `mangle' table to iptables: see the man page for iptables(8). This table is used for various packet alterations @@ -261,7 +268,7 @@ config IP_NF_MANGLE config IP_NF_TARGET_ECN tristate "ECN target support" - depends on IP_NF_MANGLE + depends on IP_NF_MANGLE || NFT_COMPAT depends on NETFILTER_ADVANCED help This option adds a `ECN' target, which can be used in the iptables mangle @@ -286,6 +293,7 @@ config IP_NF_TARGET_TTL # raw + specific targets config IP_NF_RAW tristate 'raw table support (required for NOTRACK/TRACE)' + select IP_NF_IPTABLES_LEGACY help This option adds a `raw' table to iptables. This table is the very first in the netfilter framework and hooks in at the PREROUTING @@ -299,6 +307,7 @@ config IP_NF_SECURITY tristate "Security table" depends on SECURITY depends on NETFILTER_ADVANCED + select IP_NF_IPTABLES_LEGACY help This option adds a `security' table to iptables, for use with Mandatory Access Control (MAC) policy. @@ -309,36 +318,36 @@ endif # IP_NF_IPTABLES # ARP tables config IP_NF_ARPTABLES - tristate "ARP tables support" - select NETFILTER_XTABLES - select NETFILTER_FAMILY_ARP - depends on NETFILTER_ADVANCED - help - arptables is a general, extensible packet identification framework. - The ARP packet filtering and mangling (manipulation)subsystems - use this: say Y or M here if you want to use either of those. - - To compile it as a module, choose M here. If unsure, say N. + tristate -if IP_NF_ARPTABLES +config NFT_COMPAT_ARP + tristate + depends on NF_TABLES_ARP && NFT_COMPAT + default m if NFT_COMPAT=m + default y if NFT_COMPAT=y config IP_NF_ARPFILTER - tristate "ARP packet filtering" + tristate "arptables-legacy packet filtering support" + select IP_NF_ARPTABLES + select NETFILTER_FAMILY_ARP + depends on NETFILTER_XTABLES help ARP packet filtering defines a table `filter', which has a series of rules for simple ARP packet filtering at local input and - local output. On a bridge, you can also specify filtering rules - for forwarded ARP packets. See the man page for arptables(8). + local output. This is only needed for arptables-legacy(8). + Neither arptables-nft nor nftables need this to work. To compile it as a module, choose M here. If unsure, say N. config IP_NF_ARP_MANGLE tristate "ARP payload mangling" + depends on IP_NF_ARPTABLES || NFT_COMPAT_ARP help Allows altering the ARP packet payload: source and destination hardware and network addresses. -endif # IP_NF_ARPTABLES + This option is needed by both arptables-legacy and arptables-nft. + It is not used by nftables. endmenu diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 5a26f9de1a..85502d4dfb 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -25,7 +25,7 @@ obj-$(CONFIG_NFT_FIB_IPV4) += nft_fib_ipv4.o obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o # generic IP tables -obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o +obj-$(CONFIG_IP_NF_IPTABLES_LEGACY) += ip_tables.o # the three instances of ip_tables obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index b9062f4552..3ab908b747 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -44,7 +44,7 @@ static int iptable_filter_table_init(struct net *net) return -ENOMEM; /* Entry 1 is the FORWARD hook */ ((struct ipt_standard *)repl->entries)[1].target.verdict = - forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; + forward ? -NF_ACCEPT - 1 : NF_DROP - 1; err = ipt_register_table(net, &packet_filter, repl, filter_ops); kfree(repl); diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 4d42d0756f..a5db7c67d6 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -145,25 +145,27 @@ static struct pernet_operations iptable_nat_net_ops = { static int __init iptable_nat_init(void) { - int ret = xt_register_template(&nf_nat_ipv4_table, - iptable_nat_table_init); + int ret; + /* net->gen->ptr[iptable_nat_net_id] must be allocated + * before calling iptable_nat_table_init(). + */ + ret = register_pernet_subsys(&iptable_nat_net_ops); if (ret < 0) return ret; - ret = register_pernet_subsys(&iptable_nat_net_ops); - if (ret < 0) { - xt_unregister_template(&nf_nat_ipv4_table); - return ret; - } + ret = xt_register_template(&nf_nat_ipv4_table, + iptable_nat_table_init); + if (ret < 0) + unregister_pernet_subsys(&iptable_nat_net_ops); return ret; } static void __exit iptable_nat_exit(void) { - unregister_pernet_subsys(&iptable_nat_net_ops); xt_unregister_template(&nf_nat_ipv4_table); + unregister_pernet_subsys(&iptable_nat_net_ops); } module_init(iptable_nat_init); diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c index 69e3317996..73e66a088e 100644 --- a/net/ipv4/netfilter/nf_tproxy_ipv4.c +++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c @@ -58,6 +58,8 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) laddr = 0; indev = __in_dev_get_rcu(skb->dev); + if (!indev) + return daddr; in_dev_for_each_ifa_rcu(ifa, indev) { if (ifa->ifa_flags & IFA_F_SECONDARY) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index bbff68b5b5..6b9787ee86 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -26,6 +26,9 @@ static void remove_nexthop(struct net *net, struct nexthop *nh, #define NH_DEV_HASHBITS 8 #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS) +#define NHA_OP_FLAGS_DUMP_ALL (NHA_OP_FLAG_DUMP_STATS | \ + NHA_OP_FLAG_DUMP_HW_STATS) + static const struct nla_policy rtm_nh_policy_new[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_GROUP] = { .type = NLA_BINARY }, @@ -37,10 +40,17 @@ static const struct nla_policy rtm_nh_policy_new[] = { [NHA_ENCAP] = { .type = NLA_NESTED }, [NHA_FDB] = { .type = NLA_FLAG }, [NHA_RES_GROUP] = { .type = NLA_NESTED }, + [NHA_HW_STATS_ENABLE] = NLA_POLICY_MAX(NLA_U32, true), }; static const struct nla_policy rtm_nh_policy_get[] = { [NHA_ID] = { .type = NLA_U32 }, + [NHA_OP_FLAGS] = NLA_POLICY_MASK(NLA_U32, + NHA_OP_FLAGS_DUMP_ALL), +}; + +static const struct nla_policy rtm_nh_policy_del[] = { + [NHA_ID] = { .type = NLA_U32 }, }; static const struct nla_policy rtm_nh_policy_dump[] = { @@ -48,6 +58,8 @@ static const struct nla_policy rtm_nh_policy_dump[] = { [NHA_GROUPS] = { .type = NLA_FLAG }, [NHA_MASTER] = { .type = NLA_U32 }, [NHA_FDB] = { .type = NLA_FLAG }, + [NHA_OP_FLAGS] = NLA_POLICY_MASK(NLA_U32, + NHA_OP_FLAGS_DUMP_ALL), }; static const struct nla_policy rtm_nh_res_policy_new[] = { @@ -92,6 +104,7 @@ __nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info, else if (nh_info->gw_family == AF_INET6) nh_info->ipv6 = nhi->fib_nhc.nhc_gw.ipv6; + nh_info->id = nhi->nh_parent->id; nh_info->is_reject = nhi->reject_nh; nh_info->is_fdb = nhi->fdb_nh; nh_info->has_encap = !!nhi->fib_nhc.nhc_lwtstate; @@ -131,13 +144,13 @@ static int nh_notifier_mpath_info_init(struct nh_notifier_info *info, info->nh_grp->num_nh = num_nh; info->nh_grp->is_fdb = nhg->fdb_nh; + info->nh_grp->hw_stats = nhg->hw_stats; for (i = 0; i < num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; struct nh_info *nhi; nhi = rtnl_dereference(nhge->nh->nh_info); - info->nh_grp->nh_entries[i].id = nhge->nh->id; info->nh_grp->nh_entries[i].weight = nhge->weight; __nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh, nhi); @@ -162,6 +175,7 @@ static int nh_notifier_res_table_info_init(struct nh_notifier_info *info, return -ENOMEM; info->nh_res_table->num_nh_buckets = num_nh_buckets; + info->nh_res_table->hw_stats = nhg->hw_stats; for (i = 0; i < num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; @@ -393,6 +407,7 @@ static int call_nexthop_res_table_notifiers(struct net *net, struct nexthop *nh, struct nh_notifier_info info = { .net = net, .extack = extack, + .id = nh->id, }; struct nh_group *nhg; int err; @@ -474,6 +489,7 @@ static void nexthop_free_group(struct nexthop *nh) struct nh_grp_entry *nhge = &nhg->nh_entries[i]; WARN_ON(!list_empty(&nhge->nh_list)); + free_percpu(nhge->stats); nexthop_put(nhge->nh); } @@ -654,8 +670,204 @@ nla_put_failure: return -EMSGSIZE; } -static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg) +static void nh_grp_entry_stats_inc(struct nh_grp_entry *nhge) +{ + struct nh_grp_entry_stats *cpu_stats; + + cpu_stats = get_cpu_ptr(nhge->stats); + u64_stats_update_begin(&cpu_stats->syncp); + u64_stats_inc(&cpu_stats->packets); + u64_stats_update_end(&cpu_stats->syncp); + put_cpu_ptr(cpu_stats); +} + +static void nh_grp_entry_stats_read(struct nh_grp_entry *nhge, + u64 *ret_packets) { + int i; + + *ret_packets = 0; + + for_each_possible_cpu(i) { + struct nh_grp_entry_stats *cpu_stats; + unsigned int start; + u64 packets; + + cpu_stats = per_cpu_ptr(nhge->stats, i); + do { + start = u64_stats_fetch_begin(&cpu_stats->syncp); + packets = u64_stats_read(&cpu_stats->packets); + } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); + + *ret_packets += packets; + } +} + +static int nh_notifier_grp_hw_stats_init(struct nh_notifier_info *info, + const struct nexthop *nh) +{ + struct nh_group *nhg; + int i; + + ASSERT_RTNL(); + nhg = rtnl_dereference(nh->nh_grp); + + info->id = nh->id; + info->type = NH_NOTIFIER_INFO_TYPE_GRP_HW_STATS; + info->nh_grp_hw_stats = kzalloc(struct_size(info->nh_grp_hw_stats, + stats, nhg->num_nh), + GFP_KERNEL); + if (!info->nh_grp_hw_stats) + return -ENOMEM; + + info->nh_grp_hw_stats->num_nh = nhg->num_nh; + for (i = 0; i < nhg->num_nh; i++) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + + info->nh_grp_hw_stats->stats[i].id = nhge->nh->id; + } + + return 0; +} + +static void nh_notifier_grp_hw_stats_fini(struct nh_notifier_info *info) +{ + kfree(info->nh_grp_hw_stats); +} + +void nh_grp_hw_stats_report_delta(struct nh_notifier_grp_hw_stats_info *info, + unsigned int nh_idx, + u64 delta_packets) +{ + info->hw_stats_used = true; + info->stats[nh_idx].packets += delta_packets; +} +EXPORT_SYMBOL(nh_grp_hw_stats_report_delta); + +static void nh_grp_hw_stats_apply_update(struct nexthop *nh, + struct nh_notifier_info *info) +{ + struct nh_group *nhg; + int i; + + ASSERT_RTNL(); + nhg = rtnl_dereference(nh->nh_grp); + + for (i = 0; i < nhg->num_nh; i++) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + + nhge->packets_hw += info->nh_grp_hw_stats->stats[i].packets; + } +} + +static int nh_grp_hw_stats_update(struct nexthop *nh, bool *hw_stats_used) +{ + struct nh_notifier_info info = { + .net = nh->net, + }; + struct net *net = nh->net; + int err; + + if (nexthop_notifiers_is_empty(net)) { + *hw_stats_used = false; + return 0; + } + + err = nh_notifier_grp_hw_stats_init(&info, nh); + if (err) + return err; + + err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, + NEXTHOP_EVENT_HW_STATS_REPORT_DELTA, + &info); + + /* Cache whatever we got, even if there was an error, otherwise the + * successful stats retrievals would get lost. + */ + nh_grp_hw_stats_apply_update(nh, &info); + *hw_stats_used = info.nh_grp_hw_stats->hw_stats_used; + + nh_notifier_grp_hw_stats_fini(&info); + return notifier_to_errno(err); +} + +static int nla_put_nh_group_stats_entry(struct sk_buff *skb, + struct nh_grp_entry *nhge, + u32 op_flags) +{ + struct nlattr *nest; + u64 packets; + + nh_grp_entry_stats_read(nhge, &packets); + + nest = nla_nest_start(skb, NHA_GROUP_STATS_ENTRY); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, NHA_GROUP_STATS_ENTRY_ID, nhge->nh->id) || + nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS, + packets + nhge->packets_hw)) + goto nla_put_failure; + + if (op_flags & NHA_OP_FLAG_DUMP_HW_STATS && + nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS_HW, + nhge->packets_hw)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int nla_put_nh_group_stats(struct sk_buff *skb, struct nexthop *nh, + u32 op_flags) +{ + struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + struct nlattr *nest; + bool hw_stats_used; + int err; + int i; + + if (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats)) + goto err_out; + + if (op_flags & NHA_OP_FLAG_DUMP_HW_STATS && + nhg->hw_stats) { + err = nh_grp_hw_stats_update(nh, &hw_stats_used); + if (err) + goto out; + + if (nla_put_u32(skb, NHA_HW_STATS_USED, hw_stats_used)) + goto err_out; + } + + nest = nla_nest_start(skb, NHA_GROUP_STATS); + if (!nest) + goto err_out; + + for (i = 0; i < nhg->num_nh; i++) + if (nla_put_nh_group_stats_entry(skb, &nhg->nh_entries[i], + op_flags)) + goto cancel_out; + + nla_nest_end(skb, nest); + return 0; + +cancel_out: + nla_nest_cancel(skb, nest); +err_out: + err = -EMSGSIZE; +out: + return err; +} + +static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh, + u32 op_flags) +{ + struct nh_group *nhg = rtnl_dereference(nh->nh_grp); struct nexthop_grp *p; size_t len = nhg->num_nh * sizeof(*p); struct nlattr *nla; @@ -676,14 +888,20 @@ static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg) p = nla_data(nla); for (i = 0; i < nhg->num_nh; ++i) { - p->id = nhg->nh_entries[i].nh->id; - p->weight = nhg->nh_entries[i].weight - 1; - p += 1; + *p++ = (struct nexthop_grp) { + .id = nhg->nh_entries[i].nh->id, + .weight = nhg->nh_entries[i].weight - 1, + }; } if (nhg->resilient && nla_put_nh_group_res(skb, nhg)) goto nla_put_failure; + if (op_flags & NHA_OP_FLAG_DUMP_STATS && + (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats) || + nla_put_nh_group_stats(skb, nh, op_flags))) + goto nla_put_failure; + return 0; nla_put_failure: @@ -691,7 +909,8 @@ nla_put_failure: } static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, - int event, u32 portid, u32 seq, unsigned int nlflags) + int event, u32 portid, u32 seq, unsigned int nlflags, + u32 op_flags) { struct fib6_nh *fib6_nh; struct fib_nh *fib_nh; @@ -718,7 +937,7 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB)) goto nla_put_failure; - if (nla_put_nh_group(skb, nhg)) + if (nla_put_nh_group(skb, nh, op_flags)) goto nla_put_failure; goto out; } @@ -849,7 +1068,7 @@ static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info) if (!skb) goto errout; - err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags); + err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags, 0); if (err < 0) { /* -EMSGSIZE implies BUG in nh_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -1104,6 +1323,7 @@ static int nh_check_attr_group(struct net *net, if (!tb[i]) continue; switch (i) { + case NHA_HW_STATS_ENABLE: case NHA_FDB: continue; case NHA_RES_GROUP: @@ -1176,6 +1396,7 @@ static struct nexthop *nexthop_select_path_fdb(struct nh_group *nhg, int hash) if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; + nh_grp_entry_stats_inc(nhge); return nhge->nh; } @@ -1185,7 +1406,7 @@ static struct nexthop *nexthop_select_path_fdb(struct nh_group *nhg, int hash) static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash) { - struct nexthop *rc = NULL; + struct nh_grp_entry *nhge0 = NULL; int i; if (nhg->fdb_nh) @@ -1200,16 +1421,20 @@ static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash) if (!nexthop_is_good_nh(nhge->nh)) continue; - if (!rc) - rc = nhge->nh; + if (!nhge0) + nhge0 = nhge; if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; + nh_grp_entry_stats_inc(nhge); return nhge->nh; } - return rc ? : nhg->nh_entries[0].nh; + if (!nhge0) + nhge0 = &nhg->nh_entries[0]; + nh_grp_entry_stats_inc(nhge0); + return nhge0->nh; } static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash) @@ -1225,6 +1450,7 @@ static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash) bucket = &res_table->nh_buckets[bucket_index]; nh_res_bucket_set_busy(bucket); nhge = rcu_dereference(bucket->nh_entry); + nh_grp_entry_stats_inc(nhge); return nhge->nh; } @@ -1798,6 +2024,7 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, newg->has_v4 = true; list_del(&nhges[i].nh_list); + new_nhges[j].stats = nhges[i].stats; new_nhges[j].nh_parent = nhges[i].nh_parent; new_nhges[j].nh = nhges[i].nh; new_nhges[j].weight = nhges[i].weight; @@ -1813,6 +2040,7 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, rcu_assign_pointer(nhp->nh_grp, newg); list_del(&nhge->nh_list); + free_percpu(nhge->stats); nexthop_put(nhge->nh); /* Removal of a NH from a resilient group is notified through @@ -2477,6 +2705,13 @@ static struct nexthop *nexthop_create_group(struct net *net, if (nhi->family == AF_INET) nhg->has_v4 = true; + nhg->nh_entries[i].stats = + netdev_alloc_pcpu_stats(struct nh_grp_entry_stats); + if (!nhg->nh_entries[i].stats) { + err = -ENOMEM; + nexthop_put(nhe); + goto out_no_nh; + } nhg->nh_entries[i].nh = nhe; nhg->nh_entries[i].weight = entry[i].weight + 1; list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list); @@ -2509,6 +2744,9 @@ static struct nexthop *nexthop_create_group(struct net *net, if (cfg->nh_fdb) nhg->fdb_nh = 1; + if (cfg->nh_hw_stats) + nhg->hw_stats = true; + rcu_assign_pointer(nh->nh_grp, nhg); return nh; @@ -2516,6 +2754,7 @@ static struct nexthop *nexthop_create_group(struct net *net, out_no_nh: for (i--; i >= 0; --i) { list_del(&nhg->nh_entries[i].nh_list); + free_percpu(nhg->nh_entries[i].stats); nexthop_put(nhg->nh_entries[i].nh); } @@ -2850,6 +3089,9 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, err = rtm_to_nh_config_grp_res(tb[NHA_RES_GROUP], cfg, extack); + if (tb[NHA_HW_STATS_ENABLE]) + cfg->nh_hw_stats = nla_get_u32(tb[NHA_HW_STATS_ENABLE]); + /* no other attributes should be set */ goto out; } @@ -2941,6 +3183,10 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, goto out; } + if (tb[NHA_HW_STATS_ENABLE]) { + NL_SET_ERR_MSG(extack, "Cannot enable nexthop hardware statistics for non-group nexthops"); + goto out; + } err = 0; out: @@ -2966,9 +3212,9 @@ static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, return err; } -static int __nh_valid_get_del_req(const struct nlmsghdr *nlh, - struct nlattr **tb, u32 *id, - struct netlink_ext_ack *extack) +static int nh_valid_get_del_req(const struct nlmsghdr *nlh, + struct nlattr **tb, u32 *id, u32 *op_flags, + struct netlink_ext_ack *extack) { struct nhmsg *nhm = nlmsg_data(nlh); @@ -2988,28 +3234,21 @@ static int __nh_valid_get_del_req(const struct nlmsghdr *nlh, return -EINVAL; } - return 0; -} - -static int nh_valid_get_del_req(const struct nlmsghdr *nlh, u32 *id, - struct netlink_ext_ack *extack) -{ - struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get)]; - int err; - - err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, - ARRAY_SIZE(rtm_nh_policy_get) - 1, - rtm_nh_policy_get, extack); - if (err < 0) - return err; + if (op_flags) { + if (tb[NHA_OP_FLAGS]) + *op_flags = nla_get_u32(tb[NHA_OP_FLAGS]); + else + *op_flags = 0; + } - return __nh_valid_get_del_req(nlh, tb, id, extack); + return 0; } /* rtnl */ static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_del)]; struct net *net = sock_net(skb->sk); struct nl_info nlinfo = { .nlh = nlh, @@ -3020,7 +3259,13 @@ static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, int err; u32 id; - err = nh_valid_get_del_req(nlh, &id, extack); + err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, + ARRAY_SIZE(rtm_nh_policy_del) - 1, rtm_nh_policy_del, + extack); + if (err < 0) + return err; + + err = nh_valid_get_del_req(nlh, tb, &id, NULL, extack); if (err) return err; @@ -3037,13 +3282,21 @@ static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get)]; struct net *net = sock_net(in_skb->sk); struct sk_buff *skb = NULL; struct nexthop *nh; + u32 op_flags; int err; u32 id; - err = nh_valid_get_del_req(nlh, &id, extack); + err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, + ARRAY_SIZE(rtm_nh_policy_get) - 1, rtm_nh_policy_get, + extack); + if (err < 0) + return err; + + err = nh_valid_get_del_req(nlh, tb, &id, &op_flags, extack); if (err) return err; @@ -3058,7 +3311,7 @@ static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh, goto errout_free; err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0); + nlh->nlmsg_seq, 0, op_flags); if (err < 0) { WARN_ON(err == -EMSGSIZE); goto errout_free; @@ -3079,6 +3332,7 @@ struct nh_dump_filter { bool group_filter; bool fdb_filter; u32 res_bucket_nh_id; + u32 op_flags; }; static bool nh_dump_filtered(struct nexthop *nh, @@ -3166,6 +3420,11 @@ static int nh_valid_dump_req(const struct nlmsghdr *nlh, if (err < 0) return err; + if (tb[NHA_OP_FLAGS]) + filter->op_flags = nla_get_u32(tb[NHA_OP_FLAGS]); + else + filter->op_flags = 0; + return __nh_valid_dump_req(nlh, tb, filter, cb->extack); } @@ -3223,7 +3482,7 @@ static int rtm_dump_nexthop_cb(struct sk_buff *skb, struct netlink_callback *cb, return nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI); + cb->nlh->nlmsg_seq, NLM_F_MULTI, filter->op_flags); } /* rtnl */ @@ -3241,10 +3500,6 @@ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) err = rtm_dump_walk_nexthops(skb, cb, root, ctx, &rtm_dump_nexthop_cb, &filter); - if (err < 0) { - if (likely(skb->len)) - err = skb->len; - } cb->seq = net->nexthop.seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -3439,11 +3694,6 @@ static int rtm_dump_nexthop_bucket(struct sk_buff *skb, &rtm_dump_nexthop_bucket_cb, &dd); } - if (err < 0) { - if (likely(skb->len)) - err = skb->len; - } - cb->seq = net->nexthop.seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); return err; @@ -3483,7 +3733,7 @@ static int nh_valid_get_bucket_req(const struct nlmsghdr *nlh, if (err < 0) return err; - err = __nh_valid_get_del_req(nlh, tb, id, extack); + err = nh_valid_get_del_req(nlh, tb, id, NULL, extack); if (err) return err; @@ -3631,17 +3881,24 @@ unlock: } EXPORT_SYMBOL(register_nexthop_notifier); -int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) +int __unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) { int err; - rtnl_lock(); err = blocking_notifier_chain_unregister(&net->nexthop.notifier_chain, nb); - if (err) - goto unlock; - nexthops_dump(net, nb, NEXTHOP_EVENT_DEL, NULL); -unlock: + if (!err) + nexthops_dump(net, nb, NEXTHOP_EVENT_DEL, NULL); + return err; +} +EXPORT_SYMBOL(__unregister_nexthop_notifier); + +int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) +{ + int err; + + rtnl_lock(); + err = __unregister_nexthop_notifier(net, nb); rtnl_unlock(); return err; } @@ -3737,16 +3994,20 @@ out: } EXPORT_SYMBOL(nexthop_res_grp_activity_update); -static void __net_exit nexthop_net_exit_batch(struct list_head *net_list) +static void __net_exit nexthop_net_exit_batch_rtnl(struct list_head *net_list, + struct list_head *dev_to_kill) { struct net *net; - rtnl_lock(); - list_for_each_entry(net, net_list, exit_list) { + ASSERT_RTNL(); + list_for_each_entry(net, net_list, exit_list) flush_all_nexthops(net); - kfree(net->nexthop.devhash); - } - rtnl_unlock(); +} + +static void __net_exit nexthop_net_exit(struct net *net) +{ + kfree(net->nexthop.devhash); + net->nexthop.devhash = NULL; } static int __net_init nexthop_net_init(struct net *net) @@ -3764,7 +4025,8 @@ static int __net_init nexthop_net_init(struct net *net) static struct pernet_operations nexthop_net_ops = { .init = nexthop_net_init, - .exit_batch = nexthop_net_exit_batch, + .exit = nexthop_net_exit, + .exit_batch_rtnl = nexthop_net_exit_batch_rtnl, }; static int __init nexthop_init(void) diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 5f4654ebff..6c4664c681 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -33,6 +33,7 @@ #include <net/protocol.h> #include <net/tcp.h> #include <net/mptcp.h> +#include <net/proto_memory.h> #include <net/udp.h> #include <net/udplite.h> #include <linux/bottom_half.h> @@ -395,7 +396,7 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v) seq_printf(seq, " %s", snmp4_ipstats_list[i].name); seq_printf(seq, "\nIp: %d %d", - IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2, + IPV4_DEVCONF_ALL_RO(net, FORWARDING) ? 1 : 2, READ_ONCE(net->ipv4.sysctl_ip_default_ttl)); BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 288f1846b3..4cb43401e0 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -175,6 +175,13 @@ static int raw_v4_input(struct net *net, struct sk_buff *skb, if (!raw_v4_match(net, sk, iph->protocol, iph->saddr, iph->daddr, dif, sdif)) continue; + + if (atomic_read(&sk->sk_rmem_alloc) >= + READ_ONCE(sk->sk_rcvbuf)) { + atomic_inc(&sk->sk_drops); + continue; + } + delivered = 1; if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) && ip_mc_sf_allow(sk, iph->daddr, iph->saddr, @@ -310,7 +317,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) } nf_reset_ct(skb); - skb_push(skb, skb->data - skb_network_header(skb)); + skb_push(skb, -skb_network_offset(skb)); raw_rcv_skb(sk, skb); return 0; @@ -605,6 +612,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0, sk->sk_uid); + fl4.fl4_icmp_type = 0; + fl4.fl4_icmp_code = 0; + if (!hdrincl) { rfv.msg = msg; rfv.hlen = 0; @@ -816,7 +826,7 @@ static int raw_geticmpfilter(struct sock *sk, char __user *optval, int __user *o out: return ret; } -static int do_raw_setsockopt(struct sock *sk, int level, int optname, +static int do_raw_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { if (optname == ICMP_FILTER) { @@ -833,11 +843,11 @@ static int raw_setsockopt(struct sock *sk, int level, int optname, { if (level != SOL_RAW) return ip_setsockopt(sk, level, optname, optval, optlen); - return do_raw_setsockopt(sk, level, optname, optval, optlen); + return do_raw_setsockopt(sk, optname, optval, optlen); } -static int do_raw_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) +static int do_raw_getsockopt(struct sock *sk, int optname, + char __user *optval, int __user *optlen) { if (optname == ICMP_FILTER) { if (inet_sk(sk)->inet_num != IPPROTO_ICMP) @@ -853,7 +863,7 @@ static int raw_getsockopt(struct sock *sk, int level, int optname, { if (level != SOL_RAW) return ip_getsockopt(sk, level, optname, optval, optlen); - return do_raw_getsockopt(sk, level, optname, optval, optlen); + return do_raw_getsockopt(sk, optname, optval, optlen); } static int raw_ioctl(struct sock *sk, int cmd, int *karg) diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index fe2140c837..cc793bd8de 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -213,6 +213,7 @@ static int raw_diag_destroy(struct sk_buff *in_skb, #endif static const struct inet_diag_handler raw_diag_handler = { + .owner = THIS_MODULE, .dump = raw_diag_dump, .dump_one = raw_diag_dump_one, .idiag_get_info = raw_diag_get_info, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f67d3d6fe9..990912fa18 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -106,9 +106,6 @@ #include "fib_lookup.h" -#define RT_FL_TOS(oldflp4) \ - ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) - #define RT_GC_TIMEOUT (300*HZ) #define DEFAULT_MIN_PMTU (512 + 20 + 20) @@ -132,7 +129,8 @@ struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ipv4_default_advmss(const struct dst_entry *dst); INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst); -static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); +static void ipv4_negative_advice(struct sock *sk, + struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, @@ -498,15 +496,6 @@ void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) } EXPORT_SYMBOL(__ip_select_ident); -static void ip_rt_fix_tos(struct flowi4 *fl4) -{ - __u8 tos = RT_FL_TOS(fl4); - - fl4->flowi4_tos = tos & IPTOS_RT_MASK; - if (tos & RTO_ONLINK) - fl4->flowi4_scope = RT_SCOPE_LINK; -} - static void __build_flow_key(const struct net *net, struct flowi4 *fl4, const struct sock *sk, const struct iphdr *iph, int oif, __u8 tos, u8 prot, u32 mark, @@ -831,28 +820,21 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf u32 mark = skb->mark; __u8 tos = iph->tos; - rt = (struct rtable *) dst; + rt = dst_rtable(dst); __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0); __ip_do_redirect(rt, skb, &fl4, true); } -static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) +static void ipv4_negative_advice(struct sock *sk, + struct dst_entry *dst) { - struct rtable *rt = (struct rtable *)dst; - struct dst_entry *ret = dst; + struct rtable *rt = dst_rtable(dst); - if (rt) { - if (dst->obsolete > 0) { - ip_rt_put(rt); - ret = NULL; - } else if ((rt->rt_flags & RTCF_REDIRECTED) || - rt->dst.expires) { - ip_rt_put(rt); - ret = NULL; - } - } - return ret; + if ((dst->obsolete > 0) || + (rt->rt_flags & RTCF_REDIRECTED) || + rt->dst.expires) + sk_dst_reset(sk); } /* @@ -1056,7 +1038,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { - struct rtable *rt = (struct rtable *) dst; + struct rtable *rt = dst_rtable(dst); struct flowi4 fl4; ip_rt_build_flow_key(&fl4, sk, skb); @@ -1127,7 +1109,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); - rt = (struct rtable *)odst; + rt = dst_rtable(odst); if (odst->obsolete && !odst->ops->check(odst, 0)) { rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) @@ -1136,7 +1118,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) new = true; } - __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu); + __ip_rt_update_pmtu(dst_rtable(xfrm_dst_path(&rt->dst)), &fl4, mtu); if (!dst_check(&rt->dst, 0)) { if (new) @@ -1193,7 +1175,7 @@ EXPORT_SYMBOL_GPL(ipv4_sk_redirect); INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { - struct rtable *rt = (struct rtable *) dst; + struct rtable *rt = dst_rtable(dst); /* All IPV4 dsts are created with ->obsolete set to the value * DST_OBSOLETE_FORCE_CHK which forces validation calls down @@ -1281,7 +1263,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, - .flowi4_tos = RT_TOS(iph->tos), + .flowi4_tos = iph->tos & IPTOS_RT_MASK, .flowi4_oif = rt->dst.dev->ifindex, .flowi4_iif = skb->dev->ifindex, .flowi4_mark = skb->mark, @@ -1528,10 +1510,8 @@ void rt_del_uncached_list(struct rtable *rt) static void ipv4_dst_destroy(struct dst_entry *dst) { - struct rtable *rt = (struct rtable *)dst; - ip_dst_metrics_put(dst); - rt_del_uncached_list(rt); + rt_del_uncached_list(dst_rtable(dst)); } void rt_flush_dev(struct net_device *dev) @@ -2314,7 +2294,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (IN_DEV_BFORWARD(in_dev)) goto make_route; /* not do cache if bc_forwarding is enabled */ - if (IPV4_DEVCONF_ALL(net, BC_FORWARDING)) + if (IPV4_DEVCONF_ALL_RO(net, BC_FORWARDING)) do_cache = false; goto brd_input; } @@ -2639,7 +2619,7 @@ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, struct rtable *rth; fl4->flowi4_iif = LOOPBACK_IFINDEX; - ip_rt_fix_tos(fl4); + fl4->flowi4_tos &= IPTOS_RT_MASK; rcu_read_lock(); rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); @@ -2832,7 +2812,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = { struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) { - struct rtable *ort = (struct rtable *) dst_orig; + struct rtable *ort = dst_rtable(dst_orig); struct rtable *rt; rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, DST_OBSOLETE_DEAD, 0); @@ -2877,9 +2857,9 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, if (flp4->flowi4_proto) { flp4->flowi4_oif = rt->dst.dev->ifindex; - rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, - flowi4_to_flowi(flp4), - sk, 0); + rt = dst_rtable(xfrm_lookup_route(net, &rt->dst, + flowi4_to_flowi(flp4), + sk, 0)); } return rt; @@ -2888,9 +2868,9 @@ EXPORT_SYMBOL_GPL(ip_route_output_flow); /* called with rcu_read_lock held */ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, - struct rtable *rt, u32 table_id, struct flowi4 *fl4, - struct sk_buff *skb, u32 portid, u32 seq, - unsigned int flags) + struct rtable *rt, u32 table_id, dscp_t dscp, + struct flowi4 *fl4, struct sk_buff *skb, u32 portid, + u32 seq, unsigned int flags) { struct rtmsg *r; struct nlmsghdr *nlh; @@ -2906,7 +2886,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, r->rtm_family = AF_INET; r->rtm_dst_len = 32; r->rtm_src_len = 0; - r->rtm_tos = fl4 ? fl4->flowi4_tos : 0; + r->rtm_tos = inet_dscp_to_dsfield(dscp); r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, table_id)) goto nla_put_failure; @@ -2994,7 +2974,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, #ifdef CONFIG_IP_MROUTE if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && - IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { + IPV4_DEVCONF_ALL_RO(net, MC_FORWARDING)) { int err = ipmr_get_route(net, skb, fl4->saddr, fl4->daddr, r, portid); @@ -3056,7 +3036,7 @@ static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb, goto next; err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt, - table_id, NULL, skb, + table_id, 0, NULL, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err) @@ -3352,7 +3332,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fri.tb_id = table_id; fri.dst = res.prefix; fri.dst_len = res.prefixlen; - fri.dscp = inet_dsfield_to_dscp(fl4.flowi4_tos); + fri.dscp = res.dscp; fri.type = rt->rt_type; fri.offload = 0; fri.trap = 0; @@ -3379,8 +3359,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0); } else { - err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb, - NETLINK_CB(in_skb).portid, + err = rt_fill_info(net, dst, src, rt, table_id, res.dscp, &fl4, + skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); } if (err < 0) @@ -3510,7 +3490,6 @@ static struct ctl_table ipv4_route_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static const char ipv4_route_flush_procname[] = "flush"; @@ -3544,7 +3523,6 @@ static struct ctl_table ipv4_route_netns_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { }, }; static __net_init int sysctl_route_net_init(struct net *net) @@ -3562,16 +3540,14 @@ static __net_init int sysctl_route_net_init(struct net *net) /* Don't export non-whitelisted sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) { - if (tbl[0].procname != ipv4_route_flush_procname) { - tbl[0].procname = NULL; + if (tbl[0].procname != ipv4_route_flush_procname) table_size = 0; - } } /* Update the variables to point into the current struct net * except for the first element flush */ - for (i = 1; i < ARRAY_SIZE(ipv4_route_netns_table) - 1; i++) + for (i = 1; i < table_size; i++) tbl[i].data += (void *)net - (void *)&init_net; } tbl[0].extra1 = net; @@ -3591,7 +3567,7 @@ err_dup: static __net_exit void sysctl_route_net_exit(struct net *net) { - struct ctl_table *tbl; + const struct ctl_table *tbl; tbl = net->ipv4.route_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.route_hdr); @@ -3694,9 +3670,8 @@ int __init ip_rt_init(void) panic("IP: failed to allocate ip_rt_acct\n"); #endif - ipv4_dst_ops.kmem_cachep = - kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + ipv4_dst_ops.kmem_cachep = KMEM_CACHE(rtable, + SLAB_HWCACHE_ALIGN | SLAB_PANIC); ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 61f1c96cfe..b61d36810f 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -51,15 +51,6 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, count, &syncookie_secret[c]); } -/* Convert one nsec 64bit timestamp to ts (ms or usec resolution) */ -static u64 tcp_ns_to_ts(bool usec_ts, u64 val) -{ - if (usec_ts) - return div_u64(val, NSEC_PER_USEC); - - return div_u64(val, NSEC_PER_MSEC); -} - /* * when syncookies are in effect and tcp timestamps are enabled we encode * tcp options in the lower bits of the timestamp value that will be @@ -304,6 +295,24 @@ static int cookie_tcp_reqsk_init(struct sock *sk, struct sk_buff *skb, return 0; } +#if IS_ENABLED(CONFIG_BPF) +struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb) +{ + struct request_sock *req = inet_reqsk(skb->sk); + + skb->sk = NULL; + skb->destructor = NULL; + + if (cookie_tcp_reqsk_init(sk, skb, req)) { + reqsk_free(req); + req = NULL; + } + + return req; +} +EXPORT_SYMBOL_GPL(cookie_bpf_check); +#endif + struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk, struct sk_buff *skb, struct tcp_options_received *tcp_opt, @@ -399,16 +408,23 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) struct rtable *rt; __u8 rcv_wscale; int full_space; + SKB_DR(reason); if (!READ_ONCE(net->ipv4.sysctl_tcp_syncookies) || !th->ack || th->rst) goto out; - req = cookie_tcp_check(net, sk, skb); - if (IS_ERR(req)) - goto out; - if (!req) + if (cookie_bpf_ok(skb)) { + req = cookie_bpf_check(sk, skb); + } else { + req = cookie_tcp_check(net, sk, skb); + if (IS_ERR(req)) + goto out; + } + if (!req) { + SKB_DR_SET(reason, NO_SOCKET); goto out_drop; + } ireq = inet_rsk(req); @@ -420,8 +436,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) */ RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); - if (security_inet_conn_request(sk, skb, req)) + if (security_inet_conn_request(sk, skb, req)) { + SKB_DR_SET(reason, SECURITY_HOOK); goto out_free; + } tcp_ao_syncookie(sk, skb, req, AF_INET); @@ -438,11 +456,14 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid); security_req_classify_flow(req, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_key(net, &fl4); - if (IS_ERR(rt)) + if (IS_ERR(rt)) { + SKB_DR_SET(reason, IP_OUTNOROUTES); goto out_free; + } /* Try to redo what tcp_v4_send_synack did. */ - req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); + req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? : + dst_metric(&rt->dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ full_space = tcp_full_space(sk); if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && @@ -454,19 +475,27 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); - ireq->rcv_wscale = rcv_wscale; + /* req->syncookie is set true only if ACK is validated + * by BPF kfunc, then, rcv_wscale is already configured. + */ + if (!req->syncookie) + ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, &rt->dst); ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst); /* ip_queue_xmit() depends on our flow being setup * Normal sockets get it right from inet_csk_route_child_sock() */ - if (ret) - inet_sk(ret)->cork.fl.u.ip4 = fl4; + if (!ret) { + SKB_DR_SET(reason, NO_SOCKET); + goto out_drop; + } + inet_sk(ret)->cork.fl.u.ip4 = fl4; out: return ret; out_free: reqsk_free(req); out_drop: + kfree_skb_reason(skb, reason); return NULL; } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 7e4f16a7dc..162a0a3b6b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -575,7 +575,6 @@ static struct ctl_table ipv4_table[] = { .extra1 = &sysctl_fib_sync_mem_min, .extra2 = &sysctl_fib_sync_mem_max, }, - { } }; static struct ctl_table ipv4_net_table[] = { @@ -1502,11 +1501,11 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ONE, }, - { } }; static __net_init int ipv4_sysctl_init_net(struct net *net) { + size_t table_size = ARRAY_SIZE(ipv4_net_table); struct ctl_table *table; table = ipv4_net_table; @@ -1517,7 +1516,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) if (!table) goto err_alloc; - for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) { + for (i = 0; i < table_size; i++) { if (table[i].data) { /* Update the variables to point into * the current struct net @@ -1533,7 +1532,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) } net->ipv4.ipv4_hdr = register_net_sysctl_sz(net, "net/ipv4", table, - ARRAY_SIZE(ipv4_net_table)); + table_size); if (!net->ipv4.ipv4_hdr) goto err_reg; @@ -1554,7 +1553,7 @@ err_alloc: static __net_exit void ipv4_sysctl_exit_net(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; kfree(net->ipv4.sysctl_local_reserved_ports); table = net->ipv4.ipv4_hdr->ctl_table_arg; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5887eac87b..ec69110341 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -272,13 +272,17 @@ #include <net/inet_common.h> #include <net/tcp.h> #include <net/mptcp.h> +#include <net/proto_memory.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/sock.h> +#include <net/rstreason.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <net/busy_poll.h> +#include <net/hotdata.h> +#include <net/rps.h> /* Track pending CMSGs. */ enum { @@ -289,6 +293,9 @@ enum { DEFINE_PER_CPU(unsigned int, tcp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count); +DEFINE_PER_CPU(u32, tcp_tw_isn); +EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn); + long sysctl_tcp_mem[3] __read_mostly; EXPORT_SYMBOL(sysctl_tcp_mem); @@ -591,7 +598,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) */ mask |= EPOLLOUT | EPOLLWRNORM; } - /* This barrier is coupled with smp_wmb() in tcp_reset() */ + /* This barrier is coupled with smp_wmb() in tcp_done_with_error() */ smp_rmb(); if (READ_ONCE(sk->sk_err) || !skb_queue_empty_lockless(&sk->sk_error_queue)) @@ -974,7 +981,7 @@ int tcp_wmem_schedule(struct sock *sk, int copy) * Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0] * to guarantee some progress. */ - left = sock_net(sk)->ipv4.sysctl_tcp_wmem[0] - sk->sk_wmem_queued; + left = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[0]) - sk->sk_wmem_queued; if (left > 0) sk_forced_mem_schedule(sk, min(left, copy)); return min(copy, sk->sk_forward_alloc); @@ -1158,6 +1165,9 @@ new_segment: process_backlog++; +#ifdef CONFIG_SKB_DECRYPTED + skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); +#endif tcp_skb_entail(sk, skb); copy = size_goal; @@ -1183,7 +1193,7 @@ new_segment: if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { - if (i >= READ_ONCE(sysctl_max_skb_frags)) { + if (i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) { tcp_mark_push(tp, skb); goto new_segment; } @@ -1415,8 +1425,6 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) struct sk_buff *skb; int copied = 0, err = 0; - /* XXX -- need to support SO_PEEK_OFF */ - skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) @@ -1720,7 +1728,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val) space = tcp_space_from_win(sk, val); if (space > sk->sk_rcvbuf) { WRITE_ONCE(sk->sk_rcvbuf, space); - tcp_sk(sk)->window_clamp = val; + WRITE_ONCE(tcp_sk(sk)->window_clamp, val); } return 0; } @@ -2327,6 +2335,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, int target; /* Read at least this many bytes */ long timeo; struct sk_buff *skb, *last; + u32 peek_offset = 0; u32 urg_hole = 0; err = -ENOTCONN; @@ -2360,7 +2369,8 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, seq = &tp->copied_seq; if (flags & MSG_PEEK) { - peek_seq = tp->copied_seq; + peek_offset = max(sk_peek_offset(sk, flags), 0); + peek_seq = tp->copied_seq + peek_offset; seq = &peek_seq; } @@ -2463,11 +2473,11 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, } if ((flags & MSG_PEEK) && - (peek_seq - copied - urg_hole != tp->copied_seq)) { + (peek_seq - peek_offset - copied - urg_hole != tp->copied_seq)) { net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n", current->comm, task_pid_nr(current)); - peek_seq = tp->copied_seq; + peek_seq = tp->copied_seq + peek_offset; } continue; @@ -2508,7 +2518,10 @@ found_ok_skb: WRITE_ONCE(*seq, *seq + used); copied += used; len -= used; - + if (flags & MSG_PEEK) + sk_peek_offset_fwd(sk, used); + else + sk_peek_offset_bwd(sk, used); tcp_rcv_space_adjust(sk); skip_copy: @@ -2636,6 +2649,10 @@ void tcp_set_state(struct sock *sk, int state) if (oldstate != TCP_ESTABLISHED) TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); break; + case TCP_CLOSE_WAIT: + if (oldstate == TCP_SYN_RECV) + TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); + break; case TCP_CLOSE: if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED) @@ -2647,7 +2664,7 @@ void tcp_set_state(struct sock *sk, int state) inet_put_port(sk); fallthrough; default: - if (oldstate == TCP_ESTABLISHED) + if (oldstate == TCP_ESTABLISHED || oldstate == TCP_CLOSE_WAIT) TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); } @@ -2709,7 +2726,7 @@ void tcp_shutdown(struct sock *sk, int how) /* If we've already sent a FIN, or it's a closed state, skip this. */ if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_SYN_SENT | - TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) { + TCPF_CLOSE_WAIT)) { /* Clear out any half completed packets. FIN if needed. */ if (tcp_close_state(sk)) tcp_send_fin(sk); @@ -2743,7 +2760,15 @@ static bool tcp_too_many_orphans(int shift) READ_ONCE(sysctl_tcp_max_orphans); } -bool tcp_check_oom(struct sock *sk, int shift) +static bool tcp_out_of_memory(const struct sock *sk) +{ + if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && + sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2)) + return true; + return false; +} + +bool tcp_check_oom(const struct sock *sk, int shift) { bool too_many_orphans, out_of_socket_memory; @@ -2804,7 +2829,8 @@ void __tcp_close(struct sock *sk, long timeout) /* Unread data was tossed, zap the connection. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); tcp_set_state(sk, TCP_CLOSE); - tcp_send_active_reset(sk, sk->sk_allocation); + tcp_send_active_reset(sk, sk->sk_allocation, + SK_RST_REASON_NOT_SPECIFIED); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); @@ -2818,7 +2844,7 @@ void __tcp_close(struct sock *sk, long timeout) * machine. State transitions: * * TCP_ESTABLISHED -> TCP_FIN_WAIT1 - * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible) + * TCP_SYN_RECV -> TCP_FIN_WAIT1 (it is difficult) * TCP_CLOSE_WAIT -> TCP_LAST_ACK * * are legal only when FIN has been sent (i.e. in window), @@ -2878,7 +2904,8 @@ adjudge_to_death: struct tcp_sock *tp = tcp_sk(sk); if (READ_ONCE(tp->linger2) < 0) { tcp_set_state(sk, TCP_CLOSE); - tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_send_active_reset(sk, GFP_ATOMIC, + SK_RST_REASON_NOT_SPECIFIED); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONLINGER); } else { @@ -2896,7 +2923,8 @@ adjudge_to_death: if (sk->sk_state != TCP_CLOSE) { if (tcp_check_oom(sk, 0)) { tcp_set_state(sk, TCP_CLOSE); - tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_send_active_reset(sk, GFP_ATOMIC, + SK_RST_REASON_NOT_SPECIFIED); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); } else if (!check_net(sock_net(sk))) { @@ -3000,7 +3028,7 @@ int tcp_disconnect(struct sock *sk, int flags) /* The last check adjusts for discrepancy of Linux wrt. RFC * states */ - tcp_send_active_reset(sk, gfp_any()); + tcp_send_active_reset(sk, gfp_any(), SK_RST_REASON_NOT_SPECIFIED); WRITE_ONCE(sk->sk_err, ECONNRESET); } else if (old_state == TCP_SYN_SENT) WRITE_ONCE(sk->sk_err, ECONNRESET); @@ -3009,6 +3037,7 @@ int tcp_disconnect(struct sock *sk, int flags) __skb_queue_purge(&sk->sk_receive_queue); WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); WRITE_ONCE(tp->urg_data, 0); + sk_set_peek_off(sk, -1); tcp_write_queue_purge(sk); tcp_fastopen_active_disable_ofo_check(sk); skb_rbtree_purge(&tp->out_of_order_queue); @@ -3378,7 +3407,7 @@ int tcp_set_window_clamp(struct sock *sk, int val) if (!val) { if (sk->sk_state != TCP_CLOSE) return -EINVAL; - tp->window_clamp = 0; + WRITE_ONCE(tp->window_clamp, 0); } else { u32 new_rcv_ssthresh, old_window_clamp = tp->window_clamp; u32 new_window_clamp = val < SOCK_MIN_RCVBUF / 2 ? @@ -3387,7 +3416,7 @@ int tcp_set_window_clamp(struct sock *sk, int val) if (new_window_clamp == old_window_clamp) return 0; - tp->window_clamp = new_window_clamp; + WRITE_ONCE(tp->window_clamp, new_window_clamp); if (new_window_clamp < old_window_clamp) { /* need to apply the reserved mem provisioning only * when shrinking the window clamp @@ -4056,7 +4085,7 @@ int do_tcp_getsockopt(struct sock *sk, int level, TCP_RTO_MAX / HZ); break; case TCP_WINDOW_CLAMP: - val = tp->window_clamp; + val = READ_ONCE(tp->window_clamp); break; case TCP_INFO: { struct tcp_info info; @@ -4341,6 +4370,9 @@ zerocopy_rcv_out: return err; } + case TCP_IS_MPTCP: + val = 0; + break; default: return -ENOPROTOOPT; } @@ -4551,13 +4583,10 @@ int tcp_abort(struct sock *sk, int err) bh_lock_sock(sk); if (!sock_flag(sk, SOCK_DEAD)) { - WRITE_ONCE(sk->sk_err, err); - /* This barrier is coupled with smp_rmb() in tcp_poll() */ - smp_wmb(); - sk_error_report(sk); if (tcp_need_reset(sk->sk_state)) - tcp_send_active_reset(sk, GFP_ATOMIC); - tcp_done(sk); + tcp_send_active_reset(sk, GFP_ATOMIC, + SK_RST_REASON_NOT_SPECIFIED); + tcp_done_with_error(sk, err); } bh_unlock_sock(sk); @@ -4647,16 +4676,16 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns); - CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_clock_cache); - CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 113); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 89); /* TXRX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_clock_cache); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_nxt); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_nxt); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_una); @@ -4669,7 +4698,11 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 76); + + /* 32bit arches with 8byte alignment on u64 fields might need padding + * before tcp_clock_cache. + */ + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 92 + 4); /* RX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index 254d6e3f93..9171e10668 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -266,32 +266,49 @@ static void tcp_ao_key_free_rcu(struct rcu_head *head) kfree_sensitive(key); } -void tcp_ao_destroy_sock(struct sock *sk, bool twsk) +static void tcp_ao_info_free_rcu(struct rcu_head *head) { - struct tcp_ao_info *ao; + struct tcp_ao_info *ao = container_of(head, struct tcp_ao_info, rcu); struct tcp_ao_key *key; struct hlist_node *n; + hlist_for_each_entry_safe(key, n, &ao->head, node) { + hlist_del(&key->node); + tcp_sigpool_release(key->tcp_sigpool_id); + kfree_sensitive(key); + } + kfree(ao); + static_branch_slow_dec_deferred(&tcp_ao_needed); +} + +static void tcp_ao_sk_omem_free(struct sock *sk, struct tcp_ao_info *ao) +{ + size_t total_ao_sk_mem = 0; + struct tcp_ao_key *key; + + hlist_for_each_entry(key, &ao->head, node) + total_ao_sk_mem += tcp_ao_sizeof_key(key); + atomic_sub(total_ao_sk_mem, &sk->sk_omem_alloc); +} + +void tcp_ao_destroy_sock(struct sock *sk, bool twsk) +{ + struct tcp_ao_info *ao; + if (twsk) { ao = rcu_dereference_protected(tcp_twsk(sk)->ao_info, 1); - tcp_twsk(sk)->ao_info = NULL; + rcu_assign_pointer(tcp_twsk(sk)->ao_info, NULL); } else { ao = rcu_dereference_protected(tcp_sk(sk)->ao_info, 1); - tcp_sk(sk)->ao_info = NULL; + rcu_assign_pointer(tcp_sk(sk)->ao_info, NULL); } if (!ao || !refcount_dec_and_test(&ao->refcnt)) return; - hlist_for_each_entry_safe(key, n, &ao->head, node) { - hlist_del_rcu(&key->node); - if (!twsk) - atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc); - call_rcu(&key->rcu, tcp_ao_key_free_rcu); - } - - kfree_rcu(ao, rcu); - static_branch_slow_dec_deferred(&tcp_ao_needed); + if (!twsk) + tcp_ao_sk_omem_free(sk, ao); + call_rcu(&ao->rcu, tcp_ao_info_free_rcu); } void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp) @@ -509,9 +526,9 @@ static int tcp_ao_hash_header(struct tcp_sigpool *hp, bool exclude_options, u8 *hash, int hash_offset, int hash_len) { - int err, len = th->doff << 2; struct scatterlist sg; u8 *hdr = hp->scratch; + int err, len; /* We are not allowed to change tcphdr, make a local copy */ if (exclude_options) { @@ -933,6 +950,7 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, struct tcp_ao_key *key; __be32 sisn, disn; u8 *traffic_key; + int state; u32 sne = 0; info = rcu_dereference(tcp_sk(sk)->ao_info); @@ -948,8 +966,9 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, disn = 0; } + state = READ_ONCE(sk->sk_state); /* Fast-path */ - if (likely((1 << sk->sk_state) & TCP_AO_ESTABLISHED)) { + if (likely((1 << state) & TCP_AO_ESTABLISHED)) { enum skb_drop_reason err; struct tcp_ao_key *current_key; @@ -988,6 +1007,9 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, return SKB_NOT_DROPPED_YET; } + if (unlikely(state == TCP_CLOSE)) + return SKB_DROP_REASON_TCP_CLOSE; + /* Lookup key based on peer address and keyid. * current_key and rnext_key must not be used on tcp listen * sockets as otherwise: @@ -1001,7 +1023,7 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, if (th->syn && !th->ack) goto verify_hash; - if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV)) { + if ((1 << state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV)) { /* Make the initial syn the likely case here */ if (unlikely(req)) { sne = tcp_ao_compute_sne(0, tcp_rsk(req)->rcv_isn, @@ -1018,14 +1040,14 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, /* no way to figure out initial sisn/disn - drop */ return SKB_DROP_REASON_TCP_FLAGS; } - } else if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { + } else if ((1 << state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { disn = info->lisn; if (th->syn || th->rst) sisn = th->seq; else sisn = info->risn; } else { - WARN_ONCE(1, "TCP-AO: Unexpected sk_state %d", sk->sk_state); + WARN_ONCE(1, "TCP-AO: Unexpected sk_state %d", state); return SKB_DROP_REASON_TCP_AOFAILURE; } verify_hash: @@ -1963,8 +1985,10 @@ static int tcp_ao_info_cmd(struct sock *sk, unsigned short int family, first = true; } - if (cmd.ao_required && tcp_ao_required_verify(sk)) - return -EKEYREJECTED; + if (cmd.ao_required && tcp_ao_required_verify(sk)) { + err = -EKEYREJECTED; + goto out; + } /* For sockets in TCP_CLOSED it's possible set keys that aren't * matching the future peer (address/port/VRF/etc), diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 22358032dd..760941e551 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -1024,7 +1024,7 @@ static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) bbr_update_gains(sk); } -__bpf_kfunc static void bbr_main(struct sock *sk, const struct rate_sample *rs) +__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) { struct bbr *bbr = inet_csk_ca(sk); u32 bw; @@ -1155,9 +1155,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { .set_state = bbr_set_state, }; -BTF_SET8_START(tcp_bbr_check_kfunc_ids) -#ifdef CONFIG_X86 -#ifdef CONFIG_DYNAMIC_FTRACE +BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids) BTF_ID_FLAGS(func, bbr_init) BTF_ID_FLAGS(func, bbr_main) BTF_ID_FLAGS(func, bbr_sndbuf_expand) @@ -1166,9 +1164,7 @@ BTF_ID_FLAGS(func, bbr_cwnd_event) BTF_ID_FLAGS(func, bbr_ssthresh) BTF_ID_FLAGS(func, bbr_min_tso_segs) BTF_ID_FLAGS(func, bbr_set_state) -#endif -#endif -BTF_SET8_END(tcp_bbr_check_kfunc_ids) +BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_bbr_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 1b34050a75..28ffcfbeef 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -146,11 +146,7 @@ EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_congestion_ops *old_ca) { struct tcp_congestion_ops *existing; - int ret; - - ret = tcp_validate_congestion_control(ca); - if (ret) - return ret; + int ret = 0; ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name)); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 0fd78ecb67..5dbed91c61 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -485,18 +485,14 @@ static struct tcp_congestion_ops cubictcp __read_mostly = { .name = "cubic", }; -BTF_SET8_START(tcp_cubic_check_kfunc_ids) -#ifdef CONFIG_X86 -#ifdef CONFIG_DYNAMIC_FTRACE +BTF_KFUNCS_START(tcp_cubic_check_kfunc_ids) BTF_ID_FLAGS(func, cubictcp_init) BTF_ID_FLAGS(func, cubictcp_recalc_ssthresh) BTF_ID_FLAGS(func, cubictcp_cong_avoid) BTF_ID_FLAGS(func, cubictcp_state) BTF_ID_FLAGS(func, cubictcp_cwnd_event) BTF_ID_FLAGS(func, cubictcp_acked) -#endif -#endif -BTF_SET8_END(tcp_cubic_check_kfunc_ids) +BTF_KFUNCS_END(tcp_cubic_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index bb23bb5b38..8a45a4aea9 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -58,7 +58,18 @@ struct dctcp { }; static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */ -module_param(dctcp_shift_g, uint, 0644); + +static int dctcp_shift_g_set(const char *val, const struct kernel_param *kp) +{ + return param_set_uint_minmax(val, kp, 0, 10); +} + +static const struct kernel_param_ops dctcp_shift_g_ops = { + .set = dctcp_shift_g_set, + .get = param_get_uint, +}; + +module_param_cb(dctcp_shift_g, &dctcp_shift_g_ops, &dctcp_shift_g, 0644); MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha"); static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA; @@ -260,18 +271,14 @@ static struct tcp_congestion_ops dctcp_reno __read_mostly = { .name = "dctcp-reno", }; -BTF_SET8_START(tcp_dctcp_check_kfunc_ids) -#ifdef CONFIG_X86 -#ifdef CONFIG_DYNAMIC_FTRACE +BTF_KFUNCS_START(tcp_dctcp_check_kfunc_ids) BTF_ID_FLAGS(func, dctcp_init) BTF_ID_FLAGS(func, dctcp_update_alpha) BTF_ID_FLAGS(func, dctcp_cwnd_event) BTF_ID_FLAGS(func, dctcp_ssthresh) BTF_ID_FLAGS(func, dctcp_cwnd_undo) BTF_ID_FLAGS(func, dctcp_state) -#endif -#endif -BTF_SET8_END(tcp_dctcp_check_kfunc_ids) +BTF_KFUNCS_END(tcp_dctcp_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 4cbe4b4442..f428ecf912 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -222,6 +222,7 @@ static int tcp_diag_destroy(struct sk_buff *in_skb, #endif static const struct inet_diag_handler tcp_diag_handler = { + .owner = THIS_MODULE, .dump = tcp_diag_dump, .dump_one = tcp_diag_dump_one, .idiag_get_info = tcp_diag_get_info, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index df7b13f0e5..ecd5211085 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -72,6 +72,7 @@ #include <linux/prefetch.h> #include <net/dst.h> #include <net/tcp.h> +#include <net/proto_memory.h> #include <net/inet_common.h> #include <linux/ipsec.h> #include <asm/unaligned.h> @@ -563,19 +564,20 @@ static void tcp_init_buffer_space(struct sock *sk) maxwin = tcp_full_space(sk); if (tp->window_clamp >= maxwin) { - tp->window_clamp = maxwin; + WRITE_ONCE(tp->window_clamp, maxwin); if (tcp_app_win && maxwin > 4 * tp->advmss) - tp->window_clamp = max(maxwin - - (maxwin >> tcp_app_win), - 4 * tp->advmss); + WRITE_ONCE(tp->window_clamp, + max(maxwin - (maxwin >> tcp_app_win), + 4 * tp->advmss)); } /* Force reservation of one segment. */ if (tcp_app_win && tp->window_clamp > 2 * tp->advmss && tp->window_clamp + tp->advmss > maxwin) - tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss); + WRITE_ONCE(tp->window_clamp, + max(2 * tp->advmss, maxwin - tp->advmss)); tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); tp->snd_cwnd_stamp = tcp_jiffies32; @@ -752,8 +754,7 @@ void tcp_rcv_space_adjust(struct sock *sk) * <prev RTT . ><current RTT .. ><next RTT .... > */ - if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && - !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf)) { u64 rcvwin, grow; int rcvbuf; @@ -769,11 +770,22 @@ void tcp_rcv_space_adjust(struct sock *sk) rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin), READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); - if (rcvbuf > sk->sk_rcvbuf) { - WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); + if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { + if (rcvbuf > sk->sk_rcvbuf) { + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); - /* Make the window clamp follow along. */ - tp->window_clamp = tcp_win_from_space(sk, rcvbuf); + /* Make the window clamp follow along. */ + WRITE_ONCE(tp->window_clamp, + tcp_win_from_space(sk, rcvbuf)); + } + } else { + /* Make the window clamp follow along while being bounded + * by SO_RCVBUF. + */ + int clamp = tcp_win_from_space(sk, min(rcvbuf, sk->sk_rcvbuf)); + + if (clamp > tp->window_clamp) + WRITE_ONCE(tp->window_clamp, clamp); } } tp->rcvq_space.space = copied; @@ -911,7 +923,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->rtt_seq = tp->snd_nxt; tp->mdev_max_us = tcp_rto_min_us(sk); - tcp_bpf_rtt(sk); + tcp_bpf_rtt(sk, mrtt_us, srtt); } } else { /* no previous measure. */ @@ -921,7 +933,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->mdev_max_us = tp->rttvar_us; tp->rtt_seq = tp->snd_nxt; - tcp_bpf_rtt(sk); + tcp_bpf_rtt(sk, mrtt_us, srtt); } tp->srtt_us = max(1U, srtt); } @@ -1164,7 +1176,7 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, * L|R 1 - orig is lost, retransmit is in flight. * S|R 1 - orig reached receiver, retrans is still in flight. * (L|S|R is logically valid, it could occur when L|R is sacked, - * but it is equivalent to plain S and code short-curcuits it to S. + * but it is equivalent to plain S and code short-circuits it to S. * L|S is logically invalid, it would mean -1 packet in flight 8)) * * These 6 states form finite state machine, controlled by the following events: @@ -2126,8 +2138,16 @@ void tcp_clear_retrans(struct tcp_sock *tp) static inline void tcp_init_undo(struct tcp_sock *tp) { tp->undo_marker = tp->snd_una; + /* Retransmission still in flight may cause DSACKs later. */ - tp->undo_retrans = tp->retrans_out ? : -1; + /* First, account for regular retransmits in flight: */ + tp->undo_retrans = tp->retrans_out; + /* Next, account for TLP retransmits in flight: */ + if (tp->tlp_high_seq && tp->tlp_retrans) + tp->undo_retrans++; + /* Finally, avoid 0, because undo_retrans==0 means "can undo now": */ + if (!tp->undo_retrans) + tp->undo_retrans = -1; } static bool tcp_is_rack(const struct sock *sk) @@ -2206,6 +2226,7 @@ void tcp_enter_loss(struct sock *sk) tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; + tp->tlp_high_seq = 0; tcp_ecn_queue_cwr(tp); /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous @@ -2779,13 +2800,37 @@ static void tcp_mtup_probe_success(struct sock *sk) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS); } +/* Sometimes we deduce that packets have been dropped due to reasons other than + * congestion, like path MTU reductions or failed client TFO attempts. In these + * cases we call this function to retransmit as many packets as cwnd allows, + * without reducing cwnd. Given that retransmits will set retrans_stamp to a + * non-zero value (and may do so in a later calling context due to TSQ), we + * also enter CA_Loss so that we track when all retransmitted packets are ACKed + * and clear retrans_stamp when that happens (to ensure later recurring RTOs + * are using the correct retrans_stamp and don't declare ETIMEDOUT + * prematurely). + */ +static void tcp_non_congestion_loss_retransmit(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + + if (icsk->icsk_ca_state != TCP_CA_Loss) { + tp->high_seq = tp->snd_nxt; + tp->snd_ssthresh = tcp_current_ssthresh(sk); + tp->prior_ssthresh = 0; + tp->undo_marker = 0; + tcp_set_ca_state(sk, TCP_CA_Loss); + } + tcp_xmit_retransmit_queue(sk); +} + /* Do a simple retransmit without using the backoff mechanisms in * tcp_timer. This is used for path mtu discovery. * The socket is already locked here. */ void tcp_simple_retransmit(struct sock *sk) { - const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int mss; @@ -2825,14 +2870,7 @@ void tcp_simple_retransmit(struct sock *sk) * in network, but units changed and effective * cwnd/ssthresh really reduced now. */ - if (icsk->icsk_ca_state != TCP_CA_Loss) { - tp->high_seq = tp->snd_nxt; - tp->snd_ssthresh = tcp_current_ssthresh(sk); - tp->prior_ssthresh = 0; - tp->undo_marker = 0; - tcp_set_ca_state(sk, TCP_CA_Loss); - } - tcp_xmit_retransmit_queue(sk); + tcp_non_congestion_loss_retransmit(sk); } EXPORT_SYMBOL(tcp_simple_retransmit); @@ -3057,7 +3095,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, return; if (tcp_try_undo_dsack(sk)) - tcp_try_keep_open(sk); + tcp_try_to_open(sk, flag); tcp_identify_packet_loss(sk, ack_flag); if (icsk->icsk_ca_state != TCP_CA_Recovery) { @@ -3539,7 +3577,7 @@ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->cong_control) { - icsk->icsk_ca_ops->cong_control(sk, rs); + icsk->icsk_ca_ops->cong_control(sk, ack, flag, rs); return; } @@ -4204,6 +4242,13 @@ void tcp_parse_options(const struct net *net, */ break; #endif +#ifdef CONFIG_TCP_AO + case TCPOPT_AO: + /* TCP AO has already been checked + * (see tcp_inbound_ao_hash()). + */ + break; +#endif case TCPOPT_FASTOPEN: tcp_parse_fastopen_option( opsize - TCPOLEN_FASTOPEN_BASE, @@ -4433,9 +4478,26 @@ static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp, return SKB_NOT_DROPPED_YET; } + +void tcp_done_with_error(struct sock *sk, int err) +{ + /* This barrier is coupled with smp_rmb() in tcp_poll() */ + WRITE_ONCE(sk->sk_err, err); + smp_wmb(); + + tcp_write_queue_purge(sk); + tcp_done(sk); + + if (!sock_flag(sk, SOCK_DEAD)) + sk_error_report(sk); +} +EXPORT_SYMBOL(tcp_done_with_error); + /* When we get a reset we do this. */ void tcp_reset(struct sock *sk, struct sk_buff *skb) { + int err; + trace_tcp_receive_reset(sk); /* mptcp can't tell us to ignore reset pkts, @@ -4447,24 +4509,17 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb) /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->sk_state) { case TCP_SYN_SENT: - WRITE_ONCE(sk->sk_err, ECONNREFUSED); + err = ECONNREFUSED; break; case TCP_CLOSE_WAIT: - WRITE_ONCE(sk->sk_err, EPIPE); + err = EPIPE; break; case TCP_CLOSE: return; default: - WRITE_ONCE(sk->sk_err, ECONNRESET); + err = ECONNRESET; } - /* This barrier is coupled with smp_rmb() in tcp_poll() */ - smp_wmb(); - - tcp_write_queue_purge(sk); - tcp_done(sk); - - if (!sock_flag(sk, SOCK_DEAD)) - sk_error_report(sk); + tcp_done_with_error(sk, err); } /* @@ -4803,10 +4858,8 @@ static bool tcp_try_coalesce(struct sock *sk, if (!mptcp_skb_can_collapse(to, from)) return false; -#ifdef CONFIG_TLS_DEVICE - if (from->decrypted != to->decrypted) + if (skb_cmp_decrypted(from, to)) return false; -#endif if (!skb_try_coalesce(to, from, fragstolen, &delta)) return false; @@ -5174,6 +5227,16 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) */ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { if (tcp_receive_window(tp) == 0) { + /* Some stacks are known to send bare FIN packets + * in a loop even if we send RWIN 0 in our ACK. + * Accepting this FIN does not hurt memory pressure + * because the FIN flag will simply be merged to the + * receive queue tail skb in most cases. + */ + if (!skb->len && + (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) + goto queue_and_out; + reason = SKB_DROP_REASON_TCP_ZEROWINDOW; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; @@ -5188,7 +5251,7 @@ queue_and_out: inet_csk_schedule_ack(sk); sk->sk_data_ready(sk); - if (skb_queue_len(&sk->sk_receive_queue)) { + if (skb_queue_len(&sk->sk_receive_queue) && skb->len) { reason = SKB_DROP_REASON_PROTO_MEM; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); goto drop; @@ -5375,9 +5438,7 @@ restart: break; memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); -#ifdef CONFIG_TLS_DEVICE - nskb->decrypted = skb->decrypted; -#endif + skb_copy_decrypted(nskb, skb); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; if (list) __skb_queue_before(list, skb, nskb); @@ -5407,10 +5468,8 @@ restart: !mptcp_skb_can_collapse(nskb, skb) || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) goto end; -#ifdef CONFIG_TLS_DEVICE - if (skb->decrypted != nskb->decrypted) + if (skb_cmp_decrypted(skb, nskb)) goto end; -#endif } } } @@ -6288,7 +6347,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, tp->fastopen_client_fail = TFO_DATA_NOT_ACKED; skb_rbtree_walk_from(data) tcp_mark_skb_lost(sk, data); - tcp_xmit_retransmit_queue(sk); + tcp_non_congestion_loss_retransmit(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); return true; @@ -6361,6 +6420,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, TCP_TIMEOUT_MIN, TCP_RTO_MAX); + SKB_DR_SET(reason, TCP_INVALID_ACK_SEQUENCE); goto reset_and_undo; } @@ -6369,6 +6429,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_time_stamp_ts(tp))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED); + SKB_DR_SET(reason, TCP_RFC7323_PAWS); goto reset_and_undo; } @@ -6424,7 +6485,8 @@ consume: if (!tp->rx_opt.wscale_ok) { tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; - tp->window_clamp = min(tp->window_clamp, 65535U); + WRITE_ONCE(tp->window_clamp, + min(tp->window_clamp, 65535U)); } if (tp->rx_opt.saw_tstamp) { @@ -6572,7 +6634,8 @@ discard_and_undo: reset_and_undo: tcp_clear_options(&tp->rx_opt); tp->rx_opt.mss_clamp = saved_clamp; - return 1; + /* we can reuse/return @reason to its caller to handle the exception */ + return reason; } static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) @@ -6616,14 +6679,14 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) * address independent. */ -int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) +enum skb_drop_reason +tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); const struct tcphdr *th = tcp_hdr(skb); struct request_sock *req; int queued = 0; - bool acceptable; SKB_DR(reason); switch (sk->sk_state) { @@ -6633,7 +6696,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_LISTEN: if (th->ack) - return 1; + return SKB_DROP_REASON_TCP_FLAGS; if (th->rst) { SKB_DR_SET(reason, TCP_RESET); @@ -6649,12 +6712,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) */ rcu_read_lock(); local_bh_disable(); - acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0; + icsk->icsk_af_ops->conn_request(sk, skb); local_bh_enable(); rcu_read_unlock(); - if (!acceptable) - return 1; consume_skb(skb); return 0; } @@ -6699,17 +6760,25 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) return 0; /* step 5: check the ACK field */ - acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | - FLAG_UPDATE_TS_RECENT | - FLAG_NO_CHALLENGE_ACK) > 0; - - if (!acceptable) { - if (sk->sk_state == TCP_SYN_RECV) - return 1; /* send one RST */ - tcp_send_challenge_ack(sk); - SKB_DR_SET(reason, TCP_OLD_ACK); - goto discard; + reason = tcp_ack(sk, skb, FLAG_SLOWPATH | + FLAG_UPDATE_TS_RECENT | + FLAG_NO_CHALLENGE_ACK); + + if ((int)reason <= 0) { + if (sk->sk_state == TCP_SYN_RECV) { + /* send one RST */ + if (!reason) + return SKB_DROP_REASON_TCP_OLD_ACK; + return -reason; + } + /* accept old ack during closing */ + if ((int)reason < 0) { + tcp_send_challenge_ack(sk); + reason = -reason; + goto discard; + } } + SKB_DR_SET(reason, NOT_SPECIFIED); switch (sk->sk_state) { case TCP_SYN_RECV: tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */ @@ -6752,6 +6821,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tcp_initialize_rcv_mss(sk); tcp_fast_path_on(tp); + if (sk->sk_shutdown & SEND_SHUTDOWN) + tcp_shutdown(sk, SEND_SHUTDOWN); break; case TCP_FIN_WAIT1: { @@ -6777,7 +6848,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (READ_ONCE(tp->linger2) < 0) { tcp_done(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); - return 1; + return SKB_DROP_REASON_TCP_ABORT_ON_DATA; } if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { @@ -6786,7 +6857,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tcp_fastopen_active_disable(sk); tcp_done(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); - return 1; + return SKB_DROP_REASON_TCP_ABORT_ON_DATA; } tmo = tcp_fin_time(sk); @@ -6851,7 +6922,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); tcp_reset(sk, skb); - return 1; + return SKB_DROP_REASON_TCP_ABORT_ON_DATA; } } fallthrough; @@ -6990,7 +7061,7 @@ EXPORT_SYMBOL(inet_reqsk_alloc); /* * Return true if a syncookie should be sent */ -static bool tcp_syn_flood_action(const struct sock *sk, const char *proto) +static bool tcp_syn_flood_action(struct sock *sk, const char *proto) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; const char *msg = "Dropping request"; @@ -7091,7 +7162,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct sock *sk, struct sk_buff *skb) { struct tcp_fastopen_cookie foc = { .len = -1 }; - __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; struct tcp_options_received tmp_opt; struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); @@ -7101,21 +7171,28 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct dst_entry *dst; struct flowi fl; u8 syncookies; + u32 isn; #ifdef CONFIG_TCP_AO const struct tcp_ao_hdr *aoh; #endif - syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); + isn = __this_cpu_read(tcp_tw_isn); + if (isn) { + /* TW buckets are converted to open requests without + * limitations, they conserve resources and peer is + * evidently real one. + */ + __this_cpu_write(tcp_tw_isn, 0); + } else { + syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); - /* TW buckets are converted to open requests without - * limitations, they conserve resources and peer is - * evidently real one. - */ - if ((syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) { - want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name); - if (!want_cookie) - goto drop; + if (syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) { + want_cookie = tcp_syn_flood_action(sk, + rsk_ops->slab_name); + if (!want_cookie) + goto drop; + } } if (sk_acceptq_is_full(sk)) { @@ -7154,7 +7231,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, /* Note: tcp_v6_init_req() might override ir_iif for link locals */ inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); - dst = af_ops->route_req(sk, skb, &fl, req); + dst = af_ops->route_req(sk, skb, &fl, req, isn); if (!dst) goto drop_and_free; @@ -7231,7 +7308,12 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->tfo_listener = false; if (!want_cookie) { req->timeout = tcp_timeout_init((struct sock *)req); - inet_csk_reqsk_queue_hash_add(sk, req, req->timeout); + if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, + req->timeout))) { + reqsk_free(req); + return 0; + } + } af_ops->send_synack(sk, dst, &fl, req, &foc, !want_cookie ? TCP_SYNACK_NORMAL : diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0c50c5a32b..a541659b65 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -70,6 +70,7 @@ #include <net/xfrm.h> #include <net/secure_seq.h> #include <net/busy_poll.h> +#include <net/rstreason.h> #include <linux/inet.h> #include <linux/ipv6.h> @@ -154,6 +155,12 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) if (tcptw->tw_ts_recent_stamp && (!twp || (reuse && time_after32(ktime_get_seconds(), tcptw->tw_ts_recent_stamp)))) { + /* inet_twsk_hashdance() sets sk_refcnt after putting twsk + * and releasing the bucket lock. + */ + if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) + return 0; + /* In case of repair and re-using TIME-WAIT sockets we still * want to be sure that it is safe as above but honor the * sequence numbers and time stamps set as part of the repair @@ -174,7 +181,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) tp->rx_opt.ts_recent = tcptw->tw_ts_recent; tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; } - sock_hold(sktw); + return 1; } @@ -604,15 +611,10 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); - if (!sock_owned_by_user(sk)) { - WRITE_ONCE(sk->sk_err, err); - - sk_error_report(sk); - - tcp_done(sk); - } else { + if (!sock_owned_by_user(sk)) + tcp_done_with_error(sk, err); + else WRITE_ONCE(sk->sk_err_soft, err); - } goto out; } @@ -723,7 +725,8 @@ out: * Exception: precedence violation. We do not implement it in any case. */ -static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) +static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, + enum sk_rst_reason reason) { const struct tcphdr *th = tcp_hdr(skb); struct { @@ -866,11 +869,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) * routing might fail in this case. No choice here, if we choose to force * input interface, we will misroute in case of asymmetric route. */ - if (sk) { + if (sk) arg.bound_dev_if = sk->sk_bound_dev_if; - if (sk_fullsock(sk)) - trace_tcp_send_reset(sk, skb); - } + + trace_tcp_send_reset(sk, skb, reason); BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != offsetof(struct inet_timewait_sock, tw_bound_dev_if)); @@ -1137,14 +1139,9 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, #endif } - /* RFC 7323 2.3 - * The window field (SEG.WND) of every outgoing segment, with the - * exception of <SYN> segments, MUST be right-shifted by - * Rcv.Wind.Shift bits: - */ tcp_v4_send_ack(sk, skb, seq, tcp_rsk(req)->rcv_nxt, - req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, + tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, tcp_rsk_tsval(tcp_rsk(req)), READ_ONCE(req->ts_recent), 0, &key, @@ -1667,7 +1664,8 @@ static void tcp_v4_init_req(struct request_sock *req, static struct dst_entry *tcp_v4_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req) + struct request_sock *req, + u32 tw_isn) { tcp_v4_init_req(req, sk, skb); @@ -1907,7 +1905,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; } - reason = SKB_DROP_REASON_NOT_SPECIFIED; if (tcp_checksum_complete(skb)) goto csum_err; @@ -1915,9 +1912,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) struct sock *nsk = tcp_v4_cookie_check(sk, skb); if (!nsk) - goto discard; + return 0; if (nsk != sk) { - if (tcp_child_process(sk, nsk, skb)) { + reason = tcp_child_process(sk, nsk, skb); + if (reason) { rsk = nsk; goto reset; } @@ -1926,14 +1924,15 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) } else sock_rps_save_rxhash(sk, skb); - if (tcp_rcv_state_process(sk, skb)) { + reason = tcp_rcv_state_process(sk, skb); + if (reason) { rsk = sk; goto reset; } return 0; reset: - tcp_v4_send_reset(rsk, skb); + tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); discard: kfree_skb_reason(skb, reason); /* Be careful here. If this function gets more complicated and @@ -1994,7 +1993,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason) { - u32 limit, tail_gso_size, tail_gso_segs; + u32 tail_gso_size, tail_gso_segs; struct skb_shared_info *shinfo; const struct tcphdr *th; struct tcphdr *thtail; @@ -2003,6 +2002,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, bool fragstolen; u32 gso_segs; u32 gso_size; + u64 limit; int delta; /* In case all data was pulled from skb frags (in __pskb_pull_tail()), @@ -2044,10 +2044,8 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || ((TCP_SKB_CB(tail)->tcp_flags ^ TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || -#ifdef CONFIG_TLS_DEVICE - tail->decrypted != skb->decrypted || -#endif !mptcp_skb_can_collapse(tail, skb) || + skb_cmp_decrypted(tail, skb) || thtail->doff != th->doff || memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) goto no_coalesce; @@ -2100,7 +2098,13 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, __skb_push(skb, hdrlen); no_coalesce: - limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1); + /* sk->sk_backlog.len is reset only at the end of __release_sock(). + * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach + * sk_rcvbuf in normal conditions. + */ + limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; + + limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; /* Only socket owner can try to collapse/prune rx queues * to reduce memory overhead, so add a little headroom here. @@ -2108,6 +2112,8 @@ no_coalesce: */ limit += 64 * 1024; + limit = min_t(u64, limit, UINT_MAX); + if (unlikely(sk_add_backlog(sk, skb, limit))) { bh_unlock_sock(sk); *reason = SKB_DROP_REASON_SOCKET_BACKLOG; @@ -2147,7 +2153,6 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, skb->len - th->doff * 4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); - TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->has_rxtstamp = @@ -2169,6 +2174,7 @@ int tcp_v4_rcv(struct sk_buff *skb) bool refcounted; struct sock *sk; int ret; + u32 isn; drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (skb->pkt_type != PACKET_HOST) @@ -2206,7 +2212,6 @@ lookup: if (!sk) goto no_tcp_socket; -process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; @@ -2275,15 +2280,21 @@ process: if (nsk == sk) { reqsk_put(req); tcp_v4_restore_cb(skb); - } else if (tcp_child_process(sk, nsk, skb)) { - tcp_v4_send_reset(nsk, skb); - goto discard_and_relse; } else { + drop_reason = tcp_child_process(sk, nsk, skb); + if (drop_reason) { + enum sk_rst_reason rst_reason; + + rst_reason = sk_rst_convert_drop_reason(drop_reason); + tcp_v4_send_reset(nsk, skb, rst_reason); + goto discard_and_relse; + } sock_put(sk); return 0; } } +process: if (static_branch_unlikely(&ip4_min_ttl)) { /* min_ttl can be changed concurrently from do_ip_setsockopt() */ if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { @@ -2354,7 +2365,7 @@ csum_error: bad_packet: __TCP_INC_STATS(net, TCP_MIB_INERRS); } else { - tcp_v4_send_reset(NULL, skb); + tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); } discard_it: @@ -2382,7 +2393,7 @@ do_time_wait: inet_twsk_put(inet_twsk(sk)); goto csum_error; } - switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { + switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, @@ -2396,6 +2407,7 @@ do_time_wait: sk = sk2; tcp_v4_restore_cb(skb); refcounted = false; + __this_cpu_write(tcp_tw_isn, isn); goto process; } } @@ -2405,7 +2417,7 @@ do_time_wait: tcp_v4_timewait_ack(sk, skb); break; case TCP_TW_RST: - tcp_v4_send_reset(sk, skb); + tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); inet_twsk_deschedule_put(inet_twsk(sk)); goto discard_it; case TCP_TW_SUCCESS:; @@ -2415,7 +2427,6 @@ do_time_wait: static struct timewait_sock_ops tcp_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp_timewait_sock), - .twsk_unique = tcp_twsk_unique, .twsk_destructor= tcp_twsk_destructor, }; @@ -3498,7 +3509,7 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { struct net *net; - tcp_twsk_purge(net_exit_list, AF_INET); + tcp_twsk_purge(net_exit_list); list_for_each_entry(net, net_exit_list, exit_list) { inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index c2a9255385..b01eb6d944 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -619,6 +619,7 @@ static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, }, [TCP_METRICS_ATTR_ADDR_IPV6] = { .type = NLA_BINARY, .len = sizeof(struct in6_addr), }, + [TCP_METRICS_ATTR_SADDR_IPV4] = { .type = NLA_U32, }, /* Following attributes are not received for GET/DEL, * we keep them for reference */ @@ -766,6 +767,7 @@ static int tcp_metrics_nl_dump(struct sk_buff *skb, unsigned int max_rows = 1U << tcp_metrics_hash_log; unsigned int row, s_row = cb->args[0]; int s_col = cb->args[1], col = s_col; + int res = 0; for (row = s_row; row < max_rows; row++, s_col = 0) { struct tcp_metrics_block *tm; @@ -778,7 +780,8 @@ static int tcp_metrics_nl_dump(struct sk_buff *skb, continue; if (col < s_col) continue; - if (tcp_metrics_dump_info(skb, cb, tm) < 0) { + res = tcp_metrics_dump_info(skb, cb, tm); + if (res < 0) { rcu_read_unlock(); goto done; } @@ -789,7 +792,7 @@ static int tcp_metrics_nl_dump(struct sk_buff *skb, done: cb->args[0] = row; cb->args[1] = col; - return skb->len; + return res; } static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, @@ -986,6 +989,7 @@ static struct genl_family tcp_metrics_nl_family __ro_after_init = { .maxattr = TCP_METRICS_ATTR_MAX, .policy = tcp_metrics_nl_policy, .netnsok = true, + .parallel_ops = true, .module = THIS_MODULE, .small_ops = tcp_metrics_nl_ops, .n_small_ops = ARRAY_SIZE(tcp_metrics_nl_ops), diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 0ecc7311dc..0fbebf6266 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -22,6 +22,7 @@ #include <net/tcp.h> #include <net/xfrm.h> #include <net/busy_poll.h> +#include <net/rstreason.h> static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { @@ -95,7 +96,7 @@ static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq) */ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, - const struct tcphdr *th) + const struct tcphdr *th, u32 *tw_isn) { struct tcp_options_received tmp_opt; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); @@ -228,7 +229,7 @@ kill: u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; - TCP_SKB_CB(skb)->tcp_tw_isn = isn; + *tw_isn = isn; return TCP_TW_SYN; } @@ -388,7 +389,7 @@ void tcp_twsk_destructor(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_twsk_destructor); -void tcp_twsk_purge(struct list_head *net_exit_list, int family) +void tcp_twsk_purge(struct list_head *net_exit_list) { bool purged_once = false; struct net *net; @@ -396,14 +397,13 @@ void tcp_twsk_purge(struct list_head *net_exit_list, int family) list_for_each_entry(net, net_exit_list, exit_list) { if (net->ipv4.tcp_death_row.hashinfo->pernet) { /* Even if tw_refcount == 1, we must clean up kernel reqsk */ - inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo, family); + inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo); } else if (!purged_once) { - inet_twsk_purge(&tcp_hashinfo, family); + inet_twsk_purge(&tcp_hashinfo); purged_once = true; } } } -EXPORT_SYMBOL_GPL(tcp_twsk_purge); /* Warning : This function is called without sk_listener being locked. * Be sure to read socket fields once, as their value could change under us. @@ -515,9 +515,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, const struct tcp_sock *oldtp; struct tcp_sock *newtp; u32 seq; -#ifdef CONFIG_TCP_AO - struct tcp_ao_key *ao_key; -#endif if (!newsk) return NULL; @@ -608,10 +605,14 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, #endif #ifdef CONFIG_TCP_AO newtp->ao_info = NULL; - ao_key = treq->af_specific->ao_lookup(sk, req, - tcp_rsk(req)->ao_keyid, -1); - if (ao_key) - newtp->tcp_header_len += tcp_ao_len_aligned(ao_key); + + if (tcp_rsk_used_ao(req)) { + struct tcp_ao_key *ao_key; + + ao_key = treq->af_specific->ao_lookup(sk, req, tcp_rsk(req)->ao_keyid, -1); + if (ao_key) + newtp->tcp_header_len += tcp_ao_len_aligned(ao_key); + } #endif if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; @@ -783,8 +784,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* RFC793: "first check sequence number". */ - if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) { + if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + tcp_rsk(req)->rcv_nxt, + tcp_rsk(req)->rcv_nxt + + tcp_synack_window(req))) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST) && !tcp_oow_rate_limited(sock_net(sk), skb, @@ -879,7 +883,7 @@ embryonic_reset: * avoid becoming vulnerable to outside attack aiming at * resetting legit local connections. */ - req->rsk_ops->send_reset(sk, skb); + req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_INVALID_SYN); } else if (fastopen) { /* received a valid RST pkt */ reqsk_fastopen_remove(sk, req, true); tcp_reset(sk, skb); @@ -907,11 +911,11 @@ EXPORT_SYMBOL(tcp_check_req); * be created. */ -int tcp_child_process(struct sock *parent, struct sock *child, - struct sk_buff *skb) +enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, + struct sk_buff *skb) __releases(&((child)->sk_lock.slock)) { - int ret = 0; + enum skb_drop_reason reason = SKB_NOT_DROPPED_YET; int state = child->sk_state; /* record sk_napi_id and sk_rx_queue_mapping of child. */ @@ -919,7 +923,7 @@ int tcp_child_process(struct sock *parent, struct sock *child, tcp_segs_in(tcp_sk(child), skb); if (!sock_owned_by_user(child)) { - ret = tcp_rcv_state_process(child, skb); + reason = tcp_rcv_state_process(child, skb); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) parent->sk_data_ready(parent); @@ -933,6 +937,6 @@ int tcp_child_process(struct sock *parent, struct sock *child, bh_unlock_sock(child); sock_put(child); - return ret; + return reason; } EXPORT_SYMBOL(tcp_child_process); diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 8311c38267..e4ad3311e1 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -28,6 +28,70 @@ static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq, } } +static void __tcpv4_gso_segment_csum(struct sk_buff *seg, + __be32 *oldip, __be32 newip, + __be16 *oldport, __be16 newport) +{ + struct tcphdr *th; + struct iphdr *iph; + + if (*oldip == newip && *oldport == newport) + return; + + th = tcp_hdr(seg); + iph = ip_hdr(seg); + + inet_proto_csum_replace4(&th->check, seg, *oldip, newip, true); + inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false); + *oldport = newport; + + csum_replace4(&iph->check, *oldip, newip); + *oldip = newip; +} + +static struct sk_buff *__tcpv4_gso_segment_list_csum(struct sk_buff *segs) +{ + const struct tcphdr *th; + const struct iphdr *iph; + struct sk_buff *seg; + struct tcphdr *th2; + struct iphdr *iph2; + + seg = segs; + th = tcp_hdr(seg); + iph = ip_hdr(seg); + th2 = tcp_hdr(seg->next); + iph2 = ip_hdr(seg->next); + + if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) && + iph->daddr == iph2->daddr && iph->saddr == iph2->saddr) + return segs; + + while ((seg = seg->next)) { + th2 = tcp_hdr(seg); + iph2 = ip_hdr(seg); + + __tcpv4_gso_segment_csum(seg, + &iph2->saddr, iph->saddr, + &th2->source, th->source); + __tcpv4_gso_segment_csum(seg, + &iph2->daddr, iph->daddr, + &th2->dest, th->dest); + } + + return segs; +} + +static struct sk_buff *__tcp4_gso_segment_list(struct sk_buff *skb, + netdev_features_t features) +{ + skb = skb_segment_list(skb, features, skb_mac_header_len(skb)); + if (IS_ERR(skb)) + return skb; + + return __tcpv4_gso_segment_list_csum(skb); +} + static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb, netdev_features_t features) { @@ -37,6 +101,9 @@ static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct tcphdr))) return ERR_PTR(-EINVAL); + if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST) + return __tcp4_gso_segment_list(skb, features); + if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { const struct iphdr *iph = ip_hdr(skb); struct tcphdr *th = tcp_hdr(skb); @@ -73,6 +140,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, if (thlen < sizeof(*th)) goto out; + if (unlikely(skb_checksum_start(skb) != skb_transport_header(skb))) + goto out; + if (!pskb_may_pull(skb, thlen)) goto out; @@ -178,63 +248,76 @@ out: return segs; } -struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) +struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th) { - struct sk_buff *pp = NULL; + struct tcphdr *th2; struct sk_buff *p; + + list_for_each_entry(p, head, list) { + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + th2 = tcp_hdr(p); + if (*(u32 *)&th->source ^ *(u32 *)&th2->source) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + + return p; + } + + return NULL; +} + +struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb) +{ + unsigned int thlen, hlen, off; struct tcphdr *th; - struct tcphdr *th2; - unsigned int len; - unsigned int thlen; - __be32 flags; - unsigned int mss = 1; - unsigned int hlen; - unsigned int off; - int flush = 1; - int i; off = skb_gro_offset(skb); hlen = off + sizeof(*th); th = skb_gro_header(skb, hlen, off); if (unlikely(!th)) - goto out; + return NULL; thlen = th->doff * 4; if (thlen < sizeof(*th)) - goto out; + return NULL; hlen = off + thlen; - if (skb_gro_header_hard(skb, hlen)) { + if (!skb_gro_may_pull(skb, hlen)) { th = skb_gro_header_slow(skb, hlen, off); if (unlikely(!th)) - goto out; + return NULL; } skb_gro_pull(skb, thlen); - len = skb_gro_len(skb); - flags = tcp_flag_word(th); - - list_for_each_entry(p, head, list) { - if (!NAPI_GRO_CB(p)->same_flow) - continue; + return th; +} - th2 = tcp_hdr(p); +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, + struct tcphdr *th) +{ + unsigned int thlen = th->doff * 4; + struct sk_buff *pp = NULL; + struct sk_buff *p; + struct tcphdr *th2; + unsigned int len; + __be32 flags; + unsigned int mss = 1; + int flush = 1; + int i; - if (*(u32 *)&th->source ^ *(u32 *)&th2->source) { - NAPI_GRO_CB(p)->same_flow = 0; - continue; - } + len = skb_gro_len(skb); + flags = tcp_flag_word(th); - goto found; - } - p = NULL; - goto out_check_final; + p = tcp_gro_lookup(head, th); + if (!p) + goto out_check_final; -found: - /* Include the IP ID check below from the inner most IP hdr */ - flush = NAPI_GRO_CB(p)->flush; - flush |= (__force int)(flags & TCP_FLAG_CWR); + th2 = tcp_hdr(p); + flush = (__force int)(flags & TCP_FLAG_CWR); flush |= (__force int)((flags ^ tcp_flag_word(th2)) & ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); flush |= (__force int)(th->ack_seq ^ th2->ack_seq); @@ -242,16 +325,7 @@ found: flush |= *(u32 *)((u8 *)th + i) ^ *(u32 *)((u8 *)th2 + i); - /* When we receive our second frame we can made a decision on if we - * continue this flow as an atomic flow with a fixed ID or if we use - * an incrementing ID. - */ - if (NAPI_GRO_CB(p)->flush_id != 1 || - NAPI_GRO_CB(p)->count != 1 || - !NAPI_GRO_CB(p)->is_atomic) - flush |= NAPI_GRO_CB(p)->flush_id; - else - NAPI_GRO_CB(p)->is_atomic = false; + flush |= gro_receive_network_flush(th, th2, p); mss = skb_shinfo(p)->gso_size; @@ -265,9 +339,19 @@ found: flush |= (len - 1) >= mss; flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); -#ifdef CONFIG_TLS_DEVICE - flush |= p->decrypted ^ skb->decrypted; -#endif + flush |= skb_cmp_decrypted(p, skb); + + if (unlikely(NAPI_GRO_CB(p)->is_flist)) { + flush |= (__force int)(flags ^ tcp_flag_word(th2)); + flush |= skb->ip_summed != p->ip_summed; + flush |= skb->csum_level != p->csum_level; + flush |= NAPI_GRO_CB(p)->count >= 64; + + if (flush || skb_gro_receive_list(p, skb)) + mss = 1; + + goto out_check_final; + } if (flush || skb_gro_receive(p, skb)) { mss = 1; @@ -290,7 +374,6 @@ out_check_final: if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) pp = p; -out: NAPI_GRO_CB(skb)->flush |= (flush != 0); return pp; @@ -299,60 +382,110 @@ out: void tcp_gro_complete(struct sk_buff *skb) { struct tcphdr *th = tcp_hdr(skb); + struct skb_shared_info *shinfo; + + if (skb->encapsulation) + skb->inner_transport_header = skb->transport_header; skb->csum_start = (unsigned char *)th - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); skb->ip_summed = CHECKSUM_PARTIAL; - skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; + shinfo = skb_shinfo(skb); + shinfo->gso_segs = NAPI_GRO_CB(skb)->count; if (th->cwr) - skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; - - if (skb->encapsulation) - skb->inner_transport_header = skb->transport_header; + shinfo->gso_type |= SKB_GSO_TCP_ECN; } EXPORT_SYMBOL(tcp_gro_complete); +static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, + struct tcphdr *th) +{ + const struct iphdr *iph; + struct sk_buff *p; + struct sock *sk; + struct net *net; + int iif, sdif; + + if (likely(!(skb->dev->features & NETIF_F_GRO_FRAGLIST))) + return; + + p = tcp_gro_lookup(head, th); + if (p) { + NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist; + return; + } + + inet_get_iif_sdif(skb, &iif, &sdif); + iph = skb_gro_network_header(skb); + net = dev_net(skb->dev); + sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, + iph->saddr, th->source, + iph->daddr, ntohs(th->dest), + iif, sdif); + NAPI_GRO_CB(skb)->is_flist = !sk; + if (sk) + sock_put(sk); +} + INDIRECT_CALLABLE_SCOPE struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb) { + struct tcphdr *th; + /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && skb_gro_checksum_validate(skb, IPPROTO_TCP, - inet_gro_compute_pseudo)) { - NAPI_GRO_CB(skb)->flush = 1; - return NULL; - } + inet_gro_compute_pseudo)) + goto flush; + + th = tcp_gro_pull_header(skb); + if (!th) + goto flush; + + tcp4_check_fraglist_gro(head, skb, th); - return tcp_gro_receive(head, skb); + return tcp_gro_receive(head, skb, th); + +flush: + NAPI_GRO_CB(skb)->flush = 1; + return NULL; } INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) { - const struct iphdr *iph = ip_hdr(skb); + const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; + const struct iphdr *iph = (struct iphdr *)(skb->data + offset); struct tcphdr *th = tcp_hdr(skb); + if (unlikely(NAPI_GRO_CB(skb)->is_flist)) { + skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV4; + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; + + __skb_incr_checksum_unnecessary(skb); + + return 0; + } + th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, iph->daddr, 0); - skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; - if (NAPI_GRO_CB(skb)->is_atomic) - skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_FIXEDID; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4 | + (NAPI_GRO_CB(skb)->ip_fixedid * SKB_GSO_TCP_FIXEDID); tcp_gro_complete(skb); return 0; } -static const struct net_offload tcpv4_offload = { - .callbacks = { - .gso_segment = tcp4_gso_segment, - .gro_receive = tcp4_gro_receive, - .gro_complete = tcp4_gro_complete, - }, -}; - int __init tcpv4_offload_init(void) { - return inet_add_offload(&tcpv4_offload, IPPROTO_TCP); + net_hotdata.tcpv4_offload = (struct net_offload) { + .callbacks = { + .gso_segment = tcp4_gso_segment, + .gro_receive = tcp4_gro_receive, + .gro_complete = tcp4_gro_complete, + }, + }; + return inet_add_offload(&net_hotdata.tcpv4_offload, IPPROTO_TCP); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e3167ad965..95618d0e78 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -39,11 +39,13 @@ #include <net/tcp.h> #include <net/mptcp.h> +#include <net/proto_memory.h> #include <linux/compiler.h> #include <linux/gfp.h> #include <linux/module.h> #include <linux/static_key.h> +#include <linux/skbuff_ref.h> #include <trace/events/tcp.h> @@ -203,16 +205,17 @@ static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt) * This MUST be enforced by all callers. */ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, - __u32 *rcv_wnd, __u32 *window_clamp, + __u32 *rcv_wnd, __u32 *__window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd) { unsigned int space = (__space < 0 ? 0 : __space); + u32 window_clamp = READ_ONCE(*__window_clamp); /* If no clamp set the clamp to the max possible scaled window */ - if (*window_clamp == 0) - (*window_clamp) = (U16_MAX << TCP_MAX_WSCALE); - space = min(*window_clamp, space); + if (window_clamp == 0) + window_clamp = (U16_MAX << TCP_MAX_WSCALE); + space = min(window_clamp, space); /* Quantize space offering to a multiple of mss if possible. */ if (space > mss) @@ -229,7 +232,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)) (*rcv_wnd) = min(space, MAX_TCP_WINDOW); else - (*rcv_wnd) = min_t(u32, space, U16_MAX); + (*rcv_wnd) = space; if (init_rcv_wnd) *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); @@ -239,12 +242,13 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, /* Set window scaling on max possible window */ space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); space = max_t(u32, space, READ_ONCE(sysctl_rmem_max)); - space = min_t(u32, space, *window_clamp); + space = min_t(u32, space, window_clamp); *rcv_wscale = clamp_t(int, ilog2(space) - 15, 0, TCP_MAX_WSCALE); } /* Set the clamp no higher than max representable value */ - (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp); + WRITE_ONCE(*__window_clamp, + min_t(__u32, U16_MAX << (*rcv_wscale), window_clamp)); } EXPORT_SYMBOL(tcp_select_initial_window); @@ -1499,18 +1503,22 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) } /* Initialize TSO segments for a packet. */ -static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) +static int tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { + int tso_segs; + if (skb->len <= mss_now) { /* Avoid the costly divide in the normal * non-TSO case. */ - tcp_skb_pcount_set(skb, 1); TCP_SKB_CB(skb)->tcp_gso_size = 0; - } else { - tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); - TCP_SKB_CB(skb)->tcp_gso_size = mss_now; + tcp_skb_pcount_set(skb, 1); + return 1; } + TCP_SKB_CB(skb)->tcp_gso_size = mss_now; + tso_segs = DIV_ROUND_UP(skb->len, mss_now); + tcp_skb_pcount_set(skb, tso_segs); + return tso_segs; } /* Pcount in the middle of the write queue got changed, we need to do various @@ -2070,16 +2078,10 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, /* Can at least one segment of SKB be sent right now, according to the * congestion window rules? If so, return how many segments are allowed. */ -static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, - const struct sk_buff *skb) +static u32 tcp_cwnd_test(const struct tcp_sock *tp) { u32 in_flight, cwnd, halfcwnd; - /* Don't be strict about the congestion window for the final FIN. */ - if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && - tcp_skb_pcount(skb) == 1) - return 1; - in_flight = tcp_packets_in_flight(tp); cwnd = tcp_snd_cwnd(tp); if (in_flight >= cwnd) @@ -2100,10 +2102,9 @@ static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); - if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) { - tcp_set_skb_tso_segs(skb, mss_now); - tso_segs = tcp_skb_pcount(skb); - } + if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) + return tcp_set_skb_tso_segs(skb, mss_now); + return tso_segs; } @@ -2403,6 +2404,21 @@ commit: return 0; } +/* tcp_mtu_probe() and tcp_grow_skb() can both eat an skb (src) if + * all its payload was moved to another one (dst). + * Make sure to transfer tcp_flags, eor, and tstamp. + */ +static void tcp_eat_one_skb(struct sock *sk, + struct sk_buff *dst, + struct sk_buff *src) +{ + TCP_SKB_CB(dst)->tcp_flags |= TCP_SKB_CB(src)->tcp_flags; + TCP_SKB_CB(dst)->eor = TCP_SKB_CB(src)->eor; + tcp_skb_collapse_tstamp(dst, src); + tcp_unlink_write_queue(src, sk); + tcp_wmem_free_skb(sk, src); +} + /* Create a new MTU probe if we are ready. * MTU probe is regularly attempting to increase the path MTU by * deliberately sending larger packets. This discovers routing @@ -2508,16 +2524,7 @@ static int tcp_mtu_probe(struct sock *sk) copy = min_t(int, skb->len, probe_size - len); if (skb->len <= copy) { - /* We've eaten all the data from this skb. - * Throw it away. */ - TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; - /* If this is the last SKB we copy and eor is set - * we need to propagate it to the new skb. - */ - TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor; - tcp_skb_collapse_tstamp(nskb, skb); - tcp_unlink_write_queue(skb, sk); - tcp_wmem_free_skb(sk, skb); + tcp_eat_one_skb(sk, nskb, skb); } else { TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags & ~(TCPHDR_FIN|TCPHDR_PSH); @@ -2683,6 +2690,35 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) tcp_chrono_set(tp, TCP_CHRONO_BUSY); } +/* First skb in the write queue is smaller than ideal packet size. + * Check if we can move payload from the second skb in the queue. + */ +static void tcp_grow_skb(struct sock *sk, struct sk_buff *skb, int amount) +{ + struct sk_buff *next_skb = skb->next; + unsigned int nlen; + + if (tcp_skb_is_last(sk, skb)) + return; + + if (!tcp_skb_can_collapse(skb, next_skb)) + return; + + nlen = min_t(u32, amount, next_skb->len); + if (!nlen || !skb_shift(skb, next_skb, nlen)) + return; + + TCP_SKB_CB(skb)->end_seq += nlen; + TCP_SKB_CB(next_skb)->seq += nlen; + + if (!next_skb->len) { + /* In case FIN is set, we need to update end_seq */ + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; + + tcp_eat_one_skb(sk, skb, next_skb); + } +} + /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. @@ -2703,10 +2739,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int tso_segs, sent_pkts; - int cwnd_quota; + u32 cwnd_quota, max_segs; int result; bool is_cwnd_limited = false, is_rwnd_limited = false; - u32 max_segs; sent_pkts = 0; @@ -2724,6 +2759,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, max_segs = tcp_tso_segs(sk, mss_now); while ((skb = tcp_send_head(sk))) { unsigned int limit; + int missing_bytes; if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ @@ -2737,10 +2773,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (tcp_pacing_check(sk)) break; - tso_segs = tcp_init_tso_segs(skb, mss_now); - BUG_ON(!tso_segs); - - cwnd_quota = tcp_cwnd_test(tp, skb); + cwnd_quota = tcp_cwnd_test(tp); if (!cwnd_quota) { if (push_one == 2) /* Force out a loss probe pkt. */ @@ -2748,6 +2781,12 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, else break; } + cwnd_quota = min(cwnd_quota, max_segs); + missing_bytes = cwnd_quota * mss_now - skb->len; + if (missing_bytes > 0) + tcp_grow_skb(sk, skb, missing_bytes); + + tso_segs = tcp_set_skb_tso_segs(skb, mss_now); if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) { is_rwnd_limited = true; @@ -2769,9 +2808,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, - min_t(unsigned int, - cwnd_quota, - max_segs), + cwnd_quota, nonagle); if (skb->len > limit && @@ -3387,11 +3424,6 @@ start: err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } - /* To avoid taking spuriously low RTT samples based on a timestamp - * for a transmit that never happened, always mark EVER_RETRANS - */ - TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; - if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG)) tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, TCP_SKB_CB(skb)->seq, segs, err); @@ -3401,6 +3433,12 @@ start: } else if (err != -EBUSY) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs); } + + /* To avoid taking spuriously low RTT samples based on a timestamp + * for a transmit that never happened, always mark EVER_RETRANS + */ + TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; + return err; } @@ -3563,7 +3601,9 @@ void tcp_send_fin(struct sock *sk) return; } } else { - skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation); + skb = alloc_skb_fclone(MAX_TCP_HEADER, + sk_gfp_mask(sk, GFP_ATOMIC | + __GFP_NOWARN)); if (unlikely(!skb)) return; @@ -3583,7 +3623,8 @@ void tcp_send_fin(struct sock *sk) * was unread data in the receive queue. This behavior is recommended * by RFC 2525, section 2.17. -DaveM */ -void tcp_send_active_reset(struct sock *sk, gfp_t priority) +void tcp_send_active_reset(struct sock *sk, gfp_t priority, + enum sk_rst_reason reason) { struct sk_buff *skb; @@ -3608,7 +3649,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) /* skb of trace_tcp_send_reset() keeps the skb that caused RST, * skb here is different to the troublesome skb, so use NULL */ - trace_tcp_send_reset(sk, NULL); + trace_tcp_send_reset(sk, NULL, SK_RST_REASON_NOT_SPECIFIED); } /* Send a crossed SYN-ACK during socket establishment. @@ -3855,7 +3896,7 @@ static void tcp_connect_init(struct sock *sk) tcp_ca_dst_init(sk, dst); if (!tp->window_clamp) - tp->window_clamp = dst_metric(dst, RTAX_WINDOW); + WRITE_ONCE(tp->window_clamp, dst_metric(dst, RTAX_WINDOW)); tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); tcp_initialize_rcv_mss(sk); @@ -3863,7 +3904,7 @@ static void tcp_connect_init(struct sock *sk) /* limit the window selection if the user enforce a smaller rx buffer */ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) - tp->window_clamp = tcp_full_space(sk); + WRITE_ONCE(tp->window_clamp, tcp_full_space(sk)); rcv_wnd = tcp_rwnd_init_bpf(sk); if (rcv_wnd == 0) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index d1ad20ce1c..4d40615dc8 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,10 +22,11 @@ #include <linux/module.h> #include <linux/gfp.h> #include <net/tcp.h> +#include <net/rstreason.h> static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) { - struct inet_connection_sock *icsk = inet_csk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); u32 elapsed, user_timeout; s32 remaining; @@ -47,7 +48,7 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) { - struct inet_connection_sock *icsk = inet_csk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); u32 remaining, user_timeout; s32 elapsed; @@ -73,11 +74,7 @@ u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) static void tcp_write_err(struct sock *sk) { - WRITE_ONCE(sk->sk_err, READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT); - sk_error_report(sk); - - tcp_write_queue_purge(sk); - tcp_done(sk); + tcp_done_with_error(sk, READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); } @@ -127,7 +124,8 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) (!tp->snd_wnd && !tp->packets_out)) do_reset = true; if (do_reset) - tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_send_active_reset(sk, GFP_ATOMIC, + SK_RST_REASON_NOT_SPECIFIED); tcp_done(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); return 1; @@ -481,11 +479,26 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk, const struct sk_buff *skb, u32 rtx_delta) { + const struct inet_connection_sock *icsk = inet_csk(sk); + u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout); const struct tcp_sock *tp = tcp_sk(sk); - const int timeout = TCP_RTO_MAX * 2; - u32 rcv_delta; + int timeout = TCP_RTO_MAX * 2; + s32 rcv_delta; - rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp; + if (user_timeout) { + /* If user application specified a TCP_USER_TIMEOUT, + * it does not want win 0 packets to 'reset the timer' + * while retransmits are not making progress. + */ + if (rtx_delta > user_timeout) + return true; + timeout = min_t(u32, timeout, msecs_to_jiffies(user_timeout)); + } + /* Note: timer interrupt might have been delayed by at least one jiffy, + * and tp->rcv_tstamp might very well have been written recently. + * rcv_delta can thus be negative. + */ + rcv_delta = icsk->icsk_timeout - tp->rcv_tstamp; if (rcv_delta <= timeout) return false; @@ -530,8 +543,6 @@ void tcp_retransmit_timer(struct sock *sk) if (WARN_ON_ONCE(!skb)) return; - tp->tlp_high_seq = 0; - if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) && !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { /* Receiver dastardly shrinks window. Our retransmits @@ -768,7 +779,7 @@ static void tcp_keepalive_timer (struct timer_list *t) goto out; } } - tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_send_active_reset(sk, GFP_ATOMIC, SK_RST_REASON_NOT_SPECIFIED); goto death; } @@ -795,7 +806,8 @@ static void tcp_keepalive_timer (struct timer_list *t) icsk->icsk_probes_out > 0) || (user_timeout == 0 && icsk->icsk_probes_out >= keepalive_probes(tp))) { - tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_send_active_reset(sk, GFP_ATOMIC, + SK_RST_REASON_NOT_SPECIFIED); tcp_write_err(sk); goto out; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 40282a3418..578668878a 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -326,6 +326,8 @@ found: goto fail_unlock; } + sock_set_flag(sk, SOCK_RCU_FREE); + sk_add_node_rcu(sk, &hslot->head); hslot->count++; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); @@ -342,7 +344,7 @@ found: hslot2->count++; spin_unlock(&hslot2->lock); } - sock_set_flag(sk, SOCK_RCU_FREE); + error = 0; fail_unlock: spin_unlock_bh(&hslot->lock); @@ -411,8 +413,6 @@ INDIRECT_CALLABLE_SCOPE u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport) { - static u32 udp_ehash_secret __read_mostly; - net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret)); return __inet_ehashfn(laddr, lport, faddr, fport, @@ -429,15 +429,21 @@ static struct sock *udp4_lib_lookup2(struct net *net, { struct sock *sk, *result; int score, badness; + bool need_rescore; result = NULL; badness = 0; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { - score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, sdif); + need_rescore = false; +rescore: + score = compute_score(need_rescore ? result : sk, net, saddr, + sport, daddr, hnum, dif, sdif); if (score > badness) { badness = score; + if (need_rescore) + continue; + if (sk->sk_state == TCP_ESTABLISHED) { result = sk; continue; @@ -458,9 +464,14 @@ static struct sock *udp4_lib_lookup2(struct net *net, if (IS_ERR(result)) continue; - badness = compute_score(result, net, saddr, sport, - daddr, hnum, dif, sdif); - + /* compute_score is too long of a function to be + * inlined, and calling it again here yields + * measureable overhead for some + * workloads. Work around it by jumping + * backwards to rescore 'result'. + */ + need_rescore = true; + goto rescore; } } return result; @@ -534,7 +545,8 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport) { - const struct iphdr *iph = ip_hdr(skb); + const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; + const struct iphdr *iph = (struct iphdr *)(skb->data + offset); struct net *net = dev_net(skb->dev); int iif, sdif; @@ -1208,7 +1220,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } if (connected) - rt = (struct rtable *)sk_dst_check(sk, 0); + rt = dst_rtable(sk_dst_check(sk, 0)); if (!rt) { struct net *net = sock_net(sk); @@ -1502,13 +1514,15 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) struct sk_buff_head *list = &sk->sk_receive_queue; int rmem, err = -ENOMEM; spinlock_t *busy = NULL; - int size; + bool becomes_readable; + int size, rcvbuf; - /* try to avoid the costly atomic add/sub pair when the receive - * queue is full; always allow at least a packet + /* Immediately drop when the receive queue is full. + * Always allow at least one packet. */ rmem = atomic_read(&sk->sk_rmem_alloc); - if (rmem > sk->sk_rcvbuf) + rcvbuf = READ_ONCE(sk->sk_rcvbuf); + if (rmem > rcvbuf) goto drop; /* Under mem pressure, it might be helpful to help udp_recvmsg() @@ -1517,7 +1531,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) * - Less cache line misses at copyout() time * - Less work at consume_skb() (less alien page frag freeing) */ - if (rmem > (sk->sk_rcvbuf >> 1)) { + if (rmem > (rcvbuf >> 1)) { skb_condense(skb); busy = busylock_acquire(sk); @@ -1525,12 +1539,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) size = skb->truesize; udp_set_dev_scratch(skb); - /* we drop only if the receive buf is full and the receive - * queue contains some other skb - */ - rmem = atomic_add_return(size, &sk->sk_rmem_alloc); - if (rmem > (size + (unsigned int)sk->sk_rcvbuf)) - goto uncharge_drop; + atomic_add(size, &sk->sk_rmem_alloc); spin_lock(&list->lock); err = udp_rmem_schedule(sk, size); @@ -1546,12 +1555,19 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) */ sock_skb_set_dropcount(sk, skb); + becomes_readable = skb_queue_empty(list); __skb_queue_tail(list, skb); spin_unlock(&list->lock); - if (!sock_flag(sk, SOCK_DEAD)) - INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); - + if (!sock_flag(sk, SOCK_DEAD)) { + if (becomes_readable || + sk->sk_data_ready != sock_def_readable || + READ_ONCE(sk->sk_peek_off) >= 0) + INDIRECT_CALL_1(sk->sk_data_ready, + sock_def_readable, sk); + else + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); + } busylock_release(busy); return 0; @@ -1597,7 +1613,8 @@ int udp_init_sock(struct sock *sk) void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) { - sk_peek_offset_bwd(sk, len); + if (unlikely(READ_ONCE(udp_sk(sk)->peeking_with_offset))) + sk_peek_offset_bwd(sk, len); if (!skb_unref(skb)) return; @@ -2058,8 +2075,8 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) drop_reason = SKB_DROP_REASON_PROTO_MEM; } UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + trace_udp_fail_queue_rcv_skb(rc, sk, skb); kfree_skb_reason(skb, drop_reason); - trace_udp_fail_queue_rcv_skb(rc, sk); return -1; } @@ -2577,11 +2594,12 @@ int udp_v4_early_demux(struct sk_buff *skb) uh->source, iph->saddr, dif, sdif); } - if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) + if (!sk) return 0; skb->sk = sk; - skb->destructor = sock_efree; + DEBUG_NET_WARN_ON_ONCE(sk_is_refcounted(sk)); + skb->destructor = sock_pfree; dst = rcu_dereference(sk->sk_rx_dst); if (dst) @@ -2696,8 +2714,6 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, #ifdef CONFIG_XFRM case UDP_ENCAP_ESPINUDP: set_xfrm_gro_udp_encap_rcv(val, sk->sk_family, sk); - fallthrough; - case UDP_ENCAP_ESPINUDP_NON_IKE: #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) WRITE_ONCE(up->encap_rcv, diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index dc41a22ee8..38cb3a28e4 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -237,6 +237,7 @@ static int udplite_diag_destroy(struct sk_buff *in_skb, #endif static const struct inet_diag_handler udp_diag_handler = { + .owner = THIS_MODULE, .dump = udp_diag_dump, .dump_one = udp_diag_dump_one, .idiag_get_info = udp_diag_get_info, @@ -260,6 +261,7 @@ static int udplite_diag_dump_one(struct netlink_callback *cb, } static const struct inet_diag_handler udplite_diag_handler = { + .owner = THIS_MODULE, .dump = udplite_diag_dump, .dump_one = udplite_diag_dump_one, .idiag_get_info = udp_diag_get_info, diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index c3d67423ae..ee9af92155 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -278,6 +278,10 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, if (gso_skb->len <= sizeof(*uh) + mss) return ERR_PTR(-EINVAL); + if (unlikely(skb_checksum_start(gso_skb) != + skb_transport_header(gso_skb))) + return ERR_PTR(-EINVAL); + if (skb_gso_ok(gso_skb, features | NETIF_F_GSO_ROBUST)) { /* Packet is from an untrusted source, reset gso_segs. */ skb_shinfo(gso_skb)->gso_segs = DIV_ROUND_UP(gso_skb->len - sizeof(*uh), @@ -433,33 +437,6 @@ out: return segs; } -static int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) -{ - if (unlikely(p->len + skb->len >= 65536)) - return -E2BIG; - - if (NAPI_GRO_CB(p)->last == p) - skb_shinfo(p)->frag_list = skb; - else - NAPI_GRO_CB(p)->last->next = skb; - - skb_pull(skb, skb_gro_offset(skb)); - - NAPI_GRO_CB(p)->last = skb; - NAPI_GRO_CB(p)->count++; - p->data_len += skb->len; - - /* sk ownership - if any - completely transferred to the aggregated packet */ - skb->destructor = NULL; - skb->sk = NULL; - p->truesize += skb->truesize; - p->len += skb->len; - - NAPI_GRO_CB(skb)->same_flow = 1; - - return 0; -} - #define UDP_GRO_CNT_MAX 64 static struct sk_buff *udp_gro_receive_segment(struct list_head *head, @@ -471,6 +448,7 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, struct sk_buff *p; unsigned int ulen; int ret = 0; + int flush; /* requires non zero csum, for symmetry with GSO */ if (!uh->check) { @@ -504,13 +482,15 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, return p; } + flush = gro_receive_network_flush(uh, uh2, p); + /* Terminate the flow on len mismatch or if it grow "too much". * Under small packet flood GRO count could elsewhere grow a lot * leading to excessive truesize values. * On len mismatch merge the first packet shorter than gso_size, * otherwise complete the GRO packet. */ - if (ulen > ntohs(uh2->len)) { + if (ulen > ntohs(uh2->len) || flush) { pp = p; } else { if (NAPI_GRO_CB(skb)->is_flist) { @@ -718,7 +698,8 @@ EXPORT_SYMBOL(udp_gro_complete); INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff) { - const struct iphdr *iph = ip_hdr(skb); + const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; + const struct iphdr *iph = (struct iphdr *)(skb->data + offset); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); /* do fraglist only if there is no outer UDP encap (or we already processed it) */ @@ -740,15 +721,14 @@ INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff) return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb); } -static const struct net_offload udpv4_offload = { - .callbacks = { - .gso_segment = udp4_ufo_fragment, - .gro_receive = udp4_gro_receive, - .gro_complete = udp4_gro_complete, - }, -}; - int __init udpv4_offload_init(void) { - return inet_add_offload(&udpv4_offload, IPPROTO_UDP); + net_hotdata.udpv4_offload = (struct net_offload) { + .callbacks = { + .gso_segment = udp4_ufo_fragment, + .gro_receive = udp4_gro_receive, + .gro_complete = udp4_gro_complete, + }, + }; + return inet_add_offload(&net_hotdata.udpv4_offload, IPPROTO_UDP); } diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 860aff5f85..e4e0fa869f 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -183,7 +183,8 @@ void udp_tunnel_sock_release(struct socket *sock) EXPORT_SYMBOL_GPL(udp_tunnel_sock_release); struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, - __be16 flags, __be64 tunnel_id, int md_size) + const unsigned long *flags, + __be64 tunnel_id, int md_size) { struct metadata_dst *tun_dst; struct ip_tunnel_info *info; @@ -199,7 +200,7 @@ struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, info->key.tp_src = udp_hdr(skb)->source; info->key.tp_dst = udp_hdr(skb)->dest; if (udp_hdr(skb)->check) - info->key.tun_flags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); return tun_dst; } EXPORT_SYMBOL_GPL(udp_tun_rx_dst); diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index c54676998e..a620618cc5 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -58,12 +58,16 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async) return -iph->protocol; #endif - __skb_push(skb, skb->data - skb_network_header(skb)); + __skb_push(skb, -skb_network_offset(skb)); iph->tot_len = htons(skb->len); ip_send_check(iph); if (xo && (xo->flags & XFRM_GRO)) { - skb_mac_header_rebuild(skb); + /* The full l2 header needs to be preserved so that re-injecting the packet at l2 + * works correctly in the presence of vlan tags. + */ + skb_mac_header_rebuild_full(skb, xo->orig_mac_len); + skb_reset_network_header(skb); skb_reset_transport_header(skb); return 0; } @@ -113,19 +117,6 @@ static int __xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb, bool pull /* Must be an IKE packet.. pass it through */ return 1; break; - case UDP_ENCAP_ESPINUDP_NON_IKE: - /* Check if this is a keepalive packet. If so, eat it. */ - if (len == 1 && udpdata[0] == 0xff) { - return -EINVAL; - } else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) && - udpdata32[0] == 0 && udpdata32[1] == 0) { - - /* ESP Packet with Non-IKE marker */ - len = sizeof(struct udphdr) + 2 * sizeof(u32); - } else - /* Must be an IKE packet.. pass it through */ - return 1; - break; } /* At this point we are sure that this is an ESPinUDP packet, diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index c33bca2c38..0294fef577 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -69,7 +69,7 @@ static int xfrm4_get_saddr(struct net *net, int oif, static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { - struct rtable *rt = (struct rtable *)xdst->route; + struct rtable *rt = dst_rtable(xdst->route); const struct flowi4 *fl4 = &fl->u.ip4; xdst->u.rt.rt_iif = fl4->flowi4_iif; @@ -152,7 +152,6 @@ static struct ctl_table xfrm4_policy_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static __net_init int xfrm4_net_sysctl_init(struct net *net) @@ -186,7 +185,7 @@ err_alloc: static __net_exit void xfrm4_net_sysctl_exit(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; if (!net->ipv4.xfrm4_hdr) return; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 37d48aa073..4f2c5cc310 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -63,6 +63,7 @@ #include <linux/string.h> #include <linux/hash.h> +#include <net/ip_tunnels.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/snmp.h> @@ -195,6 +196,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .use_tempaddr = 0, .temp_valid_lft = TEMP_VALID_LIFETIME, .temp_prefered_lft = TEMP_PREFERRED_LIFETIME, + .regen_min_advance = REGEN_MIN_ADVANCE, .regen_max_retry = REGEN_MAX_RETRY, .max_desync_factor = MAX_DESYNC_FACTOR, .max_addresses = IPV6_MAX_ADDRESSES, @@ -257,6 +259,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .use_tempaddr = 0, .temp_valid_lft = TEMP_VALID_LIFETIME, .temp_prefered_lft = TEMP_PREFERRED_LIFETIME, + .regen_min_advance = REGEN_MIN_ADVANCE, .regen_max_retry = REGEN_MAX_RETRY, .max_desync_factor = MAX_DESYNC_FACTOR, .max_addresses = IPV6_MAX_ADDRESSES, @@ -549,7 +552,8 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex, goto out; if ((all || type == NETCONFA_FORWARDING) && - nla_put_s32(skb, NETCONFA_FORWARDING, devconf->forwarding) < 0) + nla_put_s32(skb, NETCONFA_FORWARDING, + READ_ONCE(devconf->forwarding)) < 0) goto nla_put_failure; #ifdef CONFIG_IPV6_MROUTE if ((all || type == NETCONFA_MC_FORWARDING) && @@ -558,12 +562,13 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex, goto nla_put_failure; #endif if ((all || type == NETCONFA_PROXY_NEIGH) && - nla_put_s32(skb, NETCONFA_PROXY_NEIGH, devconf->proxy_ndp) < 0) + nla_put_s32(skb, NETCONFA_PROXY_NEIGH, + READ_ONCE(devconf->proxy_ndp)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, - devconf->ignore_routes_with_linkdown) < 0) + READ_ONCE(devconf->ignore_routes_with_linkdown)) < 0) goto nla_put_failure; out: @@ -713,7 +718,7 @@ errout: static u32 inet6_base_seq(const struct net *net) { u32 res = atomic_read(&net->ipv6.dev_addr_genid) + - net->dev_base_seq; + READ_ONCE(net->dev_base_seq); /* Must not return 0 (see nl_dump_check_consistent()). * Chose a value far away from 0. @@ -723,17 +728,18 @@ static u32 inet6_base_seq(const struct net *net) return res; } - static int inet6_netconf_dump_devconf(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); - int h, s_h; - int idx, s_idx; + struct { + unsigned long ifindex; + unsigned int all_default; + } *ctx = (void *)cb->ctx; struct net_device *dev; struct inet6_dev *idev; - struct hlist_head *head; + int err = 0; if (cb->strict_check) { struct netlink_ext_ack *extack = cb->extack; @@ -750,64 +756,46 @@ static int inet6_netconf_dump_devconf(struct sk_buff *skb, } } - s_h = cb->args[0]; - s_idx = idx = cb->args[1]; - - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &net->dev_index_head[h]; - rcu_read_lock(); - cb->seq = inet6_base_seq(net); - hlist_for_each_entry_rcu(dev, head, index_hlist) { - if (idx < s_idx) - goto cont; - idev = __in6_dev_get(dev); - if (!idev) - goto cont; - - if (inet6_netconf_fill_devconf(skb, dev->ifindex, - &idev->cnf, - NETLINK_CB(cb->skb).portid, - nlh->nlmsg_seq, - RTM_NEWNETCONF, - NLM_F_MULTI, - NETCONFA_ALL) < 0) { - rcu_read_unlock(); - goto done; - } - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); -cont: - idx++; - } - rcu_read_unlock(); + rcu_read_lock(); + for_each_netdev_dump(net, dev, ctx->ifindex) { + idev = __in6_dev_get(dev); + if (!idev) + continue; + err = inet6_netconf_fill_devconf(skb, dev->ifindex, + &idev->cnf, + NETLINK_CB(cb->skb).portid, + nlh->nlmsg_seq, + RTM_NEWNETCONF, + NLM_F_MULTI, + NETCONFA_ALL); + if (err < 0) + goto done; } - if (h == NETDEV_HASHENTRIES) { - if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, - net->ipv6.devconf_all, - NETLINK_CB(cb->skb).portid, - nlh->nlmsg_seq, - RTM_NEWNETCONF, NLM_F_MULTI, - NETCONFA_ALL) < 0) + if (ctx->all_default == 0) { + err = inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, + net->ipv6.devconf_all, + NETLINK_CB(cb->skb).portid, + nlh->nlmsg_seq, + RTM_NEWNETCONF, NLM_F_MULTI, + NETCONFA_ALL); + if (err < 0) goto done; - else - h++; - } - if (h == NETDEV_HASHENTRIES + 1) { - if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, - net->ipv6.devconf_dflt, - NETLINK_CB(cb->skb).portid, - nlh->nlmsg_seq, - RTM_NEWNETCONF, NLM_F_MULTI, - NETCONFA_ALL) < 0) + ctx->all_default++; + } + if (ctx->all_default == 1) { + err = inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, + net->ipv6.devconf_dflt, + NETLINK_CB(cb->skb).portid, + nlh->nlmsg_seq, + RTM_NEWNETCONF, NLM_F_MULTI, + NETCONFA_ALL); + if (err < 0) goto done; - else - h++; + ctx->all_default++; } done: - cb->args[0] = h; - cb->args[1] = idx; - - return skb->len; + rcu_read_unlock(); + return err; } #ifdef CONFIG_SYSCTL @@ -867,7 +855,8 @@ static void addrconf_forward_change(struct net *net, __s32 newf) idev = __in6_dev_get(dev); if (idev) { int changed = (!idev->cnf.forwarding) ^ (!newf); - idev->cnf.forwarding = newf; + + WRITE_ONCE(idev->cnf.forwarding, newf); if (changed) dev_forward_change(idev); } @@ -884,7 +873,7 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) net = (struct net *)table->extra2; old = *p; - *p = newf; + WRITE_ONCE(*p, newf); if (p == &net->ipv6.devconf_dflt->forwarding) { if ((!newf) ^ (!old)) @@ -899,7 +888,7 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) if (p == &net->ipv6.devconf_all->forwarding) { int old_dflt = net->ipv6.devconf_dflt->forwarding; - net->ipv6.devconf_dflt->forwarding = newf; + WRITE_ONCE(net->ipv6.devconf_dflt->forwarding, newf); if ((!newf) ^ (!old_dflt)) inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, @@ -931,7 +920,7 @@ static void addrconf_linkdown_change(struct net *net, __s32 newf) if (idev) { int changed = (!idev->cnf.ignore_routes_with_linkdown) ^ (!newf); - idev->cnf.ignore_routes_with_linkdown = newf; + WRITE_ONCE(idev->cnf.ignore_routes_with_linkdown, newf); if (changed) inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, @@ -952,7 +941,7 @@ static int addrconf_fixup_linkdown(struct ctl_table *table, int *p, int newf) net = (struct net *)table->extra2; old = *p; - *p = newf; + WRITE_ONCE(*p, newf); if (p == &net->ipv6.devconf_dflt->ignore_routes_with_linkdown) { if ((!newf) ^ (!old)) @@ -966,7 +955,7 @@ static int addrconf_fixup_linkdown(struct ctl_table *table, int *p, int newf) } if (p == &net->ipv6.devconf_all->ignore_routes_with_linkdown) { - net->ipv6.devconf_dflt->ignore_routes_with_linkdown = newf; + WRITE_ONCE(net->ipv6.devconf_dflt->ignore_routes_with_linkdown, newf); addrconf_linkdown_change(net, newf); if ((!newf) ^ (!old)) inet6_netconf_notify_devconf(net, @@ -1270,6 +1259,7 @@ static void cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt, bool del_peer) { + struct fib6_table *table; struct fib6_info *f6i; f6i = addrconf_get_prefix_route(del_peer ? &ifp->peer_addr : &ifp->addr, @@ -1279,8 +1269,15 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, if (del_rt) ip6_del_rt(dev_net(ifp->idev->dev), f6i, false); else { - if (!(f6i->fib6_flags & RTF_EXPIRES)) + if (!(f6i->fib6_flags & RTF_EXPIRES)) { + table = f6i->fib6_table; + spin_lock_bh(&table->tb6_lock); + fib6_set_expires(f6i, expires); + fib6_add_gc_list(f6i); + + spin_unlock_bh(&table->tb6_lock); + } fib6_info_release(f6i); } } @@ -1346,12 +1343,21 @@ out: in6_ifa_put(ifp); } +static unsigned long ipv6_get_regen_advance(const struct inet6_dev *idev) +{ + return READ_ONCE(idev->cnf.regen_min_advance) + + READ_ONCE(idev->cnf.regen_max_retry) * + READ_ONCE(idev->cnf.dad_transmits) * + max(NEIGH_VAR(idev->nd_parms, RETRANS_TIME), HZ/100) / HZ; +} + static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, bool block) { struct inet6_dev *idev = ifp->idev; unsigned long tmp_tstamp, age; unsigned long regen_advance; unsigned long now = jiffies; + u32 if_public_preferred_lft; s32 cnf_temp_preferred_lft; struct inet6_ifaddr *ift; struct ifa6_config cfg; @@ -1363,7 +1369,7 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, bool block) retry: in6_dev_hold(idev); - if (idev->cnf.use_tempaddr <= 0) { + if (READ_ONCE(idev->cnf.use_tempaddr) <= 0) { write_unlock_bh(&idev->lock); pr_info("%s: use_tempaddr is disabled\n", __func__); in6_dev_put(idev); @@ -1371,8 +1377,8 @@ retry: goto out; } spin_lock_bh(&ifp->lock); - if (ifp->regen_count++ >= idev->cnf.regen_max_retry) { - idev->cnf.use_tempaddr = -1; /*XXX*/ + if (ifp->regen_count++ >= READ_ONCE(idev->cnf.regen_max_retry)) { + WRITE_ONCE(idev->cnf.use_tempaddr, -1); /*XXX*/ spin_unlock_bh(&ifp->lock); write_unlock_bh(&idev->lock); pr_warn("%s: regeneration time exceeded - disabled temporary address support\n", @@ -1387,16 +1393,14 @@ retry: age = (now - ifp->tstamp) / HZ; - regen_advance = idev->cnf.regen_max_retry * - idev->cnf.dad_transmits * - max(NEIGH_VAR(idev->nd_parms, RETRANS_TIME), HZ/100) / HZ; + regen_advance = ipv6_get_regen_advance(idev); /* recalculate max_desync_factor each time and update * idev->desync_factor if it's larger */ cnf_temp_preferred_lft = READ_ONCE(idev->cnf.temp_prefered_lft); max_desync_factor = min_t(long, - idev->cnf.max_desync_factor, + READ_ONCE(idev->cnf.max_desync_factor), cnf_temp_preferred_lft - regen_advance); if (unlikely(idev->desync_factor > max_desync_factor)) { @@ -1409,11 +1413,13 @@ retry: } } + if_public_preferred_lft = ifp->prefered_lft; + memset(&cfg, 0, sizeof(cfg)); cfg.valid_lft = min_t(__u32, ifp->valid_lft, - idev->cnf.temp_valid_lft + age); + READ_ONCE(idev->cnf.temp_valid_lft) + age); cfg.preferred_lft = cnf_temp_preferred_lft + age - idev->desync_factor; - cfg.preferred_lft = min_t(__u32, ifp->prefered_lft, cfg.preferred_lft); + cfg.preferred_lft = min_t(__u32, if_public_preferred_lft, cfg.preferred_lft); cfg.preferred_lft = min_t(__u32, cfg.valid_lft, cfg.preferred_lft); cfg.plen = ifp->prefix_len; @@ -1422,19 +1428,41 @@ retry: write_unlock_bh(&idev->lock); - /* A temporary address is created only if this calculated Preferred - * Lifetime is greater than REGEN_ADVANCE time units. In particular, - * an implementation must not create a temporary address with a zero - * Preferred Lifetime. + /* From RFC 4941: + * + * A temporary address is created only if this calculated Preferred + * Lifetime is greater than REGEN_ADVANCE time units. In + * particular, an implementation must not create a temporary address + * with a zero Preferred Lifetime. + * + * ... + * + * When creating a temporary address, the lifetime values MUST be + * derived from the corresponding prefix as follows: + * + * ... + * + * * Its Preferred Lifetime is the lower of the Preferred Lifetime + * of the public address or TEMP_PREFERRED_LIFETIME - + * DESYNC_FACTOR. + * + * To comply with the RFC's requirements, clamp the preferred lifetime + * to a minimum of regen_advance, unless that would exceed valid_lft or + * ifp->prefered_lft. + * * Use age calculation as in addrconf_verify to avoid unnecessary * temporary addresses being generated. */ age = (now - tmp_tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; if (cfg.preferred_lft <= regen_advance + age) { - in6_ifa_put(ifp); - in6_dev_put(idev); - ret = -1; - goto out; + cfg.preferred_lft = regen_advance + age + 1; + if (cfg.preferred_lft > cfg.valid_lft || + cfg.preferred_lft > if_public_preferred_lft) { + in6_ifa_put(ifp); + in6_dev_put(idev); + ret = -1; + goto out; + } } cfg.ifa_flags = IFA_F_TEMPORARY; @@ -1513,15 +1541,17 @@ static inline int ipv6_saddr_preferred(int type) return 0; } -static bool ipv6_use_optimistic_addr(struct net *net, - struct inet6_dev *idev) +static bool ipv6_use_optimistic_addr(const struct net *net, + const struct inet6_dev *idev) { #ifdef CONFIG_IPV6_OPTIMISTIC_DAD if (!idev) return false; - if (!net->ipv6.devconf_all->optimistic_dad && !idev->cnf.optimistic_dad) + if (!READ_ONCE(net->ipv6.devconf_all->optimistic_dad) && + !READ_ONCE(idev->cnf.optimistic_dad)) return false; - if (!net->ipv6.devconf_all->use_optimistic && !idev->cnf.use_optimistic) + if (!READ_ONCE(net->ipv6.devconf_all->use_optimistic) && + !READ_ONCE(idev->cnf.use_optimistic)) return false; return true; @@ -1530,13 +1560,14 @@ static bool ipv6_use_optimistic_addr(struct net *net, #endif } -static bool ipv6_allow_optimistic_dad(struct net *net, - struct inet6_dev *idev) +static bool ipv6_allow_optimistic_dad(const struct net *net, + const struct inet6_dev *idev) { #ifdef CONFIG_IPV6_OPTIMISTIC_DAD if (!idev) return false; - if (!net->ipv6.devconf_all->optimistic_dad && !idev->cnf.optimistic_dad) + if (!READ_ONCE(net->ipv6.devconf_all->optimistic_dad) && + !READ_ONCE(idev->cnf.optimistic_dad)) return false; return true; @@ -1642,7 +1673,7 @@ static int ipv6_get_saddr_eval(struct net *net, */ int preftmp = dst->prefs & (IPV6_PREFER_SRC_PUBLIC|IPV6_PREFER_SRC_TMP) ? !!(dst->prefs & IPV6_PREFER_SRC_TMP) : - score->ifa->idev->cnf.use_tempaddr >= 2; + READ_ONCE(score->ifa->idev->cnf.use_tempaddr) >= 2; ret = (!(score->ifa->flags & IFA_F_TEMPORARY)) ^ preftmp; break; } @@ -1818,7 +1849,7 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, idev = __in6_dev_get(dst_dev); if ((dst_type & IPV6_ADDR_MULTICAST) || dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL || - (idev && idev->cnf.use_oif_addrs_only)) { + (idev && READ_ONCE(idev->cnf.use_oif_addrs_only))) { use_oif_addr = true; } } @@ -1842,7 +1873,8 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, master, &dst, scores, hiscore_idx); - if (scores[hiscore_idx].ifa) + if (scores[hiscore_idx].ifa && + scores[hiscore_idx].scopedist >= 0) goto out; } @@ -2126,6 +2158,7 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp) { struct inet6_dev *idev = ifp->idev; struct net *net = dev_net(idev->dev); + int max_addresses; if (addrconf_dad_end(ifp)) { in6_ifa_put(ifp); @@ -2163,9 +2196,9 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp) spin_unlock_bh(&ifp->lock); - if (idev->cnf.max_addresses && - ipv6_count_addresses(idev) >= - idev->cnf.max_addresses) + max_addresses = READ_ONCE(idev->cnf.max_addresses); + if (max_addresses && + ipv6_count_addresses(idev) >= max_addresses) goto lock_errdad; net_info_ratelimited("%s: generating new stable privacy address because of DAD conflict\n", @@ -2562,11 +2595,11 @@ static void manage_tempaddrs(struct inet6_dev *idev, * (TEMP_PREFERRED_LIFETIME - DESYNC_FACTOR), respectively. */ age = (now - ift->cstamp) / HZ; - max_valid = idev->cnf.temp_valid_lft - age; + max_valid = READ_ONCE(idev->cnf.temp_valid_lft) - age; if (max_valid < 0) max_valid = 0; - max_prefered = idev->cnf.temp_prefered_lft - + max_prefered = READ_ONCE(idev->cnf.temp_prefered_lft) - idev->desync_factor - age; if (max_prefered < 0) max_prefered = 0; @@ -2599,7 +2632,7 @@ static void manage_tempaddrs(struct inet6_dev *idev, if (list_empty(&idev->tempaddr_list) && (valid_lft || prefered_lft)) create = true; - if (create && idev->cnf.use_tempaddr > 0) { + if (create && READ_ONCE(idev->cnf.use_tempaddr) > 0) { /* When a new public address is created as described * in [ADDRCONF], also create a new temporary address. */ @@ -2627,7 +2660,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, int create = 0, update_lft = 0; if (!ifp && valid_lft) { - int max_addresses = in6_dev->cnf.max_addresses; + int max_addresses = READ_ONCE(in6_dev->cnf.max_addresses); struct ifa6_config cfg = { .pfx = addr, .plen = pinfo->prefix_len, @@ -2639,8 +2672,8 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, }; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - if ((net->ipv6.devconf_all->optimistic_dad || - in6_dev->cnf.optimistic_dad) && + if ((READ_ONCE(net->ipv6.devconf_all->optimistic_dad) || + READ_ONCE(in6_dev->cnf.optimistic_dad)) && !net->ipv6.devconf_all->forwarding && sllao) cfg.ifa_flags |= IFA_F_OPTIMISTIC; #endif @@ -2689,7 +2722,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, */ update_lft = !create && stored_lft; - if (update_lft && !in6_dev->cnf.ra_honor_pio_life) { + if (update_lft && !READ_ONCE(in6_dev->cnf.ra_honor_pio_life)) { const u32 minimum_lft = min_t(u32, stored_lft, MIN_VALID_LIFETIME); valid_lft = max(valid_lft, minimum_lft); @@ -2698,7 +2731,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, if (update_lft) { ifp->valid_lft = valid_lft; ifp->prefered_lft = prefered_lft; - ifp->tstamp = now; + WRITE_ONCE(ifp->tstamp, now); flags = ifp->flags; ifp->flags &= ~IFA_F_DEPRECATED; spin_unlock_bh(&ifp->lock); @@ -2722,6 +2755,7 @@ EXPORT_SYMBOL_GPL(addrconf_prefix_rcv_add_addr); void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) { struct prefix_info *pinfo; + struct fib6_table *table; __u32 valid_lft; __u32 prefered_lft; int addr_type, err; @@ -2798,11 +2832,20 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) if (valid_lft == 0) { ip6_del_rt(net, rt, false); rt = NULL; - } else if (addrconf_finite_timeout(rt_expires)) { - /* not infinity */ - fib6_set_expires(rt, jiffies + rt_expires); } else { - fib6_clean_expires(rt); + table = rt->fib6_table; + spin_lock_bh(&table->tb6_lock); + + if (addrconf_finite_timeout(rt_expires)) { + /* not infinity */ + fib6_set_expires(rt, jiffies + rt_expires); + fib6_add_gc_list(rt); + } else { + fib6_clean_expires(rt); + fib6_remove_gc_list(rt); + } + + spin_unlock_bh(&table->tb6_lock); } } else if (valid_lft) { clock_t expires = 0; @@ -2877,7 +2920,7 @@ put: static int addrconf_set_sit_dstaddr(struct net *net, struct net_device *dev, struct in6_ifreq *ireq) { - struct ip_tunnel_parm p = { }; + struct ip_tunnel_parm_kern p = { }; int err; if (!(ipv6_addr_type(&ireq->ifr6_addr) & IPV6_ADDR_COMPATv4)) @@ -3263,8 +3306,8 @@ void addrconf_add_linklocal(struct inet6_dev *idev, struct inet6_ifaddr *ifp; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - if ((dev_net(idev->dev)->ipv6.devconf_all->optimistic_dad || - idev->cnf.optimistic_dad) && + if ((READ_ONCE(dev_net(idev->dev)->ipv6.devconf_all->optimistic_dad) || + READ_ONCE(idev->cnf.optimistic_dad)) && !dev_net(idev->dev)->ipv6.devconf_all->forwarding) cfg.ifa_flags |= IFA_F_OPTIMISTIC; #endif @@ -3443,7 +3486,8 @@ static void addrconf_dev_config(struct net_device *dev) /* this device type has no EUI support */ if (dev->type == ARPHRD_NONE && idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) - idev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_RANDOM; + WRITE_ONCE(idev->cnf.addr_gen_mode, + IN6_ADDR_GEN_MODE_RANDOM); addrconf_addr_gen(idev, false); } @@ -3621,7 +3665,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (idev) { rt6_mtu_change(dev, dev->mtu); - idev->cnf.mtu6 = dev->mtu; + WRITE_ONCE(idev->cnf.mtu6, dev->mtu); break; } @@ -3713,9 +3757,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (idev->cnf.mtu6 != dev->mtu && dev->mtu >= IPV6_MIN_MTU) { rt6_mtu_change(dev, dev->mtu); - idev->cnf.mtu6 = dev->mtu; + WRITE_ONCE(idev->cnf.mtu6, dev->mtu); } - idev->tstamp = jiffies; + WRITE_ONCE(idev->tstamp, jiffies); inet6_ifinfo_notify(RTM_NEWLINK, idev); /* @@ -3835,10 +3879,10 @@ static int addrconf_ifdown(struct net_device *dev, bool unregister) */ if (!unregister && !idev->cnf.disable_ipv6) { /* aggregate the system setting and interface setting */ - int _keep_addr = net->ipv6.devconf_all->keep_addr_on_down; + int _keep_addr = READ_ONCE(net->ipv6.devconf_all->keep_addr_on_down); if (!_keep_addr) - _keep_addr = idev->cnf.keep_addr_on_down; + _keep_addr = READ_ONCE(idev->cnf.keep_addr_on_down); keep_addr = (_keep_addr > 0); } @@ -3957,7 +4001,7 @@ restart: ipv6_mc_down(idev); } - idev->tstamp = jiffies; + WRITE_ONCE(idev->tstamp, jiffies); idev->ra_mtu = 0; /* Last: Shot the device (if unregistered) */ @@ -3975,6 +4019,7 @@ static void addrconf_rs_timer(struct timer_list *t) struct inet6_dev *idev = from_timer(idev, t, rs_timer); struct net_device *dev = idev->dev; struct in6_addr lladdr; + int rtr_solicits; write_lock(&idev->lock); if (idev->dead || !(idev->if_flags & IF_READY)) @@ -3987,7 +4032,9 @@ static void addrconf_rs_timer(struct timer_list *t) if (idev->if_flags & IF_RA_RCVD) goto out; - if (idev->rs_probes++ < idev->cnf.rtr_solicits || idev->cnf.rtr_solicits < 0) { + rtr_solicits = READ_ONCE(idev->cnf.rtr_solicits); + + if (idev->rs_probes++ < rtr_solicits || rtr_solicits < 0) { write_unlock(&idev->lock); if (!ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE)) ndisc_send_rs(dev, &lladdr, @@ -3997,11 +4044,12 @@ static void addrconf_rs_timer(struct timer_list *t) write_lock(&idev->lock); idev->rs_interval = rfc3315_s14_backoff_update( - idev->rs_interval, idev->cnf.rtr_solicit_max_interval); + idev->rs_interval, + READ_ONCE(idev->cnf.rtr_solicit_max_interval)); /* The wait after the last probe can be shorter */ addrconf_mod_rs_timer(idev, (idev->rs_probes == - idev->cnf.rtr_solicits) ? - idev->cnf.rtr_solicit_delay : + READ_ONCE(idev->cnf.rtr_solicits)) ? + READ_ONCE(idev->cnf.rtr_solicit_delay) : idev->rs_interval); } else { /* @@ -4022,24 +4070,25 @@ put: */ static void addrconf_dad_kick(struct inet6_ifaddr *ifp) { - unsigned long rand_num; struct inet6_dev *idev = ifp->idev; + unsigned long rand_num; u64 nonce; if (ifp->flags & IFA_F_OPTIMISTIC) rand_num = 0; else - rand_num = get_random_u32_below(idev->cnf.rtr_solicit_delay ? : 1); + rand_num = get_random_u32_below( + READ_ONCE(idev->cnf.rtr_solicit_delay) ? : 1); nonce = 0; - if (idev->cnf.enhanced_dad || - dev_net(idev->dev)->ipv6.devconf_all->enhanced_dad) { + if (READ_ONCE(idev->cnf.enhanced_dad) || + READ_ONCE(dev_net(idev->dev)->ipv6.devconf_all->enhanced_dad)) { do get_random_bytes(&nonce, 6); while (nonce == 0); } ifp->dad_nonce = nonce; - ifp->dad_probes = idev->cnf.dad_transmits; + ifp->dad_probes = READ_ONCE(idev->cnf.dad_transmits); addrconf_mod_dad_work(ifp, rand_num); } @@ -4059,8 +4108,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) net = dev_net(dev); if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || - (net->ipv6.devconf_all->accept_dad < 1 && - idev->cnf.accept_dad < 1) || + (READ_ONCE(net->ipv6.devconf_all->accept_dad) < 1 && + READ_ONCE(idev->cnf.accept_dad) < 1) || !(ifp->flags&IFA_F_TENTATIVE) || ifp->flags & IFA_F_NODAD) { bool send_na = false; @@ -4152,8 +4201,8 @@ static void addrconf_dad_work(struct work_struct *w) action = DAD_ABORT; ifp->state = INET6_IFADDR_STATE_POSTDAD; - if ((dev_net(idev->dev)->ipv6.devconf_all->accept_dad > 1 || - idev->cnf.accept_dad > 1) && + if ((READ_ONCE(dev_net(idev->dev)->ipv6.devconf_all->accept_dad) > 1 || + READ_ONCE(idev->cnf.accept_dad) > 1) && !idev->cnf.disable_ipv6 && !(ifp->flags & IFA_F_STABLE_PRIVACY)) { struct in6_addr addr; @@ -4164,7 +4213,7 @@ static void addrconf_dad_work(struct work_struct *w) if (!ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) && ipv6_addr_equal(&ifp->addr, &addr)) { /* DAD failed for link-local based on MAC */ - idev->cnf.disable_ipv6 = 1; + WRITE_ONCE(idev->cnf.disable_ipv6, 1); pr_info("%s: IPv6 being disabled!\n", ifp->idev->dev->name); @@ -4278,7 +4327,7 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id, send_mld = ifp->scope == IFA_LINK && ipv6_lonely_lladdr(ifp); send_rs = send_mld && ipv6_accept_ra(ifp->idev) && - ifp->idev->cnf.rtr_solicits != 0 && + READ_ONCE(ifp->idev->cnf.rtr_solicits) != 0 && (dev->flags & IFF_LOOPBACK) == 0 && (dev->type != ARPHRD_TUNNEL) && !netif_is_team_port(dev); @@ -4292,8 +4341,8 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id, /* send unsolicited NA if enabled */ if (send_na && - (ifp->idev->cnf.ndisc_notify || - dev_net(dev)->ipv6.devconf_all->ndisc_notify)) { + (READ_ONCE(ifp->idev->cnf.ndisc_notify) || + READ_ONCE(dev_net(dev)->ipv6.devconf_all->ndisc_notify))) { ndisc_send_na(dev, &in6addr_linklocal_allnodes, &ifp->addr, /*router=*/ !!ifp->idev->cnf.forwarding, /*solicited=*/ false, /*override=*/ true, @@ -4313,7 +4362,7 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id, write_lock_bh(&ifp->idev->lock); spin_lock(&ifp->lock); ifp->idev->rs_interval = rfc3315_s14_backoff_init( - ifp->idev->cnf.rtr_solicit_interval); + READ_ONCE(ifp->idev->cnf.rtr_solicit_interval)); ifp->idev->rs_probes = 1; ifp->idev->if_flags |= IF_RS_SENT; addrconf_mod_rs_timer(ifp->idev, ifp->idev->rs_interval); @@ -4593,9 +4642,7 @@ restart: !ifp->regen_count && ifp->ifpub) { /* This is a non-regenerated temporary addr. */ - unsigned long regen_advance = ifp->idev->cnf.regen_max_retry * - ifp->idev->cnf.dad_transmits * - max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME), HZ/100) / HZ; + unsigned long regen_advance = ipv6_get_regen_advance(ifp->idev); if (age + regen_advance >= ifp->prefered_lft) { struct inet6_ifaddr *ifpub = ifp->ifpub; @@ -4757,6 +4804,7 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, u32 flags, bool modify_peer) { + struct fib6_table *table; struct fib6_info *f6i; u32 prio; @@ -4777,10 +4825,18 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp, ifp->rt_priority, ifp->idev->dev, expires, flags, GFP_KERNEL); } else { - if (!expires) + table = f6i->fib6_table; + spin_lock_bh(&table->tb6_lock); + + if (!(flags & RTF_EXPIRES)) { fib6_clean_expires(f6i); - else + fib6_remove_gc_list(f6i); + } else { fib6_set_expires(f6i, expires); + fib6_add_gc_list(f6i); + } + + spin_unlock_bh(&table->tb6_lock); fib6_info_release(f6i); } @@ -4843,13 +4899,13 @@ static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp, IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR | IFA_F_NOPREFIXROUTE); ifp->flags |= cfg->ifa_flags; - ifp->tstamp = jiffies; - ifp->valid_lft = cfg->valid_lft; - ifp->prefered_lft = cfg->preferred_lft; - ifp->ifa_proto = cfg->ifa_proto; + WRITE_ONCE(ifp->tstamp, jiffies); + WRITE_ONCE(ifp->valid_lft, cfg->valid_lft); + WRITE_ONCE(ifp->prefered_lft, cfg->preferred_lft); + WRITE_ONCE(ifp->ifa_proto, cfg->ifa_proto); if (cfg->rt_priority && cfg->rt_priority != ifp->rt_priority) - ifp->rt_priority = cfg->rt_priority; + WRITE_ONCE(ifp->rt_priority, cfg->rt_priority); if (new_peer) ifp->peer_addr = *cfg->peer_pfx; @@ -5070,17 +5126,21 @@ struct inet6_fill_args { enum addr_type_t type; }; -static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, +static int inet6_fill_ifaddr(struct sk_buff *skb, + const struct inet6_ifaddr *ifa, struct inet6_fill_args *args) { - struct nlmsghdr *nlh; + struct nlmsghdr *nlh; u32 preferred, valid; + u32 flags, priority; + u8 proto; nlh = nlmsg_put(skb, args->portid, args->seq, args->event, sizeof(struct ifaddrmsg), args->flags); if (!nlh) return -EMSGSIZE; + flags = READ_ONCE(ifa->flags); put_ifaddrmsg(nlh, ifa->prefix_len, ifa->flags, rt_scope(ifa->scope), ifa->idev->dev->ifindex); @@ -5088,13 +5148,14 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) goto error; - spin_lock_bh(&ifa->lock); - if (!((ifa->flags&IFA_F_PERMANENT) && - (ifa->prefered_lft == INFINITY_LIFE_TIME))) { - preferred = ifa->prefered_lft; - valid = ifa->valid_lft; + preferred = READ_ONCE(ifa->prefered_lft); + valid = READ_ONCE(ifa->valid_lft); + + if (!((flags & IFA_F_PERMANENT) && + (preferred == INFINITY_LIFE_TIME))) { if (preferred != INFINITY_LIFE_TIME) { - long tval = (jiffies - ifa->tstamp)/HZ; + long tval = (jiffies - READ_ONCE(ifa->tstamp)) / HZ; + if (preferred > tval) preferred -= tval; else @@ -5110,28 +5171,29 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, preferred = INFINITY_LIFE_TIME; valid = INFINITY_LIFE_TIME; } - spin_unlock_bh(&ifa->lock); if (!ipv6_addr_any(&ifa->peer_addr)) { if (nla_put_in6_addr(skb, IFA_LOCAL, &ifa->addr) < 0 || nla_put_in6_addr(skb, IFA_ADDRESS, &ifa->peer_addr) < 0) goto error; - } else + } else { if (nla_put_in6_addr(skb, IFA_ADDRESS, &ifa->addr) < 0) goto error; + } - if (ifa->rt_priority && - nla_put_u32(skb, IFA_RT_PRIORITY, ifa->rt_priority)) + priority = READ_ONCE(ifa->rt_priority); + if (priority && nla_put_u32(skb, IFA_RT_PRIORITY, priority)) goto error; - if (put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0) + if (put_cacheinfo(skb, ifa->cstamp, READ_ONCE(ifa->tstamp), + preferred, valid) < 0) goto error; - if (nla_put_u32(skb, IFA_FLAGS, ifa->flags) < 0) + if (nla_put_u32(skb, IFA_FLAGS, flags) < 0) goto error; - if (ifa->ifa_proto && - nla_put_u8(skb, IFA_PROTO, ifa->ifa_proto)) + proto = READ_ONCE(ifa->ifa_proto); + if (proto && nla_put_u8(skb, IFA_PROTO, proto)) goto error; nlmsg_end(skb, nlh); @@ -5142,12 +5204,13 @@ error: return -EMSGSIZE; } -static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, +static int inet6_fill_ifmcaddr(struct sk_buff *skb, + const struct ifmcaddr6 *ifmca, struct inet6_fill_args *args) { - struct nlmsghdr *nlh; - u8 scope = RT_SCOPE_UNIVERSE; int ifindex = ifmca->idev->dev->ifindex; + u8 scope = RT_SCOPE_UNIVERSE; + struct nlmsghdr *nlh; if (ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE) scope = RT_SCOPE_SITE; @@ -5165,7 +5228,7 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex); if (nla_put_in6_addr(skb, IFA_MULTICAST, &ifmca->mca_addr) < 0 || - put_cacheinfo(skb, ifmca->mca_cstamp, ifmca->mca_tstamp, + put_cacheinfo(skb, ifmca->mca_cstamp, READ_ONCE(ifmca->mca_tstamp), INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) { nlmsg_cancel(skb, nlh); return -EMSGSIZE; @@ -5175,13 +5238,14 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, return 0; } -static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, +static int inet6_fill_ifacaddr(struct sk_buff *skb, + const struct ifacaddr6 *ifaca, struct inet6_fill_args *args) { struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt); int ifindex = dev ? dev->ifindex : 1; - struct nlmsghdr *nlh; u8 scope = RT_SCOPE_UNIVERSE; + struct nlmsghdr *nlh; if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE) scope = RT_SCOPE_SITE; @@ -5199,7 +5263,7 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex); if (nla_put_in6_addr(skb, IFA_ANYCAST, &ifaca->aca_addr) < 0 || - put_cacheinfo(skb, ifaca->aca_cstamp, ifaca->aca_tstamp, + put_cacheinfo(skb, ifaca->aca_cstamp, READ_ONCE(ifaca->aca_tstamp), INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) { nlmsg_cancel(skb, nlh); return -EMSGSIZE; @@ -5210,24 +5274,23 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, } /* called with rcu_read_lock() */ -static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb, - struct netlink_callback *cb, int s_ip_idx, +static int in6_dump_addrs(const struct inet6_dev *idev, struct sk_buff *skb, + struct netlink_callback *cb, int *s_ip_idx, struct inet6_fill_args *fillargs) { - struct ifmcaddr6 *ifmca; - struct ifacaddr6 *ifaca; + const struct ifmcaddr6 *ifmca; + const struct ifacaddr6 *ifaca; int ip_idx = 0; - int err = 1; + int err = 0; - read_lock_bh(&idev->lock); switch (fillargs->type) { case UNICAST_ADDR: { - struct inet6_ifaddr *ifa; + const struct inet6_ifaddr *ifa; fillargs->event = RTM_NEWADDR; /* unicast address incl. temp addr */ - list_for_each_entry(ifa, &idev->addr_list, if_list) { - if (ip_idx < s_ip_idx) + list_for_each_entry_rcu(ifa, &idev->addr_list, if_list) { + if (ip_idx < *s_ip_idx) goto next; err = inet6_fill_ifaddr(skb, ifa, fillargs); if (err < 0) @@ -5239,27 +5302,25 @@ next: break; } case MULTICAST_ADDR: - read_unlock_bh(&idev->lock); fillargs->event = RTM_GETMULTICAST; /* multicast address */ - for (ifmca = rtnl_dereference(idev->mc_list); + for (ifmca = rcu_dereference(idev->mc_list); ifmca; - ifmca = rtnl_dereference(ifmca->next), ip_idx++) { - if (ip_idx < s_ip_idx) + ifmca = rcu_dereference(ifmca->next), ip_idx++) { + if (ip_idx < *s_ip_idx) continue; err = inet6_fill_ifmcaddr(skb, ifmca, fillargs); if (err < 0) break; } - read_lock_bh(&idev->lock); break; case ANYCAST_ADDR: fillargs->event = RTM_GETANYCAST; /* anycast address */ - for (ifaca = idev->ac_list; ifaca; - ifaca = ifaca->aca_next, ip_idx++) { - if (ip_idx < s_ip_idx) + for (ifaca = rcu_dereference(idev->ac_list); ifaca; + ifaca = rcu_dereference(ifaca->aca_next), ip_idx++) { + if (ip_idx < *s_ip_idx) continue; err = inet6_fill_ifacaddr(skb, ifaca, fillargs); if (err < 0) @@ -5269,8 +5330,7 @@ next: default: break; } - read_unlock_bh(&idev->lock); - cb->args[2] = ip_idx; + *s_ip_idx = err ? ip_idx : 0; return err; } @@ -5333,6 +5393,7 @@ static int inet6_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, enum addr_type_t type) { + struct net *tgt_net = sock_net(skb->sk); const struct nlmsghdr *nlh = cb->nlh; struct inet6_fill_args fillargs = { .portid = NETLINK_CB(cb->skb).portid, @@ -5341,72 +5402,53 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, .netnsid = -1, .type = type, }; - struct net *tgt_net = sock_net(skb->sk); - int idx, s_idx, s_ip_idx; - int h, s_h; + struct { + unsigned long ifindex; + int ip_idx; + } *ctx = (void *)cb->ctx; struct net_device *dev; struct inet6_dev *idev; - struct hlist_head *head; int err = 0; - s_h = cb->args[0]; - s_idx = idx = cb->args[1]; - s_ip_idx = cb->args[2]; - + rcu_read_lock(); if (cb->strict_check) { err = inet6_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net, skb->sk, cb); if (err < 0) - goto put_tgt_net; + goto done; err = 0; if (fillargs.ifindex) { - dev = __dev_get_by_index(tgt_net, fillargs.ifindex); + dev = dev_get_by_index_rcu(tgt_net, fillargs.ifindex); if (!dev) { err = -ENODEV; - goto put_tgt_net; + goto done; } idev = __in6_dev_get(dev); - if (idev) { - err = in6_dump_addrs(idev, skb, cb, s_ip_idx, + if (idev) + err = in6_dump_addrs(idev, skb, cb, + &ctx->ip_idx, &fillargs); - if (err > 0) - err = 0; - } - goto put_tgt_net; + goto done; } } - rcu_read_lock(); cb->seq = inet6_base_seq(tgt_net); - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &tgt_net->dev_index_head[h]; - hlist_for_each_entry_rcu(dev, head, index_hlist) { - if (idx < s_idx) - goto cont; - if (h > s_h || idx > s_idx) - s_ip_idx = 0; - idev = __in6_dev_get(dev); - if (!idev) - goto cont; - - if (in6_dump_addrs(idev, skb, cb, s_ip_idx, - &fillargs) < 0) - goto done; -cont: - idx++; - } + for_each_netdev_dump(tgt_net, dev, ctx->ifindex) { + idev = __in6_dev_get(dev); + if (!idev) + continue; + err = in6_dump_addrs(idev, skb, cb, &ctx->ip_idx, + &fillargs); + if (err < 0) + goto done; } done: rcu_read_unlock(); - cb->args[0] = h; - cb->args[1] = idx; -put_tgt_net: if (fillargs.netnsid >= 0) put_net(tgt_net); - return skb->len ? : err; + return err; } static int inet6_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) @@ -5579,87 +5621,97 @@ errout: rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err); } -static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, - __s32 *array, int bytes) +static void ipv6_store_devconf(const struct ipv6_devconf *cnf, + __s32 *array, int bytes) { BUG_ON(bytes < (DEVCONF_MAX * 4)); memset(array, 0, bytes); - array[DEVCONF_FORWARDING] = cnf->forwarding; - array[DEVCONF_HOPLIMIT] = cnf->hop_limit; - array[DEVCONF_MTU6] = cnf->mtu6; - array[DEVCONF_ACCEPT_RA] = cnf->accept_ra; - array[DEVCONF_ACCEPT_REDIRECTS] = cnf->accept_redirects; - array[DEVCONF_AUTOCONF] = cnf->autoconf; - array[DEVCONF_DAD_TRANSMITS] = cnf->dad_transmits; - array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits; + array[DEVCONF_FORWARDING] = READ_ONCE(cnf->forwarding); + array[DEVCONF_HOPLIMIT] = READ_ONCE(cnf->hop_limit); + array[DEVCONF_MTU6] = READ_ONCE(cnf->mtu6); + array[DEVCONF_ACCEPT_RA] = READ_ONCE(cnf->accept_ra); + array[DEVCONF_ACCEPT_REDIRECTS] = READ_ONCE(cnf->accept_redirects); + array[DEVCONF_AUTOCONF] = READ_ONCE(cnf->autoconf); + array[DEVCONF_DAD_TRANSMITS] = READ_ONCE(cnf->dad_transmits); + array[DEVCONF_RTR_SOLICITS] = READ_ONCE(cnf->rtr_solicits); array[DEVCONF_RTR_SOLICIT_INTERVAL] = - jiffies_to_msecs(cnf->rtr_solicit_interval); + jiffies_to_msecs(READ_ONCE(cnf->rtr_solicit_interval)); array[DEVCONF_RTR_SOLICIT_MAX_INTERVAL] = - jiffies_to_msecs(cnf->rtr_solicit_max_interval); + jiffies_to_msecs(READ_ONCE(cnf->rtr_solicit_max_interval)); array[DEVCONF_RTR_SOLICIT_DELAY] = - jiffies_to_msecs(cnf->rtr_solicit_delay); - array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version; + jiffies_to_msecs(READ_ONCE(cnf->rtr_solicit_delay)); + array[DEVCONF_FORCE_MLD_VERSION] = READ_ONCE(cnf->force_mld_version); array[DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL] = - jiffies_to_msecs(cnf->mldv1_unsolicited_report_interval); + jiffies_to_msecs(READ_ONCE(cnf->mldv1_unsolicited_report_interval)); array[DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL] = - jiffies_to_msecs(cnf->mldv2_unsolicited_report_interval); - array[DEVCONF_USE_TEMPADDR] = cnf->use_tempaddr; - array[DEVCONF_TEMP_VALID_LFT] = cnf->temp_valid_lft; - array[DEVCONF_TEMP_PREFERED_LFT] = cnf->temp_prefered_lft; - array[DEVCONF_REGEN_MAX_RETRY] = cnf->regen_max_retry; - array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor; - array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses; - array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr; - array[DEVCONF_RA_DEFRTR_METRIC] = cnf->ra_defrtr_metric; - array[DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT] = cnf->accept_ra_min_hop_limit; - array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo; + jiffies_to_msecs(READ_ONCE(cnf->mldv2_unsolicited_report_interval)); + array[DEVCONF_USE_TEMPADDR] = READ_ONCE(cnf->use_tempaddr); + array[DEVCONF_TEMP_VALID_LFT] = READ_ONCE(cnf->temp_valid_lft); + array[DEVCONF_TEMP_PREFERED_LFT] = READ_ONCE(cnf->temp_prefered_lft); + array[DEVCONF_REGEN_MAX_RETRY] = READ_ONCE(cnf->regen_max_retry); + array[DEVCONF_MAX_DESYNC_FACTOR] = READ_ONCE(cnf->max_desync_factor); + array[DEVCONF_MAX_ADDRESSES] = READ_ONCE(cnf->max_addresses); + array[DEVCONF_ACCEPT_RA_DEFRTR] = READ_ONCE(cnf->accept_ra_defrtr); + array[DEVCONF_RA_DEFRTR_METRIC] = READ_ONCE(cnf->ra_defrtr_metric); + array[DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT] = + READ_ONCE(cnf->accept_ra_min_hop_limit); + array[DEVCONF_ACCEPT_RA_PINFO] = READ_ONCE(cnf->accept_ra_pinfo); #ifdef CONFIG_IPV6_ROUTER_PREF - array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref; + array[DEVCONF_ACCEPT_RA_RTR_PREF] = READ_ONCE(cnf->accept_ra_rtr_pref); array[DEVCONF_RTR_PROBE_INTERVAL] = - jiffies_to_msecs(cnf->rtr_probe_interval); + jiffies_to_msecs(READ_ONCE(cnf->rtr_probe_interval)); #ifdef CONFIG_IPV6_ROUTE_INFO - array[DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN] = cnf->accept_ra_rt_info_min_plen; - array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen; + array[DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN] = + READ_ONCE(cnf->accept_ra_rt_info_min_plen); + array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = + READ_ONCE(cnf->accept_ra_rt_info_max_plen); #endif #endif - array[DEVCONF_PROXY_NDP] = cnf->proxy_ndp; - array[DEVCONF_ACCEPT_SOURCE_ROUTE] = cnf->accept_source_route; + array[DEVCONF_PROXY_NDP] = READ_ONCE(cnf->proxy_ndp); + array[DEVCONF_ACCEPT_SOURCE_ROUTE] = + READ_ONCE(cnf->accept_source_route); #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - array[DEVCONF_OPTIMISTIC_DAD] = cnf->optimistic_dad; - array[DEVCONF_USE_OPTIMISTIC] = cnf->use_optimistic; + array[DEVCONF_OPTIMISTIC_DAD] = READ_ONCE(cnf->optimistic_dad); + array[DEVCONF_USE_OPTIMISTIC] = READ_ONCE(cnf->use_optimistic); #endif #ifdef CONFIG_IPV6_MROUTE array[DEVCONF_MC_FORWARDING] = atomic_read(&cnf->mc_forwarding); #endif - array[DEVCONF_DISABLE_IPV6] = cnf->disable_ipv6; - array[DEVCONF_ACCEPT_DAD] = cnf->accept_dad; - array[DEVCONF_FORCE_TLLAO] = cnf->force_tllao; - array[DEVCONF_NDISC_NOTIFY] = cnf->ndisc_notify; - array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc; - array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local; - array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu; - array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] = cnf->ignore_routes_with_linkdown; + array[DEVCONF_DISABLE_IPV6] = READ_ONCE(cnf->disable_ipv6); + array[DEVCONF_ACCEPT_DAD] = READ_ONCE(cnf->accept_dad); + array[DEVCONF_FORCE_TLLAO] = READ_ONCE(cnf->force_tllao); + array[DEVCONF_NDISC_NOTIFY] = READ_ONCE(cnf->ndisc_notify); + array[DEVCONF_SUPPRESS_FRAG_NDISC] = + READ_ONCE(cnf->suppress_frag_ndisc); + array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = + READ_ONCE(cnf->accept_ra_from_local); + array[DEVCONF_ACCEPT_RA_MTU] = READ_ONCE(cnf->accept_ra_mtu); + array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] = + READ_ONCE(cnf->ignore_routes_with_linkdown); /* we omit DEVCONF_STABLE_SECRET for now */ - array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only; - array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast; - array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na; - array[DEVCONF_KEEP_ADDR_ON_DOWN] = cnf->keep_addr_on_down; - array[DEVCONF_SEG6_ENABLED] = cnf->seg6_enabled; + array[DEVCONF_USE_OIF_ADDRS_ONLY] = READ_ONCE(cnf->use_oif_addrs_only); + array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = + READ_ONCE(cnf->drop_unicast_in_l2_multicast); + array[DEVCONF_DROP_UNSOLICITED_NA] = READ_ONCE(cnf->drop_unsolicited_na); + array[DEVCONF_KEEP_ADDR_ON_DOWN] = READ_ONCE(cnf->keep_addr_on_down); + array[DEVCONF_SEG6_ENABLED] = READ_ONCE(cnf->seg6_enabled); #ifdef CONFIG_IPV6_SEG6_HMAC - array[DEVCONF_SEG6_REQUIRE_HMAC] = cnf->seg6_require_hmac; + array[DEVCONF_SEG6_REQUIRE_HMAC] = READ_ONCE(cnf->seg6_require_hmac); #endif - array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad; - array[DEVCONF_ADDR_GEN_MODE] = cnf->addr_gen_mode; - array[DEVCONF_DISABLE_POLICY] = cnf->disable_policy; - array[DEVCONF_NDISC_TCLASS] = cnf->ndisc_tclass; - array[DEVCONF_RPL_SEG_ENABLED] = cnf->rpl_seg_enabled; - array[DEVCONF_IOAM6_ENABLED] = cnf->ioam6_enabled; - array[DEVCONF_IOAM6_ID] = cnf->ioam6_id; - array[DEVCONF_IOAM6_ID_WIDE] = cnf->ioam6_id_wide; - array[DEVCONF_NDISC_EVICT_NOCARRIER] = cnf->ndisc_evict_nocarrier; - array[DEVCONF_ACCEPT_UNTRACKED_NA] = cnf->accept_untracked_na; - array[DEVCONF_ACCEPT_RA_MIN_LFT] = cnf->accept_ra_min_lft; + array[DEVCONF_ENHANCED_DAD] = READ_ONCE(cnf->enhanced_dad); + array[DEVCONF_ADDR_GEN_MODE] = READ_ONCE(cnf->addr_gen_mode); + array[DEVCONF_DISABLE_POLICY] = READ_ONCE(cnf->disable_policy); + array[DEVCONF_NDISC_TCLASS] = READ_ONCE(cnf->ndisc_tclass); + array[DEVCONF_RPL_SEG_ENABLED] = READ_ONCE(cnf->rpl_seg_enabled); + array[DEVCONF_IOAM6_ENABLED] = READ_ONCE(cnf->ioam6_enabled); + array[DEVCONF_IOAM6_ID] = READ_ONCE(cnf->ioam6_id); + array[DEVCONF_IOAM6_ID_WIDE] = READ_ONCE(cnf->ioam6_id_wide); + array[DEVCONF_NDISC_EVICT_NOCARRIER] = + READ_ONCE(cnf->ndisc_evict_nocarrier); + array[DEVCONF_ACCEPT_UNTRACKED_NA] = + READ_ONCE(cnf->accept_untracked_na); + array[DEVCONF_ACCEPT_RA_MIN_LFT] = READ_ONCE(cnf->accept_ra_min_lft); } static inline size_t inet6_ifla6_size(void) @@ -5739,13 +5791,14 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype, static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev, u32 ext_filter_mask) { - struct nlattr *nla; struct ifla_cacheinfo ci; + struct nlattr *nla; + u32 ra_mtu; - if (nla_put_u32(skb, IFLA_INET6_FLAGS, idev->if_flags)) + if (nla_put_u32(skb, IFLA_INET6_FLAGS, READ_ONCE(idev->if_flags))) goto nla_put_failure; ci.max_reasm_len = IPV6_MAXPLEN; - ci.tstamp = cstamp_delta(idev->tstamp); + ci.tstamp = cstamp_delta(READ_ONCE(idev->tstamp)); ci.reachable_time = jiffies_to_msecs(idev->nd_parms->reachable_time); ci.retrans_time = jiffies_to_msecs(NEIGH_VAR(idev->nd_parms, RETRANS_TIME)); if (nla_put(skb, IFLA_INET6_CACHEINFO, sizeof(ci), &ci)) @@ -5777,11 +5830,12 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev, memcpy(nla_data(nla), idev->token.s6_addr, nla_len(nla)); read_unlock_bh(&idev->lock); - if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode)) + if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, + READ_ONCE(idev->cnf.addr_gen_mode))) goto nla_put_failure; - if (idev->ra_mtu && - nla_put_u32(skb, IFLA_INET6_RA_MTU, idev->ra_mtu)) + ra_mtu = READ_ONCE(idev->ra_mtu); + if (ra_mtu && nla_put_u32(skb, IFLA_INET6_RA_MTU, ra_mtu)) goto nla_put_failure; return 0; @@ -5843,7 +5897,7 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token, return -EINVAL; } - if (idev->cnf.rtr_solicits == 0) { + if (READ_ONCE(idev->cnf.rtr_solicits) == 0) { NL_SET_ERR_MSG(extack, "Router solicitation is disabled on device"); return -EINVAL; @@ -5876,7 +5930,7 @@ update_lft: if (update_rs) { idev->if_flags |= IF_RS_SENT; idev->rs_interval = rfc3315_s14_backoff_init( - idev->cnf.rtr_solicit_interval); + READ_ONCE(idev->cnf.rtr_solicit_interval)); idev->rs_probes = 1; addrconf_mod_rs_timer(idev, idev->rs_interval); } @@ -5982,7 +6036,7 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla, if (tb[IFLA_INET6_ADDR_GEN_MODE]) { u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]); - idev->cnf.addr_gen_mode = mode; + WRITE_ONCE(idev->cnf.addr_gen_mode, mode); } return 0; @@ -5994,6 +6048,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, struct net_device *dev = idev->dev; struct ifinfomsg *hdr; struct nlmsghdr *nlh; + int ifindex, iflink; void *protoinfo; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*hdr), flags); @@ -6004,18 +6059,20 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, hdr->ifi_family = AF_INET6; hdr->__ifi_pad = 0; hdr->ifi_type = dev->type; - hdr->ifi_index = dev->ifindex; + ifindex = READ_ONCE(dev->ifindex); + hdr->ifi_index = ifindex; hdr->ifi_flags = dev_get_flags(dev); hdr->ifi_change = 0; + iflink = dev_get_iflink(dev); if (nla_put_string(skb, IFLA_IFNAME, dev->name) || (dev->addr_len && nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || - nla_put_u32(skb, IFLA_MTU, dev->mtu) || - (dev->ifindex != dev_get_iflink(dev) && - nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))) || + nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) || + (ifindex != iflink && + nla_put_u32(skb, IFLA_LINK, iflink)) || nla_put_u8(skb, IFLA_OPERSTATE, - netif_running(dev) ? dev->operstate : IF_OPER_DOWN)) + netif_running(dev) ? READ_ONCE(dev->operstate) : IF_OPER_DOWN)) goto nla_put_failure; protoinfo = nla_nest_start_noflag(skb, IFLA_PROTINFO); if (!protoinfo) @@ -6061,50 +6118,39 @@ static int inet6_valid_dump_ifinfo(const struct nlmsghdr *nlh, static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); - int h, s_h; - int idx = 0, s_idx; + struct { + unsigned long ifindex; + } *ctx = (void *)cb->ctx; struct net_device *dev; struct inet6_dev *idev; - struct hlist_head *head; + int err; /* only requests using strict checking can pass data to * influence the dump */ if (cb->strict_check) { - int err = inet6_valid_dump_ifinfo(cb->nlh, cb->extack); + err = inet6_valid_dump_ifinfo(cb->nlh, cb->extack); if (err < 0) return err; } - s_h = cb->args[0]; - s_idx = cb->args[1]; - + err = 0; rcu_read_lock(); - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &net->dev_index_head[h]; - hlist_for_each_entry_rcu(dev, head, index_hlist) { - if (idx < s_idx) - goto cont; - idev = __in6_dev_get(dev); - if (!idev) - goto cont; - if (inet6_fill_ifinfo(skb, idev, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWLINK, NLM_F_MULTI) < 0) - goto out; -cont: - idx++; - } + for_each_netdev_dump(net, dev, ctx->ifindex) { + idev = __in6_dev_get(dev); + if (!idev) + continue; + err = inet6_fill_ifinfo(skb, idev, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWLINK, NLM_F_MULTI); + if (err < 0) + break; } -out: rcu_read_unlock(); - cb->args[1] = idx; - cb->args[0] = h; - return skb->len; + return err; } void inet6_ifinfo_notify(int event, struct inet6_dev *idev) @@ -6325,7 +6371,8 @@ static void addrconf_disable_change(struct net *net, __s32 newf) idev = __in6_dev_get(dev); if (idev) { int changed = (!idev->cnf.disable_ipv6) ^ (!newf); - idev->cnf.disable_ipv6 = newf; + + WRITE_ONCE(idev->cnf.disable_ipv6, newf); if (changed) dev_disable_change(idev); } @@ -6334,23 +6381,22 @@ static void addrconf_disable_change(struct net *net, __s32 newf) static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf) { - struct net *net; + struct net *net = (struct net *)table->extra2; int old; + if (p == &net->ipv6.devconf_dflt->disable_ipv6) { + WRITE_ONCE(*p, newf); + return 0; + } + if (!rtnl_trylock()) return restart_syscall(); - net = (struct net *)table->extra2; old = *p; - *p = newf; - - if (p == &net->ipv6.devconf_dflt->disable_ipv6) { - rtnl_unlock(); - return 0; - } + WRITE_ONCE(*p, newf); if (p == &net->ipv6.devconf_all->disable_ipv6) { - net->ipv6.devconf_dflt->disable_ipv6 = newf; + WRITE_ONCE(net->ipv6.devconf_dflt->disable_ipv6, newf); addrconf_disable_change(net, newf); } else if ((!newf) ^ (!old)) dev_disable_change((struct inet6_dev *)table->extra1); @@ -6461,24 +6507,25 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, } if (idev->cnf.addr_gen_mode != new_val) { - idev->cnf.addr_gen_mode = new_val; + WRITE_ONCE(idev->cnf.addr_gen_mode, new_val); addrconf_init_auto_addrs(idev->dev); } } else if (&net->ipv6.devconf_all->addr_gen_mode == ctl->data) { struct net_device *dev; - net->ipv6.devconf_dflt->addr_gen_mode = new_val; + WRITE_ONCE(net->ipv6.devconf_dflt->addr_gen_mode, new_val); for_each_netdev(net, dev) { idev = __in6_dev_get(dev); if (idev && idev->cnf.addr_gen_mode != new_val) { - idev->cnf.addr_gen_mode = new_val; + WRITE_ONCE(idev->cnf.addr_gen_mode, + new_val); addrconf_init_auto_addrs(idev->dev); } } } - *((u32 *)ctl->data) = new_val; + WRITE_ONCE(*((u32 *)ctl->data), new_val); } out: @@ -6537,14 +6584,15 @@ static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, struct inet6_dev *idev = __in6_dev_get(dev); if (idev) { - idev->cnf.addr_gen_mode = - IN6_ADDR_GEN_MODE_STABLE_PRIVACY; + WRITE_ONCE(idev->cnf.addr_gen_mode, + IN6_ADDR_GEN_MODE_STABLE_PRIVACY); } } } else { struct inet6_dev *idev = ctl->extra1; - idev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY; + WRITE_ONCE(idev->cnf.addr_gen_mode, + IN6_ADDR_GEN_MODE_STABLE_PRIVACY); } out: @@ -6624,20 +6672,19 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) static int addrconf_disable_policy(struct ctl_table *ctl, int *valp, int val) { + struct net *net = (struct net *)ctl->extra2; struct inet6_dev *idev; - struct net *net; - - if (!rtnl_trylock()) - return restart_syscall(); - - *valp = val; - net = (struct net *)ctl->extra2; if (valp == &net->ipv6.devconf_dflt->disable_policy) { - rtnl_unlock(); + WRITE_ONCE(*valp, val); return 0; } + if (!rtnl_trylock()) + return restart_syscall(); + + WRITE_ONCE(*valp, val); + if (valp == &net->ipv6.devconf_all->disable_policy) { struct net_device *dev; @@ -6807,6 +6854,13 @@ static const struct ctl_table addrconf_sysctl[] = { .proc_handler = proc_dointvec, }, { + .procname = "regen_min_advance", + .data = &ipv6_devconf.regen_min_advance, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "regen_max_retry", .data = &ipv6_devconf.regen_max_retry, .maxlen = sizeof(int), @@ -7131,14 +7185,12 @@ static const struct ctl_table addrconf_sysctl[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, - { - /* sentinel */ - } }; static int __addrconf_sysctl_register(struct net *net, char *dev_name, struct inet6_dev *idev, struct ipv6_devconf *p) { + size_t table_size = ARRAY_SIZE(addrconf_sysctl); int i, ifindex; struct ctl_table *table; char path[sizeof("net/ipv6/conf/") + IFNAMSIZ]; @@ -7147,7 +7199,7 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name, if (!table) goto out; - for (i = 0; table[i].data; i++) { + for (i = 0; i < table_size; i++) { table[i].data += (char *)p - (char *)&ipv6_devconf; /* If one of these is already set, then it is not safe to * overwrite either of them: this makes proc_dointvec_minmax @@ -7162,7 +7214,7 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name, snprintf(path, sizeof(path), "net/ipv6/conf/%s", dev_name); p->sysctl_header = register_net_sysctl_sz(net, path, table, - ARRAY_SIZE(addrconf_sysctl)); + table_size); if (!p->sysctl_header) goto free; @@ -7185,7 +7237,7 @@ out: static void __addrconf_sysctl_unregister(struct net *net, struct ipv6_devconf *p, int ifindex) { - struct ctl_table *table; + const struct ctl_table *table; if (!p->sysctl_header) return; @@ -7366,7 +7418,8 @@ int __init addrconf_init(void) if (err < 0) goto out_addrlabel; - addrconf_wq = create_workqueue("ipv6_addrconf"); + /* All works using addrconf_wq need to lock rtnl. */ + addrconf_wq = create_singlethread_workqueue("ipv6_addrconf"); if (!addrconf_wq) { err = -ENOMEM; goto out_nowq; @@ -7389,7 +7442,7 @@ int __init addrconf_init(void) rtnl_af_register(&inet6_ops); err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETLINK, - NULL, inet6_dump_ifinfo, 0); + NULL, inet6_dump_ifinfo, RTNL_FLAG_DUMP_UNLOCKED); if (err < 0) goto errout; @@ -7403,21 +7456,25 @@ int __init addrconf_init(void) goto errout; err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDR, inet6_rtm_getaddr, inet6_dump_ifaddr, - RTNL_FLAG_DOIT_UNLOCKED); + RTNL_FLAG_DOIT_UNLOCKED | + RTNL_FLAG_DUMP_UNLOCKED); if (err < 0) goto errout; err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETMULTICAST, - NULL, inet6_dump_ifmcaddr, 0); + NULL, inet6_dump_ifmcaddr, + RTNL_FLAG_DUMP_UNLOCKED); if (err < 0) goto errout; err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETANYCAST, - NULL, inet6_dump_ifacaddr, 0); + NULL, inet6_dump_ifacaddr, + RTNL_FLAG_DUMP_UNLOCKED); if (err < 0) goto errout; err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETNETCONF, inet6_netconf_get_devconf, inet6_netconf_dump_devconf, - RTNL_FLAG_DOIT_UNLOCKED); + RTNL_FLAG_DOIT_UNLOCKED | + RTNL_FLAG_DUMP_UNLOCKED); if (err < 0) goto errout; err = ipv6_addr_label_rtnl_register(); diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 17ac45aa71..acd70b5992 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -234,7 +234,8 @@ static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp, hlist_add_head_rcu(&newp->list, &net->ipv6.ip6addrlbl_table.head); out: if (!ret) - net->ipv6.ip6addrlbl_table.seq++; + WRITE_ONCE(net->ipv6.ip6addrlbl_table.seq, + net->ipv6.ip6addrlbl_table.seq + 1); return ret; } @@ -445,7 +446,7 @@ static void ip6addrlbl_putmsg(struct nlmsghdr *nlh, }; static int ip6addrlbl_fill(struct sk_buff *skb, - struct ip6addrlbl_entry *p, + const struct ip6addrlbl_entry *p, u32 lseq, u32 portid, u32 seq, int event, unsigned int flags) @@ -498,7 +499,8 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) struct net *net = sock_net(skb->sk); struct ip6addrlbl_entry *p; int idx = 0, s_idx = cb->args[0]; - int err; + int err = 0; + u32 lseq; if (cb->strict_check) { err = ip6addrlbl_valid_dump_req(nlh, cb->extack); @@ -507,10 +509,11 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) } rcu_read_lock(); + lseq = READ_ONCE(net->ipv6.ip6addrlbl_table.seq); hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) { if (idx >= s_idx) { err = ip6addrlbl_fill(skb, p, - net->ipv6.ip6addrlbl_table.seq, + lseq, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWADDRLABEL, @@ -522,7 +525,7 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) } rcu_read_unlock(); cb->args[0] = idx; - return skb->len; + return err; } static inline int ip6addrlbl_msgsize(void) @@ -614,7 +617,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, rcu_read_lock(); p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); - lseq = net->ipv6.ip6addrlbl_table.seq; + lseq = READ_ONCE(net->ipv6.ip6addrlbl_table.seq); if (p) err = ip6addrlbl_fill(skb, p, lseq, NETLINK_CB(in_skb).portid, @@ -647,6 +650,7 @@ int __init ipv6_addr_label_rtnl_register(void) return ret; ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get, - ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED); + ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED | + RTNL_FLAG_DUMP_UNLOCKED); return ret; } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 959bfd9f63..8041dc181b 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -64,6 +64,7 @@ #include <net/xfrm.h> #include <net/ioam6.h> #include <net/rawv6.h> +#include <net/rps.h> #include <linux/uaccess.h> #include <linux/mroute6.h> @@ -736,7 +737,7 @@ const struct proto_ops inet6_dgram_ops = { .recvmsg = inet6_recvmsg, /* retpoline's sake */ .read_skb = udp_read_skb, .mmap = sock_no_mmap, - .set_peek_off = sk_set_peek_off, + .set_peek_off = udp_set_peek_off, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index bb17f484ee..0627c4c18d 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -252,9 +252,8 @@ static void aca_free_rcu(struct rcu_head *h) static void aca_put(struct ifacaddr6 *ac) { - if (refcount_dec_and_test(&ac->aca_refcnt)) { - call_rcu(&ac->rcu, aca_free_rcu); - } + if (refcount_dec_and_test(&ac->aca_refcnt)) + call_rcu_hurry(&ac->rcu, aca_free_rcu); } static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, @@ -296,7 +295,8 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) goto out; } - for (aca = idev->ac_list; aca; aca = aca->aca_next) { + for (aca = rtnl_dereference(idev->ac_list); aca; + aca = rtnl_dereference(aca->aca_next)) { if (ipv6_addr_equal(&aca->aca_addr, addr)) { aca->aca_users++; err = 0; @@ -317,13 +317,13 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) goto out; } - aca->aca_next = idev->ac_list; - idev->ac_list = aca; - /* Hold this for addrconf_join_solict() below before we unlock, * it is already exposed via idev->ac_list. */ aca_get(aca); + aca->aca_next = idev->ac_list; + rcu_assign_pointer(idev->ac_list, aca); + write_unlock_bh(&idev->lock); ipv6_add_acaddr_hash(net, aca); @@ -350,7 +350,8 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) write_lock_bh(&idev->lock); prev_aca = NULL; - for (aca = idev->ac_list; aca; aca = aca->aca_next) { + for (aca = rtnl_dereference(idev->ac_list); aca; + aca = rtnl_dereference(aca->aca_next)) { if (ipv6_addr_equal(&aca->aca_addr, addr)) break; prev_aca = aca; @@ -364,9 +365,9 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) return 0; } if (prev_aca) - prev_aca->aca_next = aca->aca_next; + rcu_assign_pointer(prev_aca->aca_next, aca->aca_next); else - idev->ac_list = aca->aca_next; + rcu_assign_pointer(idev->ac_list, aca->aca_next); write_unlock_bh(&idev->lock); ipv6_del_acaddr_hash(aca); addrconf_leave_solict(idev, &aca->aca_addr); @@ -392,8 +393,8 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev) struct ifacaddr6 *aca; write_lock_bh(&idev->lock); - while ((aca = idev->ac_list) != NULL) { - idev->ac_list = aca->aca_next; + while ((aca = rtnl_dereference(idev->ac_list)) != NULL) { + rcu_assign_pointer(idev->ac_list, aca->aca_next); write_unlock_bh(&idev->lock); ipv6_del_acaddr_hash(aca); @@ -420,11 +421,10 @@ static bool ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *ad idev = __in6_dev_get(dev); if (idev) { - read_lock_bh(&idev->lock); - for (aca = idev->ac_list; aca; aca = aca->aca_next) + for (aca = rcu_dereference(idev->ac_list); aca; + aca = rcu_dereference(aca->aca_next)) if (ipv6_addr_equal(&aca->aca_addr, addr)) break; - read_unlock_bh(&idev->lock); return aca != NULL; } return false; @@ -477,30 +477,25 @@ bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev, struct ac6_iter_state { struct seq_net_private p; struct net_device *dev; - struct inet6_dev *idev; }; #define ac6_seq_private(seq) ((struct ac6_iter_state *)(seq)->private) static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq) { - struct ifacaddr6 *im = NULL; struct ac6_iter_state *state = ac6_seq_private(seq); struct net *net = seq_file_net(seq); + struct ifacaddr6 *im = NULL; - state->idev = NULL; for_each_netdev_rcu(net, state->dev) { struct inet6_dev *idev; + idev = __in6_dev_get(state->dev); if (!idev) continue; - read_lock_bh(&idev->lock); - im = idev->ac_list; - if (im) { - state->idev = idev; + im = rcu_dereference(idev->ac_list); + if (im) break; - } - read_unlock_bh(&idev->lock); } return im; } @@ -508,22 +503,17 @@ static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq) static struct ifacaddr6 *ac6_get_next(struct seq_file *seq, struct ifacaddr6 *im) { struct ac6_iter_state *state = ac6_seq_private(seq); + struct inet6_dev *idev; - im = im->aca_next; + im = rcu_dereference(im->aca_next); while (!im) { - if (likely(state->idev != NULL)) - read_unlock_bh(&state->idev->lock); - state->dev = next_net_device_rcu(state->dev); - if (!state->dev) { - state->idev = NULL; + if (!state->dev) break; - } - state->idev = __in6_dev_get(state->dev); - if (!state->idev) + idev = __in6_dev_get(state->dev); + if (!idev) continue; - read_lock_bh(&state->idev->lock); - im = state->idev->ac_list; + im = rcu_dereference(idev->ac_list); } return im; } @@ -555,12 +545,6 @@ static void *ac6_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void ac6_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { - struct ac6_iter_state *state = ac6_seq_private(seq); - - if (likely(state->idev != NULL)) { - read_unlock_bh(&state->idev->lock); - state->idev = NULL; - } rcu_read_unlock(); } diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index 1578ed9e97..eb8ee1e937 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -657,11 +657,8 @@ static int calipso_map_cat_ntoh(const struct calipso_doi *doi_def, net_clen_bits, spot + 1, 1); - if (spot < 0) { - if (spot == -2) - return -EFAULT; + if (spot < 0) return 0; - } ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat, spot, diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 7371886d4f..3920e8aa10 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -36,6 +36,7 @@ #include <net/tcp.h> #include <net/espintcp.h> #include <net/inet6_hashtables.h> +#include <linux/skbuff_ref.h> #include <linux/highmem.h> @@ -131,7 +132,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) */ if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) - skb_page_unref(skb, sg_page(sg), false); + skb_page_unref(sg_page(sg), skb->pp_recycle); } #ifdef CONFIG_INET6_ESPINTCP @@ -255,8 +256,7 @@ static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb) #else static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb) { - kfree_skb(skb); - + WARN_ON(1); return -EOPNOTSUPP; } #endif @@ -383,7 +383,6 @@ static struct ip_esp_hdr *esp6_output_udp_encap(struct sk_buff *skb, __be16 dport) { struct udphdr *uh; - __be32 *udpdata32; unsigned int len; len = skb->len + esp->tailen - skb_transport_offset(skb); @@ -398,12 +397,6 @@ static struct ip_esp_hdr *esp6_output_udp_encap(struct sk_buff *skb, *skb_mac_header(skb) = IPPROTO_UDP; - if (encap_type == UDP_ENCAP_ESPINUDP_NON_IKE) { - udpdata32 = (__be32 *)(uh + 1); - udpdata32[0] = udpdata32[1] = 0; - return (struct ip_esp_hdr *)(udpdata32 + 2); - } - return (struct ip_esp_hdr *)(uh + 1); } @@ -459,7 +452,6 @@ static int esp6_output_encap(struct xfrm_state *x, struct sk_buff *skb, switch (encap_type) { default: case UDP_ENCAP_ESPINUDP: - case UDP_ENCAP_ESPINUDP_NON_IKE: esph = esp6_output_udp_encap(skb, encap_type, esp, sport, dport); break; case TCP_ENCAP_ESPINTCP: @@ -822,7 +814,6 @@ int esp6_input_done2(struct sk_buff *skb, int err) source = th->source; break; case UDP_ENCAP_ESPINUDP: - case UDP_ENCAP_ESPINUDP_NON_IKE: source = uh->source; break; default: @@ -1232,9 +1223,6 @@ static int esp6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) case UDP_ENCAP_ESPINUDP: x->props.header_len += sizeof(struct udphdr); break; - case UDP_ENCAP_ESPINUDP_NON_IKE: - x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32); - break; #ifdef CONFIG_INET6_ESPINTCP case TCP_ENCAP_ESPINTCP: /* only the length field, TCP encap is done by diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 527b7caddb..919ebfabbe 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -83,6 +83,13 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head, x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, (xfrm_address_t *)&ipv6_hdr(skb)->daddr, spi, IPPROTO_ESP, AF_INET6); + + if (unlikely(x && x->dir && x->dir != XFRM_SA_DIR_IN)) { + /* non-offload path will record the error and audit log */ + xfrm_state_put(x); + x = NULL; + } + if (!x) goto out_reset; diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 02e9ffb63a..6789623b2b 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -50,6 +50,7 @@ #endif #include <net/rpl.h> #include <linux/ioam6.h> +#include <linux/ioam6_genl.h> #include <net/ioam6.h> #include <net/dst_metadata.h> @@ -378,9 +379,8 @@ static int ipv6_srh_rcv(struct sk_buff *skb) idev = __in6_dev_get(skb->dev); - accept_seg6 = net->ipv6.devconf_all->seg6_enabled; - if (accept_seg6 > idev->cnf.seg6_enabled) - accept_seg6 = idev->cnf.seg6_enabled; + accept_seg6 = min(READ_ONCE(net->ipv6.devconf_all->seg6_enabled), + READ_ONCE(idev->cnf.seg6_enabled)); if (!accept_seg6) { kfree_skb(skb); @@ -654,10 +654,13 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb) struct ipv6_rt_hdr *hdr; struct rt0_hdr *rthdr; struct net *net = dev_net(skb->dev); - int accept_source_route = net->ipv6.devconf_all->accept_source_route; + int accept_source_route; - if (idev && accept_source_route > idev->cnf.accept_source_route) - accept_source_route = idev->cnf.accept_source_route; + accept_source_route = READ_ONCE(net->ipv6.devconf_all->accept_source_route); + + if (idev) + accept_source_route = min(accept_source_route, + READ_ONCE(idev->cnf.accept_source_route)); if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + @@ -801,7 +804,7 @@ looped_back: ip6_route_input(skb); if (skb_dst(skb)->error) { - skb_push(skb, skb->data - skb_network_header(skb)); + skb_push(skb, -skb_network_offset(skb)); dst_input(skb); return -1; } @@ -818,7 +821,7 @@ looped_back: goto looped_back; } - skb_push(skb, skb->data - skb_network_header(skb)); + skb_push(skb, -skb_network_offset(skb)); dst_input(skb); return -1; @@ -880,14 +883,6 @@ void ipv6_exthdrs_exit(void) Hop-by-hop options. **********************************/ -/* - * Note: we cannot rely on skb_dst(skb) before we assign it in ip6_route_input(). - */ -static inline struct net *ipv6_skb_net(struct sk_buff *skb) -{ - return skb_dst(skb) ? dev_net(skb_dst(skb)->dev) : dev_net(skb->dev); -} - /* Router Alert as of RFC 2711 */ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff) @@ -918,7 +913,7 @@ static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff) goto drop; /* Ignore if IOAM is not enabled on ingress */ - if (!__in6_dev_get(skb->dev)->cnf.ioam6_enabled) + if (!READ_ONCE(__in6_dev_get(skb->dev)->cnf.ioam6_enabled)) goto ignore; /* Truncated Option header */ @@ -938,7 +933,7 @@ static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff) goto drop; /* Ignore if the IOAM namespace is unknown */ - ns = ioam6_namespace(ipv6_skb_net(skb), trace->namespace_id); + ns = ioam6_namespace(dev_net(skb->dev), trace->namespace_id); if (!ns) goto ignore; @@ -954,6 +949,9 @@ static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff) + optoff + sizeof(*hdr)); ioam6_fill_trace_data(skb, ns, trace, true); + + ioam6_event(IOAM6_EVENT_TRACE, dev_net(skb->dev), + GFP_ATOMIC, (void *)trace, hdr->opt_len - 2); break; default: break; diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 52c04f0ac4..9e254de746 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -233,8 +233,12 @@ static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp, rt = pol_lookup_func(lookup, net, table, flp6, arg->lookup_data, flags); if (rt != net->ipv6.ip6_null_entry) { + struct inet6_dev *idev = ip6_dst_idev(&rt->dst); + + if (!idev) + goto again; err = fib6_rule_saddr(net, rule, flags, flp6, - ip6_dst_idev(&rt->dst)->dev); + idev->dev); if (err == -EAGAIN) goto again; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 1635da0728..7b31674644 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -212,7 +212,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, } else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) { res = true; } else { - struct rt6_info *rt = (struct rt6_info *)dst; + struct rt6_info *rt = dst_rt6_info(dst); int tmo = net->ipv6.sysctl.icmpv6_time; struct inet_peer *peer; @@ -241,7 +241,7 @@ static bool icmpv6_rt_has_prefsrc(struct sock *sk, u8 type, dst = ip6_route_output(net, sk, fl6); if (!dst->error) { - struct rt6_info *rt = (struct rt6_info *)dst; + struct rt6_info *rt = dst_rt6_info(dst); struct in6_addr prefsrc; rt6_get_prefsrc(rt, &prefsrc); @@ -616,7 +616,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if (ip6_append_data(sk, icmpv6_getfrag, &msg, len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), - &ipc6, &fl6, (struct rt6_info *)dst, + &ipc6, &fl6, dst_rt6_info(dst), MSG_DONTWAIT)) { ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); @@ -803,7 +803,7 @@ static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb) if (ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), &ipc6, &fl6, - (struct rt6_info *)dst, MSG_DONTWAIT)) { + dst_rt6_info(dst), MSG_DONTWAIT)) { __ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { @@ -1206,7 +1206,6 @@ static struct ctl_table ipv6_icmp_table_template[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { }, }; struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index 8c1ce78956..ff7e734e33 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -38,7 +38,7 @@ static inline struct ila_params *ila_params_lwtunnel( static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); - struct rt6_info *rt = (struct rt6_info *)orig_dst; + struct rt6_info *rt = dst_rt6_info(orig_dst); struct ila_lwt *ilwt = ila_lwt_lwtunnel(orig_dst->lwtstate); struct dst_entry *dst; int err = -EINVAL; @@ -58,7 +58,9 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) return orig_dst->lwtstate->orig_output(net, sk, skb); } + local_bh_disable(); dst = dst_cache_get(&ilwt->dst_cache); + local_bh_enable(); if (unlikely(!dst)) { struct ipv6hdr *ip6h = ipv6_hdr(skb); struct flowi6 fl6; @@ -70,7 +72,7 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = orig_dst->dev->ifindex; fl6.flowi6_iif = LOOPBACK_IFINDEX; - fl6.daddr = *rt6_nexthop((struct rt6_info *)orig_dst, + fl6.daddr = *rt6_nexthop(dst_rt6_info(orig_dst), &ip6h->daddr); dst = ip6_route_output(net, NULL, &fl6); @@ -86,8 +88,11 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) goto drop; } - if (ilwt->connected) + if (ilwt->connected) { + local_bh_disable(); dst_cache_set_ip6(&ilwt->dst_cache, dst, &fl6.saddr); + local_bh_enable(); + } } skb_dst_set(skb, dst); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index b0e8d278e8..6db71bb1cd 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -14,27 +14,26 @@ #include <linux/random.h> #include <net/addrconf.h> +#include <net/hotdata.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> #include <net/secure_seq.h> #include <net/ip.h> #include <net/sock_reuseport.h> +#include <net/tcp.h> u32 inet6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, const struct in6_addr *faddr, const __be16 fport) { - static u32 inet6_ehash_secret __read_mostly; - static u32 ipv6_hash_secret __read_mostly; - u32 lhash, fhash; net_get_random_once(&inet6_ehash_secret, sizeof(inet6_ehash_secret)); - net_get_random_once(&ipv6_hash_secret, sizeof(ipv6_hash_secret)); + net_get_random_once(&tcp_ipv6_hash_secret, sizeof(tcp_ipv6_hash_secret)); lhash = (__force u32)laddr->s6_addr32[3]; - fhash = __ipv6_addr_jhash(faddr, ipv6_hash_secret); + fhash = __ipv6_addr_jhash(faddr, tcp_ipv6_hash_secret); return __inet6_ehashfn(lhash, lport, fhash, fport, inet6_ehash_secret + net_hash_mix(net)); @@ -291,7 +290,8 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); - if (twsk_unique(sk, sk2, twp)) + if (sk->sk_protocol == IPPROTO_TCP && + tcp_twsk_unique(sk, sk2, twp)) break; } goto not_unique; diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c index 571f0e4d9c..08c9295130 100644 --- a/net/ipv6/ioam6.c +++ b/net/ipv6/ioam6.c @@ -612,6 +612,68 @@ static const struct genl_ops ioam6_genl_ops[] = { }, }; +#define IOAM6_GENL_EV_GRP_OFFSET 0 + +static const struct genl_multicast_group ioam6_mcgrps[] = { + [IOAM6_GENL_EV_GRP_OFFSET] = { .name = IOAM6_GENL_EV_GRP_NAME, + .flags = GENL_MCAST_CAP_NET_ADMIN }, +}; + +static int ioam6_event_put_trace(struct sk_buff *skb, + struct ioam6_trace_hdr *trace, + unsigned int len) +{ + if (nla_put_u16(skb, IOAM6_EVENT_ATTR_TRACE_NAMESPACE, + be16_to_cpu(trace->namespace_id)) || + nla_put_u8(skb, IOAM6_EVENT_ATTR_TRACE_NODELEN, trace->nodelen) || + nla_put_u32(skb, IOAM6_EVENT_ATTR_TRACE_TYPE, + be32_to_cpu(trace->type_be32)) || + nla_put(skb, IOAM6_EVENT_ATTR_TRACE_DATA, + len - sizeof(struct ioam6_trace_hdr) - trace->remlen * 4, + trace->data + trace->remlen * 4)) + return 1; + + return 0; +} + +void ioam6_event(enum ioam6_event_type type, struct net *net, gfp_t gfp, + void *opt, unsigned int opt_len) +{ + struct nlmsghdr *nlh; + struct sk_buff *skb; + + if (!genl_has_listeners(&ioam6_genl_family, net, + IOAM6_GENL_EV_GRP_OFFSET)) + return; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!skb) + return; + + nlh = genlmsg_put(skb, 0, 0, &ioam6_genl_family, 0, type); + if (!nlh) + goto nla_put_failure; + + switch (type) { + case IOAM6_EVENT_UNSPEC: + WARN_ON_ONCE(1); + break; + case IOAM6_EVENT_TRACE: + if (ioam6_event_put_trace(skb, (struct ioam6_trace_hdr *)opt, + opt_len)) + goto nla_put_failure; + break; + } + + genlmsg_end(skb, nlh); + genlmsg_multicast_netns(&ioam6_genl_family, net, skb, 0, + IOAM6_GENL_EV_GRP_OFFSET, gfp); + return; + +nla_put_failure: + nlmsg_free(skb); +} + static struct genl_family ioam6_genl_family __ro_after_init = { .name = IOAM6_GENL_NAME, .version = IOAM6_GENL_VERSION, @@ -620,6 +682,8 @@ static struct genl_family ioam6_genl_family __ro_after_init = { .ops = ioam6_genl_ops, .n_ops = ARRAY_SIZE(ioam6_genl_ops), .resv_start_op = IOAM6_CMD_NS_SET_SCHEMA + 1, + .mcgrps = ioam6_mcgrps, + .n_mcgrps = ARRAY_SIZE(ioam6_mcgrps), .module = THIS_MODULE, }; @@ -663,7 +727,7 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, if (!skb->dev) raw16 = IOAM6_U16_UNAVAILABLE; else - raw16 = (__force u16)__in6_dev_get(skb->dev)->cnf.ioam6_id; + raw16 = (__force u16)READ_ONCE(__in6_dev_get(skb->dev)->cnf.ioam6_id); *(__be16 *)data = cpu_to_be16(raw16); data += sizeof(__be16); @@ -671,7 +735,7 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) raw16 = IOAM6_U16_UNAVAILABLE; else - raw16 = (__force u16)__in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id; + raw16 = (__force u16)READ_ONCE(__in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id); *(__be16 *)data = cpu_to_be16(raw16); data += sizeof(__be16); @@ -758,7 +822,7 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, if (!skb->dev) raw32 = IOAM6_U32_UNAVAILABLE; else - raw32 = __in6_dev_get(skb->dev)->cnf.ioam6_id_wide; + raw32 = READ_ONCE(__in6_dev_get(skb->dev)->cnf.ioam6_id_wide); *(__be32 *)data = cpu_to_be32(raw32); data += sizeof(__be32); @@ -766,7 +830,7 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) raw32 = IOAM6_U32_UNAVAILABLE; else - raw32 = __in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id_wide; + raw32 = READ_ONCE(__in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id_wide); *(__be32 *)data = cpu_to_be32(raw32); data += sizeof(__be32); diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c index 7563f8c6aa..bf7120ecea 100644 --- a/net/ipv6/ioam6_iptunnel.c +++ b/net/ipv6/ioam6_iptunnel.c @@ -351,9 +351,9 @@ do_encap: goto drop; if (!ipv6_addr_equal(&orig_daddr, &ipv6_hdr(skb)->daddr)) { - preempt_disable(); + local_bh_disable(); dst = dst_cache_get(&ilwt->cache); - preempt_enable(); + local_bh_enable(); if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); @@ -373,9 +373,9 @@ do_encap: goto drop; } - preempt_disable(); + local_bh_disable(); dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr); - preempt_enable(); + local_bh_enable(); } skb_dst_drop(skb); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 8184076a39..83e4f9855a 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -160,6 +160,8 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh) INIT_LIST_HEAD(&f6i->fib6_siblings); refcount_set(&f6i->fib6_ref, 1); + INIT_HLIST_NODE(&f6i->gc_link); + return f6i; } @@ -246,6 +248,7 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) net->ipv6.fib6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); + INIT_HLIST_HEAD(&table->tb6_gc_hlist); } return table; @@ -617,23 +620,25 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { - struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true, - .filter.dump_routes = true }; + struct rt6_rtnl_dump_arg arg = { + .filter.dump_exceptions = true, + .filter.dump_routes = true, + .filter.rtnl_held = false, + }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); - unsigned int h, s_h; unsigned int e = 0, s_e; + struct hlist_head *head; struct fib6_walker *w; struct fib6_table *tb; - struct hlist_head *head; - int res = 0; + unsigned int h, s_h; + int err = 0; + rcu_read_lock(); if (cb->strict_check) { - int err; - err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb); if (err < 0) - return err; + goto unlock; } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); @@ -648,8 +653,10 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) * 1. allocate and initialize walker. */ w = kzalloc(sizeof(*w), GFP_ATOMIC); - if (!w) - return -ENOMEM; + if (!w) { + err = -ENOMEM; + goto unlock; + } w->func = fib6_dump_node; cb->args[2] = (long)w; @@ -669,46 +676,46 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) tb = fib6_get_table(net, arg.filter.table_id); if (!tb) { if (rtnl_msg_family(cb->nlh) != PF_INET6) - goto out; + goto unlock; NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist"); - return -ENOENT; + err = -ENOENT; + goto unlock; } if (!cb->args[0]) { - res = fib6_dump_table(tb, skb, cb); - if (!res) + err = fib6_dump_table(tb, skb, cb); + if (!err) cb->args[0] = 1; } - goto out; + goto unlock; } s_h = cb->args[0]; s_e = cb->args[1]; - rcu_read_lock(); for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (e < s_e) goto next; - res = fib6_dump_table(tb, skb, cb); - if (res != 0) - goto out_unlock; + err = fib6_dump_table(tb, skb, cb); + if (err != 0) + goto out; next: e++; } } -out_unlock: - rcu_read_unlock(); +out: cb->args[1] = e; cb->args[0] = h; -out: - res = res < 0 ? res : skb->len; - if (res <= 0) + +unlock: + rcu_read_unlock(); + if (err <= 0) fib6_dump_end(cb); - return res; + return err; } void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val) @@ -751,8 +758,6 @@ static struct fib6_node *fib6_add_1(struct net *net, int bit; __be32 dir = 0; - RT6_TRACE("fib6_add_1\n"); - /* insert node in tree */ fn = root; @@ -961,6 +966,7 @@ static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, if (!fib6_nh->rt6i_pcpu) return; + rcu_read_lock(); /* release the reference to this fib entry from * all of its cached pcpu routes */ @@ -969,7 +975,9 @@ static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, struct rt6_info *pcpu_rt; ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); - pcpu_rt = *ppcpu_rt; + + /* Paired with xchg() in rt6_get_pcpu_route() */ + pcpu_rt = READ_ONCE(*ppcpu_rt); /* only dropping the 'from' reference if the cached route * is using 'match'. The cached pcpu_rt->from only changes @@ -983,6 +991,7 @@ static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, fib6_info_release(from); } } + rcu_read_unlock(); } struct fib6_nh_pcpu_arg { @@ -1057,6 +1066,9 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, lockdep_is_held(&table->tb6_lock)); } } + + fib6_clean_expires(rt); + fib6_remove_gc_list(rt); } /* @@ -1117,10 +1129,13 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, rt->fib6_nsiblings = 0; if (!(iter->fib6_flags & RTF_EXPIRES)) return -EEXIST; - if (!(rt->fib6_flags & RTF_EXPIRES)) + if (!(rt->fib6_flags & RTF_EXPIRES)) { fib6_clean_expires(iter); - else + fib6_remove_gc_list(iter); + } else { fib6_set_expires(iter, rt->expires); + fib6_add_gc_list(iter); + } if (rt->fib6_pmtu) fib6_metric_set(iter, RTAX_MTU, @@ -1482,6 +1497,10 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, if (rt->nh) list_add(&rt->nh_list, &rt->nh->f6i_list); __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); + + if (rt->fib6_flags & RTF_EXPIRES) + fib6_add_gc_list(rt); + fib6_start_gc(info->nl_net, rt); } @@ -1806,7 +1825,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_fn_leaf; - RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); + pr_debug("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); iter++; WARN_ON(fn->fn_flags & RTN_RTINFO); @@ -1869,7 +1888,8 @@ static struct fib6_node *fib6_repair_tree(struct net *net, FOR_WALKERS(net, w) { if (!child) { if (w->node == fn) { - RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); + pr_debug("W %p adjusted by delnode 1, s=%d/%d\n", + w, w->state, nstate); w->node = pn; w->state = nstate; } @@ -1877,10 +1897,12 @@ static struct fib6_node *fib6_repair_tree(struct net *net, if (w->node == fn) { w->node = child; if (children&2) { - RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); + pr_debug("W %p adjusted by delnode 2, s=%d\n", + w, w->state); w->state = w->state >= FWS_R ? FWS_U : FWS_INIT; } else { - RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); + pr_debug("W %p adjusted by delnode 2, s=%d\n", + w, w->state); w->state = w->state >= FWS_C ? FWS_U : FWS_INIT; } } @@ -1908,8 +1930,6 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, struct net *net = info->nl_net; bool notify_del = false; - RT6_TRACE("fib6_del_route\n"); - /* If the deleted route is the first in the node and it is not part of * a multipath route, then we need to replace it with the next route * in the node, if exists. @@ -1958,7 +1978,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { - RT6_TRACE("walker %p adjusted by delroute\n", w); + pr_debug("walker %p adjusted by delroute\n", w); w->leaf = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); if (!w->leaf) @@ -2284,9 +2304,8 @@ static void fib6_flush_trees(struct net *net) * Garbage collection */ -static int fib6_age(struct fib6_info *rt, void *arg) +static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args) { - struct fib6_gc_args *gc_args = arg; unsigned long now = jiffies; /* @@ -2296,7 +2315,7 @@ static int fib6_age(struct fib6_info *rt, void *arg) if (rt->fib6_flags & RTF_EXPIRES && rt->expires) { if (time_after(now, rt->expires)) { - RT6_TRACE("expiring %p\n", rt); + pr_debug("expiring %p\n", rt); return -1; } gc_args->more++; @@ -2311,6 +2330,42 @@ static int fib6_age(struct fib6_info *rt, void *arg) return 0; } +static void fib6_gc_table(struct net *net, + struct fib6_table *tb6, + struct fib6_gc_args *gc_args) +{ + struct fib6_info *rt; + struct hlist_node *n; + struct nl_info info = { + .nl_net = net, + .skip_notify = false, + }; + + hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link) + if (fib6_age(rt, gc_args) == -1) + fib6_del(rt, &info); +} + +static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args) +{ + struct fib6_table *table; + struct hlist_head *head; + unsigned int h; + + rcu_read_lock(); + for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { + head = &net->ipv6.fib_table_hash[h]; + hlist_for_each_entry_rcu(table, head, tb6_hlist) { + spin_lock_bh(&table->tb6_lock); + + fib6_gc_table(net, table, gc_args); + + spin_unlock_bh(&table->tb6_lock); + } + } + rcu_read_unlock(); +} + void fib6_run_gc(unsigned long expires, struct net *net, bool force) { struct fib6_gc_args gc_args; @@ -2326,7 +2381,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force) net->ipv6.sysctl.ip6_rt_gc_interval; gc_args.more = 0; - fib6_clean_all(net, fib6_age, &gc_args); + fib6_gc_all(net, &gc_args); now = jiffies; net->ipv6.ip6_rt_last_gc = now; @@ -2386,6 +2441,7 @@ static int __net_init fib6_net_init(struct net *net) net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); + INIT_HLIST_HEAD(&net->ipv6.fib6_main_tbl->tb6_gc_hlist); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl), @@ -2398,6 +2454,7 @@ static int __net_init fib6_net_init(struct net *net) net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); + INIT_HLIST_HEAD(&net->ipv6.fib6_local_tbl->tb6_gc_hlist); #endif fib6_tables_init(net); @@ -2447,10 +2504,8 @@ int __init fib6_init(void) { int ret = -ENOMEM; - fib6_node_kmem = kmem_cache_create("fib6_nodes", - sizeof(struct fib6_node), 0, - SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, - NULL); + fib6_node_kmem = KMEM_CACHE(fib6_node, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT); if (!fib6_node_kmem) goto out; @@ -2459,7 +2514,8 @@ int __init fib6_init(void) goto out_kmem_cache_create; ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL, - inet6_dump_fib, 0); + inet6_dump_fib, RTNL_FLAG_DUMP_UNLOCKED | + RTNL_FLAG_DUMP_SPLIT_NLM_DONE); if (ret) goto out_unregister_subsys; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 289b83347d..3942bd2ade 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -496,11 +496,11 @@ static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) tpi->proto); if (tunnel) { if (tunnel->parms.collect_md) { + IP_TUNNEL_DECLARE_FLAGS(flags); struct metadata_dst *tun_dst; __be64 tun_id; - __be16 flags; - flags = tpi->flags; + ip_tunnel_flags_copy(flags, tpi->flags); tun_id = key32_to_tunnel_id(tpi->key); tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, 0); @@ -551,14 +551,14 @@ static int ip6erspan_rcv(struct sk_buff *skb, if (tunnel->parms.collect_md) { struct erspan_metadata *pkt_md, *md; + IP_TUNNEL_DECLARE_FLAGS(flags); struct metadata_dst *tun_dst; struct ip_tunnel_info *info; unsigned char *gh; __be64 tun_id; - __be16 flags; - tpi->flags |= TUNNEL_KEY; - flags = tpi->flags; + __set_bit(IP_TUNNEL_KEY_BIT, tpi->flags); + ip_tunnel_flags_copy(flags, tpi->flags); tun_id = key32_to_tunnel_id(tpi->key); tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, @@ -580,7 +580,8 @@ static int ip6erspan_rcv(struct sk_buff *skb, md2 = &md->u.md2; memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE : ERSPAN_V2_MDSIZE); - info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, + info->key.tun_flags); info->options_len = sizeof(*md); ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); @@ -748,8 +749,8 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, __u32 *pmtu, __be16 proto) { struct ip6_tnl *tunnel = netdev_priv(dev); + IP_TUNNEL_DECLARE_FLAGS(flags); __be16 protocol; - __be16 flags; if (dev->type == ARPHRD_ETHER) IPCB(skb)->flags = 0; @@ -781,8 +782,11 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, fl6->fl6_gre_key = tunnel_id_to_key32(key->tun_id); dsfield = key->tos; - flags = key->tun_flags & - (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); + ip_tunnel_flags_zero(flags); + __set_bit(IP_TUNNEL_CSUM_BIT, flags); + __set_bit(IP_TUNNEL_KEY_BIT, flags); + __set_bit(IP_TUNNEL_SEQ_BIT, flags); + ip_tunnel_flags_and(flags, flags, key->tun_flags); tun_hlen = gre_calc_hlen(flags); if (skb_cow_head(skb, dev->needed_headroom ?: tun_hlen + tunnel->encap_hlen)) @@ -791,19 +795,21 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, gre_build_header(skb, tun_hlen, flags, protocol, tunnel_id_to_key32(tun_info->key.tun_id), - (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) - : 0); + test_bit(IP_TUNNEL_SEQ_BIT, flags) ? + htonl(atomic_fetch_inc(&tunnel->o_seqno)) : + 0); } else { if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen)) return -ENOMEM; - flags = tunnel->parms.o_flags; + ip_tunnel_flags_copy(flags, tunnel->parms.o_flags); gre_build_header(skb, tunnel->tun_hlen, flags, protocol, tunnel->parms.o_key, - (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) - : 0); + test_bit(IP_TUNNEL_SEQ_BIT, flags) ? + htonl(atomic_fetch_inc(&tunnel->o_seqno)) : + 0); } return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu, @@ -825,7 +831,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) prepare_ip6gre_xmit_ipv4(skb, dev, &fl6, &dsfield, &encap_limit); - err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); + err = gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, + t->parms.o_flags)); if (err) return -1; @@ -859,7 +866,8 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit)) return -1; - if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM))) + if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, + t->parms.o_flags))) return -1; err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit, @@ -886,7 +894,8 @@ static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev) prepare_ip6gre_xmit_other(skb, dev, &fl6, &dsfield, &encap_limit)) return -1; - err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); + err = gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, + t->parms.o_flags)); if (err) return err; err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, skb->protocol); @@ -939,6 +948,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, struct ip_tunnel_info *tun_info = NULL; struct ip6_tnl *t = netdev_priv(dev); struct dst_entry *dst = skb_dst(skb); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; bool truncate = false; int encap_limit = -1; __u8 dsfield = false; @@ -982,7 +992,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, if (skb_cow_head(skb, dev->needed_headroom ?: t->hlen)) goto tx_err; - t->parms.o_flags &= ~TUNNEL_KEY; + __clear_bit(IP_TUNNEL_KEY_BIT, t->parms.o_flags); IPCB(skb)->flags = 0; /* For collect_md mode, derive fl6 from the tunnel key, @@ -1007,7 +1017,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, fl6.fl6_gre_key = tunnel_id_to_key32(key->tun_id); dsfield = key->tos; - if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) + if (!test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, + tun_info->key.tun_flags)) goto tx_err; if (tun_info->options_len < sizeof(*md)) goto tx_err; @@ -1068,7 +1079,9 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, } /* Push GRE header. */ - gre_build_header(skb, 8, TUNNEL_SEQ, proto, 0, htonl(atomic_fetch_inc(&t->o_seqno))); + __set_bit(IP_TUNNEL_SEQ_BIT, flags); + gre_build_header(skb, 8, flags, proto, 0, + htonl(atomic_fetch_inc(&t->o_seqno))); /* TooBig packet may have updated dst->dev's mtu */ if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu) @@ -1211,8 +1224,8 @@ static void ip6gre_tnl_copy_tnl_parm(struct ip6_tnl *t, t->parms.proto = p->proto; t->parms.i_key = p->i_key; t->parms.o_key = p->o_key; - t->parms.i_flags = p->i_flags; - t->parms.o_flags = p->o_flags; + ip_tunnel_flags_copy(t->parms.i_flags, p->i_flags); + ip_tunnel_flags_copy(t->parms.o_flags, p->o_flags); t->parms.fwmark = p->fwmark; t->parms.erspan_ver = p->erspan_ver; t->parms.index = p->index; @@ -1241,8 +1254,8 @@ static void ip6gre_tnl_parm_from_user(struct __ip6_tnl_parm *p, p->link = u->link; p->i_key = u->i_key; p->o_key = u->o_key; - p->i_flags = gre_flags_to_tnl_flags(u->i_flags); - p->o_flags = gre_flags_to_tnl_flags(u->o_flags); + gre_flags_to_tnl_flags(p->i_flags, u->i_flags); + gre_flags_to_tnl_flags(p->o_flags, u->o_flags); memcpy(p->name, u->name, sizeof(u->name)); } @@ -1394,7 +1407,7 @@ static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, ipv6h->daddr = t->parms.raddr; p = (__be16 *)(ipv6h + 1); - p[0] = t->parms.o_flags; + p[0] = ip_tunnel_flags_to_be16(t->parms.o_flags); p[1] = htons(type); /* @@ -1421,7 +1434,6 @@ static const struct net_device_ops ip6gre_netdev_ops = { .ndo_start_xmit = ip6gre_tunnel_xmit, .ndo_siocdevprivate = ip6gre_tunnel_siocdevprivate, .ndo_change_mtu = ip6_tnl_change_mtu, - .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; @@ -1431,7 +1443,6 @@ static void ip6gre_dev_free(struct net_device *dev) gro_cells_destroy(&t->gro_cells); dst_cache_destroy(&t->dst_cache); - free_percpu(dev->tstats); } static void ip6gre_tunnel_setup(struct net_device *dev) @@ -1440,6 +1451,7 @@ static void ip6gre_tunnel_setup(struct net_device *dev) dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->type = ARPHRD_IP6GRE; dev->flags |= IFF_NOARP; @@ -1458,19 +1470,17 @@ static void ip6gre_tunnel_setup(struct net_device *dev) static void ip6gre_tnl_init_features(struct net_device *dev) { struct ip6_tnl *nt = netdev_priv(dev); - __be16 flags; dev->features |= GRE6_FEATURES | NETIF_F_LLTX; dev->hw_features |= GRE6_FEATURES; - flags = nt->parms.o_flags; - /* TCP offload with GRE SEQ is not supported, nor can we support 2 * levels of outer headers requiring an update. */ - if (flags & TUNNEL_SEQ) + if (test_bit(IP_TUNNEL_SEQ_BIT, nt->parms.o_flags)) return; - if (flags & TUNNEL_CSUM && nt->encap.type != TUNNEL_ENCAP_NONE) + if (test_bit(IP_TUNNEL_CSUM_BIT, nt->parms.o_flags) && + nt->encap.type != TUNNEL_ENCAP_NONE) return; dev->features |= NETIF_F_GSO_SOFTWARE; @@ -1489,13 +1499,9 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (ret) - goto cleanup_alloc_pcpu_stats; + return ret; ret = gro_cells_init(&tunnel->gro_cells, dev); if (ret) @@ -1519,9 +1525,6 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) cleanup_dst_cache_init: dst_cache_destroy(&tunnel->dst_cache); -cleanup_alloc_pcpu_stats: - free_percpu(dev->tstats); - dev->tstats = NULL; return ret; } @@ -1636,21 +1639,19 @@ err_alloc_dev: return err; } -static void __net_exit ip6gre_exit_batch_net(struct list_head *net_list) +static void __net_exit ip6gre_exit_batch_rtnl(struct list_head *net_list, + struct list_head *dev_to_kill) { struct net *net; - LIST_HEAD(list); - rtnl_lock(); + ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) - ip6gre_destroy_tunnels(net, &list); - unregister_netdevice_many(&list); - rtnl_unlock(); + ip6gre_destroy_tunnels(net, dev_to_kill); } static struct pernet_operations ip6gre_net_ops = { .init = ip6gre_init_net, - .exit_batch = ip6gre_exit_batch_net, + .exit_batch_rtnl = ip6gre_exit_batch_rtnl, .id = &ip6gre_net_id, .size = sizeof(struct ip6gre_net), }; @@ -1797,12 +1798,12 @@ static void ip6gre_netlink_parms(struct nlattr *data[], parms->link = nla_get_u32(data[IFLA_GRE_LINK]); if (data[IFLA_GRE_IFLAGS]) - parms->i_flags = gre_flags_to_tnl_flags( - nla_get_be16(data[IFLA_GRE_IFLAGS])); + gre_flags_to_tnl_flags(parms->i_flags, + nla_get_be16(data[IFLA_GRE_IFLAGS])); if (data[IFLA_GRE_OFLAGS]) - parms->o_flags = gre_flags_to_tnl_flags( - nla_get_be16(data[IFLA_GRE_OFLAGS])); + gre_flags_to_tnl_flags(parms->o_flags, + nla_get_be16(data[IFLA_GRE_OFLAGS])); if (data[IFLA_GRE_IKEY]) parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); @@ -1855,7 +1856,6 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ip6_tnl_change_mtu, - .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; @@ -1884,13 +1884,9 @@ static int ip6erspan_tap_init(struct net_device *dev) tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (ret) - goto cleanup_alloc_pcpu_stats; + return ret; ret = gro_cells_init(&tunnel->gro_cells, dev); if (ret) @@ -1912,9 +1908,6 @@ static int ip6erspan_tap_init(struct net_device *dev) cleanup_dst_cache_init: dst_cache_destroy(&tunnel->dst_cache); -cleanup_alloc_pcpu_stats: - free_percpu(dev->tstats); - dev->tstats = NULL; return ret; } @@ -1925,7 +1918,6 @@ static const struct net_device_ops ip6erspan_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ip6_tnl_change_mtu, - .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; @@ -1939,6 +1931,7 @@ static void ip6gre_tap_setup(struct net_device *dev) dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netif_keep_dst(dev); @@ -2149,11 +2142,13 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct __ip6_tnl_parm *p = &t->parms; - __be16 o_flags = p->o_flags; + IP_TUNNEL_DECLARE_FLAGS(o_flags); + + ip_tunnel_flags_copy(o_flags, p->o_flags); if (p->erspan_ver == 1 || p->erspan_ver == 2) { if (!p->collect_md) - o_flags |= TUNNEL_KEY; + __set_bit(IP_TUNNEL_KEY_BIT, o_flags); if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, p->erspan_ver)) goto nla_put_failure; @@ -2239,6 +2234,7 @@ static void ip6erspan_tap_setup(struct net_device *dev) dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netif_keep_dst(dev); @@ -2410,7 +2406,7 @@ static void __exit ip6gre_fini(void) module_init(ip6gre_init); module_exit(ip6gre_fini); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); +MODULE_AUTHOR("D. Kozlov <xeb@mail.ru>"); MODULE_DESCRIPTION("GRE over IPv6 tunneling device"); MODULE_ALIAS_RTNL_LINK("ip6gre"); MODULE_ALIAS_RTNL_LINK("ip6gretap"); diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index b837881453..133610a49d 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -168,9 +168,9 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev, SKB_DR_SET(reason, NOT_SPECIFIED); if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL || - !idev || unlikely(idev->cnf.disable_ipv6)) { + !idev || unlikely(READ_ONCE(idev->cnf.disable_ipv6))) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); - if (idev && unlikely(idev->cnf.disable_ipv6)) + if (idev && unlikely(READ_ONCE(idev->cnf.disable_ipv6))) SKB_DR_SET(reason, IPV6DISABLED); goto drop; } @@ -236,7 +236,7 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev, if (!ipv6_addr_is_multicast(&hdr->daddr) && (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) && - idev->cnf.drop_unicast_in_l2_multicast) { + READ_ONCE(idev->cnf.drop_unicast_in_l2_multicast)) { SKB_DR_SET(reason, UNICAST_IN_L2_MULTICAST); goto err; } diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index cca64c7809..9822163428 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -67,7 +67,7 @@ static int ipv6_gro_pull_exthdrs(struct sk_buff *skb, int off, int proto) off += len; } - skb_gro_pull(skb, off - skb_network_offset(skb)); + skb_gro_pull(skb, off - skb_gro_receive_network_offset(skb)); return proto; } @@ -236,7 +236,7 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, if (unlikely(!iph)) goto out; - skb_set_network_header(skb, off); + NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = off; flush += ntohs(iph->payload_len) != skb->len - hlen; @@ -259,7 +259,7 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, NAPI_GRO_CB(skb)->proto = proto; flush--; - nlen = skb_network_header_len(skb); + nlen = skb_gro_offset(skb) - off; list_for_each_entry(p, head, list) { const struct ipv6hdr *iph2; @@ -290,19 +290,8 @@ not_same_flow: nlen - sizeof(struct ipv6hdr))) goto not_same_flow; } - /* flush if Traffic Class fields are different */ - NAPI_GRO_CB(p)->flush |= !!((first_word & htonl(0x0FF00000)) | - (__force __be32)(iph->hop_limit ^ iph2->hop_limit)); - NAPI_GRO_CB(p)->flush |= flush; - - /* If the previous IP ID value was based on an atomic - * datagram we can overwrite the value and ignore it. - */ - if (NAPI_GRO_CB(skb)->is_atomic) - NAPI_GRO_CB(p)->flush_id = 0; } - NAPI_GRO_CB(skb)->is_atomic = true; NAPI_GRO_CB(skb)->flush |= flush; skb_gro_postpull_rcsum(skb, iph, nlen); @@ -419,14 +408,6 @@ static int ip4ip6_gro_complete(struct sk_buff *skb, int nhoff) return inet_gro_complete(skb, nhoff); } -static struct packet_offload ipv6_packet_offload __read_mostly = { - .type = cpu_to_be16(ETH_P_IPV6), - .callbacks = { - .gso_segment = ipv6_gso_segment, - .gro_receive = ipv6_gro_receive, - .gro_complete = ipv6_gro_complete, - }, -}; static struct sk_buff *sit_gso_segment(struct sk_buff *skb, netdev_features_t features) @@ -486,7 +467,15 @@ static int __init ipv6_offload_init(void) if (ipv6_exthdrs_offload_init() < 0) pr_crit("%s: Cannot add EXTHDRS protocol offload\n", __func__); - dev_add_offload(&ipv6_packet_offload); + net_hotdata.ipv6_packet_offload = (struct packet_offload) { + .type = cpu_to_be16(ETH_P_IPV6), + .callbacks = { + .gso_segment = ipv6_gso_segment, + .gro_receive = ipv6_gro_receive, + .gro_complete = ipv6_gro_complete, + }, + }; + dev_add_offload(&net_hotdata.ipv6_packet_offload); inet_add_offload(&sit_offload, IPPROTO_IPV6); inet6_add_offload(&ip6ip6_offload, IPPROTO_IPV6); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 31b86fe661..784424ac41 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -120,7 +120,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); rcu_read_lock(); - nexthop = rt6_nexthop((struct rt6_info *)dst, daddr); + nexthop = rt6_nexthop(dst_rt6_info(dst), daddr); neigh = __ipv6_neigh_lookup_noref(dev, nexthop); if (unlikely(IS_ERR_OR_NULL(neigh))) { @@ -234,7 +234,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; - if (unlikely(idev->cnf.disable_ipv6)) { + if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED); return 0; @@ -501,7 +501,7 @@ int ip6_forward(struct sk_buff *skb) u32 mtu; idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); - if (net->ipv6.devconf_all->forwarding == 0) + if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0) goto error; if (skb->pkt_type != PACKET_HOST) @@ -513,8 +513,8 @@ int ip6_forward(struct sk_buff *skb) if (skb_warn_if_lro(skb)) goto drop; - if (!net->ipv6.devconf_all->disable_policy && - (!idev || !idev->cnf.disable_policy) && + if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) && + (!idev || !READ_ONCE(idev->cnf.disable_policy)) && !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; @@ -552,7 +552,7 @@ int ip6_forward(struct sk_buff *skb) } /* XXX: idev->cnf.proxy_ndp? */ - if (net->ipv6.devconf_all->proxy_ndp && + if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { int proxied = ip6_forward_proxy_check(skb); if (proxied > 0) { @@ -599,7 +599,7 @@ int ip6_forward(struct sk_buff *skb) * send a redirect. */ - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_GATEWAY) target = &rt->rt6i_gateway; else @@ -856,7 +856,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct sk_buff *frag; - struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? inet6_sk(skb->sk) : NULL; bool mono_delivery_time = skb->mono_delivery_time; @@ -1063,7 +1063,7 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, return NULL; } - rt = (struct rt6_info *)dst; + rt = dst_rt6_info(dst); /* Yes, checking route validity in not connected * case is not very simple. Take into account, * that we do not support routing by source, TOS, @@ -1118,12 +1118,13 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, struct rt6_info *rt; *dst = ip6_route_output(net, sk, fl6); - rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; + rt = (*dst)->error ? NULL : dst_rt6_info(*dst); rcu_read_lock(); from = rt ? rcu_dereference(rt->from) : NULL; err = ip6_route_get_saddr(net, from, &fl6->daddr, sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0, + fl6->flowi6_l3mdev, &fl6->saddr); rcu_read_unlock(); @@ -1159,7 +1160,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, * dst entry and replace it instead with the * dst entry of the nexthop router */ - rt = (struct rt6_info *) *dst; + rt = dst_rt6_info(*dst); rcu_read_lock(); n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt, &fl6->daddr)); @@ -1423,7 +1424,7 @@ static int __ip6_append_data(struct sock *sk, int offset = 0; bool zc = false; u32 tskey = 0; - struct rt6_info *rt = (struct rt6_info *)cork->dst; + struct rt6_info *rt = dst_rt6_info(cork->dst); bool paged, hold_tskey, extra_uref = false; struct ipv6_txoptions *opt = v6_cork->opt; int csummode = CHECKSUM_NONE; @@ -1877,7 +1878,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, struct net *net = sock_net(sk); struct ipv6hdr *hdr; struct ipv6_txoptions *opt = v6_cork->opt; - struct rt6_info *rt = (struct rt6_info *)cork->base.dst; + struct rt6_info *rt = dst_rt6_info(cork->base.dst); struct flowi6 *fl6 = &cork->fl.u.ip6; unsigned char proto = fl6->flowi6_proto; @@ -1933,7 +1934,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, u8 icmp6_type; if (sk->sk_socket->type == SOCK_RAW && - !inet_test_bit(HDRINCL, sk)) + !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH)) icmp6_type = fl6->fl6_icmp_type; else icmp6_type = icmp6_hdr(skb)->icmp6_type; @@ -1949,7 +1950,7 @@ out: int ip6_send_skb(struct sk_buff *skb) { struct net *net = sock_net(skb->sk); - struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); int err; err = ip6_local_out(net, skb->sk, skb); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 70478027a7..9dee0c1279 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -247,7 +247,6 @@ static void ip6_dev_free(struct net_device *dev) gro_cells_destroy(&t->gro_cells); dst_cache_destroy(&t->dst_cache); - free_percpu(dev->tstats); } static int ip6_tnl_create2(struct net_device *dev) @@ -799,17 +798,15 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, const struct ipv6hdr *ipv6h; int nh, err; - if ((!(tpi->flags & TUNNEL_CSUM) && - (tunnel->parms.i_flags & TUNNEL_CSUM)) || - ((tpi->flags & TUNNEL_CSUM) && - !(tunnel->parms.i_flags & TUNNEL_CSUM))) { + if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) != + test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) { DEV_STATS_INC(tunnel->dev, rx_crc_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } - if (tunnel->parms.i_flags & TUNNEL_SEQ) { - if (!(tpi->flags & TUNNEL_SEQ) || + if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) { + if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) || (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { DEV_STATS_INC(tunnel->dev, rx_fifo_errors); @@ -947,7 +944,9 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; if (t->parms.collect_md) { - tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; + + tun_dst = ipv6_tun_rx_dst(skb, flags, 0, 0); if (!tun_dst) goto drop; } @@ -1747,7 +1746,7 @@ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) if (new_mtu > IP_MAX_MTU - dev->hard_header_len) return -EINVAL; } - dev->mtu = new_mtu; + WRITE_ONCE(dev->mtu, new_mtu); return 0; } EXPORT_SYMBOL(ip6_tnl_change_mtu); @@ -1756,7 +1755,7 @@ int ip6_tnl_get_iflink(const struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - return t->parms.link; + return READ_ONCE(t->parms.link); } EXPORT_SYMBOL(ip6_tnl_get_iflink); @@ -1848,6 +1847,7 @@ static void ip6_tnl_dev_setup(struct net_device *dev) dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); dev->features |= NETIF_F_LLTX; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; netif_keep_dst(dev); dev->features |= IPXIPX_FEATURES; @@ -1873,13 +1873,10 @@ ip6_tnl_dev_init_gen(struct net_device *dev) t->dev = dev; t->net = dev_net(dev); - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; ret = dst_cache_init(&t->dst_cache, GFP_KERNEL); if (ret) - goto free_stats; + return ret; ret = gro_cells_init(&t->gro_cells, dev); if (ret) @@ -1903,9 +1900,6 @@ ip6_tnl_dev_init_gen(struct net_device *dev) destroy_dst: dst_cache_destroy(&t->dst_cache); -free_stats: - free_percpu(dev->tstats); - dev->tstats = NULL; return ret; } @@ -2152,7 +2146,7 @@ struct net *ip6_tnl_get_link_net(const struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); - return tunnel->net; + return READ_ONCE(tunnel->net); } EXPORT_SYMBOL(ip6_tnl_get_link_net); @@ -2283,21 +2277,19 @@ err_alloc_dev: return err; } -static void __net_exit ip6_tnl_exit_batch_net(struct list_head *net_list) +static void __net_exit ip6_tnl_exit_batch_rtnl(struct list_head *net_list, + struct list_head *dev_to_kill) { struct net *net; - LIST_HEAD(list); - rtnl_lock(); + ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) - ip6_tnl_destroy_tunnels(net, &list); - unregister_netdevice_many(&list); - rtnl_unlock(); + ip6_tnl_destroy_tunnels(net, dev_to_kill); } static struct pernet_operations ip6_tnl_net_ops = { .init = ip6_tnl_init_net, - .exit_batch = ip6_tnl_exit_batch_net, + .exit_batch_rtnl = ip6_tnl_exit_batch_rtnl, .id = &ip6_tnl_net_id, .size = sizeof(struct ip6_tnl_net), }; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 1163ca6ea4..590737c275 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -174,11 +174,6 @@ vti6_tnl_unlink(struct vti6_net *ip6n, struct ip6_tnl *t) } } -static void vti6_dev_free(struct net_device *dev) -{ - free_percpu(dev->tstats); -} - static int vti6_tnl_create2(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); @@ -671,7 +666,8 @@ static void vti6_link_config(struct ip6_tnl *t, bool keep_mtu) dev->flags &= ~IFF_POINTOPOINT; if (keep_mtu && dev->mtu) { - dev->mtu = clamp(dev->mtu, dev->min_mtu, dev->max_mtu); + WRITE_ONCE(dev->mtu, + clamp(dev->mtu, dev->min_mtu, dev->max_mtu)); return; } @@ -892,7 +888,6 @@ static const struct net_device_ops vti6_netdev_ops = { .ndo_uninit = vti6_dev_uninit, .ndo_start_xmit = vti6_tnl_xmit, .ndo_siocdevprivate = vti6_siocdevprivate, - .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; @@ -908,8 +903,8 @@ static void vti6_dev_setup(struct net_device *dev) dev->netdev_ops = &vti6_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->needs_free_netdev = true; - dev->priv_destructor = vti6_dev_free; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->type = ARPHRD_TUNNEL6; dev->min_mtu = IPV4_MIN_MTU; dev->max_mtu = IP_MAX_MTU - sizeof(struct ipv6hdr); @@ -931,9 +926,6 @@ static inline int vti6_dev_init_gen(struct net_device *dev) t->dev = dev; t->net = dev_net(dev); - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); netdev_lockdep_set_classes(dev); return 0; @@ -1175,24 +1167,22 @@ err_alloc_dev: return err; } -static void __net_exit vti6_exit_batch_net(struct list_head *net_list) +static void __net_exit vti6_exit_batch_rtnl(struct list_head *net_list, + struct list_head *dev_to_kill) { struct vti6_net *ip6n; struct net *net; - LIST_HEAD(list); - rtnl_lock(); + ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) { ip6n = net_generic(net, vti6_net_id); - vti6_destroy_tunnels(ip6n, &list); + vti6_destroy_tunnels(ip6n, dev_to_kill); } - unregister_netdevice_many(&list); - rtnl_unlock(); } static struct pernet_operations vti6_net_ops = { .init = vti6_init_net, - .exit_batch = vti6_exit_batch_net, + .exit_batch_rtnl = vti6_exit_batch_rtnl, .id = &vti6_net_id, .size = sizeof(struct vti6_net), }; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 9782c180fe..dd342e6ecf 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1373,10 +1373,7 @@ int __init ip6_mr_init(void) { int err; - mrt_cachep = kmem_cache_create("ip6_mrt_cache", - sizeof(struct mfc6_cache), - 0, SLAB_HWCACHE_ALIGN, - NULL); + mrt_cachep = KMEM_CACHE(mfc6_cache, SLAB_HWCACHE_ALIGN); if (!mrt_cachep) return -ENOMEM; @@ -2276,7 +2273,7 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, int err; struct mr_table *mrt; struct mfc6_cache *cache; - struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) @@ -2595,7 +2592,9 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; - struct fib_dump_filter filter = {}; + struct fib_dump_filter filter = { + .rtnl_held = true, + }; int err; if (cb->strict_check) { diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 56c3c467f9..d4c28ec1bc 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -948,6 +948,8 @@ done: if (optlen < sizeof(int)) goto e_inval; retv = ip6_ra_control(sk, val); + if (retv == 0) + inet6_assign_bit(RTALERT, sk, valbool); break; case IPV6_FLOWLABEL_MGR: retv = ipv6_flowlabel_opt(sk, optval, optlen); @@ -1346,7 +1348,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, } if (val < 0) - val = sock_net(sk)->ipv6.devconf_all->hop_limit; + val = READ_ONCE(sock_net(sk)->ipv6.devconf_all->hop_limit); break; } @@ -1445,6 +1447,10 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = np->rxopt.bits.recvfragsize; break; + case IPV6_ROUTER_ALERT: + val = inet6_test_bit(RTALERT, sk); + break; + case IPV6_ROUTER_ALERT_ISOLATE: val = inet6_test_bit(RTALERT_ISOLATE, sk); break; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 76ee1615ff..7ba01d8cfb 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -159,9 +159,9 @@ static int unsolicited_report_interval(struct inet6_dev *idev) int iv; if (mld_in_v1_mode(idev)) - iv = idev->cnf.mldv1_unsolicited_report_interval; + iv = READ_ONCE(idev->cnf.mldv1_unsolicited_report_interval); else - iv = idev->cnf.mldv2_unsolicited_report_interval; + iv = READ_ONCE(idev->cnf.mldv2_unsolicited_report_interval); return iv > 0 ? iv : 1; } @@ -1202,15 +1202,15 @@ static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, static int mld_force_mld_version(const struct inet6_dev *idev) { + const struct net *net = dev_net(idev->dev); + int all_force; + + all_force = READ_ONCE(net->ipv6.devconf_all->force_mld_version); /* Normally, both are 0 here. If enforcement to a particular is * being used, individual device enforcement will have a lower * precedence over 'all' device (.../conf/all/force_mld_version). */ - - if (dev_net(idev->dev)->ipv6.devconf_all->force_mld_version != 0) - return dev_net(idev->dev)->ipv6.devconf_all->force_mld_version; - else - return idev->cnf.force_mld_version; + return all_force ?: READ_ONCE(idev->cnf.force_mld_version); } static bool mld_in_v2_mode_only(const struct inet6_dev *idev) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index a19999b30b..0282d15725 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -227,6 +227,7 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev, return NULL; memset(ndopts, 0, sizeof(*ndopts)); while (opt_len) { + bool unknown = false; int l; if (opt_len < sizeof(struct nd_opt_hdr)) return NULL; @@ -262,22 +263,23 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev, break; #endif default: - if (ndisc_is_useropt(dev, nd_opt)) { - ndopts->nd_useropts_end = nd_opt; - if (!ndopts->nd_useropts) - ndopts->nd_useropts = nd_opt; - } else { - /* - * Unknown options must be silently ignored, - * to accommodate future extension to the - * protocol. - */ - ND_PRINTK(2, notice, - "%s: ignored unsupported option; type=%d, len=%d\n", - __func__, - nd_opt->nd_opt_type, - nd_opt->nd_opt_len); - } + unknown = true; + } + if (ndisc_is_useropt(dev, nd_opt)) { + ndopts->nd_useropts_end = nd_opt; + if (!ndopts->nd_useropts) + ndopts->nd_useropts = nd_opt; + } else if (unknown) { + /* + * Unknown options must be silently ignored, + * to accommodate future extension to the + * protocol. + */ + ND_PRINTK(2, notice, + "%s: ignored unsupported option; type=%d, len=%d\n", + __func__, + nd_opt->nd_opt_type, + nd_opt->nd_opt_len); } next_opt: opt_len -= l; @@ -451,7 +453,7 @@ static void ip6_nd_hdr(struct sk_buff *skb, rcu_read_lock(); idev = __in6_dev_get(skb->dev); - tclass = idev ? idev->cnf.ndisc_tclass : 0; + tclass = idev ? READ_ONCE(idev->cnf.ndisc_tclass) : 0; rcu_read_unlock(); skb_push(skb, sizeof(*hdr)); @@ -535,7 +537,7 @@ void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, src_addr = solicited_addr; if (ifp->flags & IFA_F_OPTIMISTIC) override = false; - inc_opt |= ifp->idev->cnf.force_tllao; + inc_opt |= READ_ONCE(ifp->idev->cnf.force_tllao); in6_ifa_put(ifp); } else { if (ipv6_dev_get_saddr(dev_net(dev), dev, daddr, @@ -903,8 +905,9 @@ have_ifp: } if (ipv6_chk_acast_addr(net, dev, &msg->target) || - (idev->cnf.forwarding && - (net->ipv6.devconf_all->proxy_ndp || idev->cnf.proxy_ndp) && + (READ_ONCE(idev->cnf.forwarding) && + (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) || + READ_ONCE(idev->cnf.proxy_ndp)) && (is_router = pndisc_is_router(&msg->target, dev)) >= 0)) { if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) && skb->pkt_type != PACKET_HOST && @@ -929,7 +932,7 @@ have_ifp: } if (is_router < 0) - is_router = idev->cnf.forwarding; + is_router = READ_ONCE(idev->cnf.forwarding); if (dad) { ndisc_send_na(dev, &in6addr_linklocal_allnodes, &msg->target, @@ -973,7 +976,7 @@ static int accept_untracked_na(struct net_device *dev, struct in6_addr *saddr) { struct inet6_dev *idev = __in6_dev_get(dev); - switch (idev->cnf.accept_untracked_na) { + switch (READ_ONCE(idev->cnf.accept_untracked_na)) { case 0: /* Don't accept untracked na (absent in neighbor cache) */ return 0; case 1: /* Create new entries from na if currently untracked */ @@ -1024,7 +1027,7 @@ static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb) * drop_unsolicited_na takes precedence over accept_untracked_na */ if (!msg->icmph.icmp6_solicited && idev && - idev->cnf.drop_unsolicited_na) + READ_ONCE(idev->cnf.drop_unsolicited_na)) return reason; if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) @@ -1080,7 +1083,7 @@ static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb) * Note that we don't do a (daddr == all-routers-mcast) check. */ new_state = msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE; - if (!neigh && lladdr && idev && idev->cnf.forwarding) { + if (!neigh && lladdr && idev && READ_ONCE(idev->cnf.forwarding)) { if (accept_untracked_na(dev, saddr)) { neigh = neigh_create(&nd_tbl, &msg->target, dev); new_state = NUD_STALE; @@ -1100,7 +1103,8 @@ static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb) * has already sent a NA to us. */ if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) && - net->ipv6.devconf_all->forwarding && net->ipv6.devconf_all->proxy_ndp && + READ_ONCE(net->ipv6.devconf_all->forwarding) && + READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) { /* XXX: idev->cnf.proxy_ndp */ goto out; @@ -1148,7 +1152,7 @@ static enum skb_drop_reason ndisc_recv_rs(struct sk_buff *skb) } /* Don't accept RS if we're not in router mode */ - if (!idev->cnf.forwarding) + if (!READ_ONCE(idev->cnf.forwarding)) goto out; /* @@ -1237,6 +1241,7 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) struct ndisc_options ndopts; struct fib6_info *rt = NULL; struct inet6_dev *in6_dev; + struct fib6_table *table; u32 defrtr_usr_metric; unsigned int pref = 0; __u32 old_if_flags; @@ -1317,7 +1322,7 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) if (old_if_flags != in6_dev->if_flags) send_ifinfo_notify = true; - if (!in6_dev->cnf.accept_ra_defrtr) { + if (!READ_ONCE(in6_dev->cnf.accept_ra_defrtr)) { ND_PRINTK(2, info, "RA: %s, defrtr is false for dev: %s\n", __func__, skb->dev->name); @@ -1325,7 +1330,8 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) } lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); - if (lifetime != 0 && lifetime < in6_dev->cnf.accept_ra_min_lft) { + if (lifetime != 0 && + lifetime < READ_ONCE(in6_dev->cnf.accept_ra_min_lft)) { ND_PRINTK(2, info, "RA: router lifetime (%ds) is too short: %s\n", lifetime, skb->dev->name); @@ -1336,7 +1342,7 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) * accept_ra_from_local is set to true. */ net = dev_net(in6_dev->dev); - if (!in6_dev->cnf.accept_ra_from_local && + if (!READ_ONCE(in6_dev->cnf.accept_ra_from_local) && ipv6_chk_addr(net, &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) { ND_PRINTK(2, info, "RA from local address detected on dev: %s: default router ignored\n", @@ -1348,7 +1354,7 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) pref = ra_msg->icmph.icmp6_router_pref; /* 10b is handled as if it were 00b (medium) */ if (pref == ICMPV6_ROUTER_PREF_INVALID || - !in6_dev->cnf.accept_ra_rtr_pref) + !READ_ONCE(in6_dev->cnf.accept_ra_rtr_pref)) pref = ICMPV6_ROUTER_PREF_MEDIUM; #endif /* routes added from RAs do not use nexthop objects */ @@ -1382,7 +1388,8 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) neigh_release(neigh); rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr, - skb->dev, pref, defrtr_usr_metric); + skb->dev, pref, defrtr_usr_metric, + lifetime); if (!rt) { ND_PRINTK(0, err, "RA: %s failed to add default route\n", @@ -1409,12 +1416,21 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) inet6_rt_notify(RTM_NEWROUTE, rt, &nlinfo, NLM_F_REPLACE); } - if (rt) + if (rt) { + table = rt->fib6_table; + spin_lock_bh(&table->tb6_lock); + fib6_set_expires(rt, jiffies + (HZ * lifetime)); - if (in6_dev->cnf.accept_ra_min_hop_limit < 256 && + fib6_add_gc_list(rt); + + spin_unlock_bh(&table->tb6_lock); + } + if (READ_ONCE(in6_dev->cnf.accept_ra_min_hop_limit) < 256 && ra_msg->icmph.icmp6_hop_limit) { - if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) { - in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit; + if (READ_ONCE(in6_dev->cnf.accept_ra_min_hop_limit) <= + ra_msg->icmph.icmp6_hop_limit) { + WRITE_ONCE(in6_dev->cnf.hop_limit, + ra_msg->icmph.icmp6_hop_limit); fib6_metric_set(rt, RTAX_HOPLIMIT, ra_msg->icmph.icmp6_hop_limit); } else { @@ -1496,7 +1512,7 @@ skip_linkparms: } #ifdef CONFIG_IPV6_ROUTE_INFO - if (!in6_dev->cnf.accept_ra_from_local && + if (!READ_ONCE(in6_dev->cnf.accept_ra_from_local) && ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) { ND_PRINTK(2, info, @@ -1505,7 +1521,7 @@ skip_linkparms: goto skip_routeinfo; } - if (in6_dev->cnf.accept_ra_rtr_pref && ndopts.nd_opts_ri) { + if (READ_ONCE(in6_dev->cnf.accept_ra_rtr_pref) && ndopts.nd_opts_ri) { struct nd_opt_hdr *p; for (p = ndopts.nd_opts_ri; p; @@ -1517,14 +1533,14 @@ skip_linkparms: continue; #endif if (ri->prefix_len == 0 && - !in6_dev->cnf.accept_ra_defrtr) + !READ_ONCE(in6_dev->cnf.accept_ra_defrtr)) continue; if (ri->lifetime != 0 && - ntohl(ri->lifetime) < in6_dev->cnf.accept_ra_min_lft) + ntohl(ri->lifetime) < READ_ONCE(in6_dev->cnf.accept_ra_min_lft)) continue; - if (ri->prefix_len < in6_dev->cnf.accept_ra_rt_info_min_plen) + if (ri->prefix_len < READ_ONCE(in6_dev->cnf.accept_ra_rt_info_min_plen)) continue; - if (ri->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen) + if (ri->prefix_len > READ_ONCE(in6_dev->cnf.accept_ra_rt_info_max_plen)) continue; rt6_route_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3, &ipv6_hdr(skb)->saddr); @@ -1544,7 +1560,7 @@ skip_routeinfo: } #endif - if (in6_dev->cnf.accept_ra_pinfo && ndopts.nd_opts_pi) { + if (READ_ONCE(in6_dev->cnf.accept_ra_pinfo) && ndopts.nd_opts_pi) { struct nd_opt_hdr *p; for (p = ndopts.nd_opts_pi; p; @@ -1555,7 +1571,7 @@ skip_routeinfo: } } - if (ndopts.nd_opts_mtu && in6_dev->cnf.accept_ra_mtu) { + if (ndopts.nd_opts_mtu && READ_ONCE(in6_dev->cnf.accept_ra_mtu)) { __be32 n; u32 mtu; @@ -1569,8 +1585,8 @@ skip_routeinfo: if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) { ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu); - } else if (in6_dev->cnf.mtu6 != mtu) { - in6_dev->cnf.mtu6 = mtu; + } else if (READ_ONCE(in6_dev->cnf.mtu6) != mtu) { + WRITE_ONCE(in6_dev->cnf.mtu6, mtu); fib6_metric_set(rt, RTAX_MTU, mtu); rt6_mtu_change(skb->dev, mtu); } @@ -1708,7 +1724,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) if (IS_ERR(dst)) return; - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_GATEWAY) { ND_PRINTK(2, warn, @@ -1804,7 +1820,7 @@ static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb) if (!idev) return true; if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED && - idev->cnf.suppress_frag_ndisc) { + READ_ONCE(idev->cnf.suppress_frag_ndisc)) { net_warn_ratelimited("Received fragmented ndisc packet. Carefully consider disabling suppress_frag_ndisc.\n"); return true; } @@ -1881,8 +1897,8 @@ static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, idev = in6_dev_get(dev); if (!idev) break; - if (idev->cnf.ndisc_notify || - net->ipv6.devconf_all->ndisc_notify) + if (READ_ONCE(idev->cnf.ndisc_notify) || + READ_ONCE(net->ipv6.devconf_all->ndisc_notify)) ndisc_send_unsol_na(dev); in6_dev_put(idev); break; @@ -1891,8 +1907,8 @@ static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, if (!idev) evict_nocarrier = true; else { - evict_nocarrier = idev->cnf.ndisc_evict_nocarrier && - net->ipv6.devconf_all->ndisc_evict_nocarrier; + evict_nocarrier = READ_ONCE(idev->cnf.ndisc_evict_nocarrier) && + READ_ONCE(net->ipv6.devconf_all->ndisc_evict_nocarrier); in6_dev_put(idev); } @@ -1966,7 +1982,7 @@ int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void *buffer, if (ctl->data == &NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME)) idev->nd_parms->reachable_time = neigh_rand_reach_time(NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME)); - idev->tstamp = jiffies; + WRITE_ONCE(idev->tstamp, jiffies); inet6_ifinfo_notify(RTM_NEWLINK, idev); in6_dev_put(idev); } diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 53d255838e..5d989d8030 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -36,6 +36,7 @@ int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff .flowi6_uid = sock_net_uid(net, sk), .daddr = iph->daddr, .saddr = iph->saddr, + .flowlabel = ip6_flowinfo(iph), }; int err; diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 0ba62f4868..f3c8e2d918 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -6,6 +6,10 @@ menu "IPv6: Netfilter Configuration" depends on INET && IPV6 && NETFILTER +# old sockopt interface and eval loop +config IP6_NF_IPTABLES_LEGACY + tristate + config NF_SOCKET_IPV6 tristate "IPv6 socket lookup support" help @@ -147,7 +151,7 @@ config IP6_NF_MATCH_MH config IP6_NF_MATCH_RPFILTER tristate '"rpfilter" reverse path filter match support' depends on NETFILTER_ADVANCED - depends on IP6_NF_MANGLE || IP6_NF_RAW + depends on IP6_NF_MANGLE || IP6_NF_RAW || NFT_COMPAT help This option allows you to match packets whose replies would go out via the interface the packet came in. @@ -186,6 +190,8 @@ config IP6_NF_TARGET_HL config IP6_NF_FILTER tristate "Packet filtering" default m if NETFILTER_ADVANCED=n + select IP6_NF_IPTABLES_LEGACY + tristate help Packet filtering defines a table `filter', which has a series of rules for simple packet filtering at local input, forwarding and @@ -195,7 +201,7 @@ config IP6_NF_FILTER config IP6_NF_TARGET_REJECT tristate "REJECT target support" - depends on IP6_NF_FILTER + depends on IP6_NF_FILTER || NFT_COMPAT select NF_REJECT_IPV6 default m if NETFILTER_ADVANCED=n help @@ -221,6 +227,7 @@ config IP6_NF_TARGET_SYNPROXY config IP6_NF_MANGLE tristate "Packet mangling" default m if NETFILTER_ADVANCED=n + select IP6_NF_IPTABLES_LEGACY help This option adds a `mangle' table to iptables: see the man page for iptables(8). This table is used for various packet alterations @@ -230,6 +237,7 @@ config IP6_NF_MANGLE config IP6_NF_RAW tristate 'raw table support (required for TRACE)' + select IP6_NF_IPTABLES_LEGACY help This option adds a `raw' table to ip6tables. This table is the very first in the netfilter framework and hooks in at the PREROUTING @@ -243,6 +251,7 @@ config IP6_NF_SECURITY tristate "Security table" depends on SECURITY depends on NETFILTER_ADVANCED + select IP6_NF_IPTABLES_LEGACY help This option adds a `security' table to iptables, for use with Mandatory Access Control (MAC) policy. @@ -254,6 +263,7 @@ config IP6_NF_NAT depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select NF_NAT + select IP6_NF_IPTABLES_LEGACY select NETFILTER_XT_NAT help This enables the `nat' table in ip6tables. This allows masquerading, @@ -262,25 +272,23 @@ config IP6_NF_NAT To compile it as a module, choose M here. If unsure, say N. -if IP6_NF_NAT - config IP6_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" select NETFILTER_XT_TARGET_MASQUERADE + depends on IP6_NF_NAT help This is a backwards-compat option for the user's convenience (e.g. when running oldconfig). It selects NETFILTER_XT_TARGET_MASQUERADE. config IP6_NF_TARGET_NPT tristate "NPT (Network Prefix translation) target support" + depends on IP6_NF_NAT || NFT_COMPAT help This option adds the `SNPT' and `DNPT' target, which perform stateless IPv6-to-IPv6 Network Prefix Translation per RFC 6296. To compile it as a module, choose M here. If unsure, say N. -endif # IP6_NF_NAT - endif # IP6_NF_IPTABLES endmenu diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index b8d6dc9aee..66ce6fa5b2 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -4,7 +4,7 @@ # # Link order matters here. -obj-$(CONFIG_IP6_NF_IPTABLES) += ip6_tables.o +obj-$(CONFIG_IP6_NF_IPTABLES_LEGACY) += ip6_tables.o obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index df785ebda0..e8992693e1 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -43,7 +43,7 @@ static int ip6table_filter_table_init(struct net *net) return -ENOMEM; /* Entry 1 is the FORWARD hook */ ((struct ip6t_standard *)repl->entries)[1].target.verdict = - forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; + forward ? -NF_ACCEPT - 1 : NF_DROP - 1; err = ip6t_register_table(net, &packet_filter, repl, filter_ops); kfree(repl); diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 52cf104e34..e119d4f090 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -147,23 +147,27 @@ static struct pernet_operations ip6table_nat_net_ops = { static int __init ip6table_nat_init(void) { - int ret = xt_register_template(&nf_nat_ipv6_table, - ip6table_nat_table_init); + int ret; + /* net->gen->ptr[ip6table_nat_net_id] must be allocated + * before calling ip6t_nat_register_lookups(). + */ + ret = register_pernet_subsys(&ip6table_nat_net_ops); if (ret < 0) return ret; - ret = register_pernet_subsys(&ip6table_nat_net_ops); + ret = xt_register_template(&nf_nat_ipv6_table, + ip6table_nat_table_init); if (ret) - xt_unregister_template(&nf_nat_ipv6_table); + unregister_pernet_subsys(&ip6table_nat_net_ops); return ret; } static void __exit ip6table_nat_exit(void) { - unregister_pernet_subsys(&ip6table_nat_net_ops); xt_unregister_template(&nf_nat_ipv6_table); + unregister_pernet_subsys(&ip6table_nat_net_ops); } module_init(ip6table_nat_init); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index efbec7ee27..5e1b50c6a4 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -62,7 +62,6 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, - { } }; static int nf_ct_frag6_sysctl_register(struct net *net) @@ -105,7 +104,7 @@ err_alloc: static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) { struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); - struct ctl_table *table; + const struct ctl_table *table; table = nf_frag->nf_frag_frags_hdr->ctl_table_arg; unregister_net_sysctl_table(nf_frag->nf_frag_frags_hdr); @@ -328,9 +327,9 @@ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, if (!reasm_data) goto err; - payload_len = ((skb->data - skb_network_header(skb)) - + payload_len = -skb_network_offset(skb) - sizeof(struct ipv6hdr) + fq->q.len - - sizeof(struct frag_hdr)); + sizeof(struct frag_hdr); if (payload_len > IPV6_MAXPLEN) { net_dbg_ratelimited("nf_ct_frag6_reasm: payload len = %d\n", payload_len); diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 196dd4ecb5..dedee264b8 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -83,7 +83,7 @@ struct sk_buff *nf_reject_skb_v6_tcp_reset(struct net *net, skb_reserve(nskb, LL_MAX_HEADER); nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, - net->ipv6.devconf_all->hop_limit); + READ_ONCE(net->ipv6.devconf_all->hop_limit)); nf_reject_ip6_tcphdr_put(nskb, oldskb, oth, otcplen); nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr)); @@ -124,7 +124,7 @@ struct sk_buff *nf_reject_skb_v6_unreach(struct net *net, skb_reserve(nskb, LL_MAX_HEADER); nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_ICMPV6, - net->ipv6.devconf_all->hop_limit); + READ_ONCE(net->ipv6.devconf_all->hop_limit)); skb_reset_transport_header(nskb); icmp6h = skb_put_zero(nskb, sizeof(struct icmp6hdr)); diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index b5205311f3..806d4b5dd1 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -111,9 +111,9 @@ int ip6_dst_hoplimit(struct dst_entry *dst) rcu_read_lock(); idev = __in6_dev_get(dev); if (idev) - hoplimit = idev->cnf.hop_limit; + hoplimit = READ_ONCE(idev->cnf.hop_limit); else - hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit; + hoplimit = READ_ONCE(dev_net(dev)->ipv6.devconf_all->hop_limit); rcu_read_unlock(); } return hoplimit; diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index ef2059c889..88b3fcacd4 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -154,7 +154,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, false); if (IS_ERR(dst)) return PTR_ERR(dst); - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) fl6.flowi6_oif = READ_ONCE(np->mcast_oif); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 03dbb874c3..2eedf25560 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -160,6 +160,13 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) if (!raw_v6_match(net, sk, nexthdr, daddr, saddr, inet6_iif(skb), inet6_sdif(skb))) continue; + + if (atomic_read(&sk->sk_rmem_alloc) >= + READ_ONCE(sk->sk_rcvbuf)) { + atomic_inc(&sk->sk_drops); + continue; + } + delivered = true; switch (nexthdr) { case IPPROTO_ICMPV6: @@ -288,8 +295,7 @@ out: } static void rawv6_err(struct sock *sk, struct sk_buff *skb, - struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, __be32 info) + u8 type, u8 code, int offset, __be32 info) { bool recverr = inet6_test_bit(RECVERR6, sk); struct ipv6_pinfo *np = inet6_sk(sk); @@ -344,7 +350,7 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr, if (!raw_v6_match(net, sk, nexthdr, &ip6h->saddr, &ip6h->daddr, inet6_iif(skb), inet6_iif(skb))) continue; - rawv6_err(sk, skb, NULL, type, code, inner_offset, info); + rawv6_err(sk, skb, type, code, inner_offset, info); } rcu_read_unlock(); } @@ -592,7 +598,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, struct ipv6hdr *iph; struct sk_buff *skb; int err; - struct rt6_info *rt = (struct rt6_info *)*dstp; + struct rt6_info *rt = dst_rt6_info(*dstp); int hlen = LL_RESERVED_SPACE(rt->dst.dev); int tlen = rt->dst.dev->needed_tailroom; @@ -911,7 +917,7 @@ back_from_confirm: ipc6.opt = opt; lock_sock(sk); err = ip6_append_data(sk, raw6_getfrag, &rfv, - len, 0, &ipc6, &fl6, (struct rt6_info *)dst, + len, 0, &ipc6, &fl6, dst_rt6_info(dst), msg->msg_flags); if (err) @@ -935,7 +941,7 @@ do_confirm: goto done; } -static int rawv6_seticmpfilter(struct sock *sk, int level, int optname, +static int rawv6_seticmpfilter(struct sock *sk, int optname, sockptr_t optval, int optlen) { switch (optname) { @@ -952,7 +958,7 @@ static int rawv6_seticmpfilter(struct sock *sk, int level, int optname, return 0; } -static int rawv6_geticmpfilter(struct sock *sk, int level, int optname, +static int rawv6_geticmpfilter(struct sock *sk, int optname, char __user *optval, int __user *optlen) { int len; @@ -1038,7 +1044,7 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname, case SOL_ICMPV6: if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; - return rawv6_seticmpfilter(sk, level, optname, optval, optlen); + return rawv6_seticmpfilter(sk, optname, optval, optlen); case SOL_IPV6: if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) @@ -1099,7 +1105,7 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname, case SOL_ICMPV6: if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; - return rawv6_geticmpfilter(sk, level, optname, optval, optlen); + return rawv6_geticmpfilter(sk, optname, optval, optlen); case SOL_IPV6: if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 5ebc47da10..327caca642 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -272,9 +272,9 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, if (!reasm_data) goto out_oom; - payload_len = ((skb->data - skb_network_header(skb)) - + payload_len = -skb_network_offset(skb) - sizeof(struct ipv6hdr) + fq->q.len - - sizeof(struct frag_hdr)); + sizeof(struct frag_hdr); if (payload_len > IPV6_MAXPLEN) goto out_oversize; @@ -369,7 +369,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) * the source of the fragment, with the Pointer field set to zero. */ nexthdr = hdr->nexthdr; - if (ipv6frag_thdr_truncated(skb, skb_transport_offset(skb), &nexthdr)) { + if (ipv6frag_thdr_truncated(skb, skb_network_offset(skb) + sizeof(struct ipv6hdr), &nexthdr)) { __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_INCOMP, 0); @@ -436,7 +436,6 @@ static struct ctl_table ip6_frags_ns_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; /* secret interval has been deprecated */ @@ -449,7 +448,6 @@ static struct ctl_table ip6_frags_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; static int __net_init ip6_frags_ns_sysctl_register(struct net *net) @@ -487,7 +485,7 @@ err_alloc: static void __net_exit ip6_frags_ns_sysctl_unregister(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->ipv6.sysctl.frags_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv6.sysctl.frags_hdr); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ef815ba583..c9a9506b71 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -87,7 +87,8 @@ struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ip6_default_advmss(const struct dst_entry *dst); INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst); -static struct dst_entry *ip6_negative_advice(struct dst_entry *); +static void ip6_negative_advice(struct sock *sk, + struct dst_entry *dst); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, struct net_device *dev); @@ -226,7 +227,7 @@ static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { - const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); + const struct rt6_info *rt = dst_rt6_info(dst); return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any), dst->dev, skb, daddr); @@ -234,8 +235,8 @@ static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) { + const struct rt6_info *rt = dst_rt6_info(dst); struct net_device *dev = dst->dev; - struct rt6_info *rt = (struct rt6_info *)dst; daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr); if (!daddr) @@ -354,7 +355,7 @@ EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { - struct rt6_info *rt = (struct rt6_info *)dst; + struct rt6_info *rt = dst_rt6_info(dst); struct fib6_info *from; struct inet6_dev *idev; @@ -373,7 +374,7 @@ static void ip6_dst_destroy(struct dst_entry *dst) static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { - struct rt6_info *rt = (struct rt6_info *)dst; + struct rt6_info *rt = dst_rt6_info(dst); struct inet6_dev *idev = rt->rt6i_idev; if (idev && idev->dev != blackhole_netdev) { @@ -637,6 +638,8 @@ static void rt6_probe(struct fib6_nh *fib6_nh) rcu_read_lock(); last_probe = READ_ONCE(fib6_nh->last_probe); idev = __in6_dev_get(dev); + if (!idev) + goto out; neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); if (neigh) { if (READ_ONCE(neigh->nud_state) & NUD_VALID) @@ -645,14 +648,15 @@ static void rt6_probe(struct fib6_nh *fib6_nh) write_lock_bh(&neigh->lock); if (!(neigh->nud_state & NUD_VALID) && time_after(jiffies, - neigh->updated + idev->cnf.rtr_probe_interval)) { + neigh->updated + + READ_ONCE(idev->cnf.rtr_probe_interval))) { work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) __neigh_set_probe_once(neigh); } write_unlock_bh(&neigh->lock); } else if (time_after(jiffies, last_probe + - idev->cnf.rtr_probe_interval)) { + READ_ONCE(idev->cnf.rtr_probe_interval))) { work = kmalloc(sizeof(*work), GFP_ATOMIC); } @@ -931,6 +935,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, struct net *net = dev_net(dev); struct route_info *rinfo = (struct route_info *) opt; struct in6_addr prefix_buf, *prefix; + struct fib6_table *table; unsigned int pref; unsigned long lifetime; struct fib6_info *rt; @@ -989,10 +994,18 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); if (rt) { - if (!addrconf_finite_timeout(lifetime)) + table = rt->fib6_table; + spin_lock_bh(&table->tb6_lock); + + if (!addrconf_finite_timeout(lifetime)) { fib6_clean_expires(rt); - else + fib6_remove_gc_list(rt); + } else { fib6_set_expires(rt, jiffies + HZ * lifetime); + fib6_add_gc_list(rt); + } + + spin_unlock_bh(&table->tb6_lock); fib6_info_release(rt); } @@ -1278,7 +1291,7 @@ struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); if (dst->error == 0) - return (struct rt6_info *) dst; + return dst_rt6_info(dst); dst_release(dst); @@ -1398,6 +1411,7 @@ static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) struct rt6_info *prev, **p; p = this_cpu_ptr(res->nh->rt6i_pcpu); + /* Paired with READ_ONCE() in __fib6_drop_pcpu_from() */ prev = xchg(p, NULL); if (prev) { dst_dev_put(&prev->dst); @@ -1587,7 +1601,7 @@ static unsigned int fib6_mtu(const struct fib6_result *res) rcu_read_lock(); idev = __in6_dev_get(dev); - mtu = idev->cnf.mtu6; + mtu = READ_ONCE(idev->cnf.mtu6); rcu_read_unlock(); } @@ -2085,12 +2099,12 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, */ if (!(rt->rt6i_flags & RTF_EXPIRES)) { if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { - RT6_TRACE("aging clone %p\n", rt); + pr_debug("aging clone %p\n", rt); rt6_remove_exception(bucket, rt6_ex); return; } } else if (time_after(jiffies, rt->dst.expires)) { - RT6_TRACE("purging expired route %p\n", rt); + pr_debug("purging expired route %p\n", rt); rt6_remove_exception(bucket, rt6_ex); return; } @@ -2101,8 +2115,8 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); if (!(neigh && (neigh->flags & NTF_ROUTER))) { - RT6_TRACE("purging route %p via non-router but gateway\n", - rt); + pr_debug("purging route %p via non-router but gateway\n", + rt); rt6_remove_exception(bucket, rt6_ex); return; } @@ -2211,7 +2225,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, strict |= flags & RT6_LOOKUP_F_IFACE; strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; - if (net->ipv6.devconf_all->forwarding == 0) + if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0) strict |= RT6_LOOKUP_F_REACHABLE; rcu_read_lock(); @@ -2637,7 +2651,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, rcu_read_lock(); dst = ip6_route_output_flags_noref(net, sk, fl6, flags); - rt6 = (struct rt6_info *)dst; + rt6 = dst_rt6_info(dst); /* For dst cached in uncached_list, refcnt is already taken. */ if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) { dst = &net->ipv6.ip6_null_entry->dst; @@ -2651,7 +2665,7 @@ EXPORT_SYMBOL_GPL(ip6_route_output_flags); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) { - struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; + struct rt6_info *rt, *ort = dst_rt6_info(dst_orig); struct net_device *loopback_dev = net->loopback_dev; struct dst_entry *new = NULL; @@ -2734,7 +2748,7 @@ INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, struct fib6_info *from; struct rt6_info *rt; - rt = container_of(dst, struct rt6_info, dst); + rt = dst_rt6_info(dst); if (rt->sernum) return rt6_is_valid(rt) ? dst : NULL; @@ -2760,24 +2774,24 @@ INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, } EXPORT_INDIRECT_CALLABLE(ip6_dst_check); -static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) +static void ip6_negative_advice(struct sock *sk, + struct dst_entry *dst) { - struct rt6_info *rt = (struct rt6_info *) dst; + struct rt6_info *rt = dst_rt6_info(dst); - if (rt) { - if (rt->rt6i_flags & RTF_CACHE) { - rcu_read_lock(); - if (rt6_check_expired(rt)) { - rt6_remove_exception_rt(rt); - dst = NULL; - } - rcu_read_unlock(); - } else { - dst_release(dst); - dst = NULL; + if (rt->rt6i_flags & RTF_CACHE) { + rcu_read_lock(); + if (rt6_check_expired(rt)) { + /* counteract the dst_release() in sk_dst_reset() */ + dst_hold(dst); + sk_dst_reset(sk); + + rt6_remove_exception_rt(rt); } + rcu_read_unlock(); + return; } - return dst; + sk_dst_reset(sk); } static void ip6_link_failure(struct sk_buff *skb) @@ -2786,7 +2800,7 @@ static void ip6_link_failure(struct sk_buff *skb) icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); - rt = (struct rt6_info *) skb_dst(skb); + rt = dst_rt6_info(skb_dst(skb)); if (rt) { rcu_read_lock(); if (rt->rt6i_flags & RTF_CACHE) { @@ -2842,7 +2856,7 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, bool confirm_neigh) { const struct in6_addr *daddr, *saddr; - struct rt6_info *rt6 = (struct rt6_info *)dst; + struct rt6_info *rt6 = dst_rt6_info(dst); /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU) * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it. @@ -3240,8 +3254,8 @@ u32 ip6_mtu_from_fib6(const struct fib6_result *res, mtu = IPV6_MIN_MTU; idev = __in6_dev_get(dev); - if (idev && idev->cnf.mtu6 > mtu) - mtu = idev->cnf.mtu6; + if (idev) + mtu = max_t(u32, mtu, READ_ONCE(idev->cnf.mtu6)); } mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); @@ -3591,7 +3605,7 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, if (!dev) goto out; - if (idev->cnf.disable_ipv6) { + if (!idev || idev->cnf.disable_ipv6) { NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); err = -EACCES; goto out; @@ -3765,8 +3779,6 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, if (cfg->fc_flags & RTF_EXPIRES) fib6_set_expires(rt, jiffies + clock_t_to_jiffies(cfg->fc_expires)); - else - fib6_clean_expires(rt); if (cfg->fc_protocol == RTPROT_UNSPEC) cfg->fc_protocol = RTPROT_BOOT; @@ -4142,7 +4154,8 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu in6_dev = __in6_dev_get(skb->dev); if (!in6_dev) return; - if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) + if (READ_ONCE(in6_dev->cnf.forwarding) || + !READ_ONCE(in6_dev->cnf.accept_redirects)) return; /* RFC2461 8.1: @@ -4165,7 +4178,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu } } - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_REJECT) { net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); return; @@ -4355,7 +4368,8 @@ struct fib6_info *rt6_add_dflt_router(struct net *net, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref, - u32 defrtr_usr_metric) + u32 defrtr_usr_metric, + int lifetime) { struct fib6_config cfg = { .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, @@ -4368,6 +4382,7 @@ struct fib6_info *rt6_add_dflt_router(struct net *net, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = net, + .fc_expires = jiffies_to_clock_t(lifetime * HZ), }; cfg.fc_gateway = *gwaddr; @@ -4434,7 +4449,7 @@ static void rtmsg_to_fib6_config(struct net *net, .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? : RT6_TABLE_MAIN, .fc_ifindex = rtmsg->rtmsg_ifindex, - .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER, + .fc_metric = rtmsg->rtmsg_metric, .fc_expires = rtmsg->rtmsg_info, .fc_dst_len = rtmsg->rtmsg_dst_len, .fc_src_len = rtmsg->rtmsg_src_len, @@ -4464,6 +4479,9 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg) rtnl_lock(); switch (cmd) { case SIOCADDRT: + /* Only do the default setting of fc_metric in route adding */ + if (cfg.fc_metric == 0) + cfg.fc_metric = IP6_RT_PRIO_USER; err = ip6_route_add(&cfg, GFP_KERNEL, NULL); break; case SIOCDELRT: @@ -4574,8 +4592,8 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, f6i->dst_nocount = true; if (!anycast && - (net->ipv6.devconf_all->disable_policy || - idev->cnf.disable_policy)) + (READ_ONCE(net->ipv6.devconf_all->disable_policy) || + READ_ONCE(idev->cnf.disable_policy))) f6i->dst_nopolicy = true; } @@ -5597,7 +5615,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, int iif, int type, u32 portid, u32 seq, unsigned int flags) { - struct rt6_info *rt6 = (struct rt6_info *)dst; + struct rt6_info *rt6 = dst_rt6_info(dst); struct rt6key *rt6_dst, *rt6_src; u32 *pmetrics, table, rt6_flags; unsigned char nh_flags = 0; @@ -5671,7 +5689,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; } else if (dest) { struct in6_addr saddr_buf; - if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && + if (ip6_route_get_saddr(net, rt, dest, 0, 0, &saddr_buf) == 0 && nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) goto nla_put_failure; } @@ -6100,7 +6118,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, } - rt = container_of(dst, struct rt6_info, dst); + rt = dst_rt6_info(dst); if (rt->dst.error) { err = rt->dst.error; ip6_rt_put(rt); @@ -6327,12 +6345,12 @@ static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, if (!write) return -EINVAL; - net = (struct net *)ctl->extra1; - delay = net->ipv6.sysctl.flush_delay; ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (ret) return ret; + net = (struct net *)ctl->extra1; + delay = net->ipv6.sysctl.flush_delay; fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); return 0; } @@ -6417,7 +6435,6 @@ static struct ctl_table ipv6_route_table_template[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { } }; struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) @@ -6441,10 +6458,6 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; - - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) - table[1].procname = NULL; } return table; diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index a013b92cbb..2c83b75864 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -212,9 +212,9 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (unlikely(err)) goto drop; - preempt_disable(); + local_bh_disable(); dst = dst_cache_get(&rlwt->cache); - preempt_enable(); + local_bh_enable(); if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); @@ -234,9 +234,9 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) goto drop; } - preempt_disable(); + local_bh_disable(); dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr); - preempt_enable(); + local_bh_enable(); } skb_dst_drop(skb); @@ -268,23 +268,21 @@ static int rpl_input(struct sk_buff *skb) return err; } - preempt_disable(); + local_bh_disable(); dst = dst_cache_get(&rlwt->cache); - preempt_enable(); if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); if (!dst->error) { - preempt_disable(); dst_cache_set_ip6(&rlwt->cache, dst, &ipv6_hdr(skb)->saddr); - preempt_enable(); } } else { skb_dst_drop(skb); skb_dst_set(skb, dst); } + local_bh_enable(); err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); if (unlikely(err)) diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 35508abd76..a31521e270 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -551,6 +551,8 @@ out_unregister_iptun: #endif #ifdef CONFIG_IPV6_SEG6_LWTUNNEL out_unregister_genl: +#endif +#if IS_ENABLED(CONFIG_IPV6_SEG6_LWTUNNEL) || IS_ENABLED(CONFIG_IPV6_SEG6_HMAC) genl_unregister_family(&seg6_genl_family); #endif out_unregister_pernet: @@ -564,8 +566,9 @@ void seg6_exit(void) seg6_hmac_exit(); #endif #ifdef CONFIG_IPV6_SEG6_LWTUNNEL + seg6_local_exit(); seg6_iptunnel_exit(); #endif - unregister_pernet_subsys(&ip6_segments_ops); genl_unregister_family(&seg6_genl_family); + unregister_pernet_subsys(&ip6_segments_ops); } diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index d43c50a731..bbf5b84a70 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -241,6 +241,7 @@ bool seg6_hmac_validate_skb(struct sk_buff *skb) struct sr6_tlv_hmac *tlv; struct ipv6_sr_hdr *srh; struct inet6_dev *idev; + int require_hmac; idev = __in6_dev_get(skb->dev); @@ -248,16 +249,17 @@ bool seg6_hmac_validate_skb(struct sk_buff *skb) tlv = seg6_get_tlv_hmac(srh); + require_hmac = READ_ONCE(idev->cnf.seg6_require_hmac); /* mandatory check but no tlv */ - if (idev->cnf.seg6_require_hmac > 0 && !tlv) + if (require_hmac > 0 && !tlv) return false; /* no check */ - if (idev->cnf.seg6_require_hmac < 0) + if (require_hmac < 0) return true; /* check only if present */ - if (idev->cnf.seg6_require_hmac == 0 && !tlv) + if (require_hmac == 0 && !tlv) return true; /* now, seg6_require_hmac >= 0 && tlv */ @@ -354,6 +356,7 @@ static int seg6_hmac_init_algo(void) struct crypto_shash *tfm; struct shash_desc *shash; int i, alg_count, cpu; + int ret = -ENOMEM; alg_count = ARRAY_SIZE(hmac_algos); @@ -364,12 +367,14 @@ static int seg6_hmac_init_algo(void) algo = &hmac_algos[i]; algo->tfms = alloc_percpu(struct crypto_shash *); if (!algo->tfms) - return -ENOMEM; + goto error_out; for_each_possible_cpu(cpu) { tfm = crypto_alloc_shash(algo->name, 0, 0); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); + if (IS_ERR(tfm)) { + ret = PTR_ERR(tfm); + goto error_out; + } p_tfm = per_cpu_ptr(algo->tfms, cpu); *p_tfm = tfm; } @@ -381,18 +386,22 @@ static int seg6_hmac_init_algo(void) algo->shashs = alloc_percpu(struct shash_desc *); if (!algo->shashs) - return -ENOMEM; + goto error_out; for_each_possible_cpu(cpu) { shash = kzalloc_node(shsize, GFP_KERNEL, cpu_to_node(cpu)); if (!shash) - return -ENOMEM; + goto error_out; *per_cpu_ptr(algo->shashs, cpu) = shash; } } return 0; + +error_out: + seg6_hmac_exit(); + return ret; } int __init seg6_hmac_init(void) @@ -410,22 +419,29 @@ int __net_init seg6_hmac_net_init(struct net *net) void seg6_hmac_exit(void) { struct seg6_hmac_algo *algo = NULL; + struct crypto_shash *tfm; + struct shash_desc *shash; int i, alg_count, cpu; alg_count = ARRAY_SIZE(hmac_algos); for (i = 0; i < alg_count; i++) { algo = &hmac_algos[i]; - for_each_possible_cpu(cpu) { - struct crypto_shash *tfm; - struct shash_desc *shash; - shash = *per_cpu_ptr(algo->shashs, cpu); - kfree(shash); - tfm = *per_cpu_ptr(algo->tfms, cpu); - crypto_free_shash(tfm); + if (algo->shashs) { + for_each_possible_cpu(cpu) { + shash = *per_cpu_ptr(algo->shashs, cpu); + kfree(shash); + } + free_percpu(algo->shashs); + } + + if (algo->tfms) { + for_each_possible_cpu(cpu) { + tfm = *per_cpu_ptr(algo->tfms, cpu); + crypto_free_shash(tfm); + } + free_percpu(algo->tfms); } - free_percpu(algo->tfms); - free_percpu(algo->shashs); } } EXPORT_SYMBOL(seg6_hmac_exit); diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 03b877ff45..098632adc9 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -459,34 +459,30 @@ static int seg6_input_core(struct net *net, struct sock *sk, int err; err = seg6_do_srh(skb); - if (unlikely(err)) { - kfree_skb(skb); - return err; - } + if (unlikely(err)) + goto drop; slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); - preempt_disable(); + local_bh_disable(); dst = dst_cache_get(&slwt->cache); - preempt_enable(); if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); if (!dst->error) { - preempt_disable(); dst_cache_set_ip6(&slwt->cache, dst, &ipv6_hdr(skb)->saddr); - preempt_enable(); } } else { skb_dst_drop(skb); skb_dst_set(skb, dst); } + local_bh_enable(); err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); if (unlikely(err)) - return err; + goto drop; if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, @@ -494,6 +490,9 @@ static int seg6_input_core(struct net *net, struct sock *sk, skb_dst(skb)->dev, seg6_input_finish); return seg6_input_finish(dev_net(skb->dev), NULL, skb); +drop: + kfree_skb(skb); + return err; } static int seg6_input_nf(struct sk_buff *skb) @@ -535,9 +534,9 @@ static int seg6_output_core(struct net *net, struct sock *sk, slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); - preempt_disable(); + local_bh_disable(); dst = dst_cache_get(&slwt->cache); - preempt_enable(); + local_bh_enable(); if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); @@ -557,9 +556,9 @@ static int seg6_output_core(struct net *net, struct sock *sk, goto drop; } - preempt_disable(); + local_bh_disable(); dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); - preempt_enable(); + local_bh_enable(); } skb_dst_drop(skb); diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 24e2b4b494..c434940131 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -941,8 +941,8 @@ static int input_action_end_dx6(struct sk_buff *skb, if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, - dev_net(skb->dev), NULL, skb, NULL, - skb_dst(skb)->dev, input_action_end_dx6_finish); + dev_net(skb->dev), NULL, skb, skb->dev, + NULL, input_action_end_dx6_finish); return input_action_end_dx6_finish(dev_net(skb->dev), NULL, skb); drop: @@ -991,8 +991,8 @@ static int input_action_end_dx4(struct sk_buff *skb, if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, - dev_net(skb->dev), NULL, skb, NULL, - skb_dst(skb)->dev, input_action_end_dx4_finish); + dev_net(skb->dev), NULL, skb, skb->dev, + NULL, input_action_end_dx4_finish); return input_action_end_dx4_finish(dev_net(skb->dev), NULL, skb); drop: diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index fbad6e1c97..83b195f095 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -132,8 +132,8 @@ static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net, return NULL; } -static struct ip_tunnel __rcu **__ipip6_bucket(struct sit_net *sitn, - struct ip_tunnel_parm *parms) +static struct ip_tunnel __rcu ** +__ipip6_bucket(struct sit_net *sitn, struct ip_tunnel_parm_kern *parms) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; @@ -207,7 +207,7 @@ static int ipip6_tunnel_create(struct net_device *dev) __dev_addr_set(dev, &t->parms.iph.saddr, 4); memcpy(dev->broadcast, &t->parms.iph.daddr, 4); - if ((__force u16)t->parms.i_flags & SIT_ISATAP) + if (test_bit(IP_TUNNEL_SIT_ISATAP_BIT, t->parms.i_flags)) dev->priv_flags |= IFF_ISATAP; dev->rtnl_link_ops = &sit_link_ops; @@ -226,7 +226,8 @@ out: } static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, - struct ip_tunnel_parm *parms, int create) + struct ip_tunnel_parm_kern *parms, + int create) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; @@ -1135,7 +1136,8 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) dev->needed_headroom = t_hlen + hlen; } -static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p, +static void ipip6_tunnel_update(struct ip_tunnel *t, + struct ip_tunnel_parm_kern *p, __u32 fwmark) { struct net *net = t->net; @@ -1196,11 +1198,11 @@ static int ipip6_tunnel_get6rd(struct net_device *dev, struct ip_tunnel_parm __user *data) { struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm_kern p; struct ip_tunnel_6rd ip6rd; - struct ip_tunnel_parm p; if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { - if (copy_from_user(&p, data, sizeof(p))) + if (!ip_tunnel_parm_from_user(&p, data)) return -EFAULT; t = ipip6_tunnel_locate(t->net, &p, 0); } @@ -1251,7 +1253,7 @@ static bool ipip6_valid_ip_proto(u8 ipproto) } static int -__ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm *p) +__ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm_kern *p) { if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -1268,7 +1270,7 @@ __ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm *p) +ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm_kern *p) { struct ip_tunnel *t = netdev_priv(dev); @@ -1281,7 +1283,7 @@ ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm *p) +ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm_kern *p) { struct ip_tunnel *t = netdev_priv(dev); int err; @@ -1297,7 +1299,7 @@ ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm *p) +ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm_kern *p) { struct ip_tunnel *t = netdev_priv(dev); int err; @@ -1328,7 +1330,7 @@ ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm *p) +ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm_kern *p) { struct ip_tunnel *t = netdev_priv(dev); @@ -1348,7 +1350,8 @@ ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, + int cmd) { switch (cmd) { case SIOCGETTUNNEL: @@ -1398,7 +1401,6 @@ static const struct net_device_ops ipip6_netdev_ops = { .ndo_uninit = ipip6_tunnel_uninit, .ndo_start_xmit = sit_tunnel_xmit, .ndo_siocdevprivate = ipip6_tunnel_siocdevprivate, - .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipip6_tunnel_ctl, }; @@ -1408,7 +1410,6 @@ static void ipip6_dev_free(struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); dst_cache_destroy(&tunnel->dst_cache); - free_percpu(dev->tstats); } #define SIT_FEATURES (NETIF_F_SG | \ @@ -1437,6 +1438,8 @@ static void ipip6_tunnel_setup(struct net_device *dev) dev->features |= NETIF_F_LLTX; dev->features |= SIT_FEATURES; dev->hw_features |= SIT_FEATURES; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; + } static int ipip6_tunnel_init(struct net_device *dev) @@ -1449,16 +1452,11 @@ static int ipip6_tunnel_init(struct net_device *dev) strcpy(tunnel->parms.name, dev->name); ipip6_tunnel_bind_dev(dev); - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); - if (err) { - free_percpu(dev->tstats); - dev->tstats = NULL; + if (err) return err; - } + netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL); netdev_lockdep_set_classes(dev); return 0; @@ -1495,7 +1493,7 @@ static int ipip6_validate(struct nlattr *tb[], struct nlattr *data[], } static void ipip6_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms, + struct ip_tunnel_parm_kern *parms, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); @@ -1604,8 +1602,8 @@ static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + struct ip_tunnel_parm_kern p; struct net *net = t->net; struct sit_net *sitn = net_generic(net, sit_net_id); #ifdef CONFIG_IPV6_SIT_6RD @@ -1692,7 +1690,7 @@ static size_t ipip6_get_size(const struct net_device *dev) static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct ip_tunnel_parm *parm = &tunnel->parms; + struct ip_tunnel_parm_kern *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || @@ -1702,7 +1700,8 @@ static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, !!(parm->iph.frag_off & htons(IP_DF))) || nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) || - nla_put_be16(skb, IFLA_IPTUN_FLAGS, parm->i_flags) || + nla_put_be16(skb, IFLA_IPTUN_FLAGS, + ip_tunnel_flags_to_be16(parm->i_flags)) || nla_put_u32(skb, IFLA_IPTUN_FWMARK, tunnel->fwmark)) goto nla_put_failure; @@ -1876,22 +1875,19 @@ err_alloc_dev: return err; } -static void __net_exit sit_exit_batch_net(struct list_head *net_list) +static void __net_exit sit_exit_batch_rtnl(struct list_head *net_list, + struct list_head *dev_to_kill) { - LIST_HEAD(list); struct net *net; - rtnl_lock(); + ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) - sit_destroy_tunnels(net, &list); - - unregister_netdevice_many(&list); - rtnl_unlock(); + sit_destroy_tunnels(net, dev_to_kill); } static struct pernet_operations sit_net_ops = { .init = sit_init_net, - .exit_batch = sit_exit_batch_net, + .exit_batch_rtnl = sit_exit_batch_rtnl, .id = &sit_net_id, .size = sizeof(struct sit_net), }; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index c8d2ca2722..bfad1e89b6 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -177,24 +177,33 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) struct sock *ret = sk; __u8 rcv_wscale; int full_space; + SKB_DR(reason); if (!READ_ONCE(net->ipv4.sysctl_tcp_syncookies) || !th->ack || th->rst) goto out; - req = cookie_tcp_check(net, sk, skb); - if (IS_ERR(req)) - goto out; - if (!req) + if (cookie_bpf_ok(skb)) { + req = cookie_bpf_check(sk, skb); + } else { + req = cookie_tcp_check(net, sk, skb); + if (IS_ERR(req)) + goto out; + } + if (!req) { + SKB_DR_SET(reason, NO_SOCKET); goto out_drop; + } ireq = inet_rsk(req); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; - if (security_inet_conn_request(sk, skb, req)) + if (security_inet_conn_request(sk, skb, req)) { + SKB_DR_SET(reason, SECURITY_HOOK); goto out_free; + } if (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || @@ -231,11 +240,13 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) security_req_classify_flow(req, flowi6_to_flowi_common(&fl6)); dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p); - if (IS_ERR(dst)) + if (IS_ERR(dst)) { + SKB_DR_SET(reason, IP_OUTNOROUTES); goto out_free; + } } - req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); + req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? :dst_metric(dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ full_space = tcp_full_space(sk); if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && @@ -247,14 +258,23 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); - ireq->rcv_wscale = rcv_wscale; + /* req->syncookie is set true only if ACK is validated + * by BPF kfunc, then, rcv_wscale is already configured. + */ + if (!req->syncookie) + ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, dst); ret = tcp_get_cookie_sock(sk, skb, req, dst); + if (!ret) { + SKB_DR_SET(reason, NO_SOCKET); + goto out_drop; + } out: return ret; out_free: reqsk_free(req); out_drop: + kfree_skb_reason(skb, reason); return NULL; } diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 888676163e..c060285ff4 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -213,7 +213,6 @@ static struct ctl_table ipv6_table_template[] = { .proc_handler = proc_doulongvec_minmax, .extra2 = &ioam6_id_wide_max, }, - { } }; static struct ctl_table ipv6_rotable[] = { @@ -248,11 +247,11 @@ static struct ctl_table ipv6_rotable[] = { .proc_handler = proc_dointvec, }, #endif /* CONFIG_NETLABEL */ - { } }; static int __net_init ipv6_sysctl_net_init(struct net *net) { + size_t table_size = ARRAY_SIZE(ipv6_table_template); struct ctl_table *ipv6_table; struct ctl_table *ipv6_route_table; struct ctl_table *ipv6_icmp_table; @@ -264,7 +263,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) if (!ipv6_table) goto out; /* Update the variables to point into the current struct net */ - for (i = 0; i < ARRAY_SIZE(ipv6_table_template) - 1; i++) + for (i = 0; i < table_size; i++) ipv6_table[i].data += (void *)net - (void *)&init_net; ipv6_route_table = ipv6_route_sysctl_init(net); @@ -276,8 +275,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) goto out_ipv6_route_table; net->ipv6.sysctl.hdr = register_net_sysctl_sz(net, "net/ipv6", - ipv6_table, - ARRAY_SIZE(ipv6_table_template)); + ipv6_table, table_size); if (!net->ipv6.sysctl.hdr) goto out_ipv6_icmp_table; @@ -313,9 +311,9 @@ out_ipv6_table: static void __net_exit ipv6_sysctl_net_exit(struct net *net) { - struct ctl_table *ipv6_table; - struct ctl_table *ipv6_route_table; - struct ctl_table *ipv6_icmp_table; + const struct ctl_table *ipv6_table; + const struct ctl_table *ipv6_route_table; + const struct ctl_table *ipv6_icmp_table; ipv6_table = net->ipv6.sysctl.hdr->ctl_table_arg; ipv6_route_table = net->ipv6.sysctl.route_hdr->ctl_table_arg; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 57b25b1fc9..3385faf1d5 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -58,7 +58,9 @@ #include <net/timewait_sock.h> #include <net/inet_common.h> #include <net/secure_seq.h> +#include <net/hotdata.h> #include <net/busy_poll.h> +#include <net/rstreason.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -68,7 +70,8 @@ #include <trace/events/tcp.h> -static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb); +static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, + enum sk_rst_reason reason); static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); @@ -94,11 +97,9 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) struct dst_entry *dst = skb_dst(skb); if (dst && dst_hold_safe(dst)) { - const struct rt6_info *rt = (const struct rt6_info *)dst; - rcu_assign_pointer(sk->sk_rx_dst, dst); sk->sk_rx_dst_ifindex = skb->skb_iif; - sk->sk_rx_dst_cookie = rt6_get_cookie(rt); + sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); } } @@ -489,14 +490,10 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, ipv6_icmp_error(sk, skb, err, th->dest, ntohl(info), (u8 *)th); - if (!sock_owned_by_user(sk)) { - WRITE_ONCE(sk->sk_err, err); - sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ - - tcp_done(sk); - } else { + if (!sock_owned_by_user(sk)) + tcp_done_with_error(sk, err); + else WRITE_ONCE(sk->sk_err_soft, err); - } goto out; case TCP_LISTEN: break; @@ -792,7 +789,8 @@ clear_hash_nostart: static void tcp_v6_init_req(struct request_sock *req, const struct sock *sk_listener, - struct sk_buff *skb) + struct sk_buff *skb, + u32 tw_isn) { bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags); struct inet_request_sock *ireq = inet_rsk(req); @@ -806,7 +804,7 @@ static void tcp_v6_init_req(struct request_sock *req, ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = tcp_v6_iif(skb); - if (!TCP_SKB_CB(skb)->tcp_tw_isn && + if (!tw_isn && (ipv6_opt_accepted(sk_listener, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || @@ -819,9 +817,10 @@ static void tcp_v6_init_req(struct request_sock *req, static struct dst_entry *tcp_v6_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req) + struct request_sock *req, + u32 tw_isn) { - tcp_v6_init_req(req, sk, skb); + tcp_v6_init_req(req, sk, skb, tw_isn); if (security_inet_conn_request(sk, skb, req)) return NULL; @@ -1005,7 +1004,8 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 kfree_skb(buff); } -static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) +static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, + enum sk_rst_reason reason) { const struct tcphdr *th = tcp_hdr(skb); struct ipv6hdr *ipv6h = ipv6_hdr(skb); @@ -1112,7 +1112,6 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) if (sk) { oif = sk->sk_bound_dev_if; if (sk_fullsock(sk)) { - trace_tcp_send_reset(sk, skb); if (inet6_test_bit(REPFLOW, sk)) label = ip6_flowlabel(ipv6h); priority = READ_ONCE(sk->sk_priority); @@ -1128,6 +1127,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) label = ip6_flowlabel(ipv6h); } + trace_tcp_send_reset(sk, skb, reason); + tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1, ipv6_get_dsfield(ipv6h), label, priority, txhash, &key); @@ -1267,15 +1268,10 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ - /* RFC 7323 2.3 - * The window field (SEG.WND) of every outgoing segment, with the - * exception of <SYN> segments, MUST be right-shifted by - * Rcv.Wind.Shift bits: - */ tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, tcp_rsk(req)->rcv_nxt, - req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, + tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, tcp_rsk_tsval(tcp_rsk(req)), READ_ONCE(req->ts_recent), sk->sk_bound_dev_if, &key, ipv6_get_dsfield(ipv6_hdr(skb)), 0, @@ -1439,7 +1435,6 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * */ newsk->sk_gso_type = SKB_GSO_TCPV6; - ip6_dst_store(newsk, dst, NULL, NULL); inet6_sk_rx_dst_set(newsk, skb); inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); @@ -1450,6 +1445,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + ip6_dst_store(newsk, dst, NULL, NULL); + newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr; newnp->saddr = ireq->ir_v6_loc_addr; newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; @@ -1623,7 +1620,6 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (np->rxopt.all) opt_skb = skb_clone_and_charge_r(skb, sk); - reason = SKB_DROP_REASON_NOT_SPECIFIED; if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ struct dst_entry *dst; @@ -1653,12 +1649,12 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (sk->sk_state == TCP_LISTEN) { struct sock *nsk = tcp_v6_cookie_check(sk, skb); - if (!nsk) - goto discard; - if (nsk != sk) { - if (tcp_child_process(sk, nsk, skb)) - goto reset; + if (nsk) { + reason = tcp_child_process(sk, nsk, skb); + if (reason) + goto reset; + } if (opt_skb) __kfree_skb(opt_skb); return 0; @@ -1666,14 +1662,15 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) } else sock_rps_save_rxhash(sk, skb); - if (tcp_rcv_state_process(sk, skb)) + reason = tcp_rcv_state_process(sk, skb); + if (reason) goto reset; if (opt_skb) goto ipv6_pktoptions; return 0; reset: - tcp_v6_send_reset(sk, skb); + tcp_v6_send_reset(sk, skb, sk_rst_convert_drop_reason(reason)); discard: if (opt_skb) __kfree_skb(opt_skb); @@ -1737,7 +1734,6 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, skb->len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); - TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->has_rxtstamp = @@ -1754,6 +1750,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) bool refcounted; struct sock *sk; int ret; + u32 isn; struct net *net = dev_net(skb->dev); drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; @@ -1790,7 +1787,6 @@ lookup: if (!sk) goto no_tcp_socket; -process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; @@ -1856,15 +1852,21 @@ process: if (nsk == sk) { reqsk_put(req); tcp_v6_restore_cb(skb); - } else if (tcp_child_process(sk, nsk, skb)) { - tcp_v6_send_reset(nsk, skb); - goto discard_and_relse; } else { + drop_reason = tcp_child_process(sk, nsk, skb); + if (drop_reason) { + enum sk_rst_reason rst_reason; + + rst_reason = sk_rst_convert_drop_reason(drop_reason); + tcp_v6_send_reset(nsk, skb, rst_reason); + goto discard_and_relse; + } sock_put(sk); return 0; } } +process: if (static_branch_unlikely(&ip6_min_hopcount)) { /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ if (unlikely(hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount))) { @@ -1933,7 +1935,7 @@ csum_error: bad_packet: __TCP_INC_STATS(net, TCP_MIB_INERRS); } else { - tcp_v6_send_reset(NULL, skb); + tcp_v6_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); } discard_it: @@ -1961,7 +1963,7 @@ do_time_wait: goto csum_error; } - switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { + switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) { case TCP_TW_SYN: { struct sock *sk2; @@ -1979,6 +1981,7 @@ do_time_wait: sk = sk2; tcp_v6_restore_cb(skb); refcounted = false; + __this_cpu_write(tcp_tw_isn, isn); goto process; } } @@ -1988,7 +1991,7 @@ do_time_wait: tcp_v6_timewait_ack(sk, skb); break; case TCP_TW_RST: - tcp_v6_send_reset(sk, skb); + tcp_v6_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); inet_twsk_deschedule_put(inet_twsk(sk)); goto discard_it; case TCP_TW_SUCCESS: @@ -2038,7 +2041,6 @@ void tcp_v6_early_demux(struct sk_buff *skb) static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), - .twsk_unique = tcp_twsk_unique, .twsk_destructor = tcp_twsk_destructor, }; @@ -2365,11 +2367,6 @@ struct proto tcpv6_prot = { }; EXPORT_SYMBOL_GPL(tcpv6_prot); -static const struct inet6_protocol tcpv6_protocol = { - .handler = tcp_v6_rcv, - .err_handler = tcp_v6_err, - .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, -}; static struct inet_protosw tcpv6_protosw = { .type = SOCK_STREAM, @@ -2391,22 +2388,21 @@ static void __net_exit tcpv6_net_exit(struct net *net) inet_ctl_sock_destroy(net->ipv6.tcp_sk); } -static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list) -{ - tcp_twsk_purge(net_exit_list, AF_INET6); -} - static struct pernet_operations tcpv6_net_ops = { .init = tcpv6_net_init, .exit = tcpv6_net_exit, - .exit_batch = tcpv6_net_exit_batch, }; int __init tcpv6_init(void) { int ret; - ret = inet6_add_protocol(&tcpv6_protocol, IPPROTO_TCP); + net_hotdata.tcpv6_protocol = (struct inet6_protocol) { + .handler = tcp_v6_rcv, + .err_handler = tcp_v6_err, + .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, + }; + ret = inet6_add_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP); if (ret) goto out; @@ -2431,7 +2427,7 @@ out_tcpv6_pernet_subsys: out_tcpv6_protosw: inet6_unregister_protosw(&tcpv6_protosw); out_tcpv6_protocol: - inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP); + inet6_del_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP); goto out; } @@ -2439,5 +2435,5 @@ void tcpv6_exit(void) { unregister_pernet_subsys(&tcpv6_net_ops); inet6_unregister_protosw(&tcpv6_protosw); - inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP); + inet6_del_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP); } diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index bf0c957e4b..23971903e6 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -7,31 +7,84 @@ */ #include <linux/indirect_call_wrapper.h> #include <linux/skbuff.h> +#include <net/inet6_hashtables.h> #include <net/gro.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/ip6_checksum.h> #include "ip6_offload.h" +static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, + struct tcphdr *th) +{ +#if IS_ENABLED(CONFIG_IPV6) + const struct ipv6hdr *hdr; + struct sk_buff *p; + struct sock *sk; + struct net *net; + int iif, sdif; + + if (likely(!(skb->dev->features & NETIF_F_GRO_FRAGLIST))) + return; + + p = tcp_gro_lookup(head, th); + if (p) { + NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist; + return; + } + + inet6_get_iif_sdif(skb, &iif, &sdif); + hdr = skb_gro_network_header(skb); + net = dev_net(skb->dev); + sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, + &hdr->saddr, th->source, + &hdr->daddr, ntohs(th->dest), + iif, sdif); + NAPI_GRO_CB(skb)->is_flist = !sk; + if (sk) + sock_put(sk); +#endif /* IS_ENABLED(CONFIG_IPV6) */ +} + INDIRECT_CALLABLE_SCOPE struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb) { + struct tcphdr *th; + /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && skb_gro_checksum_validate(skb, IPPROTO_TCP, - ip6_gro_compute_pseudo)) { - NAPI_GRO_CB(skb)->flush = 1; - return NULL; - } + ip6_gro_compute_pseudo)) + goto flush; - return tcp_gro_receive(head, skb); + th = tcp_gro_pull_header(skb); + if (!th) + goto flush; + + tcp6_check_fraglist_gro(head, skb, th); + + return tcp_gro_receive(head, skb, th); + +flush: + NAPI_GRO_CB(skb)->flush = 1; + return NULL; } INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff) { - const struct ipv6hdr *iph = ipv6_hdr(skb); + const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; + const struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + offset); struct tcphdr *th = tcp_hdr(skb); + if (unlikely(NAPI_GRO_CB(skb)->is_flist)) { + skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV6; + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; + + __skb_incr_checksum_unnecessary(skb); + + return 0; + } + th->check = ~tcp_v6_check(skb->len - thoff, &iph->saddr, &iph->daddr, 0); skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; @@ -40,6 +93,61 @@ INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff) return 0; } +static void __tcpv6_gso_segment_csum(struct sk_buff *seg, + __be16 *oldport, __be16 newport) +{ + struct tcphdr *th; + + if (*oldport == newport) + return; + + th = tcp_hdr(seg); + inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false); + *oldport = newport; +} + +static struct sk_buff *__tcpv6_gso_segment_list_csum(struct sk_buff *segs) +{ + const struct tcphdr *th; + const struct ipv6hdr *iph; + struct sk_buff *seg; + struct tcphdr *th2; + struct ipv6hdr *iph2; + + seg = segs; + th = tcp_hdr(seg); + iph = ipv6_hdr(seg); + th2 = tcp_hdr(seg->next); + iph2 = ipv6_hdr(seg->next); + + if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) && + ipv6_addr_equal(&iph->saddr, &iph2->saddr) && + ipv6_addr_equal(&iph->daddr, &iph2->daddr)) + return segs; + + while ((seg = seg->next)) { + th2 = tcp_hdr(seg); + iph2 = ipv6_hdr(seg); + + iph2->saddr = iph->saddr; + iph2->daddr = iph->daddr; + __tcpv6_gso_segment_csum(seg, &th2->source, th->source); + __tcpv6_gso_segment_csum(seg, &th2->dest, th->dest); + } + + return segs; +} + +static struct sk_buff *__tcp6_gso_segment_list(struct sk_buff *skb, + netdev_features_t features) +{ + skb = skb_segment_list(skb, features, skb_mac_header_len(skb)); + if (IS_ERR(skb)) + return skb; + + return __tcpv6_gso_segment_list_csum(skb); +} + static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb, netdev_features_t features) { @@ -51,6 +159,9 @@ static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(*th))) return ERR_PTR(-EINVAL); + if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST) + return __tcp6_gso_segment_list(skb, features); + if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { const struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct tcphdr *th = tcp_hdr(skb); @@ -66,15 +177,15 @@ static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb, return tcp_gso_segment(skb, features); } -static const struct net_offload tcpv6_offload = { - .callbacks = { - .gso_segment = tcp6_gso_segment, - .gro_receive = tcp6_gro_receive, - .gro_complete = tcp6_gro_complete, - }, -}; int __init tcpv6_offload_init(void) { - return inet6_add_offload(&tcpv6_offload, IPPROTO_TCP); + net_hotdata.tcpv6_offload = (struct net_offload) { + .callbacks = { + .gso_segment = tcp6_gso_segment, + .gro_receive = tcp6_gro_receive, + .gro_complete = tcp6_gro_complete, + }, + }; + return inet6_add_offload(&net_hotdata.tcpv6_offload, IPPROTO_TCP); } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 8c14c4cc82..c81a07ac04 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -34,6 +34,7 @@ #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/indirect_call_wrapper.h> +#include <trace/events/udp.h> #include <net/addrconf.h> #include <net/ndisc.h> @@ -79,9 +80,6 @@ u32 udp6_ehashfn(const struct net *net, const struct in6_addr *faddr, const __be16 fport) { - static u32 udp6_ehash_secret __read_mostly; - static u32 udp_ipv6_hash_secret __read_mostly; - u32 lhash, fhash; net_get_random_once(&udp6_ehash_secret, @@ -171,15 +169,21 @@ static struct sock *udp6_lib_lookup2(struct net *net, { struct sock *sk, *result; int score, badness; + bool need_rescore; result = NULL; badness = -1; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { - score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, sdif); + need_rescore = false; +rescore: + score = compute_score(need_rescore ? result : sk, net, saddr, + sport, daddr, hnum, dif, sdif); if (score > badness) { badness = score; + if (need_rescore) + continue; + if (sk->sk_state == TCP_ESTABLISHED) { result = sk; continue; @@ -200,8 +204,14 @@ static struct sock *udp6_lib_lookup2(struct net *net, if (IS_ERR(result)) continue; - badness = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, sdif); + /* compute_score is too long of a function to be + * inlined, and calling it again here yields + * measureable overhead for some + * workloads. Work around it by jumping + * backwards to rescore 'result'. + */ + need_rescore = true; + goto rescore; } } return result; @@ -275,7 +285,8 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport) { - const struct ipv6hdr *iph = ipv6_hdr(skb); + const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; + const struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + offset); struct net *net = dev_net(skb->dev); int iif, sdif; @@ -661,8 +672,8 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) drop_reason = SKB_DROP_REASON_PROTO_MEM; } UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + trace_udp_fail_queue_rcv_skb(rc, sk, skb); kfree_skb_reason(skb, drop_reason); - trace_udp_fail_queue_rcv_skb(rc, sk); return -1; } @@ -900,11 +911,8 @@ start_lookup: static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) { - if (udp_sk_rx_dst_set(sk, dst)) { - const struct rt6_info *rt = (const struct rt6_info *)dst; - - sk->sk_rx_dst_cookie = rt6_get_cookie(rt); - } + if (udp_sk_rx_dst_set(sk, dst)) + sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); } /* wrapper for udp_queue_rcv_skb tacking care of csum conversion and @@ -1101,11 +1109,12 @@ void udp_v6_early_demux(struct sk_buff *skb) else return; - if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) + if (!sk) return; skb->sk = sk; - skb->destructor = sock_efree; + DEBUG_NET_WARN_ON_ONCE(sk_is_refcounted(sk)); + skb->destructor = sock_pfree; dst = rcu_dereference(sk->sk_rx_dst); if (dst) @@ -1574,7 +1583,7 @@ back_from_confirm: skb = ip6_make_skb(sk, getfrag, msg, ulen, sizeof(struct udphdr), &ipc6, - (struct rt6_info *)dst, + dst_rt6_info(dst), msg->msg_flags, &cork); err = PTR_ERR(skb); if (!IS_ERR_OR_NULL(skb)) @@ -1601,7 +1610,7 @@ do_append_data: ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk); up->len += ulen; err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr), - &ipc6, fl6, (struct rt6_info *)dst, + &ipc6, fl6, dst_rt6_info(dst), corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) udp_v6_flush_pending_frames(sk); @@ -1703,11 +1712,6 @@ int udpv6_getsockopt(struct sock *sk, int level, int optname, return ipv6_getsockopt(sk, level, optname, optval, optlen); } -static const struct inet6_protocol udpv6_protocol = { - .handler = udpv6_rcv, - .err_handler = udpv6_err, - .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, -}; /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS @@ -1804,7 +1808,12 @@ int __init udpv6_init(void) { int ret; - ret = inet6_add_protocol(&udpv6_protocol, IPPROTO_UDP); + net_hotdata.udpv6_protocol = (struct inet6_protocol) { + .handler = udpv6_rcv, + .err_handler = udpv6_err, + .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, + }; + ret = inet6_add_protocol(&net_hotdata.udpv6_protocol, IPPROTO_UDP); if (ret) goto out; @@ -1815,12 +1824,12 @@ out: return ret; out_udpv6_protocol: - inet6_del_protocol(&udpv6_protocol, IPPROTO_UDP); + inet6_del_protocol(&net_hotdata.udpv6_protocol, IPPROTO_UDP); goto out; } void udpv6_exit(void) { inet6_unregister_protosw(&udpv6_protosw); - inet6_del_protocol(&udpv6_protocol, IPPROTO_UDP); + inet6_del_protocol(&net_hotdata.udpv6_protocol, IPPROTO_UDP); } diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 626d7b362d..b41152dd42 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -164,7 +164,8 @@ flush: INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff) { - const struct ipv6hdr *ipv6h = ipv6_hdr(skb); + const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; + const struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + offset); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); /* do fraglist only if there is no outer UDP encap (or we already processed it) */ @@ -186,20 +187,19 @@ INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff) return udp_gro_complete(skb, nhoff, udp6_lib_lookup_skb); } -static const struct net_offload udpv6_offload = { - .callbacks = { - .gso_segment = udp6_ufo_fragment, - .gro_receive = udp6_gro_receive, - .gro_complete = udp6_gro_complete, - }, -}; - -int udpv6_offload_init(void) +int __init udpv6_offload_init(void) { - return inet6_add_offload(&udpv6_offload, IPPROTO_UDP); + net_hotdata.udpv6_offload = (struct net_offload) { + .callbacks = { + .gso_segment = udp6_ufo_fragment, + .gro_receive = udp6_gro_receive, + .gro_complete = udp6_gro_complete, + }, + }; + return inet6_add_offload(&net_hotdata.udpv6_offload, IPPROTO_UDP); } int udpv6_offload_exit(void) { - return inet6_del_offload(&udpv6_offload, IPPROTO_UDP); + return inet6_del_offload(&net_hotdata.udpv6_offload, IPPROTO_UDP); } diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 6e36e5047f..4abc5e9d63 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -43,7 +43,7 @@ static int xfrm6_transport_finish2(struct net *net, struct sock *sk, int xfrm6_transport_finish(struct sk_buff *skb, int async) { struct xfrm_offload *xo = xfrm_offload(skb); - int nhlen = skb->data - skb_network_header(skb); + int nhlen = -skb_network_offset(skb); skb_network_header(skb)[IP6CB(skb)->nhoff] = XFRM_MODE_SKB_CB(skb)->protocol; @@ -58,7 +58,11 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) skb_postpush_rcsum(skb, skb_network_header(skb), nhlen); if (xo && (xo->flags & XFRM_GRO)) { - skb_mac_header_rebuild(skb); + /* The full l2 header needs to be preserved so that re-injecting the packet at l2 + * works correctly in the presence of vlan tags. + */ + skb_mac_header_rebuild_full(skb, xo->orig_mac_len); + skb_reset_network_header(skb); skb_reset_transport_header(skb); return 0; } @@ -109,19 +113,6 @@ static int __xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb, bool pull /* Must be an IKE packet.. pass it through */ return 1; break; - case UDP_ENCAP_ESPINUDP_NON_IKE: - /* Check if this is a keepalive packet. If so, eat it. */ - if (len == 1 && udpdata[0] == 0xff) { - return -EINVAL; - } else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) && - udpdata32[0] == 0 && udpdata32[1] == 0) { - - /* ESP Packet with Non-IKE marker */ - len = sizeof(struct udphdr) + 2 * sizeof(u32); - } else - /* Must be an IKE packet.. pass it through */ - return 1; - break; } /* At this point we are sure that this is an ESPinUDP packet, @@ -279,6 +270,13 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, if (!x) continue; + if (unlikely(x->dir && x->dir != XFRM_SA_DIR_IN)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEDIRERROR); + xfrm_state_put(x); + x = NULL; + continue; + } + spin_lock(&x->lock); if ((!i || (x->props.flags & XFRM_STATE_WILDRECV)) && diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 42fb6996b0..2f1ea5f999 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -56,12 +56,18 @@ static int xfrm6_get_saddr(struct net *net, int oif, { struct dst_entry *dst; struct net_device *dev; + struct inet6_dev *idev; dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr, mark); if (IS_ERR(dst)) return -EHOSTUNREACH; - dev = ip6_dst_idev(dst)->dev; + idev = ip6_dst_idev(dst); + if (!idev) { + dst_release(dst); + return -EHOSTUNREACH; + } + dev = idev->dev; ipv6_dev_get_saddr(dev_net(dev), dev, &daddr->in6, 0, &saddr->in6); dst_release(dst); return 0; @@ -70,7 +76,7 @@ static int xfrm6_get_saddr(struct net *net, int oif, static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { - struct rt6_info *rt = (struct rt6_info *)xdst->route; + struct rt6_info *rt = dst_rt6_info(xdst->route); xdst->u.dst.dev = dev; netdev_hold(dev, &xdst->u.dst.dev_tracker, GFP_ATOMIC); @@ -184,7 +190,6 @@ static struct ctl_table xfrm6_policy_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static int __net_init xfrm6_net_sysctl_init(struct net *net) @@ -218,7 +223,7 @@ err_alloc: static void __net_exit xfrm6_net_sysctl_exit(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; if (!net->ipv6.sysctl.xfrm6_hdr) return; diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index f6cb94f82c..bf140ef781 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -355,10 +355,7 @@ static int __init xfrm6_tunnel_init(void) { int rv; - xfrm6_tunnel_spi_kmem = kmem_cache_create("xfrm6_tunnel_spi", - sizeof(struct xfrm6_tunnel_spi), - 0, SLAB_HWCACHE_ALIGN, - NULL); + xfrm6_tunnel_spi_kmem = KMEM_CACHE(xfrm6_tunnel_spi, SLAB_HWCACHE_ALIGN); if (!xfrm6_tunnel_spi_kmem) return -ENOMEM; rv = register_pernet_subsys(&xfrm6_tunnel_net_ops); diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 498a0c35b7..c00323fa9e 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -184,7 +184,7 @@ static void iucv_sock_wake_msglim(struct sock *sk) wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_all(&wq->wait); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } @@ -335,8 +335,8 @@ static void iucv_sever_path(struct sock *sk, int with_user_data) struct iucv_sock *iucv = iucv_sk(sk); struct iucv_path *path = iucv->path; - if (iucv->path) { - iucv->path = NULL; + /* Whoever resets the path pointer, must sever and free it. */ + if (xchg(&iucv->path, NULL)) { if (with_user_data) { low_nmcpy(user_data, iucv->src_name); high_nmcpy(user_data, iucv->dst_name); @@ -795,7 +795,7 @@ done: /* Accept a pending connection */ static int iucv_sock_accept(struct socket *sock, struct socket *newsock, - int flags, bool kern) + struct proto_accept_arg *arg) { DECLARE_WAITQUEUE(wait, current); struct sock *sk = sock->sk, *nsk; @@ -809,7 +809,7 @@ static int iucv_sock_accept(struct socket *sock, struct socket *newsock, goto done; } - timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); /* Wait for an incoming connection */ add_wait_queue_exclusive(sk_sleep(sk), &wait); @@ -1060,13 +1060,12 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, int i; /* skip iucv_array lying in the headroom */ - iba[0].address = (u32)(addr_t)skb->data; + iba[0].address = virt_to_dma32(skb->data); iba[0].length = (u32)skb_headlen(skb); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - iba[i + 1].address = - (u32)(addr_t)skb_frag_address(frag); + iba[i + 1].address = virt_to_dma32(skb_frag_address(frag)); iba[i + 1].length = (u32)skb_frag_size(frag); } err = pr_iucv->message_send(iucv->path, &txmsg, @@ -1162,13 +1161,12 @@ static void iucv_process_message(struct sock *sk, struct sk_buff *skb, struct iucv_array *iba = (struct iucv_array *)skb->head; int i; - iba[0].address = (u32)(addr_t)skb->data; + iba[0].address = virt_to_dma32(skb->data); iba[0].length = (u32)skb_headlen(skb); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - iba[i + 1].address = - (u32)(addr_t)skb_frag_address(frag); + iba[i + 1].address = virt_to_dma32(skb_frag_address(frag)); iba[i + 1].length = (u32)skb_frag_size(frag); } rc = pr_iucv->message_receive(path, msg, diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index b0b3e9c5af..b7bf34a5eb 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -73,8 +73,42 @@ const struct bus_type iucv_bus = { }; EXPORT_SYMBOL(iucv_bus); -struct device *iucv_root; -EXPORT_SYMBOL(iucv_root); +static struct device *iucv_root; + +static void iucv_release_device(struct device *device) +{ + kfree(device); +} + +struct device *iucv_alloc_device(const struct attribute_group **attrs, + struct device_driver *driver, + void *priv, const char *fmt, ...) +{ + struct device *dev; + va_list vargs; + int rc; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + goto out_error; + va_start(vargs, fmt); + rc = dev_set_name(dev, fmt, vargs); + va_end(vargs); + if (rc) + goto out_error; + dev->bus = &iucv_bus; + dev->parent = iucv_root; + dev->driver = driver; + dev->groups = attrs; + dev->release = iucv_release_device; + dev_set_drvdata(dev, priv); + return dev; + +out_error: + kfree(dev); + return NULL; +} +EXPORT_SYMBOL(iucv_alloc_device); static int iucv_available; @@ -210,7 +244,7 @@ struct iucv_cmd_dpl { u8 iprmmsg[8]; u32 ipsrccls; u32 ipmsgtag; - u32 ipbfadr2; + dma32_t ipbfadr2; u32 ipbfln2f; u32 res; } __attribute__ ((packed,aligned(8))); @@ -226,11 +260,11 @@ struct iucv_cmd_db { u8 iprcode; u32 ipmsgid; u32 iptrgcls; - u32 ipbfadr1; + dma32_t ipbfadr1; u32 ipbfln1f; u32 ipsrccls; u32 ipmsgtag; - u32 ipbfadr2; + dma32_t ipbfadr2; u32 ipbfln2f; u32 res; } __attribute__ ((packed,aligned(8))); @@ -286,6 +320,7 @@ static union iucv_param *iucv_param_irq[NR_CPUS]; */ static inline int __iucv_call_b2f0(int command, union iucv_param *parm) { + unsigned long reg1 = virt_to_phys(parm); int cc; asm volatile( @@ -296,7 +331,7 @@ static inline int __iucv_call_b2f0(int command, union iucv_param *parm) " srl %[cc],28\n" : [cc] "=&d" (cc), "+m" (*parm) : [reg0] "d" ((unsigned long)command), - [reg1] "d" ((unsigned long)parm) + [reg1] "d" (reg1) : "cc", "0", "1"); return cc; } @@ -431,7 +466,7 @@ static void iucv_declare_cpu(void *data) /* Declare interrupt buffer. */ parm = iucv_param_irq[cpu]; memset(parm, 0, sizeof(union iucv_param)); - parm->db.ipbfadr1 = virt_to_phys(iucv_irq_data[cpu]); + parm->db.ipbfadr1 = virt_to_dma32(iucv_irq_data[cpu]); rc = iucv_call_b2f0(IUCV_DECLARE_BUFFER, parm); if (rc) { char *err = "Unknown"; @@ -519,7 +554,7 @@ static void iucv_setmask_mp(void) */ static void iucv_setmask_up(void) { - cpumask_t cpumask; + static cpumask_t cpumask; int cpu; /* Disable all cpu but the first in cpu_irq_cpumask. */ @@ -627,23 +662,33 @@ static int iucv_cpu_online(unsigned int cpu) static int iucv_cpu_down_prep(unsigned int cpu) { - cpumask_t cpumask; + cpumask_var_t cpumask; + int ret = 0; if (!iucv_path_table) return 0; - cpumask_copy(&cpumask, &iucv_buffer_cpumask); - cpumask_clear_cpu(cpu, &cpumask); - if (cpumask_empty(&cpumask)) + if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_copy(cpumask, &iucv_buffer_cpumask); + cpumask_clear_cpu(cpu, cpumask); + if (cpumask_empty(cpumask)) { /* Can't offline last IUCV enabled cpu. */ - return -EINVAL; + ret = -EINVAL; + goto __free_cpumask; + } iucv_retrieve_cpu(NULL); if (!cpumask_empty(&iucv_irq_cpumask)) - return 0; + goto __free_cpumask; + smp_call_function_single(cpumask_first(&iucv_buffer_cpumask), iucv_allow_cpu, NULL, 1); - return 0; + +__free_cpumask: + free_cpumask_var(cpumask); + return ret; } /** @@ -1080,8 +1125,7 @@ static int iucv_message_receive_iprmdata(struct iucv_path *path, size = (size < 8) ? size : 8; for (array = buffer; size > 0; array++) { copy = min_t(size_t, size, array->length); - memcpy((u8 *)(addr_t) array->address, - rmmsg, copy); + memcpy(dma32_to_virt(array->address), rmmsg, copy); rmmsg += copy; size -= copy; } @@ -1123,7 +1167,7 @@ int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, parm = iucv_param[smp_processor_id()]; memset(parm, 0, sizeof(union iucv_param)); - parm->db.ipbfadr1 = (u32)(addr_t) buffer; + parm->db.ipbfadr1 = virt_to_dma32(buffer); parm->db.ipbfln1f = (u32) size; parm->db.ipmsgid = msg->id; parm->db.ippathid = path->pathid; @@ -1241,7 +1285,7 @@ int iucv_message_reply(struct iucv_path *path, struct iucv_message *msg, parm->dpl.iptrgcls = msg->class; memcpy(parm->dpl.iprmmsg, reply, min_t(size_t, size, 8)); } else { - parm->db.ipbfadr1 = (u32)(addr_t) reply; + parm->db.ipbfadr1 = virt_to_dma32(reply); parm->db.ipbfln1f = (u32) size; parm->db.ippathid = path->pathid; parm->db.ipflags1 = flags; @@ -1293,7 +1337,7 @@ int __iucv_message_send(struct iucv_path *path, struct iucv_message *msg, parm->dpl.ipmsgtag = msg->tag; memcpy(parm->dpl.iprmmsg, buffer, 8); } else { - parm->db.ipbfadr1 = (u32)(addr_t) buffer; + parm->db.ipbfadr1 = virt_to_dma32(buffer); parm->db.ipbfln1f = (u32) size; parm->db.ippathid = path->pathid; parm->db.ipflags1 = flags | IUCV_IPNORPY; @@ -1378,7 +1422,7 @@ int iucv_message_send2way(struct iucv_path *path, struct iucv_message *msg, parm->dpl.iptrgcls = msg->class; parm->dpl.ipsrccls = srccls; parm->dpl.ipmsgtag = msg->tag; - parm->dpl.ipbfadr2 = (u32)(addr_t) answer; + parm->dpl.ipbfadr2 = virt_to_dma32(answer); parm->dpl.ipbfln2f = (u32) asize; memcpy(parm->dpl.iprmmsg, buffer, 8); } else { @@ -1387,9 +1431,9 @@ int iucv_message_send2way(struct iucv_path *path, struct iucv_message *msg, parm->db.iptrgcls = msg->class; parm->db.ipsrccls = srccls; parm->db.ipmsgtag = msg->tag; - parm->db.ipbfadr1 = (u32)(addr_t) buffer; + parm->db.ipbfadr1 = virt_to_dma32(buffer); parm->db.ipbfln1f = (u32) size; - parm->db.ipbfadr2 = (u32)(addr_t) answer; + parm->db.ipbfadr2 = virt_to_dma32(answer); parm->db.ipbfln2f = (u32) asize; } rc = iucv_call_b2f0(IUCV_SEND, parm); @@ -1903,6 +1947,6 @@ static void __exit iucv_exit(void) subsys_initcall(iucv_init); module_exit(iucv_exit); -MODULE_AUTHOR("(C) 2001 IBM Corp. by Fritz Elfert (felfert@millenux.com)"); +MODULE_AUTHOR("(C) 2001 IBM Corp. by Fritz Elfert <felfert@millenux.com>"); MODULE_DESCRIPTION("Linux for S/390 IUCV lowlevel driver"); MODULE_LICENSE("GPL"); diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index eda933c097..2f191e50d4 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -627,7 +627,8 @@ retry: skb = txm->frag_skb; } - if (WARN_ON(!skb_shinfo(skb)->nr_frags)) { + if (WARN_ON(!skb_shinfo(skb)->nr_frags) || + WARN_ON_ONCE(!skb_frag_page(&skb_shinfo(skb)->frags[0]))) { ret = -EINVAL; goto out; } @@ -637,8 +638,8 @@ retry: msize += skb_frag_size(&skb_shinfo(skb)->frags[i]); iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, - skb_shinfo(skb)->frags, skb_shinfo(skb)->nr_frags, - msize); + (const struct bio_vec *)skb_shinfo(skb)->frags, + skb_shinfo(skb)->nr_frags, msize); iov_iter_advance(&msg.msg_iter, txm->frag_offset); do { @@ -1878,15 +1879,11 @@ static int __init kcm_init(void) { int err = -ENOMEM; - kcm_muxp = kmem_cache_create("kcm_mux_cache", - sizeof(struct kcm_mux), 0, - SLAB_HWCACHE_ALIGN, NULL); + kcm_muxp = KMEM_CACHE(kcm_mux, SLAB_HWCACHE_ALIGN); if (!kcm_muxp) goto fail; - kcm_psockp = kmem_cache_create("kcm_psock_cache", - sizeof(struct kcm_psock), 0, - SLAB_HWCACHE_ALIGN, NULL); + kcm_psockp = KMEM_CACHE(kcm_psock, SLAB_HWCACHE_ALIGN); if (!kcm_psockp) goto fail; diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 8d21ff25f1..7ea4adf81d 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -88,6 +88,11 @@ /* Default trace flags */ #define L2TP_DEFAULT_DEBUG_FLAGS 0 +#define L2TP_DEPTH_NESTING 2 +#if L2TP_DEPTH_NESTING == SINGLE_DEPTH_NESTING +#error "L2TP requires its own lockdep subclass" +#endif + /* Private data stored for received packets in the skb. */ struct l2tp_skb_cb { @@ -794,6 +799,7 @@ static void l2tp_session_queue_purge(struct l2tp_session *session) static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) { struct l2tp_session *session = NULL; + struct l2tp_tunnel *orig_tunnel = tunnel; unsigned char *ptr, *optr; u16 hdrflags; u32 tunnel_id, session_id; @@ -819,13 +825,8 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) /* Get L2TP header flags */ hdrflags = ntohs(*(__be16 *)ptr); - /* Check protocol version */ + /* Get protocol version */ version = hdrflags & L2TP_HDR_VER_MASK; - if (version != tunnel->version) { - pr_debug_ratelimited("%s: recv protocol version mismatch: got %d expected %d\n", - tunnel->name, version, tunnel->version); - goto invalid; - } /* Get length of L2TP packet */ length = skb->len; @@ -837,7 +838,7 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) /* Skip flags */ ptr += 2; - if (tunnel->version == L2TP_HDR_VER_2) { + if (version == L2TP_HDR_VER_2) { /* If length is present, skip it */ if (hdrflags & L2TP_HDRFLAG_L) ptr += 2; @@ -845,6 +846,20 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) /* Extract tunnel and session ID */ tunnel_id = ntohs(*(__be16 *)ptr); ptr += 2; + + if (tunnel_id != tunnel->tunnel_id) { + /* We are receiving trafic for another tunnel, probably + * because we have several tunnels between the same + * IP/port quadruple, look it up. + */ + struct l2tp_tunnel *alt_tunnel; + + alt_tunnel = l2tp_tunnel_get(tunnel->l2tp_net, tunnel_id); + if (!alt_tunnel) + goto pass; + tunnel = alt_tunnel; + } + session_id = ntohs(*(__be16 *)ptr); ptr += 2; } else { @@ -854,6 +869,13 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) ptr += 4; } + /* Check protocol version */ + if (version != tunnel->version) { + pr_debug_ratelimited("%s: recv protocol version mismatch: got %d expected %d\n", + tunnel->name, version, tunnel->version); + goto invalid; + } + /* Find the session context */ session = l2tp_tunnel_get_session(tunnel, session_id); if (!session || !session->recv_skb) { @@ -875,6 +897,9 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) l2tp_recv_common(session, skb, ptr, optr, hdrflags, length); l2tp_session_dec_refcount(session); + if (tunnel != orig_tunnel) + l2tp_tunnel_dec_refcount(tunnel); + return 0; invalid: @@ -884,25 +909,26 @@ pass: /* Put UDP header back */ __skb_push(skb, sizeof(struct udphdr)); + if (tunnel != orig_tunnel) + l2tp_tunnel_dec_refcount(tunnel); + return 1; } -/* UDP encapsulation receive handler. See net/ipv4/udp.c. - * Return codes: - * 0 : success. - * <0: error - * >0: skb should be passed up to userspace as UDP. +/* UDP encapsulation receive and error receive handlers. + * See net/ipv4/udp.c for details. + * + * Note that these functions are called from inside an + * RCU-protected region, but without the socket being locked. + * + * Hence we use rcu_dereference_sk_user_data to access the + * tunnel data structure rather the usual l2tp_sk_to_tunnel + * accessor function. */ int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct l2tp_tunnel *tunnel; - /* Note that this is called from the encap_rcv hook inside an - * RCU-protected region, but without the socket being locked. - * Hence we use rcu_dereference_sk_user_data to access the - * tunnel data structure rather the usual l2tp_sk_to_tunnel - * accessor function. - */ tunnel = rcu_dereference_sk_user_data(sk); if (!tunnel) goto pass_up; @@ -919,6 +945,29 @@ pass_up: } EXPORT_SYMBOL_GPL(l2tp_udp_encap_recv); +static void l2tp_udp_encap_err_recv(struct sock *sk, struct sk_buff *skb, int err, + __be16 port, u32 info, u8 *payload) +{ + struct l2tp_tunnel *tunnel; + + tunnel = rcu_dereference_sk_user_data(sk); + if (!tunnel || tunnel->fd < 0) + return; + + sk->sk_err = err; + sk_error_report(sk); + + if (ip_hdr(skb)->version == IPVERSION) { + if (inet_test_bit(RECVERR, sk)) + return ip_icmp_error(sk, skb, err, port, info, payload); +#if IS_ENABLED(CONFIG_IPV6) + } else { + if (inet6_test_bit(RECVERR6, sk)) + return ipv6_icmp_error(sk, skb, err, port, info, payload); +#endif + } +} + /************************************************************************ * Transmit handling ***********************************************************************/ @@ -1041,7 +1090,13 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, uns IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); nf_reset_ct(skb); - bh_lock_sock_nested(sk); + /* L2TP uses its own lockdep subclass to avoid lockdep splats caused by + * nested socket calls on the same lockdep socket class. This can + * happen when data from a user socket is routed over l2tp, which uses + * another userspace socket. + */ + spin_lock_nested(&sk->sk_lock.slock, L2TP_DEPTH_NESTING); + if (sock_owned_by_user(sk)) { kfree_skb(skb); ret = NET_XMIT_DROP; @@ -1093,7 +1148,7 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, uns ret = l2tp_xmit_queue(tunnel, skb, &inet->cork.fl); out_unlock: - bh_unlock_sock(sk); + spin_unlock(&sk->sk_lock.slock); return ret; } @@ -1493,6 +1548,7 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, .sk_user_data = tunnel, .encap_type = UDP_ENCAP_L2TPINUDP, .encap_rcv = l2tp_udp_encap_recv, + .encap_err_rcv = l2tp_udp_encap_err_recv, .encap_destroy = l2tp_udp_encap_destroy, }; diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 25ca89f804..8ba00ad433 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -100,7 +100,7 @@ static const struct net_device_ops l2tp_eth_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, }; -static struct device_type l2tpeth_type = { +static const struct device_type l2tpeth_type = { .name = "l2tpeth", }; @@ -127,6 +127,9 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, /* checksums verified by L2TP */ skb->ip_summed = CHECKSUM_NONE; + /* drop outer flow-hash */ + skb_clear_hash(skb); + skb_dst_drop(skb); nf_reset_ct(skb); diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 9a2a9ed3ba..19c8cc5289 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -459,7 +459,7 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl4 = &inet->cork.fl.u.ip4; if (connected) - rt = (struct rtable *)__sk_dst_check(sk, 0); + rt = dst_rtable(__sk_dst_check(sk, 0)); rcu_read_lock(); if (!rt) { @@ -478,7 +478,7 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, - sk->sk_protocol, RT_CONN_FLAGS(sk), + sk->sk_protocol, ip_sock_rt_tos(sk), sk->sk_bound_dev_if); if (IS_ERR(rt)) goto no_route; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 7bf14cf9ff..8780ec64f3 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -630,7 +630,7 @@ back_from_confirm: ulen = len + (skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0); err = ip6_append_data(sk, ip_generic_getfrag, msg, ulen, transhdrlen, &ipc6, - &fl6, (struct rt6_info *)dst, + &fl6, dst_rt6_info(dst), msg->msg_flags); if (err) ip6_flush_pending_frames(sk); diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index fde1140d89..4eb52add71 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -688,14 +688,13 @@ static void llc_cmsg_rcv(struct msghdr *msg, struct sk_buff *skb) * llc_ui_accept - accept a new incoming connection. * @sock: Socket which connections arrive on. * @newsock: Socket to move incoming connection to. - * @flags: User specified operational flags. - * @kern: If the socket is kernel internal + * @arg: User specified arguments * * Accept a new incoming connection. * Returns 0 upon success, negative otherwise. */ -static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +static int llc_ui_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { struct sock *sk = sock->sk, *newsk; struct llc_sock *llc, *newllc; diff --git a/net/llc/sysctl_net_llc.c b/net/llc/sysctl_net_llc.c index 8443a6d841..72e101135f 100644 --- a/net/llc/sysctl_net_llc.c +++ b/net/llc/sysctl_net_llc.c @@ -44,11 +44,6 @@ static struct ctl_table llc2_timeout_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { }, -}; - -static struct ctl_table llc_station_table[] = { - { }, }; static struct ctl_table_header *llc2_timeout_header; @@ -56,8 +51,9 @@ static struct ctl_table_header *llc_station_header; int __init llc_sysctl_init(void) { + struct ctl_table empty[1] = {}; llc2_timeout_header = register_net_sysctl(&init_net, "net/llc/llc2/timeout", llc2_timeout_table); - llc_station_header = register_net_sysctl(&init_net, "net/llc/station", llc_station_table); + llc_station_header = register_net_sysctl_sz(&init_net, "net/llc/station", empty, 0); if (!llc2_timeout_header || !llc_station_header) { llc_sysctl_exit(); diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile index 4406b4f8f3..a33884967f 100644 --- a/net/mac80211/Makefile +++ b/net/mac80211/Makefile @@ -29,7 +29,7 @@ mac80211-y := \ spectmgmt.o \ tx.o \ key.o \ - util.o \ + util.o parse.o \ wme.o \ chan.o \ trace.o mlme.o \ diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index b8a278355e..677bbbac9f 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -616,7 +616,9 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, return -EINVAL; if (!pubsta->deflink.ht_cap.ht_supported && - sta->sdata->vif.bss_conf.chandef.chan->band != NL80211_BAND_6GHZ) + !pubsta->deflink.vht_cap.vht_supported && + !pubsta->deflink.he_cap.has_he && + !pubsta->deflink.eht_cap.has_eht) return -EINVAL; if (WARN_ON_ONCE(!local->ops->ampdu_action)) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 1d43a80064..87a7b569cc 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -114,7 +114,7 @@ static int ieee80211_set_mon_options(struct ieee80211_sub_if_data *sdata, /* apply all changes now - no failures allowed */ - if (monitor_sdata) + if (monitor_sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) ieee80211_set_mu_mimo_follow(monitor_sdata, params); if (params->flags) { @@ -886,33 +886,32 @@ static int ieee80211_set_monitor_channel(struct wiphy *wiphy, { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata; - int ret = 0; + struct ieee80211_chan_req chanreq = { .oper = *chandef }; + int ret; lockdep_assert_wiphy(local->hw.wiphy); - if (cfg80211_chandef_identical(&local->monitor_chandef, chandef)) + if (cfg80211_chandef_identical(&local->monitor_chanreq.oper, + &chanreq.oper)) return 0; - if (local->use_chanctx) { - sdata = wiphy_dereference(local->hw.wiphy, - local->monitor_sdata); - if (sdata) { - ieee80211_link_release_channel(&sdata->deflink); - ret = ieee80211_link_use_channel(&sdata->deflink, - chandef, - IEEE80211_CHANCTX_EXCLUSIVE); - } - } else { - if (local->open_count == local->monitors) { - local->_oper_chandef = *chandef; - ieee80211_hw_config(local, 0); - } - } + sdata = wiphy_dereference(local->hw.wiphy, + local->monitor_sdata); + if (!sdata) + goto done; - if (ret == 0) - local->monitor_chandef = *chandef; + if (cfg80211_chandef_identical(&sdata->vif.bss_conf.chanreq.oper, + &chanreq.oper)) + return 0; - return ret; + ieee80211_link_release_channel(&sdata->deflink); + ret = ieee80211_link_use_channel(&sdata->deflink, &chanreq, + IEEE80211_CHANCTX_EXCLUSIVE); + if (ret) + return ret; +done: + local->monitor_chanreq = chanreq; + return 0; } static int @@ -953,7 +952,8 @@ ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata, static int ieee80211_set_fils_discovery(struct ieee80211_sub_if_data *sdata, struct cfg80211_fils_discovery *params, struct ieee80211_link_data *link, - struct ieee80211_bss_conf *link_conf) + struct ieee80211_bss_conf *link_conf, + u64 *changed) { struct fils_discovery_data *new, *old = NULL; struct ieee80211_fils_discovery *fd; @@ -980,7 +980,8 @@ static int ieee80211_set_fils_discovery(struct ieee80211_sub_if_data *sdata, RCU_INIT_POINTER(link->u.ap.fils_discovery, NULL); } - return BSS_CHANGED_FILS_DISCOVERY; + *changed |= BSS_CHANGED_FILS_DISCOVERY; + return 0; } static int @@ -1240,6 +1241,30 @@ ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, return 0; } +static u8 ieee80211_num_beaconing_links(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_link_data *link; + u8 link_id, num = 0; + + if (sdata->vif.type != NL80211_IFTYPE_AP && + sdata->vif.type != NL80211_IFTYPE_P2P_GO) + return num; + + if (!sdata->vif.valid_links) + return num; + + for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { + link = sdata_dereference(sdata->link[link_id], sdata); + if (!link) + continue; + + if (sdata_dereference(link->u.ap.beacon, sdata)) + num++; + } + + return num; +} + static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ap_settings *params) { @@ -1258,6 +1283,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, unsigned int link_id = params->beacon.link_id; struct ieee80211_link_data *link; struct ieee80211_bss_conf *link_conf; + struct ieee80211_chan_req chanreq = { .oper = params->chandef }; lockdep_assert_wiphy(local->hw.wiphy); @@ -1341,8 +1367,6 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, return -EOPNOTSUPP; link_conf->eht_support = true; - link_conf->eht_puncturing = params->punct_bitmap; - changed |= BSS_CHANGED_EHT_PUNCTURING; link_conf->eht_su_beamformer = params->eht_cap->fixed.phy_cap_info[0] & @@ -1370,7 +1394,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, return err; } - err = ieee80211_link_use_channel(link, ¶ms->chandef, + err = ieee80211_link_use_channel(link, &chanreq, IEEE80211_CHANCTX_SHARED); if (!err) ieee80211_link_copy_chanctx_to_vlans(link, false); @@ -1445,10 +1469,9 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, goto error; err = ieee80211_set_fils_discovery(sdata, ¶ms->fils_discovery, - link, link_conf); + link, link_conf, &changed); if (err < 0) goto error; - changed |= err; err = ieee80211_set_unsol_bcast_probe_resp(sdata, ¶ms->unsol_bcast_probe_resp, @@ -1463,7 +1486,10 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, if (old) kfree_rcu(old, rcu_head); RCU_INIT_POINTER(link->u.ap.beacon, NULL); - sdata->u.ap.active = false; + + if (ieee80211_num_beaconing_links(sdata) == 0) + sdata->u.ap.active = false; + goto error; } @@ -1471,7 +1497,9 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_SSID); ieee80211_link_info_change_notify(sdata, link, changed); - netif_carrier_on(dev); + if (ieee80211_num_beaconing_links(sdata) <= 1) + netif_carrier_on(dev); + list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) netif_carrier_on(vlan->dev); @@ -1519,10 +1547,9 @@ static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev, return err; err = ieee80211_set_fils_discovery(sdata, ¶ms->fils_discovery, - link, link_conf); + link, link_conf, &changed); if (err < 0) return err; - changed |= err; err = ieee80211_set_unsol_bcast_probe_resp(sdata, ¶ms->unsol_bcast_probe_resp, @@ -1565,6 +1592,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, struct ieee80211_link_data *link = sdata_dereference(sdata->link[link_id], sdata); struct ieee80211_bss_conf *link_conf = link->conf; + LIST_HEAD(keys); lockdep_assert_wiphy(local->hw.wiphy); @@ -1582,10 +1610,10 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, /* abort any running channel switch or color change */ link_conf->csa_active = false; link_conf->color_change_active = false; - if (link->csa_block_tx) { + if (sdata->csa_blocked_queues) { ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); - link->csa_block_tx = false; + sdata->csa_blocked_queues = false; } ieee80211_free_next_beacon(link); @@ -1593,10 +1621,13 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, /* turn off carrier for this interface and dependent VLANs */ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) netif_carrier_off(vlan->dev); - netif_carrier_off(dev); + + if (ieee80211_num_beaconing_links(sdata) <= 1) { + netif_carrier_off(dev); + sdata->u.ap.active = false; + } /* remove beacon and probe response */ - sdata->u.ap.active = false; RCU_INIT_POINTER(link->u.ap.beacon, NULL); RCU_INIT_POINTER(link->u.ap.probe_resp, NULL); RCU_INIT_POINTER(link->u.ap.fils_discovery, NULL); @@ -1618,8 +1649,13 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, link_conf->ema_ap = false; link_conf->bssid_indicator = 0; - __sta_info_flush(sdata, true); - ieee80211_free_keys(sdata, true); + __sta_info_flush(sdata, true, link_id); + + ieee80211_remove_link_keys(link, &keys); + if (!list_empty(&keys)) { + synchronize_net(); + ieee80211_free_key_list(local, &keys); + } link_conf->enable_beacon = false; sdata->beacon_rate_set = false; @@ -1629,7 +1665,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, BSS_CHANGED_BEACON_ENABLED); if (sdata->wdev.cac_started) { - chandef = link_conf->chandef; + chandef = link_conf->chanreq.oper; wiphy_delayed_work_cancel(wiphy, &link->dfs_cac_timer_work); cfg80211_cac_event(sdata->dev, &chandef, NL80211_RADAR_CAC_ABORTED, @@ -1829,7 +1865,7 @@ static int sta_link_apply_parameters(struct ieee80211_local *local, if (params->supported_rates && params->supported_rates_len) { - ieee80211_parse_bitrates(link->conf->chandef.width, + ieee80211_parse_bitrates(link->conf->chanreq.oper.width, sband, params->supported_rates, params->supported_rates_len, &link_sta->pub->supp_rates[sband->band]); @@ -1944,6 +1980,9 @@ static int sta_apply_parameters(struct ieee80211_local *local, clear_sta_flag(sta, WLAN_STA_TDLS_PEER); } + if (mask & BIT(NL80211_STA_FLAG_SPP_AMSDU)) + sta->sta.spp_amsdu = set & BIT(NL80211_STA_FLAG_SPP_AMSDU); + /* mark TDLS channel switch support, if the AP allows it */ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) && !sdata->deflink.u.mgd.tdls_chan_switch_prohibited && @@ -2095,7 +2134,7 @@ static int ieee80211_del_station(struct wiphy *wiphy, struct net_device *dev, if (params->mac) return sta_info_destroy_addr_bss(sdata, params->mac); - sta_info_flush(sdata); + sta_info_flush(sdata, params->link_id); return 0; } @@ -2601,6 +2640,7 @@ static int ieee80211_join_mesh(struct wiphy *wiphy, struct net_device *dev, const struct mesh_setup *setup) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_chan_req chanreq = { .oper = setup->chandef }; struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; int err; @@ -2617,7 +2657,7 @@ static int ieee80211_join_mesh(struct wiphy *wiphy, struct net_device *dev, sdata->deflink.smps_mode = IEEE80211_SMPS_OFF; sdata->deflink.needed_rx_chains = sdata->local->rx_chains; - err = ieee80211_link_use_channel(&sdata->deflink, &setup->chandef, + err = ieee80211_link_use_channel(&sdata->deflink, &chanreq, IEEE80211_CHANCTX_SHARED); if (err) return err; @@ -2660,7 +2700,7 @@ static int ieee80211_change_bss(struct wiphy *wiphy, return -EINVAL; if (params->basic_rates) { - if (!ieee80211_parse_bitrates(link->conf->chandef.width, + if (!ieee80211_parse_bitrates(link->conf->chanreq.oper.width, wiphy->bands[sband->band], params->basic_rates, params->basic_rates_len, @@ -2918,8 +2958,9 @@ static int ieee80211_set_mcast_rate(struct wiphy *wiphy, struct net_device *dev, memcpy(sdata->vif.bss_conf.mcast_rate, rate, sizeof(int) * NUM_NL80211_BANDS); - ieee80211_link_info_change_notify(sdata, &sdata->deflink, - BSS_CHANGED_MCAST_RATE); + if (ieee80211_sdata_running(sdata)) + ieee80211_link_info_change_notify(sdata, &sdata->deflink, + BSS_CHANGED_MCAST_RATE); return 0; } @@ -2997,6 +3038,9 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { + if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) + return -EOPNOTSUPP; + sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (!sdata) @@ -3059,7 +3103,7 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, if (has_monitor) { sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); - if (sdata) { + if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { sdata->deflink.user_power_level = local->user_power_level; if (txp_type != sdata->vif.bss_conf.txpower_type) update_txp_type = true; @@ -3082,7 +3126,7 @@ static int ieee80211_get_tx_power(struct wiphy *wiphy, if (local->ops->get_txpower) return drv_get_txpower(local, sdata, dbm); - if (!local->use_chanctx) + if (local->emulate_chanctx) *dbm = local->hw.conf.power_level; else *dbm = sdata->vif.bss_conf.txpower; @@ -3152,8 +3196,7 @@ int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata, if (WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION)) return -EINVAL; - if (ieee80211_vif_is_mld(&sdata->vif) && - !(sdata->vif.active_links & BIT(link->link_id))) + if (!ieee80211_vif_link_active(&sdata->vif, link->link_id)) return 0; old_req = link->u.mgd.req_smps; @@ -3175,7 +3218,7 @@ int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata, * the new value until we associate. */ if (!sdata->u.mgd.associated || - link->conf->chandef.width == NL80211_CHAN_WIDTH_20_NOHT) + link->conf->chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT) return 0; ap = sdata->vif.cfg.ap_addr; @@ -3206,7 +3249,7 @@ int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata, if (err) link->u.mgd.req_smps = old_req; else if (smps_mode != IEEE80211_SMPS_OFF && tdls_peer_found) - ieee80211_teardown_tdls_peers(sdata); + ieee80211_teardown_tdls_peers(link); return err; } @@ -3253,33 +3296,57 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev, return 0; } +static void ieee80211_set_cqm_rssi_link(struct ieee80211_sub_if_data *sdata, + struct ieee80211_link_data *link, + s32 rssi_thold, u32 rssi_hyst, + s32 rssi_low, s32 rssi_high) +{ + struct ieee80211_bss_conf *conf; + + if (!link || !link->conf) + return; + + conf = link->conf; + + if (rssi_thold && rssi_hyst && + rssi_thold == conf->cqm_rssi_thold && + rssi_hyst == conf->cqm_rssi_hyst) + return; + + conf->cqm_rssi_thold = rssi_thold; + conf->cqm_rssi_hyst = rssi_hyst; + conf->cqm_rssi_low = rssi_low; + conf->cqm_rssi_high = rssi_high; + link->u.mgd.last_cqm_event_signal = 0; + + if (!ieee80211_vif_link_active(&sdata->vif, link->link_id)) + return; + + if (sdata->u.mgd.associated && + (sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)) + ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_CQM); +} + static int ieee80211_set_cqm_rssi_config(struct wiphy *wiphy, struct net_device *dev, s32 rssi_thold, u32 rssi_hyst) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_vif *vif = &sdata->vif; - struct ieee80211_bss_conf *bss_conf = &vif->bss_conf; - - if (rssi_thold == bss_conf->cqm_rssi_thold && - rssi_hyst == bss_conf->cqm_rssi_hyst) - return 0; + int link_id; - if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER && - !(sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)) + if (vif->driver_flags & IEEE80211_VIF_BEACON_FILTER && + !(vif->driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)) return -EOPNOTSUPP; - bss_conf->cqm_rssi_thold = rssi_thold; - bss_conf->cqm_rssi_hyst = rssi_hyst; - bss_conf->cqm_rssi_low = 0; - bss_conf->cqm_rssi_high = 0; - sdata->deflink.u.mgd.last_cqm_event_signal = 0; + /* For MLD, handle CQM change on all the active links */ + for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { + struct ieee80211_link_data *link = + sdata_dereference(sdata->link[link_id], sdata); - /* tell the driver upon association, unless already associated */ - if (sdata->u.mgd.associated && - sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI) - ieee80211_link_info_change_notify(sdata, &sdata->deflink, - BSS_CHANGED_CQM); + ieee80211_set_cqm_rssi_link(sdata, link, rssi_thold, rssi_hyst, + 0, 0); + } return 0; } @@ -3290,22 +3357,19 @@ static int ieee80211_set_cqm_rssi_range_config(struct wiphy *wiphy, { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_vif *vif = &sdata->vif; - struct ieee80211_bss_conf *bss_conf = &vif->bss_conf; + int link_id; - if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER) + if (vif->driver_flags & IEEE80211_VIF_BEACON_FILTER) return -EOPNOTSUPP; - bss_conf->cqm_rssi_low = rssi_low; - bss_conf->cqm_rssi_high = rssi_high; - bss_conf->cqm_rssi_thold = 0; - bss_conf->cqm_rssi_hyst = 0; - sdata->deflink.u.mgd.last_cqm_event_signal = 0; + /* For MLD, handle CQM change on all the active links */ + for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { + struct ieee80211_link_data *link = + sdata_dereference(sdata->link[link_id], sdata); - /* tell the driver upon association, unless already associated */ - if (sdata->u.mgd.associated && - sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI) - ieee80211_link_info_change_notify(sdata, &sdata->deflink, - BSS_CHANGED_CQM); + ieee80211_set_cqm_rssi_link(sdata, link, 0, 0, + rssi_low, rssi_high); + } return 0; } @@ -3330,9 +3394,11 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, * so at a basic rate so that all clients can receive it. */ if (rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) && - sdata->vif.bss_conf.chandef.chan) { + sdata->vif.bss_conf.chanreq.oper.chan) { u32 basic_rates = sdata->vif.bss_conf.basic_rates; - enum nl80211_band band = sdata->vif.bss_conf.chandef.chan->band; + enum nl80211_band band; + + band = sdata->vif.bss_conf.chanreq.oper.chan->band; if (!(mask->control[band].legacy & basic_rates)) return -EINVAL; @@ -3384,6 +3450,7 @@ static int ieee80211_start_radar_detection(struct wiphy *wiphy, u32 cac_time_ms) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_chan_req chanreq = { .oper = *chandef }; struct ieee80211_local *local = sdata->local; int err; @@ -3398,7 +3465,7 @@ static int ieee80211_start_radar_detection(struct wiphy *wiphy, sdata->deflink.smps_mode = IEEE80211_SMPS_OFF; sdata->deflink.needed_rx_chains = local->rx_chains; - err = ieee80211_link_use_channel(&sdata->deflink, chandef, + err = ieee80211_link_use_channel(&sdata->deflink, &chanreq, IEEE80211_CHANCTX_SHARED); if (err) goto out_unlock; @@ -3541,13 +3608,24 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon) return new_beacon; } -void ieee80211_csa_finish(struct ieee80211_vif *vif) +void ieee80211_csa_finish(struct ieee80211_vif *vif, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; + struct ieee80211_link_data *link_data; + + if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) + return; rcu_read_lock(); + link_data = rcu_dereference(sdata->link[link_id]); + if (WARN_ON(!link_data)) { + rcu_read_unlock(); + return; + } + + /* TODO: MBSSID with MLO changes */ if (vif->mbssid_tx_vif == vif) { /* Trigger ieee80211_csa_finish() on the non-transmitting * interfaces when channel switch is received on @@ -3566,7 +3644,7 @@ void ieee80211_csa_finish(struct ieee80211_vif *vif) &iter->deflink.csa_finalize_work); } } - wiphy_work_queue(local->hw.wiphy, &sdata->deflink.csa_finalize_work); + wiphy_work_queue(local->hw.wiphy, &link_data->csa_finalize_work); rcu_read_unlock(); } @@ -3578,26 +3656,27 @@ void ieee80211_channel_switch_disconnect(struct ieee80211_vif *vif, bool block_t struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; - sdata->deflink.csa_block_tx = block_tx; + sdata->csa_blocked_queues = block_tx; sdata_info(sdata, "channel switch failed, disconnecting\n"); wiphy_work_queue(local->hw.wiphy, &ifmgd->csa_connection_drop_work); } EXPORT_SYMBOL(ieee80211_channel_switch_disconnect); -static int ieee80211_set_after_csa_beacon(struct ieee80211_sub_if_data *sdata, +static int ieee80211_set_after_csa_beacon(struct ieee80211_link_data *link_data, u64 *changed) { + struct ieee80211_sub_if_data *sdata = link_data->sdata; int err; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: - if (!sdata->deflink.u.ap.next_beacon) + if (!link_data->u.ap.next_beacon) return -EINVAL; - err = ieee80211_assign_beacon(sdata, &sdata->deflink, - sdata->deflink.u.ap.next_beacon, + err = ieee80211_assign_beacon(sdata, link_data, + link_data->u.ap.next_beacon, NULL, NULL, changed); - ieee80211_free_next_beacon(&sdata->deflink); + ieee80211_free_next_beacon(link_data); if (err < 0) return err; @@ -3626,6 +3705,7 @@ static int __ieee80211_csa_finalize(struct ieee80211_link_data *link_data) { struct ieee80211_sub_if_data *sdata = link_data->sdata; struct ieee80211_local *local = sdata->local; + struct ieee80211_bss_conf *link_conf = link_data->conf; u64 changed = 0; int err; @@ -3647,40 +3727,33 @@ static int __ieee80211_csa_finalize(struct ieee80211_link_data *link_data) if (link_data->reserved_ready) return 0; - return ieee80211_link_use_reserved_context(&sdata->deflink); + return ieee80211_link_use_reserved_context(link_data); } - if (!cfg80211_chandef_identical(&link_data->conf->chandef, - &link_data->csa_chandef)) + if (!cfg80211_chandef_identical(&link_conf->chanreq.oper, + &link_data->csa_chanreq.oper)) return -EINVAL; - sdata->vif.bss_conf.csa_active = false; + link_conf->csa_active = false; - err = ieee80211_set_after_csa_beacon(sdata, &changed); + err = ieee80211_set_after_csa_beacon(link_data, &changed); if (err) return err; - if (sdata->vif.bss_conf.eht_puncturing != sdata->vif.bss_conf.csa_punct_bitmap) { - sdata->vif.bss_conf.eht_puncturing = - sdata->vif.bss_conf.csa_punct_bitmap; - changed |= BSS_CHANGED_EHT_PUNCTURING; - } - ieee80211_link_info_change_notify(sdata, link_data, changed); - if (link_data->csa_block_tx) { + if (sdata->csa_blocked_queues) { ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); - link_data->csa_block_tx = false; + sdata->csa_blocked_queues = false; } err = drv_post_channel_switch(link_data); if (err) return err; - cfg80211_ch_switch_notify(sdata->dev, &link_data->csa_chandef, - link_data->link_id, - link_data->conf->eht_puncturing); + cfg80211_ch_switch_notify(sdata->dev, &link_data->csa_chanreq.oper, + link_data->link_id); return 0; } @@ -3690,7 +3763,8 @@ static void ieee80211_csa_finalize(struct ieee80211_link_data *link_data) struct ieee80211_sub_if_data *sdata = link_data->sdata; if (__ieee80211_csa_finalize(link_data)) { - sdata_info(sdata, "failed to finalize CSA, disconnecting\n"); + sdata_info(sdata, "failed to finalize CSA on link %d, disconnecting\n", + link_data->link_id); cfg80211_stop_iface(sdata->local->hw.wiphy, &sdata->wdev, GFP_KERNEL); } @@ -3715,18 +3789,19 @@ void ieee80211_csa_finalize_work(struct wiphy *wiphy, struct wiphy_work *work) ieee80211_csa_finalize(link); } -static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata, +static int ieee80211_set_csa_beacon(struct ieee80211_link_data *link_data, struct cfg80211_csa_settings *params, u64 *changed) { + struct ieee80211_sub_if_data *sdata = link_data->sdata; struct ieee80211_csa_settings csa = {}; int err; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: - sdata->deflink.u.ap.next_beacon = + link_data->u.ap.next_beacon = cfg80211_beacon_dup(¶ms->beacon_after); - if (!sdata->deflink.u.ap.next_beacon) + if (!link_data->u.ap.next_beacon) return -ENOMEM; /* @@ -3752,7 +3827,7 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata, IEEE80211_MAX_CNTDWN_COUNTERS_NUM) || (params->n_counter_offsets_presp > IEEE80211_MAX_CNTDWN_COUNTERS_NUM)) { - ieee80211_free_next_beacon(&sdata->deflink); + ieee80211_free_next_beacon(link_data); return -EINVAL; } @@ -3762,11 +3837,11 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata, csa.n_counter_offsets_presp = params->n_counter_offsets_presp; csa.count = params->count; - err = ieee80211_assign_beacon(sdata, &sdata->deflink, + err = ieee80211_assign_beacon(sdata, link_data, ¶ms->beacon_csa, &csa, NULL, changed); if (err < 0) { - ieee80211_free_next_beacon(&sdata->deflink); + ieee80211_free_next_beacon(link_data); return err; } @@ -3813,7 +3888,7 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; /* changes into another band are not supported */ - if (sdata->vif.bss_conf.chandef.chan->band != + if (sdata->vif.bss_conf.chanreq.oper.chan->band != params->chandef.chan->band) return -EINVAL; @@ -3847,13 +3922,13 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata, return 0; } -static void ieee80211_color_change_abort(struct ieee80211_sub_if_data *sdata) +static void ieee80211_color_change_abort(struct ieee80211_link_data *link) { - sdata->vif.bss_conf.color_change_active = false; + link->conf->color_change_active = false; - ieee80211_free_next_beacon(&sdata->deflink); + ieee80211_free_next_beacon(link); - cfg80211_color_change_aborted_notify(sdata->dev); + cfg80211_color_change_aborted_notify(link->sdata->dev, link->link_id); } static int @@ -3861,11 +3936,17 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_csa_settings *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_chan_req chanreq = { .oper = params->chandef }; struct ieee80211_local *local = sdata->local; - struct ieee80211_channel_switch ch_switch; + struct ieee80211_channel_switch ch_switch = { + .link_id = params->link_id, + }; struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *chanctx; + struct ieee80211_bss_conf *link_conf; + struct ieee80211_link_data *link_data; u64 changed = 0; + u8 link_id = params->link_id; int err; lockdep_assert_wiphy(local->hw.wiphy); @@ -3876,16 +3957,23 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, if (sdata->wdev.cac_started) return -EBUSY; - if (cfg80211_chandef_identical(¶ms->chandef, - &sdata->vif.bss_conf.chandef)) + if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) + return -EINVAL; + + link_data = wiphy_dereference(wiphy, sdata->link[link_id]); + if (!link_data) + return -ENOLINK; + + link_conf = link_data->conf; + + if (chanreq.oper.punctured && !link_conf->eht_support) return -EINVAL; /* don't allow another channel switch if one is already active. */ - if (sdata->vif.bss_conf.csa_active) + if (link_conf->csa_active) return -EBUSY; - conf = rcu_dereference_protected(sdata->vif.bss_conf.chanctx_conf, - lockdep_is_held(&local->hw.wiphy->mtx)); + conf = wiphy_dereference(wiphy, link_conf->chanctx_conf); if (!conf) { err = -EBUSY; goto out; @@ -3902,14 +3990,14 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, ch_switch.timestamp = 0; ch_switch.device_timestamp = 0; ch_switch.block_tx = params->block_tx; - ch_switch.chandef = params->chandef; + ch_switch.chandef = chanreq.oper; ch_switch.count = params->count; err = drv_pre_channel_switch(sdata, &ch_switch); if (err) goto out; - err = ieee80211_link_reserve_chanctx(&sdata->deflink, ¶ms->chandef, + err = ieee80211_link_reserve_chanctx(link_data, &chanreq, chanctx->mode, params->radar_required); if (err) @@ -3918,44 +4006,40 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, /* if reservation is invalid then this will fail */ err = ieee80211_check_combinations(sdata, NULL, chanctx->mode, 0); if (err) { - ieee80211_link_unreserve_chanctx(&sdata->deflink); + ieee80211_link_unreserve_chanctx(link_data); goto out; } /* if there is a color change in progress, abort it */ - if (sdata->vif.bss_conf.color_change_active) - ieee80211_color_change_abort(sdata); + if (link_conf->color_change_active) + ieee80211_color_change_abort(link_data); - err = ieee80211_set_csa_beacon(sdata, params, &changed); + err = ieee80211_set_csa_beacon(link_data, params, &changed); if (err) { - ieee80211_link_unreserve_chanctx(&sdata->deflink); + ieee80211_link_unreserve_chanctx(link_data); goto out; } - if (params->punct_bitmap && !sdata->vif.bss_conf.eht_support) - goto out; + link_data->csa_chanreq = chanreq; + link_conf->csa_active = true; - sdata->deflink.csa_chandef = params->chandef; - sdata->deflink.csa_block_tx = params->block_tx; - sdata->vif.bss_conf.csa_active = true; - sdata->vif.bss_conf.csa_punct_bitmap = params->punct_bitmap; - - if (sdata->deflink.csa_block_tx) + if (params->block_tx && + !ieee80211_hw_check(&local->hw, HANDLES_QUIET_CSA)) { ieee80211_stop_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); + sdata->csa_blocked_queues = true; + } cfg80211_ch_switch_started_notify(sdata->dev, - &sdata->deflink.csa_chandef, 0, - params->count, params->block_tx, - sdata->vif.bss_conf.csa_punct_bitmap); + &link_data->csa_chanreq.oper, link_id, + params->count, params->block_tx); if (changed) { - ieee80211_link_info_change_notify(sdata, &sdata->deflink, - changed); - drv_channel_switch_beacon(sdata, ¶ms->chandef); + ieee80211_link_info_change_notify(sdata, link_data, changed); + drv_channel_switch_beacon(sdata, &link_data->csa_chanreq.oper); } else { /* if the beacon didn't change, we can finalize immediately */ - ieee80211_csa_finalize(&sdata->deflink); + ieee80211_csa_finalize(link_data); } out: @@ -4205,15 +4289,12 @@ static int ieee80211_cfg_get_channel(struct wiphy *wiphy, chanctx_conf = rcu_dereference(link->conf->chanctx_conf); if (chanctx_conf) { - *chandef = link->conf->chandef; + *chandef = link->conf->chanreq.oper; ret = 0; } else if (local->open_count > 0 && local->open_count == local->monitors && sdata->vif.type == NL80211_IFTYPE_MONITOR) { - if (local->use_chanctx) - *chandef = local->monitor_chandef; - else - *chandef = local->_oper_chandef; + *chandef = local->monitor_chanreq.oper; ret = 0; } out: @@ -4261,12 +4342,13 @@ static int ieee80211_set_ap_chanwidth(struct wiphy *wiphy, { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_link_data *link; + struct ieee80211_chan_req chanreq = { .oper = *chandef }; int ret; u64 changed = 0; link = sdata_dereference(sdata->link[link_id], sdata); - ret = ieee80211_link_change_bandwidth(link, chandef, &changed); + ret = ieee80211_link_change_chanreq(link, &chanreq, &changed); if (ret == 0) ieee80211_link_info_change_notify(sdata, link, changed); @@ -4588,20 +4670,22 @@ static int ieee80211_set_sar_specs(struct wiphy *wiphy, } static int -ieee80211_set_after_color_change_beacon(struct ieee80211_sub_if_data *sdata, +ieee80211_set_after_color_change_beacon(struct ieee80211_link_data *link, u64 *changed) { + struct ieee80211_sub_if_data *sdata = link->sdata; + switch (sdata->vif.type) { case NL80211_IFTYPE_AP: { int ret; - if (!sdata->deflink.u.ap.next_beacon) + if (!link->u.ap.next_beacon) return -EINVAL; - ret = ieee80211_assign_beacon(sdata, &sdata->deflink, - sdata->deflink.u.ap.next_beacon, + ret = ieee80211_assign_beacon(sdata, link, + link->u.ap.next_beacon, NULL, NULL, changed); - ieee80211_free_next_beacon(&sdata->deflink); + ieee80211_free_next_beacon(link); if (ret < 0) return ret; @@ -4617,18 +4701,19 @@ ieee80211_set_after_color_change_beacon(struct ieee80211_sub_if_data *sdata, } static int -ieee80211_set_color_change_beacon(struct ieee80211_sub_if_data *sdata, +ieee80211_set_color_change_beacon(struct ieee80211_link_data *link, struct cfg80211_color_change_settings *params, u64 *changed) { + struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_color_change_settings color_change = {}; int err; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: - sdata->deflink.u.ap.next_beacon = + link->u.ap.next_beacon = cfg80211_beacon_dup(¶ms->beacon_next); - if (!sdata->deflink.u.ap.next_beacon) + if (!link->u.ap.next_beacon) return -ENOMEM; if (params->count <= 1) @@ -4640,11 +4725,11 @@ ieee80211_set_color_change_beacon(struct ieee80211_sub_if_data *sdata, params->counter_offset_presp; color_change.count = params->count; - err = ieee80211_assign_beacon(sdata, &sdata->deflink, + err = ieee80211_assign_beacon(sdata, link, ¶ms->beacon_color_change, NULL, &color_change, changed); if (err < 0) { - ieee80211_free_next_beacon(&sdata->deflink); + ieee80211_free_next_beacon(link); return err; } break; @@ -4656,16 +4741,18 @@ ieee80211_set_color_change_beacon(struct ieee80211_sub_if_data *sdata, } static void -ieee80211_color_change_bss_config_notify(struct ieee80211_sub_if_data *sdata, +ieee80211_color_change_bss_config_notify(struct ieee80211_link_data *link, u8 color, int enable, u64 changed) { + struct ieee80211_sub_if_data *sdata = link->sdata; + lockdep_assert_wiphy(sdata->local->hw.wiphy); - sdata->vif.bss_conf.he_bss_color.color = color; - sdata->vif.bss_conf.he_bss_color.enabled = enable; + link->conf->he_bss_color.color = color; + link->conf->he_bss_color.enabled = enable; changed |= BSS_CHANGED_HE_BSS_COLOR; - ieee80211_link_info_change_notify(sdata, &sdata->deflink, changed); + ieee80211_link_info_change_notify(sdata, link, changed); if (!sdata->vif.bss_conf.nontransmitted && sdata->vif.mbssid_tx_vif) { struct ieee80211_sub_if_data *child; @@ -4682,26 +4769,27 @@ ieee80211_color_change_bss_config_notify(struct ieee80211_sub_if_data *sdata, } } -static int ieee80211_color_change_finalize(struct ieee80211_sub_if_data *sdata) +static int ieee80211_color_change_finalize(struct ieee80211_link_data *link) { + struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; u64 changed = 0; int err; lockdep_assert_wiphy(local->hw.wiphy); - sdata->vif.bss_conf.color_change_active = false; + link->conf->color_change_active = false; - err = ieee80211_set_after_color_change_beacon(sdata, &changed); + err = ieee80211_set_after_color_change_beacon(link, &changed); if (err) { - cfg80211_color_change_aborted_notify(sdata->dev); + cfg80211_color_change_aborted_notify(sdata->dev, link->link_id); return err; } - ieee80211_color_change_bss_config_notify(sdata, - sdata->vif.bss_conf.color_change_color, + ieee80211_color_change_bss_config_notify(link, + link->conf->color_change_color, 1, changed); - cfg80211_color_change_notify(sdata->dev); + cfg80211_color_change_notify(sdata->dev, link->link_id); return 0; } @@ -4709,21 +4797,23 @@ static int ieee80211_color_change_finalize(struct ieee80211_sub_if_data *sdata) void ieee80211_color_change_finalize_work(struct wiphy *wiphy, struct wiphy_work *work) { - struct ieee80211_sub_if_data *sdata = - container_of(work, struct ieee80211_sub_if_data, - deflink.color_change_finalize_work); + struct ieee80211_link_data *link = + container_of(work, struct ieee80211_link_data, + color_change_finalize_work); + struct ieee80211_sub_if_data *sdata = link->sdata; + struct ieee80211_bss_conf *link_conf = link->conf; struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); /* AP might have been stopped while waiting for the lock. */ - if (!sdata->vif.bss_conf.color_change_active) + if (!link_conf->color_change_active) return; if (!ieee80211_sdata_running(sdata)) return; - ieee80211_color_change_finalize(sdata); + ieee80211_color_change_finalize(link); } void ieee80211_color_collision_detection_work(struct work_struct *work) @@ -4734,30 +4824,60 @@ void ieee80211_color_collision_detection_work(struct work_struct *work) color_collision_detect_work); struct ieee80211_sub_if_data *sdata = link->sdata; - cfg80211_obss_color_collision_notify(sdata->dev, link->color_bitmap); + cfg80211_obss_color_collision_notify(sdata->dev, link->color_bitmap, + link->link_id); } -void ieee80211_color_change_finish(struct ieee80211_vif *vif) +void ieee80211_color_change_finish(struct ieee80211_vif *vif, u8 link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_link_data *link; + + if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) + return; + + rcu_read_lock(); + + link = rcu_dereference(sdata->link[link_id]); + if (WARN_ON(!link)) { + rcu_read_unlock(); + return; + } wiphy_work_queue(sdata->local->hw.wiphy, - &sdata->deflink.color_change_finalize_work); + &link->color_change_finalize_work); + + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_color_change_finish); void ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif, - u64 color_bitmap, gfp_t gfp) + u64 color_bitmap, u8 link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); - struct ieee80211_link_data *link = &sdata->deflink; + struct ieee80211_link_data *link; + + if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) + return; + + rcu_read_lock(); + + link = rcu_dereference(sdata->link[link_id]); + if (WARN_ON(!link)) { + rcu_read_unlock(); + return; + } - if (sdata->vif.bss_conf.color_change_active || sdata->vif.bss_conf.csa_active) + if (link->conf->color_change_active || link->conf->csa_active) { + rcu_read_unlock(); return; + } - if (delayed_work_pending(&link->color_collision_detect_work)) + if (delayed_work_pending(&link->color_collision_detect_work)) { + rcu_read_unlock(); return; + } link->color_bitmap = color_bitmap; /* queue the color collision detection event every 500 ms in order to @@ -4766,6 +4886,8 @@ ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif, ieee80211_queue_delayed_work(&sdata->local->hw, &link->color_collision_detect_work, msecs_to_jiffies(500)); + + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_obss_color_collision_notify); @@ -4775,36 +4897,48 @@ ieee80211_color_change(struct wiphy *wiphy, struct net_device *dev, { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; + struct ieee80211_bss_conf *link_conf; + struct ieee80211_link_data *link; + u8 link_id = params->link_id; u64 changed = 0; int err; lockdep_assert_wiphy(local->hw.wiphy); - if (sdata->vif.bss_conf.nontransmitted) + if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) + return -EINVAL; + + link = wiphy_dereference(wiphy, sdata->link[link_id]); + if (!link) + return -ENOLINK; + + link_conf = link->conf; + + if (link_conf->nontransmitted) return -EINVAL; /* don't allow another color change if one is already active or if csa * is active */ - if (sdata->vif.bss_conf.color_change_active || sdata->vif.bss_conf.csa_active) { + if (link_conf->color_change_active || link_conf->csa_active) { err = -EBUSY; goto out; } - err = ieee80211_set_color_change_beacon(sdata, params, &changed); + err = ieee80211_set_color_change_beacon(link, params, &changed); if (err) goto out; - sdata->vif.bss_conf.color_change_active = true; - sdata->vif.bss_conf.color_change_color = params->color; + link_conf->color_change_active = true; + link_conf->color_change_color = params->color; - cfg80211_color_change_started_notify(sdata->dev, params->count); + cfg80211_color_change_started_notify(sdata->dev, params->count, link_id); if (changed) - ieee80211_color_change_bss_config_notify(sdata, 0, 0, changed); + ieee80211_color_change_bss_config_notify(link, 0, 0, changed); else /* if the beacon didn't change, we can finalize immediately */ - ieee80211_color_change_finalize(sdata); + ieee80211_color_change_finalize(link); out: @@ -4967,6 +5101,17 @@ static int ieee80211_set_hw_timestamp(struct wiphy *wiphy, return local->ops->set_hw_timestamp(&local->hw, &sdata->vif, hwts); } +static int +ieee80211_set_ttlm(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_ttlm_params *params) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + lockdep_assert_wiphy(sdata->local->hw.wiphy); + + return ieee80211_req_neg_ttlm(sdata, params); +} + const struct cfg80211_ops mac80211_config_ops = { .add_virtual_intf = ieee80211_add_iface, .del_virtual_intf = ieee80211_del_iface, @@ -5079,4 +5224,5 @@ const struct cfg80211_ops mac80211_config_ops = { .mod_link_station = ieee80211_mod_link_station, .del_link_station = ieee80211_del_link_station, .set_hw_timestamp = ieee80211_set_hw_timestamp, + .set_ttlm = ieee80211_set_ttlm, }; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index ef4c2cebc0..e6a7ff6ca6 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * mac80211 - channel management - * Copyright 2020 - 2022 Intel Corporation + * Copyright 2020 - 2024 Intel Corporation */ #include <linux/nl80211.h> @@ -81,87 +81,122 @@ ieee80211_link_get_chanctx(struct ieee80211_link_data *link) return container_of(conf, struct ieee80211_chanctx, conf); } -static const struct cfg80211_chan_def * -ieee80211_chanctx_reserved_chandef(struct ieee80211_local *local, +bool ieee80211_chanreq_identical(const struct ieee80211_chan_req *a, + const struct ieee80211_chan_req *b) +{ + if (!cfg80211_chandef_identical(&a->oper, &b->oper)) + return false; + if (!a->ap.chan && !b->ap.chan) + return true; + return cfg80211_chandef_identical(&a->ap, &b->ap); +} + +static const struct ieee80211_chan_req * +ieee80211_chanreq_compatible(const struct ieee80211_chan_req *a, + const struct ieee80211_chan_req *b, + struct ieee80211_chan_req *tmp) +{ + const struct cfg80211_chan_def *compat; + + if (a->ap.chan && b->ap.chan && + !cfg80211_chandef_identical(&a->ap, &b->ap)) + return NULL; + + compat = cfg80211_chandef_compatible(&a->oper, &b->oper); + if (!compat) + return NULL; + + /* Note: later code assumes this always fills & returns tmp if compat */ + tmp->oper = *compat; + tmp->ap = a->ap.chan ? a->ap : b->ap; + return tmp; +} + +static const struct ieee80211_chan_req * +ieee80211_chanctx_compatible(struct ieee80211_chanctx *ctx, + const struct ieee80211_chan_req *req, + struct ieee80211_chan_req *tmp) +{ + const struct ieee80211_chan_req *ret; + struct ieee80211_chan_req tmp2; + + *tmp = (struct ieee80211_chan_req){ + .oper = ctx->conf.def, + .ap = ctx->conf.ap, + }; + + ret = ieee80211_chanreq_compatible(tmp, req, &tmp2); + if (!ret) + return NULL; + *tmp = *ret; + return tmp; +} + +static const struct ieee80211_chan_req * +ieee80211_chanctx_reserved_chanreq(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, - const struct cfg80211_chan_def *compat) + const struct ieee80211_chan_req *req, + struct ieee80211_chan_req *tmp) { struct ieee80211_link_data *link; lockdep_assert_wiphy(local->hw.wiphy); - list_for_each_entry(link, &ctx->reserved_links, - reserved_chanctx_list) { - if (!compat) - compat = &link->reserved_chandef; + if (WARN_ON(!req)) + return NULL; - compat = cfg80211_chandef_compatible(&link->reserved_chandef, - compat); - if (!compat) + list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list) { + req = ieee80211_chanreq_compatible(&link->reserved, req, tmp); + if (!req) break; } - return compat; + return req; } -static const struct cfg80211_chan_def * +static const struct ieee80211_chan_req * ieee80211_chanctx_non_reserved_chandef(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, - const struct cfg80211_chan_def *compat) + const struct ieee80211_chan_req *compat, + struct ieee80211_chan_req *tmp) { struct ieee80211_link_data *link; + const struct ieee80211_chan_req *comp_def = compat; lockdep_assert_wiphy(local->hw.wiphy); - list_for_each_entry(link, &ctx->assigned_links, - assigned_chanctx_list) { + list_for_each_entry(link, &ctx->assigned_links, assigned_chanctx_list) { struct ieee80211_bss_conf *link_conf = link->conf; if (link->reserved_chanctx) continue; - if (!compat) - compat = &link_conf->chandef; - - compat = cfg80211_chandef_compatible( - &link_conf->chandef, compat); - if (!compat) + comp_def = ieee80211_chanreq_compatible(&link_conf->chanreq, + comp_def, tmp); + if (!comp_def) break; } - return compat; -} - -static const struct cfg80211_chan_def * -ieee80211_chanctx_combined_chandef(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx, - const struct cfg80211_chan_def *compat) -{ - lockdep_assert_wiphy(local->hw.wiphy); - - compat = ieee80211_chanctx_reserved_chandef(local, ctx, compat); - if (!compat) - return NULL; - - compat = ieee80211_chanctx_non_reserved_chandef(local, ctx, compat); - if (!compat) - return NULL; - - return compat; + return comp_def; } static bool -ieee80211_chanctx_can_reserve_chandef(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx, - const struct cfg80211_chan_def *def) +ieee80211_chanctx_can_reserve(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx, + const struct ieee80211_chan_req *req) { + struct ieee80211_chan_req tmp; + lockdep_assert_wiphy(local->hw.wiphy); - if (ieee80211_chanctx_combined_chandef(local, ctx, def)) - return true; + if (!ieee80211_chanctx_reserved_chanreq(local, ctx, req, &tmp)) + return false; + + if (!ieee80211_chanctx_non_reserved_chandef(local, ctx, req, &tmp)) + return false; if (!list_empty(&ctx->reserved_links) && - ieee80211_chanctx_reserved_chandef(local, ctx, def)) + ieee80211_chanctx_reserved_chanreq(local, ctx, req, &tmp)) return true; return false; @@ -169,7 +204,7 @@ ieee80211_chanctx_can_reserve_chandef(struct ieee80211_local *local, static struct ieee80211_chanctx * ieee80211_find_reservation_chanctx(struct ieee80211_local *local, - const struct cfg80211_chan_def *chandef, + const struct ieee80211_chan_req *chanreq, enum ieee80211_chanctx_mode mode) { struct ieee80211_chanctx *ctx; @@ -186,8 +221,7 @@ ieee80211_find_reservation_chanctx(struct ieee80211_local *local, if (ctx->mode == IEEE80211_CHANCTX_EXCLUSIVE) continue; - if (!ieee80211_chanctx_can_reserve_chandef(local, ctx, - chandef)) + if (!ieee80211_chanctx_can_reserve(local, ctx, chanreq)) continue; return ctx; @@ -202,7 +236,7 @@ static enum nl80211_chan_width ieee80211_get_sta_bw(struct sta_info *sta, enum ieee80211_sta_rx_bandwidth width; struct link_sta_info *link_sta; - link_sta = rcu_dereference(sta->link[link_id]); + link_sta = wiphy_dereference(sta->local->hw.wiphy, sta->link[link_id]); /* no effect if this STA has no presence on this link */ if (!link_sta) @@ -240,9 +274,10 @@ static enum nl80211_chan_width ieee80211_get_sta_bw(struct sta_info *sta, } static enum nl80211_chan_width -ieee80211_get_max_required_bw(struct ieee80211_sub_if_data *sdata, - unsigned int link_id) +ieee80211_get_max_required_bw(struct ieee80211_link_data *link) { + struct ieee80211_sub_if_data *sdata = link->sdata; + unsigned int link_id = link->link_id; enum nl80211_chan_width max_bw = NL80211_CHAN_WIDTH_20_NOHT; struct sta_info *sta; @@ -258,31 +293,25 @@ ieee80211_get_max_required_bw(struct ieee80211_sub_if_data *sdata, } static enum nl80211_chan_width -ieee80211_get_chanctx_vif_max_required_bw(struct ieee80211_sub_if_data *sdata, - struct ieee80211_chanctx *ctx, - struct ieee80211_link_data *rsvd_for) +ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx, + struct ieee80211_link_data *rsvd_for) { + struct ieee80211_sub_if_data *sdata; + struct ieee80211_link_data *link; enum nl80211_chan_width max_bw = NL80211_CHAN_WIDTH_20_NOHT; - struct ieee80211_vif *vif = &sdata->vif; - int link_id; - rcu_read_lock(); - for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { + for_each_sdata_link(local, link) { enum nl80211_chan_width width = NL80211_CHAN_WIDTH_20_NOHT; - struct ieee80211_link_data *link = - rcu_dereference(sdata->link[link_id]); - - if (!link) - continue; if (link != rsvd_for && rcu_access_pointer(link->conf->chanctx_conf) != &ctx->conf) continue; - switch (vif->type) { + switch (link->sdata->vif.type) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: - width = ieee80211_get_max_required_bw(sdata, link_id); + width = ieee80211_get_max_required_bw(link); break; case NL80211_IFTYPE_STATION: /* @@ -290,8 +319,8 @@ ieee80211_get_chanctx_vif_max_required_bw(struct ieee80211_sub_if_data *sdata, * point, so take the width from the chandef, but * account also for TDLS peers */ - width = max(link->conf->chandef.width, - ieee80211_get_max_required_bw(sdata, link_id)); + width = max(link->conf->chanreq.oper.width, + ieee80211_get_max_required_bw(link)); break; case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_NAN: @@ -299,7 +328,7 @@ ieee80211_get_chanctx_vif_max_required_bw(struct ieee80211_sub_if_data *sdata, case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_MESH_POINT: case NL80211_IFTYPE_OCB: - width = link->conf->chandef.width; + width = link->conf->chanreq.oper.width; break; case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_UNSPECIFIED: @@ -312,40 +341,13 @@ ieee80211_get_chanctx_vif_max_required_bw(struct ieee80211_sub_if_data *sdata, max_bw = max(max_bw, width); } - rcu_read_unlock(); - - return max_bw; -} - -static enum nl80211_chan_width -ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx, - struct ieee80211_link_data *rsvd_for) -{ - struct ieee80211_sub_if_data *sdata; - enum nl80211_chan_width max_bw = NL80211_CHAN_WIDTH_20_NOHT; - - rcu_read_lock(); - list_for_each_entry_rcu(sdata, &local->interfaces, list) { - enum nl80211_chan_width width; - - if (!ieee80211_sdata_running(sdata)) - continue; - - width = ieee80211_get_chanctx_vif_max_required_bw(sdata, ctx, - rsvd_for); - - max_bw = max(max_bw, width); - } /* use the configured bandwidth in case of monitor interface */ - sdata = rcu_dereference(local->monitor_sdata); + sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (sdata && rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) == &ctx->conf) max_bw = max(max_bw, ctx->conf.def.width); - rcu_read_unlock(); - return max_bw; } @@ -382,7 +384,7 @@ _ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, /* downgrade chandef up to max_bw */ min_def = ctx->conf.def; while (min_def.width > max_bw) - ieee80211_chandef_downgrade(&min_def); + ieee80211_chandef_downgrade(&min_def, NULL); if (cfg80211_chandef_identical(&ctx->conf.min_def, &min_def)) return 0; @@ -395,7 +397,7 @@ _ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, } /* calling this function is assuming that station vif is updated to - * lates changes by calling ieee80211_link_update_chandef + * lates changes by calling ieee80211_link_update_chanreq */ static void ieee80211_chan_bw_change(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, @@ -475,10 +477,15 @@ void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, static void _ieee80211_change_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, struct ieee80211_chanctx *old_ctx, - const struct cfg80211_chan_def *chandef, + const struct ieee80211_chan_req *chanreq, struct ieee80211_link_data *rsvd_for) { - u32 changed; + const struct cfg80211_chan_def *chandef = &chanreq->oper; + struct ieee80211_chan_req ctx_req = { + .oper = ctx->conf.def, + .ap = ctx->conf.ap, + }; + u32 changed = 0; /* expected to handle only 20/40/80/160/320 channel widths */ switch (chandef->width) { @@ -500,47 +507,54 @@ static void _ieee80211_change_chanctx(struct ieee80211_local *local, */ ieee80211_chan_bw_change(local, old_ctx, true); - if (cfg80211_chandef_identical(&ctx->conf.def, chandef)) { + if (ieee80211_chanreq_identical(&ctx_req, chanreq)) { ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for); return; } - WARN_ON(!cfg80211_chandef_compatible(&ctx->conf.def, chandef)); + WARN_ON(ieee80211_chanctx_refcount(local, ctx) > 1 && + !cfg80211_chandef_compatible(&ctx->conf.def, &chanreq->oper)); ieee80211_remove_wbrf(local, &ctx->conf.def); + if (!cfg80211_chandef_identical(&ctx->conf.def, &chanreq->oper)) { + if (ctx->conf.def.width != chanreq->oper.width) + changed |= IEEE80211_CHANCTX_CHANGE_WIDTH; + if (ctx->conf.def.punctured != chanreq->oper.punctured) + changed |= IEEE80211_CHANCTX_CHANGE_PUNCTURING; + } + if (!cfg80211_chandef_identical(&ctx->conf.ap, &chanreq->ap)) + changed |= IEEE80211_CHANCTX_CHANGE_AP; ctx->conf.def = *chandef; + ctx->conf.ap = chanreq->ap; /* check if min chanctx also changed */ - changed = IEEE80211_CHANCTX_CHANGE_WIDTH | - _ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for); + changed |= _ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for); ieee80211_add_wbrf(local, &ctx->conf.def); drv_change_chanctx(local, ctx, changed); - if (!local->use_chanctx) { - local->_oper_chandef = *chandef; - ieee80211_hw_config(local, 0); - } - - /* check is BW wider */ + /* check if BW is wider */ ieee80211_chan_bw_change(local, old_ctx, false); } static void ieee80211_change_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, struct ieee80211_chanctx *old_ctx, - const struct cfg80211_chan_def *chandef) + const struct ieee80211_chan_req *chanreq) { - _ieee80211_change_chanctx(local, ctx, old_ctx, chandef, NULL); + _ieee80211_change_chanctx(local, ctx, old_ctx, chanreq, NULL); } +/* Note: if successful, the returned chanctx is reserved for the link */ static struct ieee80211_chanctx * ieee80211_find_chanctx(struct ieee80211_local *local, - const struct cfg80211_chan_def *chandef, + struct ieee80211_link_data *link, + const struct ieee80211_chan_req *chanreq, enum ieee80211_chanctx_mode mode) { + struct ieee80211_chan_req tmp; struct ieee80211_chanctx *ctx; lockdep_assert_wiphy(local->hw.wiphy); @@ -548,8 +562,11 @@ ieee80211_find_chanctx(struct ieee80211_local *local, if (mode == IEEE80211_CHANCTX_EXCLUSIVE) return NULL; + if (WARN_ON(link->reserved_chanctx)) + return NULL; + list_for_each_entry(ctx, &local->chanctx_list, list) { - const struct cfg80211_chan_def *compat; + const struct ieee80211_chan_req *compat; if (ctx->replace_state != IEEE80211_CHANCTX_REPLACE_NONE) continue; @@ -557,15 +574,25 @@ ieee80211_find_chanctx(struct ieee80211_local *local, if (ctx->mode == IEEE80211_CHANCTX_EXCLUSIVE) continue; - compat = cfg80211_chandef_compatible(&ctx->conf.def, chandef); + compat = ieee80211_chanctx_compatible(ctx, chanreq, &tmp); if (!compat) continue; - compat = ieee80211_chanctx_reserved_chandef(local, ctx, - compat); + compat = ieee80211_chanctx_reserved_chanreq(local, ctx, + compat, &tmp); if (!compat) continue; + /* + * Reserve the chanctx temporarily, as the driver might change + * active links during callbacks we make into it below and/or + * later during assignment, which could (otherwise) cause the + * context to actually be removed. + */ + link->reserved_chanctx = ctx; + list_add(&link->reserved_chanctx_list, + &ctx->reserved_links); + ieee80211_change_chanctx(local, ctx, ctx, compat); return ctx; @@ -576,26 +603,14 @@ ieee80211_find_chanctx(struct ieee80211_local *local, bool ieee80211_is_radar_required(struct ieee80211_local *local) { - struct ieee80211_sub_if_data *sdata; + struct ieee80211_link_data *link; lockdep_assert_wiphy(local->hw.wiphy); - rcu_read_lock(); - list_for_each_entry_rcu(sdata, &local->interfaces, list) { - unsigned int link_id; - - for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { - struct ieee80211_link_data *link; - - link = rcu_dereference(sdata->link[link_id]); - - if (link && link->radar_required) { - rcu_read_unlock(); - return true; - } - } + for_each_sdata_link(local, link) { + if (link->radar_required) + return true; } - rcu_read_unlock(); return false; } @@ -605,43 +620,24 @@ ieee80211_chanctx_radar_required(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { struct ieee80211_chanctx_conf *conf = &ctx->conf; - struct ieee80211_sub_if_data *sdata; - bool required = false; + struct ieee80211_link_data *link; lockdep_assert_wiphy(local->hw.wiphy); - rcu_read_lock(); - list_for_each_entry_rcu(sdata, &local->interfaces, list) { - unsigned int link_id; - - if (!ieee80211_sdata_running(sdata)) + for_each_sdata_link(local, link) { + if (rcu_access_pointer(link->conf->chanctx_conf) != conf) continue; - for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { - struct ieee80211_link_data *link; - - link = rcu_dereference(sdata->link[link_id]); - if (!link) - continue; - - if (rcu_access_pointer(link->conf->chanctx_conf) != conf) - continue; - if (!link->radar_required) - continue; - required = true; - break; - } - - if (required) - break; + if (!link->radar_required) + continue; + return true; } - rcu_read_unlock(); - return required; + return false; } static struct ieee80211_chanctx * ieee80211_alloc_chanctx(struct ieee80211_local *local, - const struct cfg80211_chan_def *chandef, + const struct ieee80211_chan_req *chanreq, enum ieee80211_chanctx_mode mode) { struct ieee80211_chanctx *ctx; @@ -654,7 +650,8 @@ ieee80211_alloc_chanctx(struct ieee80211_local *local, INIT_LIST_HEAD(&ctx->assigned_links); INIT_LIST_HEAD(&ctx->reserved_links); - ctx->conf.def = *chandef; + ctx->conf.def = chanreq->oper; + ctx->conf.ap = chanreq->ap; ctx->conf.rx_chains_static = 1; ctx->conf.rx_chains_dynamic = 1; ctx->mode = mode; @@ -674,23 +671,15 @@ static int ieee80211_add_chanctx(struct ieee80211_local *local, ieee80211_add_wbrf(local, &ctx->conf.def); - if (!local->use_chanctx) - local->hw.conf.radar_enabled = ctx->conf.radar_enabled; - /* turn idle off *before* setting channel -- some drivers need that */ changed = ieee80211_idle_off(local); if (changed) ieee80211_hw_config(local, changed); - if (!local->use_chanctx) { - local->_oper_chandef = ctx->conf.def; - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); - } else { - err = drv_add_chanctx(local, ctx); - if (err) { - ieee80211_recalc_idle(local); - return err; - } + err = drv_add_chanctx(local, ctx); + if (err) { + ieee80211_recalc_idle(local); + return err; } return 0; @@ -698,74 +687,55 @@ static int ieee80211_add_chanctx(struct ieee80211_local *local, static struct ieee80211_chanctx * ieee80211_new_chanctx(struct ieee80211_local *local, - const struct cfg80211_chan_def *chandef, - enum ieee80211_chanctx_mode mode) + const struct ieee80211_chan_req *chanreq, + enum ieee80211_chanctx_mode mode, + bool assign_on_failure) { struct ieee80211_chanctx *ctx; int err; lockdep_assert_wiphy(local->hw.wiphy); - ctx = ieee80211_alloc_chanctx(local, chandef, mode); + ctx = ieee80211_alloc_chanctx(local, chanreq, mode); if (!ctx) return ERR_PTR(-ENOMEM); err = ieee80211_add_chanctx(local, ctx); - if (err) { + if (!assign_on_failure && err) { kfree(ctx); return ERR_PTR(err); } + /* We ignored a driver error, see _ieee80211_set_active_links */ + WARN_ON_ONCE(err && !local->in_reconfig); list_add_rcu(&ctx->list, &local->chanctx_list); return ctx; } static void ieee80211_del_chanctx(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx) + struct ieee80211_chanctx *ctx, + bool skip_idle_recalc) { lockdep_assert_wiphy(local->hw.wiphy); - if (!local->use_chanctx) { - struct cfg80211_chan_def *chandef = &local->_oper_chandef; - /* S1G doesn't have 20MHz, so get the correct width for the - * current channel. - */ - if (chandef->chan->band == NL80211_BAND_S1GHZ) - chandef->width = - ieee80211_s1g_channel_width(chandef->chan); - else - chandef->width = NL80211_CHAN_WIDTH_20_NOHT; - chandef->center_freq1 = chandef->chan->center_freq; - chandef->freq1_offset = chandef->chan->freq_offset; - chandef->center_freq2 = 0; - - /* NOTE: Disabling radar is only valid here for - * single channel context. To be sure, check it ... - */ - WARN_ON(local->hw.conf.radar_enabled && - !list_empty(&local->chanctx_list)); - - local->hw.conf.radar_enabled = false; - - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); - } else { - drv_remove_chanctx(local, ctx); - } + drv_remove_chanctx(local, ctx); - ieee80211_recalc_idle(local); + if (!skip_idle_recalc) + ieee80211_recalc_idle(local); ieee80211_remove_wbrf(local, &ctx->conf.def); } static void ieee80211_free_chanctx(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx) + struct ieee80211_chanctx *ctx, + bool skip_idle_recalc) { lockdep_assert_wiphy(local->hw.wiphy); WARN_ON_ONCE(ieee80211_chanctx_refcount(local, ctx) != 0); list_del_rcu(&ctx->list); - ieee80211_del_chanctx(local, ctx); + ieee80211_del_chanctx(local, ctx, skip_idle_recalc); kfree_rcu(ctx, rcu_head); } @@ -773,64 +743,64 @@ void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { struct ieee80211_chanctx_conf *conf = &ctx->conf; - struct ieee80211_sub_if_data *sdata; - const struct cfg80211_chan_def *compat = NULL; + const struct ieee80211_chan_req *compat = NULL; + struct ieee80211_link_data *link; + struct ieee80211_chan_req tmp; struct sta_info *sta; lockdep_assert_wiphy(local->hw.wiphy); - rcu_read_lock(); - list_for_each_entry_rcu(sdata, &local->interfaces, list) { - int link_id; - - if (!ieee80211_sdata_running(sdata)) - continue; + for_each_sdata_link(local, link) { + struct ieee80211_bss_conf *link_conf; - if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + if (link->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) continue; - for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { - struct ieee80211_bss_conf *link_conf = - rcu_dereference(sdata->vif.link_conf[link_id]); - - if (!link_conf) - continue; + link_conf = link->conf; - if (rcu_access_pointer(link_conf->chanctx_conf) != conf) - continue; + if (rcu_access_pointer(link_conf->chanctx_conf) != conf) + continue; - if (!compat) - compat = &link_conf->chandef; + if (!compat) + compat = &link_conf->chanreq; - compat = cfg80211_chandef_compatible(&link_conf->chandef, - compat); - if (WARN_ON_ONCE(!compat)) - break; - } + compat = ieee80211_chanreq_compatible(&link_conf->chanreq, + compat, &tmp); + if (WARN_ON_ONCE(!compat)) + return; } - if (WARN_ON_ONCE(!compat)) { - rcu_read_unlock(); + if (WARN_ON_ONCE(!compat)) return; - } /* TDLS peers can sometimes affect the chandef width */ - list_for_each_entry_rcu(sta, &local->sta_list, list) { + list_for_each_entry(sta, &local->sta_list, list) { + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_chan_req tdls_chanreq = {}; + int tdls_link_id; + if (!sta->uploaded || !test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW) || !test_sta_flag(sta, WLAN_STA_AUTHORIZED) || !sta->tdls_chandef.chan) continue; - compat = cfg80211_chandef_compatible(&sta->tdls_chandef, - compat); + tdls_link_id = ieee80211_tdls_sta_link_id(sta); + link = sdata_dereference(sdata->link[tdls_link_id], sdata); + if (!link) + continue; + + if (rcu_access_pointer(link->conf->chanctx_conf) != conf) + continue; + + tdls_chanreq.oper = sta->tdls_chandef; + + /* note this always fills and returns &tmp if compat */ + compat = ieee80211_chanreq_compatible(&tdls_chanreq, + compat, &tmp); if (WARN_ON_ONCE(!compat)) - break; + return; } - rcu_read_unlock(); - - if (!compat) - return; ieee80211_change_chanctx(local, ctx, ctx, compat); } @@ -849,22 +819,19 @@ static void ieee80211_recalc_radar_chanctx(struct ieee80211_local *local, chanctx->conf.radar_enabled = radar_enabled; - if (!local->use_chanctx) { - local->hw.conf.radar_enabled = chanctx->conf.radar_enabled; - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); - } - drv_change_chanctx(local, chanctx, IEEE80211_CHANCTX_CHANGE_RADAR); } static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link, - struct ieee80211_chanctx *new_ctx) + struct ieee80211_chanctx *new_ctx, + bool assign_on_failure) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *curr_ctx = NULL; - int ret = 0; + bool new_idle; + int ret; if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_NAN)) return -EOPNOTSUPP; @@ -885,19 +852,22 @@ static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link, ieee80211_recalc_chanctx_min_def(local, new_ctx, link); ret = drv_assign_vif_chanctx(local, sdata, link->conf, new_ctx); - if (ret) - goto out; - - conf = &new_ctx->conf; - list_add(&link->assigned_chanctx_list, - &new_ctx->assigned_links); + if (assign_on_failure || !ret) { + /* Need to continue, see _ieee80211_set_active_links */ + WARN_ON_ONCE(ret && !local->in_reconfig); + ret = 0; + + /* succeeded, so commit it to the data structures */ + conf = &new_ctx->conf; + list_add(&link->assigned_chanctx_list, + &new_ctx->assigned_links); + } + } else { + ret = 0; } -out: rcu_assign_pointer(link->conf->chanctx_conf, conf); - sdata->vif.cfg.idle = !conf; - if (curr_ctx && ieee80211_chanctx_num_assigned(local, curr_ctx) > 0) { ieee80211_recalc_chanctx_chantype(local, curr_ctx); ieee80211_recalc_smps_chanctx(local, curr_ctx); @@ -910,9 +880,27 @@ out: ieee80211_recalc_chanctx_min_def(local, new_ctx, NULL); } - if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && - sdata->vif.type != NL80211_IFTYPE_MONITOR) - ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_IDLE); + if (conf) { + new_idle = false; + } else { + struct ieee80211_link_data *tmp; + + new_idle = true; + for_each_sdata_link(local, tmp) { + if (rcu_access_pointer(tmp->conf->chanctx_conf)) { + new_idle = false; + break; + } + } + } + + if (new_idle != sdata->vif.cfg.idle) { + sdata->vif.cfg.idle = new_idle; + + if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && + sdata->vif.type != NL80211_IFTYPE_MONITOR) + ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_IDLE); + } ieee80211_check_fast_xmit_iface(sdata); @@ -924,23 +912,19 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, { struct ieee80211_sub_if_data *sdata; u8 rx_chains_static, rx_chains_dynamic; + struct ieee80211_link_data *link; lockdep_assert_wiphy(local->hw.wiphy); rx_chains_static = 1; rx_chains_dynamic = 1; - rcu_read_lock(); - list_for_each_entry_rcu(sdata, &local->interfaces, list) { + for_each_sdata_link(local, link) { u8 needed_static, needed_dynamic; - unsigned int link_id; - - if (!ieee80211_sdata_running(sdata)) - continue; - switch (sdata->vif.type) { + switch (link->sdata->vif.type) { case NL80211_IFTYPE_STATION: - if (!sdata->u.mgd.associated) + if (!link->sdata->u.mgd.associated) continue; break; case NL80211_IFTYPE_AP: @@ -952,59 +936,38 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, continue; } - for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { - struct ieee80211_link_data *link; - - link = rcu_dereference(sdata->link[link_id]); - - if (!link) - continue; - - if (rcu_access_pointer(link->conf->chanctx_conf) != &chanctx->conf) - continue; - - switch (link->smps_mode) { - default: - WARN_ONCE(1, "Invalid SMPS mode %d\n", - link->smps_mode); - fallthrough; - case IEEE80211_SMPS_OFF: - needed_static = link->needed_rx_chains; - needed_dynamic = link->needed_rx_chains; - break; - case IEEE80211_SMPS_DYNAMIC: - needed_static = 1; - needed_dynamic = link->needed_rx_chains; - break; - case IEEE80211_SMPS_STATIC: - needed_static = 1; - needed_dynamic = 1; - break; - } + if (rcu_access_pointer(link->conf->chanctx_conf) != &chanctx->conf) + continue; - rx_chains_static = max(rx_chains_static, needed_static); - rx_chains_dynamic = max(rx_chains_dynamic, needed_dynamic); + switch (link->smps_mode) { + default: + WARN_ONCE(1, "Invalid SMPS mode %d\n", + link->smps_mode); + fallthrough; + case IEEE80211_SMPS_OFF: + needed_static = link->needed_rx_chains; + needed_dynamic = link->needed_rx_chains; + break; + case IEEE80211_SMPS_DYNAMIC: + needed_static = 1; + needed_dynamic = link->needed_rx_chains; + break; + case IEEE80211_SMPS_STATIC: + needed_static = 1; + needed_dynamic = 1; + break; } + + rx_chains_static = max(rx_chains_static, needed_static); + rx_chains_dynamic = max(rx_chains_dynamic, needed_dynamic); } /* Disable SMPS for the monitor interface */ - sdata = rcu_dereference(local->monitor_sdata); + sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (sdata && rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) == &chanctx->conf) rx_chains_dynamic = rx_chains_static = local->rx_chains; - rcu_read_unlock(); - - if (!local->use_chanctx) { - if (rx_chains_static > 1) - local->smps_mode = IEEE80211_SMPS_OFF; - else if (rx_chains_dynamic > 1) - local->smps_mode = IEEE80211_SMPS_DYNAMIC; - else - local->smps_mode = IEEE80211_SMPS_STATIC; - ieee80211_hw_config(local, 0); - } - if (rx_chains_static == chanctx->conf.rx_chains_static && rx_chains_dynamic == chanctx->conf.rx_chains_dynamic) return; @@ -1043,17 +1006,16 @@ __ieee80211_link_copy_chanctx_to_vlans(struct ieee80211_link_data *link, if (clear) conf = NULL; - rcu_read_lock(); list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) { struct ieee80211_bss_conf *vlan_conf; - vlan_conf = rcu_dereference(vlan->vif.link_conf[link_id]); + vlan_conf = wiphy_dereference(local->hw.wiphy, + vlan->vif.link_conf[link_id]); if (WARN_ON(!vlan_conf)) continue; rcu_assign_pointer(vlan_conf->chanctx_conf, conf); } - rcu_read_unlock(); } void ieee80211_link_copy_chanctx_to_vlans(struct ieee80211_link_data *link, @@ -1095,7 +1057,7 @@ int ieee80211_link_unreserve_chanctx(struct ieee80211_link_data *link) list_del_rcu(&ctx->list); kfree_rcu(ctx, rcu_head); } else { - ieee80211_free_chanctx(sdata->local, ctx); + ieee80211_free_chanctx(sdata->local, ctx, false); } } @@ -1103,7 +1065,7 @@ int ieee80211_link_unreserve_chanctx(struct ieee80211_link_data *link) } int ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link, - const struct cfg80211_chan_def *chandef, + const struct ieee80211_chan_req *chanreq, enum ieee80211_chanctx_mode mode, bool radar_required) { @@ -1114,13 +1076,14 @@ int ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link, lockdep_assert_wiphy(local->hw.wiphy); curr_ctx = ieee80211_link_get_chanctx(link); - if (curr_ctx && local->use_chanctx && !local->ops->switch_vif_chanctx) + if (curr_ctx && !local->ops->switch_vif_chanctx) return -EOPNOTSUPP; - new_ctx = ieee80211_find_reservation_chanctx(local, chandef, mode); + new_ctx = ieee80211_find_reservation_chanctx(local, chanreq, mode); if (!new_ctx) { if (ieee80211_can_create_new_chanctx(local)) { - new_ctx = ieee80211_new_chanctx(local, chandef, mode); + new_ctx = ieee80211_new_chanctx(local, chanreq, mode, + false); if (IS_ERR(new_ctx)) return PTR_ERR(new_ctx); } else { @@ -1174,7 +1137,7 @@ int ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link, !list_empty(&curr_ctx->reserved_links)) return -EBUSY; - new_ctx = ieee80211_alloc_chanctx(local, chandef, mode); + new_ctx = ieee80211_alloc_chanctx(local, chanreq, mode); if (!new_ctx) return -ENOMEM; @@ -1192,7 +1155,7 @@ int ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link, list_add(&link->reserved_chanctx_list, &new_ctx->reserved_links); link->reserved_chanctx = new_ctx; - link->reserved_chandef = *chandef; + link->reserved = *chanreq; link->reserved_radar_required = radar_required; link->reserved_ready = false; @@ -1231,29 +1194,28 @@ ieee80211_link_chanctx_reservation_complete(struct ieee80211_link_data *link) } static void -ieee80211_link_update_chandef(struct ieee80211_link_data *link, - const struct cfg80211_chan_def *chandef) +ieee80211_link_update_chanreq(struct ieee80211_link_data *link, + const struct ieee80211_chan_req *chanreq) { struct ieee80211_sub_if_data *sdata = link->sdata; unsigned int link_id = link->link_id; struct ieee80211_sub_if_data *vlan; - link->conf->chandef = *chandef; + link->conf->chanreq = *chanreq; if (sdata->vif.type != NL80211_IFTYPE_AP) return; - rcu_read_lock(); list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) { struct ieee80211_bss_conf *vlan_conf; - vlan_conf = rcu_dereference(vlan->vif.link_conf[link_id]); + vlan_conf = wiphy_dereference(sdata->local->hw.wiphy, + vlan->vif.link_conf[link_id]); if (WARN_ON(!vlan_conf)) continue; - vlan_conf->chandef = *chandef; + vlan_conf->chanreq = *chanreq; } - rcu_read_unlock(); } static int @@ -1264,7 +1226,8 @@ ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link) struct ieee80211_local *local = sdata->local; struct ieee80211_vif_chanctx_switch vif_chsw[1] = {}; struct ieee80211_chanctx *old_ctx, *new_ctx; - const struct cfg80211_chan_def *chandef; + const struct ieee80211_chan_req *chanreq; + struct ieee80211_chan_req tmp; u64 changed = 0; int err; @@ -1286,17 +1249,18 @@ ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link) IEEE80211_CHANCTX_REPLACES_OTHER)) return -EINVAL; - chandef = ieee80211_chanctx_non_reserved_chandef(local, new_ctx, - &link->reserved_chandef); - if (WARN_ON(!chandef)) + chanreq = ieee80211_chanctx_non_reserved_chandef(local, new_ctx, + &link->reserved, + &tmp); + if (WARN_ON(!chanreq)) return -EINVAL; - if (link_conf->chandef.width != link->reserved_chandef.width) + if (link_conf->chanreq.oper.width != link->reserved.oper.width) changed = BSS_CHANGED_BANDWIDTH; - ieee80211_link_update_chandef(link, &link->reserved_chandef); + ieee80211_link_update_chanreq(link, &link->reserved); - _ieee80211_change_chanctx(local, new_ctx, old_ctx, chandef, link); + _ieee80211_change_chanctx(local, new_ctx, old_ctx, chanreq, link); vif_chsw[0].vif = &sdata->vif; vif_chsw[0].old_ctx = &old_ctx->conf; @@ -1310,7 +1274,7 @@ ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link) CHANCTX_SWMODE_REASSIGN_VIF); if (err) { if (ieee80211_chanctx_refcount(local, new_ctx) == 0) - ieee80211_free_chanctx(local, new_ctx); + ieee80211_free_chanctx(local, new_ctx, false); goto out; } @@ -1324,7 +1288,7 @@ ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link) ieee80211_check_fast_xmit_iface(sdata); if (ieee80211_chanctx_refcount(local, old_ctx) == 0) - ieee80211_free_chanctx(local, old_ctx); + ieee80211_free_chanctx(local, old_ctx, false); ieee80211_recalc_chanctx_min_def(local, new_ctx, NULL); ieee80211_recalc_smps_chanctx(local, new_ctx); @@ -1344,7 +1308,8 @@ ieee80211_link_use_reserved_assign(struct ieee80211_link_data *link) struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx *old_ctx, *new_ctx; - const struct cfg80211_chan_def *chandef; + const struct ieee80211_chan_req *chanreq; + struct ieee80211_chan_req tmp; int err; old_ctx = ieee80211_link_get_chanctx(link); @@ -1363,20 +1328,21 @@ ieee80211_link_use_reserved_assign(struct ieee80211_link_data *link) IEEE80211_CHANCTX_REPLACES_OTHER)) return -EINVAL; - chandef = ieee80211_chanctx_non_reserved_chandef(local, new_ctx, - &link->reserved_chandef); - if (WARN_ON(!chandef)) + chanreq = ieee80211_chanctx_non_reserved_chandef(local, new_ctx, + &link->reserved, + &tmp); + if (WARN_ON(!chanreq)) return -EINVAL; - ieee80211_change_chanctx(local, new_ctx, new_ctx, chandef); + ieee80211_change_chanctx(local, new_ctx, new_ctx, chanreq); list_del(&link->reserved_chanctx_list); link->reserved_chanctx = NULL; - err = ieee80211_assign_link_chanctx(link, new_ctx); + err = ieee80211_assign_link_chanctx(link, new_ctx, false); if (err) { if (ieee80211_chanctx_refcount(local, new_ctx) == 0) - ieee80211_free_chanctx(local, new_ctx); + ieee80211_free_chanctx(local, new_ctx, false); goto out; } @@ -1412,24 +1378,6 @@ ieee80211_link_has_in_place_reservation(struct ieee80211_link_data *link) return true; } -static int ieee80211_chsw_switch_hwconf(struct ieee80211_local *local, - struct ieee80211_chanctx *new_ctx) -{ - const struct cfg80211_chan_def *chandef; - - lockdep_assert_wiphy(local->hw.wiphy); - - chandef = ieee80211_chanctx_reserved_chandef(local, new_ctx, NULL); - if (WARN_ON(!chandef)) - return -EINVAL; - - local->hw.conf.radar_enabled = new_ctx->conf.radar_enabled; - local->_oper_chandef = *chandef; - ieee80211_hw_config(local, 0); - - return 0; -} - static int ieee80211_chsw_switch_vifs(struct ieee80211_local *local, int n_vifs) { @@ -1491,7 +1439,7 @@ static int ieee80211_chsw_switch_ctxs(struct ieee80211_local *local) if (!list_empty(&ctx->replace_ctx->assigned_links)) continue; - ieee80211_del_chanctx(local, ctx->replace_ctx); + ieee80211_del_chanctx(local, ctx->replace_ctx, false); err = ieee80211_add_chanctx(local, ctx); if (err) goto err; @@ -1508,7 +1456,7 @@ err: if (!list_empty(&ctx->replace_ctx->assigned_links)) continue; - ieee80211_del_chanctx(local, ctx); + ieee80211_del_chanctx(local, ctx, false); WARN_ON(ieee80211_add_chanctx(local, ctx->replace_ctx)); } @@ -1518,7 +1466,6 @@ err: static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) { struct ieee80211_chanctx *ctx, *ctx_tmp, *old_ctx; - struct ieee80211_chanctx *new_ctx = NULL; int err, n_assigned, n_reserved, n_ready; int n_ctx = 0, n_vifs_switch = 0, n_vifs_assign = 0, n_vifs_ctxless = 0; @@ -1551,9 +1498,6 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) goto err; } - if (!local->use_chanctx) - new_ctx = ctx; - n_ctx++; n_assigned = 0; @@ -1607,9 +1551,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) if (WARN_ON(n_ctx == 0) || WARN_ON(n_vifs_switch == 0 && n_vifs_assign == 0 && - n_vifs_ctxless == 0) || - WARN_ON(n_ctx > 1 && !local->use_chanctx) || - WARN_ON(!new_ctx && !local->use_chanctx)) { + n_vifs_ctxless == 0)) { err = -EINVAL; goto err; } @@ -1619,20 +1561,14 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) * reservations and driver capabilities. */ - if (local->use_chanctx) { - if (n_vifs_switch > 0) { - err = ieee80211_chsw_switch_vifs(local, n_vifs_switch); - if (err) - goto err; - } + if (n_vifs_switch > 0) { + err = ieee80211_chsw_switch_vifs(local, n_vifs_switch); + if (err) + goto err; + } - if (n_vifs_assign > 0 || n_vifs_ctxless > 0) { - err = ieee80211_chsw_switch_ctxs(local); - if (err) - goto err; - } - } else { - err = ieee80211_chsw_switch_hwconf(local, new_ctx); + if (n_vifs_assign > 0 || n_vifs_ctxless > 0) { + err = ieee80211_chsw_switch_ctxs(local); if (err) goto err; } @@ -1672,10 +1608,10 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) link->radar_required = link->reserved_radar_required; - if (link_conf->chandef.width != link->reserved_chandef.width) + if (link_conf->chanreq.oper.width != link->reserved.oper.width) changed = BSS_CHANGED_BANDWIDTH; - ieee80211_link_update_chandef(link, &link->reserved_chandef); + ieee80211_link_update_chanreq(link, &link->reserved); if (changed) ieee80211_link_info_change_notify(sdata, link, @@ -1772,7 +1708,8 @@ err: return err; } -static void __ieee80211_link_release_channel(struct ieee80211_link_data *link) +void __ieee80211_link_release_channel(struct ieee80211_link_data *link, + bool skip_idle_recalc) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_bss_conf *link_conf = link->conf; @@ -1798,9 +1735,9 @@ static void __ieee80211_link_release_channel(struct ieee80211_link_data *link) ieee80211_link_unreserve_chanctx(link); } - ieee80211_assign_link_chanctx(link, NULL); + ieee80211_assign_link_chanctx(link, NULL, false); if (ieee80211_chanctx_refcount(local, ctx) == 0) - ieee80211_free_chanctx(local, ctx); + ieee80211_free_chanctx(local, ctx, skip_idle_recalc); link->radar_required = false; @@ -1809,56 +1746,69 @@ static void __ieee80211_link_release_channel(struct ieee80211_link_data *link) ieee80211_vif_use_reserved_switch(local); } -int ieee80211_link_use_channel(struct ieee80211_link_data *link, - const struct cfg80211_chan_def *chandef, - enum ieee80211_chanctx_mode mode) +int _ieee80211_link_use_channel(struct ieee80211_link_data *link, + const struct ieee80211_chan_req *chanreq, + enum ieee80211_chanctx_mode mode, + bool assign_on_failure) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx *ctx; u8 radar_detect_width = 0; + bool reserved = false; int ret; lockdep_assert_wiphy(local->hw.wiphy); - if (sdata->vif.active_links && - !(sdata->vif.active_links & BIT(link->link_id))) { - ieee80211_link_update_chandef(link, chandef); + if (!ieee80211_vif_link_active(&sdata->vif, link->link_id)) { + ieee80211_link_update_chanreq(link, chanreq); return 0; } ret = cfg80211_chandef_dfs_required(local->hw.wiphy, - chandef, + &chanreq->oper, sdata->wdev.iftype); if (ret < 0) goto out; if (ret > 0) - radar_detect_width = BIT(chandef->width); + radar_detect_width = BIT(chanreq->oper.width); link->radar_required = ret; - ret = ieee80211_check_combinations(sdata, chandef, mode, + ret = ieee80211_check_combinations(sdata, &chanreq->oper, mode, radar_detect_width); if (ret < 0) goto out; - __ieee80211_link_release_channel(link); + __ieee80211_link_release_channel(link, false); - ctx = ieee80211_find_chanctx(local, chandef, mode); - if (!ctx) - ctx = ieee80211_new_chanctx(local, chandef, mode); + ctx = ieee80211_find_chanctx(local, link, chanreq, mode); + /* Note: context is now reserved */ + if (ctx) + reserved = true; + else + ctx = ieee80211_new_chanctx(local, chanreq, mode, + assign_on_failure); if (IS_ERR(ctx)) { ret = PTR_ERR(ctx); goto out; } - ieee80211_link_update_chandef(link, chandef); + ieee80211_link_update_chanreq(link, chanreq); + + ret = ieee80211_assign_link_chanctx(link, ctx, assign_on_failure); + + if (reserved) { + /* remove reservation */ + WARN_ON(link->reserved_chanctx != ctx); + link->reserved_chanctx = NULL; + list_del(&link->reserved_chanctx_list); + } - ret = ieee80211_assign_link_chanctx(link, ctx); if (ret) { /* if assign fails refcount stays the same */ if (ieee80211_chanctx_refcount(local, ctx) == 0) - ieee80211_free_chanctx(local, ctx); + ieee80211_free_chanctx(local, ctx, false); goto out; } @@ -1932,28 +1882,79 @@ int ieee80211_link_use_reserved_context(struct ieee80211_link_data *link) return 0; } -int ieee80211_link_change_bandwidth(struct ieee80211_link_data *link, - const struct cfg80211_chan_def *chandef, - u64 *changed) +/* + * This is similar to ieee80211_chanctx_compatible(), but rechecks + * against all the links actually using it (except the one that's + * passed, since that one is changing). + * This is done in order to allow changes to the AP's bandwidth for + * wider bandwidth OFDMA purposes, which wouldn't be treated as + * compatible by ieee80211_chanctx_recheck() but is OK if the link + * requesting the update is the only one using it. + */ +static const struct ieee80211_chan_req * +ieee80211_chanctx_recheck(struct ieee80211_local *local, + struct ieee80211_link_data *skip_link, + struct ieee80211_chanctx *ctx, + const struct ieee80211_chan_req *req, + struct ieee80211_chan_req *tmp) +{ + const struct ieee80211_chan_req *ret = req; + struct ieee80211_link_data *link; + + lockdep_assert_wiphy(local->hw.wiphy); + + for_each_sdata_link(local, link) { + if (link == skip_link) + continue; + + if (rcu_access_pointer(link->conf->chanctx_conf) == &ctx->conf) { + ret = ieee80211_chanreq_compatible(ret, + &link->conf->chanreq, + tmp); + if (!ret) + return NULL; + } + + if (link->reserved_chanctx == ctx) { + ret = ieee80211_chanreq_compatible(ret, + &link->reserved, + tmp); + if (!ret) + return NULL; + } + } + + *tmp = *ret; + return tmp; +} + +int ieee80211_link_change_chanreq(struct ieee80211_link_data *link, + const struct ieee80211_chan_req *chanreq, + u64 *changed) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_bss_conf *link_conf = link->conf; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *ctx; - const struct cfg80211_chan_def *compat; + const struct ieee80211_chan_req *compat; + struct ieee80211_chan_req tmp; lockdep_assert_wiphy(local->hw.wiphy); - if (!cfg80211_chandef_usable(sdata->local->hw.wiphy, chandef, + if (!cfg80211_chandef_usable(sdata->local->hw.wiphy, + &chanreq->oper, IEEE80211_CHAN_DISABLED)) return -EINVAL; - if (cfg80211_chandef_identical(chandef, &link_conf->chandef)) + /* for non-HT 20 MHz the rest doesn't matter */ + if (chanreq->oper.width == NL80211_CHAN_WIDTH_20_NOHT && + cfg80211_chandef_identical(&chanreq->oper, &link_conf->chanreq.oper)) return 0; - if (chandef->width == NL80211_CHAN_WIDTH_20_NOHT || - link_conf->chandef.width == NL80211_CHAN_WIDTH_20_NOHT) + /* but you cannot switch to/from it */ + if (chanreq->oper.width == NL80211_CHAN_WIDTH_20_NOHT || + link_conf->chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT) return -EINVAL; conf = rcu_dereference_protected(link_conf->chanctx_conf, @@ -1963,13 +1964,14 @@ int ieee80211_link_change_bandwidth(struct ieee80211_link_data *link, ctx = container_of(conf, struct ieee80211_chanctx, conf); - compat = cfg80211_chandef_compatible(&conf->def, chandef); + compat = ieee80211_chanctx_recheck(local, link, ctx, chanreq, &tmp); if (!compat) return -EINVAL; switch (ctx->replace_state) { case IEEE80211_CHANCTX_REPLACE_NONE: - if (!ieee80211_chanctx_reserved_chandef(local, ctx, compat)) + if (!ieee80211_chanctx_reserved_chanreq(local, ctx, compat, + &tmp)) return -EBUSY; break; case IEEE80211_CHANCTX_WILL_BE_REPLACED: @@ -1984,7 +1986,7 @@ int ieee80211_link_change_bandwidth(struct ieee80211_link_data *link, break; } - ieee80211_link_update_chandef(link, chandef); + ieee80211_link_update_chanreq(link, chanreq); ieee80211_recalc_chanctx_chantype(local, ctx); @@ -1999,7 +2001,7 @@ void ieee80211_link_release_channel(struct ieee80211_link_data *link) lockdep_assert_wiphy(sdata->local->hw.wiphy); if (rcu_access_pointer(link->conf->chanctx_conf)) - __ieee80211_link_release_channel(link); + __ieee80211_link_release_channel(link, false); } void ieee80211_link_vlan_copy_chanctx(struct ieee80211_link_data *link) @@ -2019,12 +2021,11 @@ void ieee80211_link_vlan_copy_chanctx(struct ieee80211_link_data *link) ap = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); - rcu_read_lock(); - ap_conf = rcu_dereference(ap->vif.link_conf[link_id]); - conf = rcu_dereference_protected(ap_conf->chanctx_conf, - lockdep_is_held(&local->hw.wiphy->mtx)); + ap_conf = wiphy_dereference(local->hw.wiphy, + ap->vif.link_conf[link_id]); + conf = wiphy_dereference(local->hw.wiphy, + ap_conf->chanctx_conf); rcu_assign_pointer(link_conf->chanctx_conf, conf); - rcu_read_unlock(); } void ieee80211_iter_chan_contexts_atomic( diff --git a/net/mac80211/debug.h b/net/mac80211/debug.h index d49894df23..35a8ba25fa 100644 --- a/net/mac80211/debug.h +++ b/net/mac80211/debug.h @@ -152,16 +152,17 @@ do { \ else \ _sdata_err((link)->sdata, fmt, ##__VA_ARGS__); \ } while (0) -#define link_dbg(link, fmt, ...) \ +#define _link_id_dbg(print, sdata, link_id, fmt, ...) \ do { \ - if (ieee80211_vif_is_mld(&(link)->sdata->vif)) \ - _sdata_dbg(1, (link)->sdata, "[link %d] " fmt, \ - (link)->link_id, \ - ##__VA_ARGS__); \ + if (ieee80211_vif_is_mld(&(sdata)->vif)) \ + _sdata_dbg(print, sdata, "[link %d] " fmt, \ + link_id, ##__VA_ARGS__); \ else \ - _sdata_dbg(1, (link)->sdata, fmt, \ - ##__VA_ARGS__); \ + _sdata_dbg(print, sdata, fmt, ##__VA_ARGS__); \ } while (0) +#define link_dbg(link, fmt, ...) \ + _link_id_dbg(1, (link)->sdata, (link)->link_id, \ + fmt, ##__VA_ARGS__) #define ht_dbg(sdata, fmt, ...) \ _sdata_dbg(MAC80211_HT_DEBUG, \ @@ -226,6 +227,9 @@ do { \ #define mlme_dbg(sdata, fmt, ...) \ _sdata_dbg(MAC80211_MLME_DEBUG, \ sdata, fmt, ##__VA_ARGS__) +#define mlme_link_id_dbg(sdata, link_id, fmt, ...) \ + _link_id_dbg(MAC80211_MLME_DEBUG, sdata, link_id, \ + fmt, ##__VA_ARGS__) #define mlme_dbg_ratelimited(sdata, fmt, ...) \ _sdata_dbg(MAC80211_MLME_DEBUG && net_ratelimit(), \ diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 74be49191e..98310188f3 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -4,7 +4,7 @@ * * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright (C) 2018 - 2019, 2021-2023 Intel Corporation + * Copyright (C) 2018 - 2019, 2021-2024 Intel Corporation */ #include <linux/debugfs.h> @@ -498,6 +498,8 @@ static const char *hw_flag_names[] = { FLAG(DETECTS_COLOR_COLLISION), FLAG(MLO_MCAST_MULTI_LINK_TX), FLAG(DISALLOW_PUNCTURING), + FLAG(DISALLOW_PUNCTURING_5GHZ), + FLAG(HANDLES_QUIET_CSA), #undef FLAG }; diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index 3b7f70073f..254d745832 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2015 Intel Deutschland GmbH - * Copyright (C) 2022-2023 Intel Corporation + * Copyright (C) 2022-2024 Intel Corporation */ #include <net/mac80211.h> #include "ieee80211_i.h" @@ -214,8 +214,7 @@ int drv_conf_tx(struct ieee80211_local *local, if (!check_sdata_in_driver(sdata)) return -EIO; - if (sdata->vif.active_links && - !(sdata->vif.active_links & BIT(link->link_id))) + if (!ieee80211_vif_link_active(&sdata->vif, link->link_id)) return 0; if (params->cw_min == 0 || params->cw_min > params->cw_max) { @@ -312,11 +311,22 @@ int drv_assign_vif_chanctx(struct ieee80211_local *local, might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); + /* + * We should perhaps push emulate chanctx down and only + * make it call ->config() when the chanctx is actually + * assigned here (and unassigned below), but that's yet + * another change to all drivers to add assign/unassign + * emulation callbacks. Maybe later. + */ + if (sdata->vif.type == NL80211_IFTYPE_MONITOR && + local->emulate_chanctx && + !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) + return 0; + if (!check_sdata_in_driver(sdata)) return -EIO; - if (sdata->vif.active_links && - !(sdata->vif.active_links & BIT(link_conf->link_id))) + if (!ieee80211_vif_link_active(&sdata->vif, link_conf->link_id)) return 0; trace_drv_assign_vif_chanctx(local, sdata, link_conf, ctx); @@ -340,11 +350,15 @@ void drv_unassign_vif_chanctx(struct ieee80211_local *local, might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); + if (sdata->vif.type == NL80211_IFTYPE_MONITOR && + local->emulate_chanctx && + !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) + return; + if (!check_sdata_in_driver(sdata)) return; - if (sdata->vif.active_links && - !(sdata->vif.active_links & BIT(link_conf->link_id))) + if (!ieee80211_vif_link_active(&sdata->vif, link_conf->link_id)) return; trace_drv_unassign_vif_chanctx(local, sdata, link_conf, ctx); @@ -461,8 +475,7 @@ void drv_link_info_changed(struct ieee80211_local *local, if (!check_sdata_in_driver(sdata)) return; - if (sdata->vif.active_links && - !(sdata->vif.active_links & BIT(link_id))) + if (!ieee80211_vif_link_active(&sdata->vif, link_id)) return; trace_drv_link_info_changed(local, sdata, info, changed); diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index eb482fb8c3..5d078c0a23 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -2,7 +2,7 @@ /* * Portions of this file * Copyright(c) 2016 Intel Deutschland GmbH -* Copyright (C) 2018 - 2019, 2021 - 2023 Intel Corporation +* Copyright (C) 2018-2019, 2021-2024 Intel Corporation */ #ifndef __MAC80211_DRIVER_OPS @@ -1180,8 +1180,9 @@ drv_post_channel_switch(struct ieee80211_link_data *link) } static inline void -drv_abort_channel_switch(struct ieee80211_sub_if_data *sdata) +drv_abort_channel_switch(struct ieee80211_link_data *link) { + struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; might_sleep(); @@ -1193,7 +1194,8 @@ drv_abort_channel_switch(struct ieee80211_sub_if_data *sdata) trace_drv_abort_channel_switch(local, sdata); if (local->ops->abort_channel_switch) - local->ops->abort_channel_switch(&local->hw, &sdata->vif); + local->ops->abort_channel_switch(&local->hw, &sdata->vif, + link->conf); } static inline void @@ -1695,4 +1697,23 @@ int drv_change_sta_links(struct ieee80211_local *local, struct ieee80211_sta *sta, u16 old_links, u16 new_links); +static inline enum ieee80211_neg_ttlm_res +drv_can_neg_ttlm(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_neg_ttlm *neg_ttlm) +{ + enum ieee80211_neg_ttlm_res res = NEG_TTLM_RES_REJECT; + + might_sleep(); + if (!check_sdata_in_driver(sdata)) + return -EIO; + + trace_drv_can_neg_ttlm(local, sdata, neg_ttlm); + if (local->ops->can_neg_ttlm) + res = local->ops->can_neg_ttlm(&local->hw, &sdata->vif, + neg_ttlm); + trace_drv_neg_ttlm_res(local, sdata, res, neg_ttlm); + + return res; +} #endif /* __MAC80211_DRIVER_OPS */ diff --git a/net/mac80211/drop.h b/net/mac80211/drop.h index 12a6f0e9ec..59e3ec4dc9 100644 --- a/net/mac80211/drop.h +++ b/net/mac80211/drop.h @@ -2,7 +2,7 @@ /* * mac80211 drop reason list * - * Copyright (C) 2023 Intel Corporation + * Copyright (C) 2023-2024 Intel Corporation */ #ifndef MAC80211_DROP_H @@ -66,6 +66,7 @@ typedef unsigned int __bitwise ieee80211_rx_result; R(RX_DROP_U_UNEXPECTED_STA_4ADDR) \ R(RX_DROP_U_UNEXPECTED_VLAN_MCAST) \ R(RX_DROP_U_NOT_PORT_CONTROL) \ + R(RX_DROP_U_UNKNOWN_ACTION_REJECTED) \ /* this line for the trailing \ - add before this */ /* having two enums allows for checking ieee80211_rx_result use with sparse */ diff --git a/net/mac80211/he.c b/net/mac80211/he.c index 9f5ffdc9db..ecbb042dd0 100644 --- a/net/mac80211/he.c +++ b/net/mac80211/he.c @@ -230,15 +230,21 @@ ieee80211_he_spr_ie_to_bss_conf(struct ieee80211_vif *vif, if (!he_spr_ie_elem) return; + + he_obss_pd->sr_ctrl = he_spr_ie_elem->he_sr_control; data = he_spr_ie_elem->optional; if (he_spr_ie_elem->he_sr_control & IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT) - data++; + he_obss_pd->non_srg_max_offset = *data++; + if (he_spr_ie_elem->he_sr_control & IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT) { - he_obss_pd->max_offset = *data++; he_obss_pd->min_offset = *data++; + he_obss_pd->max_offset = *data++; + memcpy(he_obss_pd->bss_color_bitmap, data, 8); + data += 8; + memcpy(he_obss_pd->partial_bssid_bitmap, data, 8); he_obss_pd->enable = true; } } diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 749f4ecab9..d7e8cf8e48 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2007-2010, Intel Corporation * Copyright 2017 Intel Deutschland GmbH - * Copyright(c) 2020-2023 Intel Corporation + * Copyright(c) 2020-2024 Intel Corporation */ #include <linux/ieee80211.h> @@ -257,7 +257,7 @@ bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata, if (WARN_ON(!link_conf)) width = NL80211_CHAN_WIDTH_20_NOHT; else - width = link_conf->chandef.width; + width = link_conf->chanreq.oper.width; switch (width) { default: @@ -580,7 +580,7 @@ int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, /* we'll do more on status of this frame */ info = IEEE80211_SKB_CB(skb); info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; - /* we have 12 bits, and need 6: link_id 4, smps 2 */ + /* we have 13 bits, and need 6: link_id 4, smps 2 */ info->status_data = IEEE80211_STATUS_TYPE_SMPS | u16_encode_bits(status_link_id << 2 | smps, IEEE80211_STATUS_SUBDATA_MASK); @@ -603,6 +603,8 @@ void ieee80211_request_smps(struct ieee80211_vif *vif, unsigned int link_id, if (WARN_ON(!link)) goto out; + trace_api_request_smps(sdata->local, sdata, link, smps_mode); + if (link->u.mgd.driver_smps_mode == smps_mode) goto out; diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 8f2b445a5e..7ace5cdc6c 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -223,7 +223,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt; struct cfg80211_bss *bss; u64 bss_change; - struct cfg80211_chan_def chandef; + struct ieee80211_chan_req chanreq = {}; struct ieee80211_channel *chan; struct beacon_data *presp; struct cfg80211_inform_bss bss_meta = {}; @@ -237,7 +237,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, drv_reset_tsf(local, sdata); if (!ether_addr_equal(ifibss->bssid, bssid)) - sta_info_flush(sdata); + sta_info_flush(sdata, -1); /* if merging, indicate to driver that we leave the old IBSS */ if (sdata->vif.cfg.ibss_joined) { @@ -257,22 +257,22 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, kfree_rcu(presp, rcu_head); /* make a copy of the chandef, it could be modified below. */ - chandef = *req_chandef; - chan = chandef.chan; - if (!cfg80211_reg_can_beacon(local->hw.wiphy, &chandef, + chanreq.oper = *req_chandef; + chan = chanreq.oper.chan; + if (!cfg80211_reg_can_beacon(local->hw.wiphy, &chanreq.oper, NL80211_IFTYPE_ADHOC)) { - if (chandef.width == NL80211_CHAN_WIDTH_5 || - chandef.width == NL80211_CHAN_WIDTH_10 || - chandef.width == NL80211_CHAN_WIDTH_20_NOHT || - chandef.width == NL80211_CHAN_WIDTH_20) { + if (chanreq.oper.width == NL80211_CHAN_WIDTH_5 || + chanreq.oper.width == NL80211_CHAN_WIDTH_10 || + chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT || + chanreq.oper.width == NL80211_CHAN_WIDTH_20) { sdata_info(sdata, "Failed to join IBSS, beacons forbidden\n"); return; } - chandef.width = NL80211_CHAN_WIDTH_20; - chandef.center_freq1 = chan->center_freq; + chanreq.oper.width = NL80211_CHAN_WIDTH_20; + chanreq.oper.center_freq1 = chan->center_freq; /* check again for downgraded chandef */ - if (!cfg80211_reg_can_beacon(local->hw.wiphy, &chandef, + if (!cfg80211_reg_can_beacon(local->hw.wiphy, &chanreq.oper, NL80211_IFTYPE_ADHOC)) { sdata_info(sdata, "Failed to join IBSS, beacons forbidden\n"); @@ -281,7 +281,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, } err = cfg80211_chandef_dfs_required(sdata->local->hw.wiphy, - &chandef, NL80211_IFTYPE_ADHOC); + &chanreq.oper, NL80211_IFTYPE_ADHOC); if (err < 0) { sdata_info(sdata, "Failed to join IBSS, invalid chandef\n"); @@ -295,7 +295,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, radar_required = err; - if (ieee80211_link_use_channel(&sdata->deflink, &chandef, + if (ieee80211_link_use_channel(&sdata->deflink, &chanreq, ifibss->fixed_channel ? IEEE80211_CHANCTX_SHARED : IEEE80211_CHANCTX_EXCLUSIVE)) { @@ -307,7 +307,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, memcpy(ifibss->bssid, bssid, ETH_ALEN); presp = ieee80211_ibss_build_presp(sdata, beacon_int, basic_rates, - capability, tsf, &chandef, + capability, tsf, &chanreq.oper, &have_higher_than_11mbit, NULL); if (!presp) return; @@ -533,12 +533,12 @@ int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata, u64 *changed) IEEE80211_PRIVACY(ifibss->privacy)); /* XXX: should not really modify cfg80211 data */ if (cbss) { - cbss->channel = sdata->deflink.csa_chandef.chan; + cbss->channel = sdata->deflink.csa_chanreq.oper.chan; cfg80211_put_bss(sdata->local->hw.wiphy, cbss); } } - ifibss->chandef = sdata->deflink.csa_chandef; + ifibss->chandef = sdata->deflink.csa_chanreq.oper; /* generate the beacon */ return ieee80211_ibss_csa_beacon(sdata, NULL, changed); @@ -682,7 +682,7 @@ static void ieee80211_ibss_disconnect(struct ieee80211_sub_if_data *sdata) ifibss->state = IEEE80211_IBSS_MLME_SEARCH; - sta_info_flush(sdata); + sta_info_flush(sdata, -1); spin_lock_bh(&ifibss->incomplete_lock); while (!list_empty(&ifibss->incomplete_stations)) { @@ -757,21 +757,22 @@ ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; enum nl80211_channel_type ch_type; int err; - ieee80211_conn_flags_t conn_flags; + struct ieee80211_conn_settings conn = { + .mode = IEEE80211_CONN_MODE_HT, + .bw_limit = IEEE80211_CONN_BW_LIMIT_40, + }; u32 vht_cap_info = 0; lockdep_assert_wiphy(sdata->local->hw.wiphy); - conn_flags = IEEE80211_CONN_DISABLE_VHT; - switch (ifibss->chandef.width) { case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_20_NOHT: - conn_flags |= IEEE80211_CONN_DISABLE_HT; + conn.mode = IEEE80211_CONN_MODE_LEGACY; fallthrough; case NL80211_CHAN_WIDTH_20: - conn_flags |= IEEE80211_CONN_DISABLE_40MHZ; + conn.bw_limit = IEEE80211_CONN_BW_LIMIT_20; break; default: break; @@ -783,8 +784,8 @@ ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata, memset(¶ms, 0, sizeof(params)); err = ieee80211_parse_ch_switch_ie(sdata, elems, ifibss->chandef.chan->band, - vht_cap_info, - conn_flags, ifibss->bssid, &csa_ie); + vht_cap_info, &conn, + ifibss->bssid, &csa_ie); /* can't switch to destination channel, fail */ if (err < 0) goto disconnect; @@ -798,7 +799,7 @@ ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata, goto disconnect; params.count = csa_ie.count; - params.chandef = csa_ie.chandef; + params.chandef = csa_ie.chanreq.oper; switch (ifibss->chandef.width) { case NL80211_CHAN_WIDTH_20_NOHT: @@ -857,7 +858,7 @@ ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata, params.radar_required = err; if (cfg80211_chandef_identical(¶ms.chandef, - &sdata->vif.bss_conf.chandef)) { + &sdata->vif.bss_conf.chanreq.oper)) { ibss_dbg(sdata, "received csa with an identical chandef, ignoring\n"); return true; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index a18361afea..3cedfdc909 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2015 Intel Mobile Communications GmbH - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation */ #ifndef IEEE80211_I_H @@ -89,7 +89,8 @@ enum ieee80211_status_data { IEEE80211_STATUS_TYPE_MASK = 0x00f, IEEE80211_STATUS_TYPE_INVALID = 0, IEEE80211_STATUS_TYPE_SMPS = 1, - IEEE80211_STATUS_SUBDATA_MASK = 0xff0, + IEEE80211_STATUS_TYPE_NEG_TTLM = 2, + IEEE80211_STATUS_SUBDATA_MASK = 0x1ff0, }; static inline bool @@ -131,7 +132,7 @@ struct ieee80211_bss { }; /** - * enum ieee80211_corrupt_data_flags - BSS data corruption flags + * enum ieee80211_bss_corrupt_data_flags - BSS data corruption flags * @IEEE80211_BSS_CORRUPT_BEACON: last beacon frame received was corrupted * @IEEE80211_BSS_CORRUPT_PROBE_RESP: last probe response received was corrupted * @@ -144,7 +145,7 @@ enum ieee80211_bss_corrupt_data_flags { }; /** - * enum ieee80211_valid_data_flags - BSS valid data flags + * enum ieee80211_bss_valid_data_flags - BSS valid data flags * @IEEE80211_BSS_VALID_WMM: WMM/UAPSD data was gathered from non-corrupt IE * @IEEE80211_BSS_VALID_RATES: Supported rates were gathered from non-corrupt IE * @IEEE80211_BSS_VALID_ERP: ERP flag was gathered from non-corrupt IE @@ -370,19 +371,32 @@ enum ieee80211_sta_flags { IEEE80211_STA_ENABLE_RRM = BIT(15), }; -typedef u32 __bitwise ieee80211_conn_flags_t; - -enum ieee80211_conn_flags { - IEEE80211_CONN_DISABLE_HT = (__force ieee80211_conn_flags_t)BIT(0), - IEEE80211_CONN_DISABLE_40MHZ = (__force ieee80211_conn_flags_t)BIT(1), - IEEE80211_CONN_DISABLE_VHT = (__force ieee80211_conn_flags_t)BIT(2), - IEEE80211_CONN_DISABLE_80P80MHZ = (__force ieee80211_conn_flags_t)BIT(3), - IEEE80211_CONN_DISABLE_160MHZ = (__force ieee80211_conn_flags_t)BIT(4), - IEEE80211_CONN_DISABLE_HE = (__force ieee80211_conn_flags_t)BIT(5), - IEEE80211_CONN_DISABLE_EHT = (__force ieee80211_conn_flags_t)BIT(6), - IEEE80211_CONN_DISABLE_320MHZ = (__force ieee80211_conn_flags_t)BIT(7), +enum ieee80211_conn_mode { + IEEE80211_CONN_MODE_S1G, + IEEE80211_CONN_MODE_LEGACY, + IEEE80211_CONN_MODE_HT, + IEEE80211_CONN_MODE_VHT, + IEEE80211_CONN_MODE_HE, + IEEE80211_CONN_MODE_EHT, }; +#define IEEE80211_CONN_MODE_HIGHEST IEEE80211_CONN_MODE_EHT + +enum ieee80211_conn_bw_limit { + IEEE80211_CONN_BW_LIMIT_20, + IEEE80211_CONN_BW_LIMIT_40, + IEEE80211_CONN_BW_LIMIT_80, + IEEE80211_CONN_BW_LIMIT_160, /* also 80+80 */ + IEEE80211_CONN_BW_LIMIT_320, +}; + +struct ieee80211_conn_settings { + enum ieee80211_conn_mode mode; + enum ieee80211_conn_bw_limit bw_limit; +}; + +extern const struct ieee80211_conn_settings ieee80211_conn_settings_unlimited; + struct ieee80211_mgd_auth_data { struct cfg80211_bss *bss; unsigned long timeout; @@ -416,7 +430,7 @@ struct ieee80211_mgd_assoc_data { size_t elems_len; u8 *elems; /* pointing to inside ie[] below */ - ieee80211_conn_flags_t conn_flags; + struct ieee80211_conn_settings conn; u16 status; @@ -441,6 +455,7 @@ struct ieee80211_mgd_assoc_data { bool timeout_started; bool comeback; /* whether the AP has requested association comeback */ bool s1g; + bool spp_amsdu; unsigned int assoc_link_id; @@ -509,6 +524,8 @@ struct ieee80211_if_managed { unsigned int flags; + u16 mcast_seq_last; + bool status_acked; bool status_received; __le16 status_fc; @@ -579,6 +596,11 @@ struct ieee80211_if_managed { /* TID-to-link mapping support */ struct wiphy_delayed_work ttlm_work; struct ieee80211_adv_ttlm_info ttlm_info; + struct wiphy_work teardown_ttlm_work; + + /* dialog token enumerator for neg TTLM request */ + u8 dialog_token_alloc; + struct wiphy_delayed_work neg_ttlm_timeout_work; }; struct ieee80211_if_ibss { @@ -664,7 +686,7 @@ struct mesh_csa_settings { }; /** - * struct mesh_table + * struct mesh_table - mesh hash table * * @known_gates: list of known mesh gates and their mpaths by the station. The * gate's mpath may or may not be resolved and active. @@ -866,6 +888,9 @@ struct ieee80211_chanctx { enum ieee80211_chanctx_mode mode; bool driver_present; + /* temporary data for search algorithm etc. */ + struct ieee80211_chan_req req; + struct ieee80211_chanctx_conf conf; }; @@ -938,7 +963,7 @@ struct ieee80211_link_data_managed { enum ieee80211_smps_mode req_smps, /* requested smps mode */ driver_smps_mode; /* smps mode request */ - ieee80211_conn_flags_t conn_flags; + struct ieee80211_conn_settings conn; s16 p2p_noa_index; @@ -951,6 +976,7 @@ struct ieee80211_link_data_managed { bool csa_waiting_bcn; bool csa_ignored_same_chan; + bool csa_blocked_tx; struct wiphy_delayed_work chswitch_work; struct wiphy_work request_smps_work; @@ -983,8 +1009,6 @@ struct ieee80211_link_data_managed { int mu_edca_last_param_set; u8 bss_param_ch_cnt; - - struct cfg80211_bss *bss; }; struct ieee80211_link_data_ap { @@ -1013,11 +1037,10 @@ struct ieee80211_link_data { struct ieee80211_key __rcu *default_beacon_key; struct wiphy_work csa_finalize_work; - bool csa_block_tx; bool operating_11g_mode; - struct cfg80211_chan_def csa_chandef; + struct ieee80211_chan_req csa_chanreq; struct wiphy_work color_change_finalize_work; struct delayed_work color_collision_detect_work; @@ -1025,7 +1048,7 @@ struct ieee80211_link_data { /* context reservation -- protected with wiphy mutex */ struct ieee80211_chanctx *reserved_chanctx; - struct cfg80211_chan_def reserved_chandef; + struct ieee80211_chan_req reserved; bool reserved_radar_required; bool reserved_ready; @@ -1072,6 +1095,8 @@ struct ieee80211_sub_if_data { unsigned long state; + bool csa_blocked_queues; + char name[IFNAMSIZ]; struct ieee80211_fragment_cache frags; @@ -1137,6 +1162,8 @@ struct ieee80211_sub_if_data { struct wiphy_work activate_links_work; u16 desired_active_links; + u16 restart_active_links; + #ifdef CONFIG_MAC80211_DEBUGFS struct { struct dentry *subdir_stations; @@ -1160,6 +1187,19 @@ struct ieee80211_sub_if_data *vif_to_sdata(struct ieee80211_vif *p) #define sdata_dereference(p, sdata) \ wiphy_dereference(sdata->local->hw.wiphy, p) +#define for_each_sdata_link(_local, _link) \ + /* outer loop just to define the variables ... */ \ + for (struct ieee80211_sub_if_data *___sdata = NULL; \ + !___sdata; \ + ___sdata = (void *)~0 /* always stop */) \ + list_for_each_entry(___sdata, &(_local)->interfaces, list) \ + if (ieee80211_sdata_running(___sdata)) \ + for (int ___link_id = 0; \ + ___link_id < ARRAY_SIZE(___sdata->link); \ + ___link_id++) \ + if ((_link = wiphy_dereference((local)->hw.wiphy, \ + ___sdata->link[___link_id]))) + static inline int ieee80211_get_mbssid_beacon_len(struct cfg80211_mbssid_elems *elems, struct cfg80211_rnr_elems *rnr_elems, @@ -1330,7 +1370,8 @@ struct ieee80211_local { bool wiphy_ciphers_allocated; - bool use_chanctx; + struct cfg80211_chan_def dflt_chandef; + bool emulate_chanctx; /* protects the aggregated multicast list and filter calls */ spinlock_t filter_lock; @@ -1456,8 +1497,6 @@ struct ieee80211_local { enum mac80211_scan_state next_scan_state; struct wiphy_delayed_work scan_work; struct ieee80211_sub_if_data __rcu *scan_sdata; - /* For backward compatibility only -- do not use */ - struct cfg80211_chan_def _oper_chandef; /* Temporary remain-on-channel for off-channel operations */ struct ieee80211_channel *tmp_channel; @@ -1531,8 +1570,6 @@ struct ieee80211_local { int user_power_level; /* in dBm, for all interfaces */ - enum ieee80211_smps_mode smps_mode; - struct work_struct restart_work; #ifdef CONFIG_MAC80211_DEBUGFS @@ -1559,7 +1596,7 @@ struct ieee80211_local { /* virtual monitor interface */ struct ieee80211_sub_if_data __rcu *monitor_sdata; - struct cfg80211_chan_def monitor_chandef; + struct ieee80211_chan_req monitor_chanreq; /* extended capabilities provided by mac80211 */ u8 ext_capa[8]; @@ -1624,7 +1661,7 @@ ieee80211_get_link_sband(struct ieee80211_link_data *link) /* this struct holds the value parsing from channel switch IE */ struct ieee80211_csa_ie { - struct cfg80211_chan_def chandef; + struct ieee80211_chan_req chanreq; u8 mode; u8 count; u8 ttl; @@ -1633,6 +1670,14 @@ struct ieee80211_csa_ie { u32 max_switch_time; }; +enum ieee80211_elems_parse_error { + IEEE80211_PARSE_ERR_INVALID_END = BIT(0), + IEEE80211_PARSE_ERR_DUP_ELEM = BIT(1), + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE = BIT(2), + IEEE80211_PARSE_ERR_UNEXPECTED_ELEM = BIT(3), + IEEE80211_PARSE_ERR_DUP_NEST_ML_BASIC = BIT(4), +}; + /* Parsed Information Elements */ struct ieee802_11_elems { const u8 *ie_start; @@ -1727,12 +1772,6 @@ struct ieee802_11_elems { size_t ml_basic_len; size_t ml_reconf_len; - /* The basic Multi-Link element in the original IEs */ - const struct element *ml_basic_elem; - - /* The reconfiguration Multi-Link element in the original IEs */ - const struct element *ml_reconf_elem; - u8 ttlm_num; /* @@ -1743,16 +1782,8 @@ struct ieee802_11_elems { struct ieee80211_mle_per_sta_profile *prof; size_t sta_prof_len; - /* whether a parse error occurred while retrieving these elements */ - bool parse_error; - - /* - * scratch buffer that can be used for various element parsing related - * tasks, e.g., element de-fragmentation etc. - */ - size_t scratch_len; - u8 *scratch_pos; - u8 scratch[] __counted_by(scratch_len); + /* whether/which parse error occurred while retrieving these elements */ + u8 parse_error; }; static inline struct ieee80211_local *hw_to_local( @@ -1801,6 +1832,8 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, unsigned int mpdu_len, unsigned int mpdu_offset); int ieee80211_hw_config(struct ieee80211_local *local, u32 changed); +int ieee80211_hw_conf_chan(struct ieee80211_local *local); +void ieee80211_hw_conf_init(struct ieee80211_local *local); void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx); void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata, u64 changed); @@ -1812,6 +1845,8 @@ void ieee80211_link_info_change_notify(struct ieee80211_sub_if_data *sdata, void ieee80211_configure_filter(struct ieee80211_local *local); u64 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata); +void ieee80211_handle_queued_frames(struct ieee80211_local *local); + u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local); int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, u64 *cookie, gfp_t gfp); @@ -2166,9 +2201,8 @@ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, * @elems: parsed 802.11 elements received with the frame * @current_band: indicates the current band * @vht_cap_info: VHT capabilities of the transmitter - * @conn_flags: contains information about own capabilities and restrictions - * to decide which channel switch announcements can be accepted, using - * flags from &enum ieee80211_conn_flags. + * @conn: contains information about own capabilities and restrictions + * to decide which channel switch announcements can be accepted * @bssid: the currently connected bssid (for reporting) * @csa_ie: parsed 802.11 csa elements on count, mode, chandef and mesh ttl. * All of them will be filled with if success only. @@ -2178,7 +2212,8 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band current_band, u32 vht_cap_info, - ieee80211_conn_flags_t conn_flags, u8 *bssid, + struct ieee80211_conn_settings *conn, + u8 *bssid, struct ieee80211_csa_ie *csa_ie); /* Suspend/resume and hw reconfiguration */ @@ -2202,6 +2237,9 @@ static inline int __ieee80211_resume(struct ieee80211_hw *hw) /* utility functions/constants */ extern const void *const mac80211_wiphy_privid; /* for wiphy privid */ +const char *ieee80211_conn_mode_str(enum ieee80211_conn_mode mode); +enum ieee80211_conn_bw_limit +ieee80211_min_bw_limit_from_chandef(struct cfg80211_chan_def *chandef); int ieee80211_frame_duration(enum nl80211_band band, size_t len, int rate, int erp, int short_preamble); void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata, @@ -2243,6 +2281,7 @@ static inline void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, /** * struct ieee80211_elems_parse_params - element parsing parameters + * @mode: connection mode for parsing * @start: pointer to the elements * @len: length of the elements * @action: %true if the elements came from an action frame @@ -2260,6 +2299,7 @@ static inline void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, * for EHT capabilities parsing) */ struct ieee80211_elems_parse_params { + enum ieee80211_conn_mode mode; const u8 *start; size_t len; bool action; @@ -2279,6 +2319,7 @@ ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, struct cfg80211_bss *bss) { struct ieee80211_elems_parse_params params = { + .mode = IEEE80211_CONN_MODE_HIGHEST, .start = start, .len = len, .action = action, @@ -2408,7 +2449,6 @@ void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, const u8 *da, const u8 *bssid, u16 stype, u16 reason, bool send_frame, u8 *frame_buf); -u8 *ieee80211_write_he_6ghz_cap(u8 *pos, __le16 cap, u8 *end); enum { IEEE80211_PROBE_FLAG_DIRECTED = BIT(0), @@ -2453,32 +2493,36 @@ u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, u32 cap); u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, const struct cfg80211_chan_def *chandef); -u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata, u8 iftype); -u8 *ieee80211_ie_build_he_cap(ieee80211_conn_flags_t disable_flags, u8 *pos, - const struct ieee80211_sta_he_cap *he_cap, - u8 *end); -void ieee80211_ie_build_he_6ghz_cap(struct ieee80211_sub_if_data *sdata, - enum ieee80211_smps_mode smps_mode, - struct sk_buff *skb); +u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata); u8 *ieee80211_ie_build_he_oper(u8 *pos, struct cfg80211_chan_def *chandef); u8 *ieee80211_ie_build_eht_oper(u8 *pos, struct cfg80211_chan_def *chandef, const struct ieee80211_sta_eht_cap *eht_cap); int ieee80211_parse_bitrates(enum nl80211_chan_width width, const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates); -int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata, - struct sk_buff *skb, bool need_basic, - enum nl80211_band band); -int ieee80211_add_ext_srates_ie(struct ieee80211_sub_if_data *sdata, - struct sk_buff *skb, bool need_basic, - enum nl80211_band band); u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo); void ieee80211_add_s1g_capab_ie(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_s1g_cap *caps, struct sk_buff *skb); void ieee80211_add_aid_request_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); -u8 *ieee80211_ie_build_s1g_cap(u8 *pos, struct ieee80211_sta_s1g_cap *s1g_cap); + +/* element building in SKBs */ +int ieee80211_put_srates_elem(struct sk_buff *skb, + const struct ieee80211_supported_band *sband, + u32 basic_rates, u32 rate_flags, u32 masked_rates, + u8 element_id); +int ieee80211_put_he_cap(struct sk_buff *skb, + struct ieee80211_sub_if_data *sdata, + const struct ieee80211_supported_band *sband, + const struct ieee80211_conn_settings *conn); +int ieee80211_put_he_6ghz_cap(struct sk_buff *skb, + struct ieee80211_sub_if_data *sdata, + enum ieee80211_smps_mode smps_mode); +int ieee80211_put_eht_cap(struct sk_buff *skb, + struct ieee80211_sub_if_data *sdata, + const struct ieee80211_supported_band *sband, + const struct ieee80211_conn_settings *conn); /* channel management */ bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, @@ -2488,23 +2532,46 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info, const struct ieee80211_ht_operation *htop, struct cfg80211_chan_def *chandef); void ieee80211_chandef_eht_oper(const struct ieee80211_eht_operation_info *info, - bool support_160, bool support_320, struct cfg80211_chan_def *chandef); -bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata, +bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_local *local, const struct ieee80211_he_operation *he_oper, const struct ieee80211_eht_operation *eht_oper, struct cfg80211_chan_def *chandef); bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, struct cfg80211_chan_def *chandef); -ieee80211_conn_flags_t ieee80211_chandef_downgrade(struct cfg80211_chan_def *c); +void ieee80211_chandef_downgrade(struct cfg80211_chan_def *chandef, + struct ieee80211_conn_settings *conn); +static inline void +ieee80211_chanreq_downgrade(struct ieee80211_chan_req *chanreq, + struct ieee80211_conn_settings *conn) +{ + ieee80211_chandef_downgrade(&chanreq->oper, conn); + if (WARN_ON(!conn)) + return; + if (conn->mode < IEEE80211_CONN_MODE_EHT) + chanreq->ap.chan = NULL; +} + +bool ieee80211_chanreq_identical(const struct ieee80211_chan_req *a, + const struct ieee80211_chan_req *b); int __must_check +_ieee80211_link_use_channel(struct ieee80211_link_data *link, + const struct ieee80211_chan_req *req, + enum ieee80211_chanctx_mode mode, + bool assign_on_failure); + +static inline int __must_check ieee80211_link_use_channel(struct ieee80211_link_data *link, - const struct cfg80211_chan_def *chandef, - enum ieee80211_chanctx_mode mode); + const struct ieee80211_chan_req *req, + enum ieee80211_chanctx_mode mode) +{ + return _ieee80211_link_use_channel(link, req, mode, false); +} + int __must_check ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link, - const struct cfg80211_chan_def *chandef, + const struct ieee80211_chan_req *req, enum ieee80211_chanctx_mode mode, bool radar_required); int __must_check @@ -2512,9 +2579,11 @@ ieee80211_link_use_reserved_context(struct ieee80211_link_data *link); int ieee80211_link_unreserve_chanctx(struct ieee80211_link_data *link); int __must_check -ieee80211_link_change_bandwidth(struct ieee80211_link_data *link, - const struct cfg80211_chan_def *chandef, - u64 *changed); +ieee80211_link_change_chanreq(struct ieee80211_link_data *link, + const struct ieee80211_chan_req *req, + u64 *changed); +void __ieee80211_link_release_channel(struct ieee80211_link_data *link, + bool skip_idle_recalc); void ieee80211_link_release_channel(struct ieee80211_link_data *link); void ieee80211_link_vlan_copy_chanctx(struct ieee80211_link_data *link); void ieee80211_link_copy_chanctx_to_vlans(struct ieee80211_link_data *link, @@ -2561,7 +2630,7 @@ int ieee80211_tdls_channel_switch(struct wiphy *wiphy, struct net_device *dev, void ieee80211_tdls_cancel_channel_switch(struct wiphy *wiphy, struct net_device *dev, const u8 *addr); -void ieee80211_teardown_tdls_peers(struct ieee80211_sub_if_data *sdata); +void ieee80211_teardown_tdls_peers(struct ieee80211_link_data *link); void ieee80211_tdls_handle_disconnect(struct ieee80211_sub_if_data *sdata, const u8 *peer, u16 reason); void @@ -2589,12 +2658,7 @@ u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw, void ieee80211_init_frag_cache(struct ieee80211_fragment_cache *cache); void ieee80211_destroy_frag_cache(struct ieee80211_fragment_cache *cache); -u8 ieee80211_ie_len_eht_cap(struct ieee80211_sub_if_data *sdata, u8 iftype); -u8 *ieee80211_ie_build_eht_cap(u8 *pos, - const struct ieee80211_sta_he_cap *he_cap, - const struct ieee80211_sta_eht_cap *eht_cap, - u8 *end, - bool for_ap); +u8 ieee80211_ie_len_eht_cap(struct ieee80211_sub_if_data *sdata); void ieee80211_eht_cap_ie_to_sta_eht_cap(struct ieee80211_sub_if_data *sdata, @@ -2603,6 +2667,12 @@ ieee80211_eht_cap_ie_to_sta_eht_cap(struct ieee80211_sub_if_data *sdata, const struct ieee80211_eht_cap_elem *eht_cap_ie_elem, u8 eht_cap_len, struct link_sta_info *link_sta); +void ieee80211_process_neg_ttlm_req(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len); +void ieee80211_process_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len); +int ieee80211_req_neg_ttlm(struct ieee80211_sub_if_data *sdata, + struct cfg80211_ttlm_params *params); void ieee80211_check_wbrf_support(struct ieee80211_local *local); void ieee80211_add_wbrf(struct ieee80211_local *local, struct cfg80211_chan_def *chandef); diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 11c4caa474..b935bb5d8e 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -8,7 +8,7 @@ * Copyright 2008, Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (c) 2016 Intel Deutschland GmbH - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation */ #include <linux/slab.h> #include <linux/kernel.h> @@ -511,7 +511,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do * would have removed them, but in other modes there shouldn't * be any stations. */ - flushed = sta_info_flush(sdata); + flushed = sta_info_flush(sdata, -1); WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_AP_VLAN && flushed > 0); /* don't count this interface for allmulti while it is down */ @@ -544,10 +544,10 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do sdata->vif.bss_conf.csa_active = false; if (sdata->vif.type == NL80211_IFTYPE_STATION) sdata->deflink.u.mgd.csa_waiting_bcn = false; - if (sdata->deflink.csa_block_tx) { + if (sdata->csa_blocked_queues) { ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); - sdata->deflink.csa_block_tx = false; + sdata->csa_blocked_queues = false; } wiphy_work_cancel(local->hw.wiphy, &sdata->deflink.csa_finalize_work); @@ -557,7 +557,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do &sdata->deflink.dfs_cac_timer_work); if (sdata->wdev.cac_started) { - chandef = sdata->vif.bss_conf.chandef; + chandef = sdata->vif.bss_conf.chanreq.oper; WARN_ON(local->suspended); ieee80211_link_release_channel(&sdata->deflink); cfg80211_cac_event(sdata->dev, &chandef, @@ -686,6 +686,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do ieee80211_del_virtual_monitor(local); ieee80211_recalc_idle(local); + ieee80211_recalc_offload(local); if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE)) break; @@ -1121,9 +1122,6 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) struct ieee80211_sub_if_data *sdata; int ret; - if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) - return 0; - ASSERT_RTNL(); lockdep_assert_wiphy(local->hw.wiphy); @@ -1145,11 +1143,13 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) ieee80211_set_default_queues(sdata); - ret = drv_add_interface(local, sdata); - if (WARN_ON(ret)) { - /* ok .. stupid driver, it asked for this! */ - kfree(sdata); - return ret; + if (ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { + ret = drv_add_interface(local, sdata); + if (WARN_ON(ret)) { + /* ok .. stupid driver, it asked for this! */ + kfree(sdata); + return ret; + } } set_bit(SDATA_STATE_RUNNING, &sdata->state); @@ -1164,7 +1164,7 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) rcu_assign_pointer(local->monitor_sdata, sdata); mutex_unlock(&local->iflist_mtx); - ret = ieee80211_link_use_channel(&sdata->deflink, &local->monitor_chandef, + ret = ieee80211_link_use_channel(&sdata->deflink, &local->monitor_chanreq, IEEE80211_CHANCTX_EXCLUSIVE); if (ret) { mutex_lock(&local->iflist_mtx); @@ -1187,9 +1187,6 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; - if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) - return; - ASSERT_RTNL(); lockdep_assert_wiphy(local->hw.wiphy); @@ -1209,7 +1206,8 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local) ieee80211_link_release_channel(&sdata->deflink); - drv_remove_interface(local, sdata); + if (ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) + drv_remove_interface(local, sdata); kfree(sdata); } @@ -1252,7 +1250,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) sdata->vif.cab_queue = master->vif.cab_queue; memcpy(sdata->vif.hw_queue, master->vif.hw_queue, sizeof(sdata->vif.hw_queue)); - sdata->vif.bss_conf.chandef = master->vif.bss_conf.chandef; + sdata->vif.bss_conf.chanreq = master->vif.bss_conf.chanreq; sdata->crypto_tx_tailroom_needed_cnt += master->crypto_tx_tailroom_needed_cnt; @@ -1288,8 +1286,6 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) res = drv_start(local); if (res) goto err_del_bss; - /* we're brought up, everything changes */ - hw_reconf_flags = ~0; ieee80211_led_radio(local, true); ieee80211_mod_tpt_led_trig(local, IEEE80211_TPT_LEDTRIG_FL_RADIO, 0); @@ -1436,7 +1432,9 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) if (coming_up) local->open_count++; - if (hw_reconf_flags) + if (local->open_count == 1) + ieee80211_hw_conf_init(local); + else if (hw_reconf_flags) ieee80211_hw_config(local, hw_reconf_flags); ieee80211_recalc_ps(local); @@ -1546,6 +1544,22 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local, default: break; } + } else if (ieee80211_is_action(mgmt->frame_control) && + mgmt->u.action.category == WLAN_CATEGORY_PROTECTED_EHT) { + if (sdata->vif.type == NL80211_IFTYPE_STATION) { + switch (mgmt->u.action.u.ttlm_req.action_code) { + case WLAN_PROTECTED_EHT_ACTION_TTLM_REQ: + ieee80211_process_neg_ttlm_req(sdata, mgmt, + skb->len); + break; + case WLAN_PROTECTED_EHT_ACTION_TTLM_RES: + ieee80211_process_neg_ttlm_res(sdata, mgmt, + skb->len); + break; + default: + break; + } + } } else if (ieee80211_is_ext(mgmt->frame_control)) { if (sdata->vif.type == NL80211_IFTYPE_STATION) ieee80211_sta_rx_queued_ext(sdata, skb); @@ -1683,8 +1697,13 @@ static void ieee80211_activate_links_work(struct wiphy *wiphy, struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, activate_links_work); + struct ieee80211_local *local = wiphy_priv(wiphy); + + if (local->in_reconfig) + return; ieee80211_set_active_links(&sdata->vif, sdata->desired_active_links); + sdata->desired_active_links = 0; } /* diff --git a/net/mac80211/key.c b/net/mac80211/key.c index af74d7f9d9..eecdd2265e 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -6,7 +6,7 @@ * Copyright 2007-2008 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright 2018-2020, 2022-2023 Intel Corporation + * Copyright 2018-2020, 2022-2024 Intel Corporation */ #include <crypto/utils.h> @@ -925,6 +925,10 @@ int ieee80211_key_link(struct ieee80211_key *key, */ key->color = atomic_inc_return(&key_color); + /* keep this flag for easier access later */ + if (sta && sta->sta.spp_amsdu) + key->conf.flags |= IEEE80211_KEY_FLAG_SPP_AMSDU; + increment_tailroom_need_count(sdata); ret = ieee80211_key_replace(sdata, link, sta, pairwise, old_key, key); @@ -1368,12 +1372,19 @@ EXPORT_SYMBOL_GPL(ieee80211_remove_key); struct ieee80211_key_conf * ieee80211_gtk_rekey_add(struct ieee80211_vif *vif, - struct ieee80211_key_conf *keyconf) + struct ieee80211_key_conf *keyconf, + int link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; struct ieee80211_key *key; int err; + struct ieee80211_link_data *link_data = + link_id < 0 ? &sdata->deflink : + sdata_dereference(sdata->link[link_id], sdata); + + if (WARN_ON(!link_data)) + return ERR_PTR(-EINVAL); if (WARN_ON(!local->wowlan)) return ERR_PTR(-EINVAL); @@ -1390,8 +1401,9 @@ ieee80211_gtk_rekey_add(struct ieee80211_vif *vif, if (sdata->u.mgd.mfp != IEEE80211_MFP_DISABLED) key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT; - /* FIXME: this function needs to get a link ID */ - err = ieee80211_key_link(key, &sdata->deflink, NULL); + key->conf.link_id = link_id; + + err = ieee80211_key_link(key, link_data, NULL); if (err) return ERR_PTR(err); diff --git a/net/mac80211/link.c b/net/mac80211/link.c index d4f86955af..af0321408a 100644 --- a/net/mac80211/link.c +++ b/net/mac80211/link.c @@ -2,7 +2,7 @@ /* * MLO link handling * - * Copyright (C) 2022-2023 Intel Corporation + * Copyright (C) 2022-2024 Intel Corporation */ #include <linux/slab.h> #include <linux/kernel.h> @@ -73,6 +73,8 @@ void ieee80211_link_stop(struct ieee80211_link_data *link) ieee80211_mgd_stop_link(link); cancel_delayed_work_sync(&link->color_collision_detect_work); + wiphy_work_cancel(link->sdata->local->hw.wiphy, + &link->csa_finalize_work); ieee80211_link_release_channel(link); } @@ -354,9 +356,9 @@ static int _ieee80211_set_active_links(struct ieee80211_sub_if_data *sdata, link = sdata_dereference(sdata->link[link_id], sdata); - /* FIXME: kill TDLS connections on the link */ + ieee80211_teardown_tdls_peers(link); - ieee80211_link_release_channel(link); + __ieee80211_link_release_channel(link, true); } list_for_each_entry(sta, &local->sta_list, list) { @@ -402,8 +404,24 @@ static int _ieee80211_set_active_links(struct ieee80211_sub_if_data *sdata, link = sdata_dereference(sdata->link[link_id], sdata); - ret = ieee80211_link_use_channel(link, &link->conf->chandef, - IEEE80211_CHANCTX_SHARED); + /* + * This call really should not fail. Unfortunately, it appears + * that this may happen occasionally with some drivers. Should + * it happen, we are stuck in a bad place as going backwards is + * not really feasible. + * + * So lets just tell link_use_channel that it must not fail to + * assign the channel context (from mac80211's perspective) and + * assume the driver is going to trigger a recovery flow if it + * had a failure. + * That really is not great nor guaranteed to work. But at least + * the internal mac80211 state remains consistent and there is + * a chance that we can recover. + */ + ret = _ieee80211_link_use_channel(link, + &link->conf->chanreq, + IEEE80211_CHANCTX_SHARED, + true); WARN_ON_ONCE(ret); ieee80211_mgd_set_link_qos_params(link); @@ -444,10 +462,16 @@ int ieee80211_set_active_links(struct ieee80211_vif *vif, u16 active_links) lockdep_assert_wiphy(local->hw.wiphy); - if (!drv_can_activate_links(local, sdata, active_links)) + if (WARN_ON(!active_links)) return -EINVAL; old_active = sdata->vif.active_links; + if (old_active == active_links) + return 0; + + if (!drv_can_activate_links(local, sdata, active_links)) + return -EINVAL; + if (old_active & active_links) { /* * if there's at least one link that stays active across @@ -472,6 +496,9 @@ void ieee80211_set_active_links_async(struct ieee80211_vif *vif, { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + if (WARN_ON(!active_links)) + return; + if (!ieee80211_sdata_running(sdata)) return; diff --git a/net/mac80211/main.c b/net/mac80211/main.c index f2ece77935..7ba329ebdd 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -93,16 +93,32 @@ static void ieee80211_reconfig_filter(struct wiphy *wiphy, ieee80211_configure_filter(local); } -static u32 ieee80211_hw_conf_chan(struct ieee80211_local *local) +static u32 ieee80211_calc_hw_conf_chan(struct ieee80211_local *local, + struct ieee80211_chanctx_conf *ctx) { struct ieee80211_sub_if_data *sdata; struct cfg80211_chan_def chandef = {}; + struct cfg80211_chan_def *oper = NULL; + enum ieee80211_smps_mode smps_mode = IEEE80211_SMPS_STATIC; u32 changed = 0; int power; u32 offchannel_flag; + if (!local->emulate_chanctx) + return 0; + offchannel_flag = local->hw.conf.flags & IEEE80211_CONF_OFFCHANNEL; + if (ctx && !WARN_ON(!ctx->def.chan)) { + oper = &ctx->def; + if (ctx->rx_chains_static > 1) + smps_mode = IEEE80211_SMPS_OFF; + else if (ctx->rx_chains_dynamic > 1) + smps_mode = IEEE80211_SMPS_DYNAMIC; + else + smps_mode = IEEE80211_SMPS_STATIC; + } + if (local->scan_chandef.chan) { chandef = local->scan_chandef; } else if (local->tmp_channel) { @@ -110,25 +126,30 @@ static u32 ieee80211_hw_conf_chan(struct ieee80211_local *local) chandef.width = NL80211_CHAN_WIDTH_20_NOHT; chandef.center_freq1 = chandef.chan->center_freq; chandef.freq1_offset = chandef.chan->freq_offset; - } else - chandef = local->_oper_chandef; + } else if (oper) { + chandef = *oper; + } else { + chandef = local->dflt_chandef; + } - WARN(!cfg80211_chandef_valid(&chandef), - "control:%d.%03d MHz width:%d center: %d.%03d/%d MHz", - chandef.chan->center_freq, chandef.chan->freq_offset, - chandef.width, chandef.center_freq1, chandef.freq1_offset, - chandef.center_freq2); + if (WARN(!cfg80211_chandef_valid(&chandef), + "control:%d.%03d MHz width:%d center: %d.%03d/%d MHz", + chandef.chan ? chandef.chan->center_freq : -1, + chandef.chan ? chandef.chan->freq_offset : 0, + chandef.width, chandef.center_freq1, chandef.freq1_offset, + chandef.center_freq2)) + return 0; - if (!cfg80211_chandef_identical(&chandef, &local->_oper_chandef)) + if (!oper || !cfg80211_chandef_identical(&chandef, oper)) local->hw.conf.flags |= IEEE80211_CONF_OFFCHANNEL; else local->hw.conf.flags &= ~IEEE80211_CONF_OFFCHANNEL; offchannel_flag ^= local->hw.conf.flags & IEEE80211_CONF_OFFCHANNEL; - if (offchannel_flag || - !cfg80211_chandef_identical(&local->hw.conf.chandef, - &local->_oper_chandef)) { + /* force it also for scanning, since drivers might config differently */ + if (offchannel_flag || local->scanning || local->in_reconfig || + !cfg80211_chandef_identical(&local->hw.conf.chandef, &chandef)) { local->hw.conf.chandef = chandef; changed |= IEEE80211_CONF_CHANGE_CHANNEL; } @@ -140,8 +161,8 @@ static u32 ieee80211_hw_conf_chan(struct ieee80211_local *local) * that otherwise STATIC is used. */ local->hw.conf.smps_mode = IEEE80211_SMPS_STATIC; - } else if (local->hw.conf.smps_mode != local->smps_mode) { - local->hw.conf.smps_mode = local->smps_mode; + } else if (local->hw.conf.smps_mode != smps_mode) { + local->hw.conf.smps_mode = smps_mode; changed |= IEEE80211_CONF_CHANGE_SMPS; } @@ -173,12 +194,9 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed) might_sleep(); - if (!local->use_chanctx) - changed |= ieee80211_hw_conf_chan(local); - else - changed &= ~(IEEE80211_CONF_CHANGE_CHANNEL | - IEEE80211_CONF_CHANGE_POWER | - IEEE80211_CONF_CHANGE_SMPS); + WARN_ON(changed & (IEEE80211_CONF_CHANGE_CHANNEL | + IEEE80211_CONF_CHANGE_POWER | + IEEE80211_CONF_CHANGE_SMPS)); if (changed && local->open_count) { ret = drv_config(local, changed); @@ -202,13 +220,115 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed) return ret; } +/* for scanning, offchannel and chanctx emulation only */ +static int _ieee80211_hw_conf_chan(struct ieee80211_local *local, + struct ieee80211_chanctx_conf *ctx) +{ + u32 changed; + + if (!local->open_count) + return 0; + + changed = ieee80211_calc_hw_conf_chan(local, ctx); + if (!changed) + return 0; + + return drv_config(local, changed); +} + +int ieee80211_hw_conf_chan(struct ieee80211_local *local) +{ + struct ieee80211_chanctx *ctx; + + ctx = list_first_entry_or_null(&local->chanctx_list, + struct ieee80211_chanctx, + list); + + return _ieee80211_hw_conf_chan(local, ctx ? &ctx->conf : NULL); +} + +void ieee80211_hw_conf_init(struct ieee80211_local *local) +{ + u32 changed = ~(IEEE80211_CONF_CHANGE_CHANNEL | + IEEE80211_CONF_CHANGE_POWER | + IEEE80211_CONF_CHANGE_SMPS); + + if (WARN_ON(!local->open_count)) + return; + + if (local->emulate_chanctx) { + struct ieee80211_chanctx *ctx; + + ctx = list_first_entry_or_null(&local->chanctx_list, + struct ieee80211_chanctx, + list); + + changed |= ieee80211_calc_hw_conf_chan(local, + ctx ? &ctx->conf : NULL); + } + + WARN_ON(drv_config(local, changed)); +} + +int ieee80211_emulate_add_chanctx(struct ieee80211_hw *hw, + struct ieee80211_chanctx_conf *ctx) +{ + struct ieee80211_local *local = hw_to_local(hw); + + local->hw.conf.radar_enabled = ctx->radar_enabled; + + return _ieee80211_hw_conf_chan(local, ctx); +} +EXPORT_SYMBOL(ieee80211_emulate_add_chanctx); + +void ieee80211_emulate_remove_chanctx(struct ieee80211_hw *hw, + struct ieee80211_chanctx_conf *ctx) +{ + struct ieee80211_local *local = hw_to_local(hw); + + local->hw.conf.radar_enabled = false; + + _ieee80211_hw_conf_chan(local, NULL); +} +EXPORT_SYMBOL(ieee80211_emulate_remove_chanctx); + +void ieee80211_emulate_change_chanctx(struct ieee80211_hw *hw, + struct ieee80211_chanctx_conf *ctx, + u32 changed) +{ + struct ieee80211_local *local = hw_to_local(hw); + + local->hw.conf.radar_enabled = ctx->radar_enabled; + + _ieee80211_hw_conf_chan(local, ctx); +} +EXPORT_SYMBOL(ieee80211_emulate_change_chanctx); + +int ieee80211_emulate_switch_vif_chanctx(struct ieee80211_hw *hw, + struct ieee80211_vif_chanctx_switch *vifs, + int n_vifs, + enum ieee80211_chanctx_switch_mode mode) +{ + struct ieee80211_local *local = hw_to_local(hw); + + if (n_vifs <= 0) + return -EINVAL; + + local->hw.conf.radar_enabled = vifs[0].new_ctx->radar_enabled; + _ieee80211_hw_conf_chan(local, vifs[0].new_ctx); + + return 0; +} +EXPORT_SYMBOL(ieee80211_emulate_switch_vif_chanctx); + #define BSS_CHANGED_VIF_CFG_FLAGS (BSS_CHANGED_ASSOC |\ BSS_CHANGED_IDLE |\ BSS_CHANGED_PS |\ BSS_CHANGED_IBSS |\ BSS_CHANGED_ARP_FILTER |\ BSS_CHANGED_SSID |\ - BSS_CHANGED_MLD_VALID_LINKS) + BSS_CHANGED_MLD_VALID_LINKS |\ + BSS_CHANGED_MLD_TTLM) void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata, u64 changed) @@ -303,9 +423,9 @@ u64 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata) BSS_CHANGED_ERP_SLOT; } -static void ieee80211_tasklet_handler(struct tasklet_struct *t) +/* context: requires softirqs disabled */ +void ieee80211_handle_queued_frames(struct ieee80211_local *local) { - struct ieee80211_local *local = from_tasklet(local, t, tasklet); struct sk_buff *skb; while ((skb = skb_dequeue(&local->skb_queue)) || @@ -330,6 +450,13 @@ static void ieee80211_tasklet_handler(struct tasklet_struct *t) } } +static void ieee80211_tasklet_handler(struct tasklet_struct *t) +{ + struct ieee80211_local *local = from_tasklet(local, t, tasklet); + + ieee80211_handle_queued_frames(local); +} + static void ieee80211_restart_work(struct work_struct *work) { struct ieee80211_local *local = @@ -644,7 +771,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, struct ieee80211_local *local; int priv_size, i; struct wiphy *wiphy; - bool use_chanctx; + bool emulate_chanctx; if (WARN_ON(!ops->tx || !ops->start || !ops->stop || !ops->config || !ops->add_interface || !ops->remove_interface || @@ -659,12 +786,26 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, return NULL; /* check all or no channel context operations exist */ - i = !!ops->add_chanctx + !!ops->remove_chanctx + - !!ops->change_chanctx + !!ops->assign_vif_chanctx + - !!ops->unassign_vif_chanctx; - if (WARN_ON(i != 0 && i != 5)) - return NULL; - use_chanctx = i == 5; + if (ops->add_chanctx == ieee80211_emulate_add_chanctx && + ops->remove_chanctx == ieee80211_emulate_remove_chanctx && + ops->change_chanctx == ieee80211_emulate_change_chanctx) { + if (WARN_ON(ops->assign_vif_chanctx || + ops->unassign_vif_chanctx)) + return NULL; + emulate_chanctx = true; + } else { + if (WARN_ON(ops->add_chanctx == ieee80211_emulate_add_chanctx || + ops->remove_chanctx == ieee80211_emulate_remove_chanctx || + ops->change_chanctx == ieee80211_emulate_change_chanctx)) + return NULL; + if (WARN_ON(!ops->add_chanctx || + !ops->remove_chanctx || + !ops->change_chanctx || + !ops->assign_vif_chanctx || + !ops->unassign_vif_chanctx)) + return NULL; + emulate_chanctx = false; + } /* Ensure 32-byte alignment of our private data and hw private data. * We use the wiphy priv data for both our ieee80211_local and for @@ -698,7 +839,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, WIPHY_FLAG_REPORTS_OBSS | WIPHY_FLAG_OFFCHAN_TX; - if (!use_chanctx || ops->remain_on_channel) + if (emulate_chanctx || ops->remain_on_channel) wiphy->flags |= WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL; wiphy->features |= NL80211_FEATURE_SK_TX_STATUS | @@ -734,8 +875,11 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT); } - if (!ops->set_key) + if (!ops->set_key) { wiphy->flags |= WIPHY_FLAG_IBSS_RSN; + wiphy_ext_feature_set(wiphy, + NL80211_EXT_FEATURE_SPP_AMSDU_SUPPORT); + } wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_TXQS); wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_RRM); @@ -752,7 +896,10 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, local->hw.priv = (char *)local + ALIGN(sizeof(*local), NETDEV_ALIGN); local->ops = ops; - local->use_chanctx = use_chanctx; + local->emulate_chanctx = emulate_chanctx; + + if (emulate_chanctx) + ieee80211_hw_set(&local->hw, CHANCTX_STA_CSA); /* * We need a bit of data queued to build aggregates properly, so @@ -829,7 +976,6 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, ieee80211_dfs_radar_detected_work); wiphy_work_init(&local->reconfig_filter, ieee80211_reconfig_filter); - local->smps_mode = IEEE80211_SMPS_OFF; wiphy_work_init(&local->dynamic_ps_enable_work, ieee80211_dynamic_ps_enable_work); @@ -980,7 +1126,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) * as much, e.g. monitoring beacons would be hard if we * might not even know which link is active at which time. */ - if (WARN_ON(!local->use_chanctx)) + if (WARN_ON(local->emulate_chanctx)) return -EINVAL; if (WARN_ON(!local->ops->link_info_changed)) @@ -1024,7 +1170,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) return -EINVAL; #endif - if (!local->use_chanctx) { + if (local->emulate_chanctx) { for (i = 0; i < local->hw.wiphy->n_iface_combinations; i++) { const struct ieee80211_iface_combination *comb; @@ -1090,11 +1236,11 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) &sband->channels[i], NL80211_CHAN_NO_HT); /* init channel we're on */ - if (!local->use_chanctx && !local->_oper_chandef.chan) { + local->monitor_chanreq.oper = dflt_chandef; + if (local->emulate_chanctx) { + local->dflt_chandef = dflt_chandef; local->hw.conf.chandef = dflt_chandef; - local->_oper_chandef = dflt_chandef; } - local->monitor_chandef = dflt_chandef; } channels += sband->n_channels; @@ -1115,8 +1261,26 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) supp_vht = supp_vht || sband->vht_cap.vht_supported; for_each_sband_iftype_data(sband, i, iftd) { + u8 he_40_mhz_cap; + supp_he = supp_he || iftd->he_cap.has_he; supp_eht = supp_eht || iftd->eht_cap.has_eht; + + if (band == NL80211_BAND_2GHZ) + he_40_mhz_cap = + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G; + else + he_40_mhz_cap = + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G; + + /* currently no support for HE client where HT has 40 MHz but not HT */ + if (iftd->he_cap.has_he && + iftd->types_mask & (BIT(NL80211_IFTYPE_STATION) | + BIT(NL80211_IFTYPE_P2P_CLIENT)) && + sband->ht_cap.ht_supported && + sband->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 && + !(iftd->he_cap.he_cap_elem.phy_cap_info[0] & he_40_mhz_cap)) + return -EINVAL; } /* HT, VHT, HE require QoS, thus >= 4 queues */ diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 3d4806b7ff..6d4510221c 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2018 - 2023 Intel Corporation + * Copyright (C) 2018 - 2024 Intel Corporation * Authors: Luis Carlos Cobo <luisca@cozybit.com> * Javier Cardona <javier@cozybit.com> */ @@ -97,7 +97,7 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata, if (sdata->vif.bss_conf.basic_rates != basic_rates) return false; - cfg80211_chandef_create(&sta_chan_def, sdata->vif.bss_conf.chandef.chan, + cfg80211_chandef_create(&sta_chan_def, sdata->vif.bss_conf.chanreq.oper.chan, NL80211_CHAN_NO_HT); ieee80211_chandef_ht_oper(ie->ht_operation, &sta_chan_def); @@ -107,10 +107,11 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata, ieee80211_chandef_vht_oper(&sdata->local->hw, vht_cap_info, ie->vht_operation, ie->ht_operation, &sta_chan_def); - ieee80211_chandef_he_6ghz_oper(sdata, ie->he_operation, ie->eht_operation, + ieee80211_chandef_he_6ghz_oper(sdata->local, ie->he_operation, + ie->eht_operation, &sta_chan_def); - if (!cfg80211_chandef_compatible(&sdata->vif.bss_conf.chandef, + if (!cfg80211_chandef_compatible(&sdata->vif.bss_conf.chanreq.oper, &sta_chan_def)) return false; @@ -435,9 +436,9 @@ int mesh_add_ht_cap_ie(struct ieee80211_sub_if_data *sdata, return 0; if (!sband->ht_cap.ht_supported || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10) + sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT || + sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_5 || + sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_10) return 0; if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_ht_cap)) @@ -476,16 +477,16 @@ int mesh_add_ht_oper_ie(struct ieee80211_sub_if_data *sdata, return 0; if (!ht_cap->ht_supported || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10) + sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT || + sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_5 || + sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_10) return 0; if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_ht_operation)) return -ENOMEM; pos = skb_put(skb, 2 + sizeof(struct ieee80211_ht_operation)); - ieee80211_ie_build_ht_oper(pos, ht_cap, &sdata->vif.bss_conf.chandef, + ieee80211_ie_build_ht_oper(pos, ht_cap, &sdata->vif.bss_conf.chanreq.oper, sdata->vif.bss_conf.ht_operation_mode, false); @@ -507,9 +508,9 @@ int mesh_add_vht_cap_ie(struct ieee80211_sub_if_data *sdata, return 0; if (!sband->vht_cap.vht_supported || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10) + sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT || + sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_5 || + sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_10) return 0; if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_vht_cap)) @@ -548,9 +549,9 @@ int mesh_add_vht_oper_ie(struct ieee80211_sub_if_data *sdata, return 0; if (!vht_cap->vht_supported || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10) + sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT || + sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_5 || + sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_10) return 0; if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_vht_operation)) @@ -558,7 +559,7 @@ int mesh_add_vht_oper_ie(struct ieee80211_sub_if_data *sdata, pos = skb_put(skb, 2 + sizeof(struct ieee80211_vht_operation)); ieee80211_ie_build_vht_oper(pos, vht_cap, - &sdata->vif.bss_conf.chandef); + &sdata->vif.bss_conf.chanreq.oper); return 0; } @@ -566,29 +567,18 @@ int mesh_add_vht_oper_ie(struct ieee80211_sub_if_data *sdata, int mesh_add_he_cap_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u8 ie_len) { - const struct ieee80211_sta_he_cap *he_cap; struct ieee80211_supported_band *sband; - u8 *pos; sband = ieee80211_get_sband(sdata); if (!sband) return -EINVAL; - he_cap = ieee80211_get_he_iftype_cap(sband, NL80211_IFTYPE_MESH_POINT); - - if (!he_cap || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10) + if (sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT || + sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_5 || + sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_10) return 0; - if (skb_tailroom(skb) < ie_len) - return -ENOMEM; - - pos = skb_put(skb, ie_len); - ieee80211_ie_build_he_cap(0, pos, he_cap, pos + ie_len); - - return 0; + return ieee80211_put_he_cap(skb, sdata, sband, NULL); } int mesh_add_he_oper_ie(struct ieee80211_sub_if_data *sdata, @@ -605,20 +595,20 @@ int mesh_add_he_oper_ie(struct ieee80211_sub_if_data *sdata, he_cap = ieee80211_get_he_iftype_cap(sband, NL80211_IFTYPE_MESH_POINT); if (!he_cap || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10) + sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT || + sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_5 || + sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_10) return 0; len = 2 + 1 + sizeof(struct ieee80211_he_operation); - if (sdata->vif.bss_conf.chandef.chan->band == NL80211_BAND_6GHZ) + if (sdata->vif.bss_conf.chanreq.oper.chan->band == NL80211_BAND_6GHZ) len += sizeof(struct ieee80211_he_6ghz_oper); if (skb_tailroom(skb) < len) return -ENOMEM; pos = skb_put(skb, len); - ieee80211_ie_build_he_oper(pos, &sdata->vif.bss_conf.chandef); + ieee80211_ie_build_he_oper(pos, &sdata->vif.bss_conf.chanreq.oper); return 0; } @@ -639,37 +629,25 @@ int mesh_add_he_6ghz_cap_ie(struct ieee80211_sub_if_data *sdata, if (!iftd) return 0; - ieee80211_ie_build_he_6ghz_cap(sdata, sdata->deflink.smps_mode, skb); + ieee80211_put_he_6ghz_cap(skb, sdata, sdata->deflink.smps_mode); return 0; } int mesh_add_eht_cap_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u8 ie_len) { - const struct ieee80211_sta_he_cap *he_cap; - const struct ieee80211_sta_eht_cap *eht_cap; struct ieee80211_supported_band *sband; - u8 *pos; sband = ieee80211_get_sband(sdata); if (!sband) return -EINVAL; - he_cap = ieee80211_get_he_iftype_cap(sband, NL80211_IFTYPE_MESH_POINT); - eht_cap = ieee80211_get_eht_iftype_cap(sband, NL80211_IFTYPE_MESH_POINT); - if (!he_cap || !eht_cap || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10) + if (sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT || + sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_5 || + sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_10) return 0; - if (skb_tailroom(skb) < ie_len) - return -ENOMEM; - - pos = skb_put(skb, ie_len); - ieee80211_ie_build_eht_cap(pos, he_cap, eht_cap, pos + ie_len, false); - - return 0; + return ieee80211_put_eht_cap(skb, sdata, sband, NULL); } int mesh_add_eht_oper_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) @@ -685,9 +663,9 @@ int mesh_add_eht_oper_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *sk eht_cap = ieee80211_get_eht_iftype_cap(sband, NL80211_IFTYPE_MESH_POINT); if (!eht_cap || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10) + sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT || + sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_5 || + sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_10) return 0; len = 2 + 1 + offsetof(struct ieee80211_eht_operation, optional) + @@ -697,7 +675,7 @@ int mesh_add_eht_oper_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *sk return -ENOMEM; pos = skb_put(skb, len); - ieee80211_ie_build_eht_oper(pos, &sdata->vif.bss_conf.chandef, eht_cap); + ieee80211_ie_build_eht_oper(pos, &sdata->vif.bss_conf.chanreq.oper, eht_cap); return 0; } @@ -745,9 +723,9 @@ ieee80211_mesh_update_bss_params(struct ieee80211_sub_if_data *sdata, return; if (!ieee80211_get_he_iftype_cap(sband, NL80211_IFTYPE_MESH_POINT) || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || - sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10) + sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT || + sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_5 || + sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_10) return; sdata->vif.bss_conf.he_support = true; @@ -972,24 +950,22 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) int head_len, tail_len; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; - struct ieee80211_chanctx_conf *chanctx_conf; struct mesh_csa_settings *csa; - enum nl80211_band band; + const struct ieee80211_supported_band *sband; u8 ie_len_he_cap, ie_len_eht_cap; u8 *pos; struct ieee80211_sub_if_data *sdata; int hdr_len = offsetofend(struct ieee80211_mgmt, u.beacon); + u32 rate_flags; sdata = container_of(ifmsh, struct ieee80211_sub_if_data, u.mesh); - rcu_read_lock(); - chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); - band = chanctx_conf->def.chan->band; - rcu_read_unlock(); - ie_len_he_cap = ieee80211_ie_len_he_cap(sdata, - NL80211_IFTYPE_MESH_POINT); - ie_len_eht_cap = ieee80211_ie_len_eht_cap(sdata, - NL80211_IFTYPE_MESH_POINT); + sband = ieee80211_get_sband(sdata); + rate_flags = + ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper); + + ie_len_he_cap = ieee80211_ie_len_he_cap(sdata); + ie_len_eht_cap = ieee80211_ie_len_eht_cap(sdata); head_len = hdr_len + 2 + /* NULL SSID */ /* Channel Switch Announcement */ @@ -1113,7 +1089,9 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) } rcu_read_unlock(); - if (ieee80211_add_srates_ie(sdata, skb, true, band) || + if (ieee80211_put_srates_elem(skb, sband, + sdata->vif.bss_conf.basic_rates, + rate_flags, 0, WLAN_EID_SUPP_RATES) || mesh_add_ds_params_ie(sdata, skb)) goto out_free; @@ -1124,7 +1102,9 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) skb_trim(skb, 0); bcn->tail = bcn->head + bcn->head_len; - if (ieee80211_add_ext_srates_ie(sdata, skb, true, band) || + if (ieee80211_put_srates_elem(skb, sband, + sdata->vif.bss_conf.basic_rates, + rate_flags, 0, WLAN_EID_EXT_SUPP_RATES) || mesh_add_rsn_ie(sdata, skb) || mesh_add_ht_cap_ie(sdata, skb) || mesh_add_ht_oper_ie(sdata, skb) || @@ -1240,7 +1220,7 @@ void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata) netif_carrier_off(sdata->dev); /* flush STAs and mpaths on this iface */ - sta_info_flush(sdata); + sta_info_flush(sdata, -1); ieee80211_free_keys(sdata, true); mesh_path_flush_by_iface(sdata); @@ -1282,11 +1262,12 @@ static void ieee80211_mesh_csa_mark_radar(struct ieee80211_sub_if_data *sdata) * unavailable. */ err = cfg80211_chandef_dfs_required(sdata->local->hw.wiphy, - &sdata->vif.bss_conf.chandef, + &sdata->vif.bss_conf.chanreq.oper, NL80211_IFTYPE_MESH_POINT); if (err > 0) cfg80211_radar_event(sdata->local->hw.wiphy, - &sdata->vif.bss_conf.chandef, GFP_ATOMIC); + &sdata->vif.bss_conf.chanreq.oper, + GFP_ATOMIC); } static bool @@ -1298,7 +1279,7 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct ieee80211_supported_band *sband; int err; - ieee80211_conn_flags_t conn_flags = 0; + struct ieee80211_conn_settings conn = ieee80211_conn_settings_unlimited; u32 vht_cap_info = 0; lockdep_assert_wiphy(sdata->local->hw.wiphy); @@ -1307,15 +1288,18 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata, if (!sband) return false; - switch (sdata->vif.bss_conf.chandef.width) { + switch (sdata->vif.bss_conf.chanreq.oper.width) { case NL80211_CHAN_WIDTH_20_NOHT: - conn_flags |= IEEE80211_CONN_DISABLE_HT; - fallthrough; + conn.mode = IEEE80211_CONN_MODE_LEGACY; + conn.bw_limit = IEEE80211_CONN_BW_LIMIT_20; + break; case NL80211_CHAN_WIDTH_20: - conn_flags |= IEEE80211_CONN_DISABLE_40MHZ; - fallthrough; + conn.mode = IEEE80211_CONN_MODE_HT; + conn.bw_limit = IEEE80211_CONN_BW_LIMIT_20; + break; case NL80211_CHAN_WIDTH_40: - conn_flags |= IEEE80211_CONN_DISABLE_VHT; + conn.mode = IEEE80211_CONN_MODE_HT; + conn.bw_limit = IEEE80211_CONN_BW_LIMIT_40; break; default: break; @@ -1327,8 +1311,8 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata, memset(¶ms, 0, sizeof(params)); err = ieee80211_parse_ch_switch_ie(sdata, elems, sband->band, - vht_cap_info, - conn_flags, sdata->vif.addr, + vht_cap_info, &conn, + sdata->vif.addr, &csa_ie); if (err < 0) return false; @@ -1341,7 +1325,7 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata, if (csa_ie.reason_code == WLAN_REASON_MESH_CHAN_REGULATORY) ieee80211_mesh_csa_mark_radar(sdata); - params.chandef = csa_ie.chandef; + params.chandef = csa_ie.chanreq.oper; params.count = csa_ie.count; if (!cfg80211_chandef_usable(sdata->local->hw.wiphy, ¶ms.chandef, @@ -1377,7 +1361,7 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata, params.radar_required = err; if (cfg80211_chandef_identical(¶ms.chandef, - &sdata->vif.bss_conf.chandef)) { + &sdata->vif.bss_conf.chanreq.oper)) { mcsa_dbg(sdata, "received csa with an identical chandef, ignoring\n"); return true; @@ -1557,7 +1541,7 @@ int ieee80211_mesh_finish_csa(struct ieee80211_sub_if_data *sdata, u64 *changed) *changed |= BSS_CHANGED_BEACON; mcsa_dbg(sdata, "complete switching to center freq %d MHz", - sdata->vif.bss_conf.chandef.chan->center_freq); + sdata->vif.bss_conf.chanreq.oper.chan->center_freq); return 0; } @@ -1792,6 +1776,7 @@ void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata) ifmsh->last_preq = jiffies; ifmsh->next_perr = jiffies; ifmsh->csa_role = IEEE80211_MESH_CSA_ROLE_NONE; + ifmsh->nonpeer_pm = NL80211_MESH_POWER_ACTIVE; /* Allocate all mesh structures when creating the first mesh interface. */ if (!mesh_allocated) ieee80211s_init(); diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index 58c619874c..3f9664e4e0 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2023 Intel Corporation + * Copyright (C) 2023-2024 Intel Corporation * Authors: Luis Carlos Cobo <luisca@cozybit.com> * Javier Cardona <javier@cozybit.com> */ @@ -94,6 +94,7 @@ enum mesh_deferred_task_flags { * @is_root: the destination station of this path is a root node * @is_gate: the destination station of this path is a mesh gate * @path_change_count: the number of path changes to destination + * @fast_tx_check: timestamp of last fast-xmit enable attempt * * * The dst address is unique in the mesh path table. Since the mesh_path is diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index a6b62169f0..c0a5c75cdd 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -1017,10 +1017,23 @@ void mesh_path_discard_frame(struct ieee80211_sub_if_data *sdata, */ void mesh_path_flush_pending(struct mesh_path *mpath) { + struct ieee80211_sub_if_data *sdata = mpath->sdata; + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + struct mesh_preq_queue *preq, *tmp; struct sk_buff *skb; while ((skb = skb_dequeue(&mpath->frame_queue)) != NULL) mesh_path_discard_frame(mpath->sdata, skb); + + spin_lock_bh(&ifmsh->mesh_preq_queue_lock); + list_for_each_entry_safe(preq, tmp, &ifmsh->preq_queue.list, list) { + if (ether_addr_equal(mpath->dst, preq->dst)) { + list_del(&preq->list); + kfree(preq); + --ifmsh->preq_queue_len; + } + } + spin_unlock_bh(&ifmsh->mesh_preq_queue_lock); } /** diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 28bf794f67..8f2b492a9f 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2019, 2021-2023 Intel Corporation + * Copyright (C) 2019, 2021-2024 Intel Corporation * Author: Luis Carlos Cobo <luisca@cozybit.com> */ #include <linux/gfp.h> @@ -163,7 +163,7 @@ static u64 mesh_set_ht_prot_mode(struct ieee80211_sub_if_data *sdata) u16 ht_opmode; bool non_ht_sta = false, ht20_sta = false; - switch (sdata->vif.bss_conf.chandef.width) { + switch (sdata->vif.bss_conf.chanreq.oper.width) { case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: @@ -196,7 +196,7 @@ static u64 mesh_set_ht_prot_mode(struct ieee80211_sub_if_data *sdata) if (non_ht_sta) ht_opmode = IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED; else if (ht20_sta && - sdata->vif.bss_conf.chandef.width > NL80211_CHAN_WIDTH_20) + sdata->vif.bss_conf.chanreq.oper.width > NL80211_CHAN_WIDTH_20) ht_opmode = IEEE80211_HT_OP_MODE_PROTECTION_20MHZ; else ht_opmode = IEEE80211_HT_OP_MODE_PROTECTION_NONE; @@ -226,10 +226,8 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.self_prot); int err = -ENOMEM; - ie_len_he_cap = ieee80211_ie_len_he_cap(sdata, - NL80211_IFTYPE_MESH_POINT); - ie_len_eht_cap = ieee80211_ie_len_eht_cap(sdata, - NL80211_IFTYPE_MESH_POINT); + ie_len_he_cap = ieee80211_ie_len_he_cap(sdata); + ie_len_eht_cap = ieee80211_ie_len_eht_cap(sdata); skb = dev_alloc_skb(local->tx_headroom + hdr_len + 2 + /* capability info */ @@ -266,14 +264,13 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, if (action != WLAN_SP_MESH_PEERING_CLOSE) { struct ieee80211_supported_band *sband; - enum nl80211_band band; + u32 rate_flags, basic_rates; sband = ieee80211_get_sband(sdata); if (!sband) { err = -EINVAL; goto free; } - band = sband->band; /* capability info */ pos = skb_put_zero(skb, 2); @@ -282,8 +279,17 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, pos = skb_put(skb, 2); put_unaligned_le16(sta->sta.aid, pos); } - if (ieee80211_add_srates_ie(sdata, skb, true, band) || - ieee80211_add_ext_srates_ie(sdata, skb, true, band) || + + rate_flags = + ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper); + basic_rates = sdata->vif.bss_conf.basic_rates; + + if (ieee80211_put_srates_elem(skb, sband, basic_rates, + rate_flags, 0, + WLAN_EID_SUPP_RATES) || + ieee80211_put_srates_elem(skb, sband, basic_rates, + rate_flags, 0, + WLAN_EID_EXT_SUPP_RATES) || mesh_add_rsn_ie(sdata, skb) || mesh_add_meshid_ie(sdata, skb) || mesh_add_meshconf_ie(sdata, skb)) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 94028b541b..ad2ce9c92b 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -46,6 +46,8 @@ #define IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS msecs_to_jiffies(100) #define IEEE80211_ADV_TTLM_ST_UNDERFLOW 0xff00 +#define IEEE80211_NEG_TTLM_REQ_TIMEOUT (HZ / 5) + static int max_nullfunc_tries = 2; module_param(max_nullfunc_tries, int, 0644); MODULE_PARM_DESC(max_nullfunc_tries, @@ -92,84 +94,6 @@ MODULE_PARM_DESC(probe_wait_ms, #define IEEE80211_SIGNAL_AVE_MIN_COUNT 4 /* - * Extract from the given disabled subchannel bitmap (raw format - * from the EHT Operation Element) the bits for the subchannel - * we're using right now. - */ -static u16 -ieee80211_extract_dis_subch_bmap(const struct ieee80211_eht_operation *eht_oper, - struct cfg80211_chan_def *chandef, u16 bitmap) -{ - struct ieee80211_eht_operation_info *info = (void *)eht_oper->optional; - struct cfg80211_chan_def ap_chandef = *chandef; - u32 ap_center_freq, local_center_freq; - u32 ap_bw, local_bw; - int ap_start_freq, local_start_freq; - u16 shift, mask; - - if (!(eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT) || - !(eht_oper->params & - IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT)) - return 0; - - /* set 160/320 supported to get the full AP definition */ - ieee80211_chandef_eht_oper((const void *)eht_oper->optional, - true, true, &ap_chandef); - ap_center_freq = ap_chandef.center_freq1; - ap_bw = 20 * BIT(u8_get_bits(info->control, - IEEE80211_EHT_OPER_CHAN_WIDTH)); - ap_start_freq = ap_center_freq - ap_bw / 2; - local_center_freq = chandef->center_freq1; - local_bw = 20 * BIT(ieee80211_chan_width_to_rx_bw(chandef->width)); - local_start_freq = local_center_freq - local_bw / 2; - shift = (local_start_freq - ap_start_freq) / 20; - mask = BIT(local_bw / 20) - 1; - - return (bitmap >> shift) & mask; -} - -/* - * Handle the puncturing bitmap, possibly downgrading bandwidth to get a - * valid bitmap. - */ -static void -ieee80211_handle_puncturing_bitmap(struct ieee80211_link_data *link, - const struct ieee80211_eht_operation *eht_oper, - u16 bitmap, u64 *changed) -{ - struct cfg80211_chan_def *chandef = &link->conf->chandef; - struct ieee80211_local *local = link->sdata->local; - u16 extracted; - u64 _changed = 0; - - if (!changed) - changed = &_changed; - - while (chandef->width > NL80211_CHAN_WIDTH_40) { - extracted = - ieee80211_extract_dis_subch_bmap(eht_oper, chandef, - bitmap); - - if (cfg80211_valid_disable_subchannel_bitmap(&bitmap, - chandef) && - !(bitmap && ieee80211_hw_check(&local->hw, - DISALLOW_PUNCTURING))) - break; - link->u.mgd.conn_flags |= - ieee80211_chandef_downgrade(chandef); - *changed |= BSS_CHANGED_BANDWIDTH; - } - - if (chandef->width <= NL80211_CHAN_WIDTH_40) - extracted = 0; - - if (link->conf->eht_puncturing != extracted) { - link->conf->eht_puncturing = extracted; - *changed |= BSS_CHANGED_EHT_PUNCTURING; - } -} - -/* * We can have multiple work items (and connection probing) * scheduling this timer, but we need to take care to only * reschedule it when it should fire _earlier_ than it was @@ -223,77 +147,84 @@ static int ecw2cw(int ecw) return (1 << ecw) - 1; } -static ieee80211_conn_flags_t -ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, - struct ieee80211_link_data *link, - ieee80211_conn_flags_t conn_flags, - struct ieee80211_supported_band *sband, - struct ieee80211_channel *channel, - u32 vht_cap_info, - const struct ieee80211_ht_operation *ht_oper, - const struct ieee80211_vht_operation *vht_oper, - const struct ieee80211_he_operation *he_oper, - const struct ieee80211_eht_operation *eht_oper, - const struct ieee80211_s1g_oper_ie *s1g_oper, - struct cfg80211_chan_def *chandef, bool tracking) +static enum ieee80211_conn_mode +ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata, + struct ieee80211_channel *channel, + u32 vht_cap_info, + const struct ieee802_11_elems *elems, + bool ignore_ht_channel_mismatch, + const struct ieee80211_conn_settings *conn, + struct cfg80211_chan_def *chandef) { + const struct ieee80211_ht_operation *ht_oper = elems->ht_operation; + const struct ieee80211_vht_operation *vht_oper = elems->vht_operation; + const struct ieee80211_he_operation *he_oper = elems->he_operation; + const struct ieee80211_eht_operation *eht_oper = elems->eht_operation; + struct ieee80211_supported_band *sband = + sdata->local->hw.wiphy->bands[channel->band]; struct cfg80211_chan_def vht_chandef; - struct ieee80211_sta_ht_cap sta_ht_cap; - ieee80211_conn_flags_t ret; + bool no_vht = false; u32 ht_cfreq; - memset(chandef, 0, sizeof(struct cfg80211_chan_def)); - chandef->chan = channel; - chandef->width = NL80211_CHAN_WIDTH_20_NOHT; - chandef->center_freq1 = channel->center_freq; - chandef->freq1_offset = channel->freq_offset; + *chandef = (struct cfg80211_chan_def) { + .chan = channel, + .width = NL80211_CHAN_WIDTH_20_NOHT, + .center_freq1 = channel->center_freq, + .freq1_offset = channel->freq_offset, + }; - if (channel->band == NL80211_BAND_6GHZ) { - if (!ieee80211_chandef_he_6ghz_oper(sdata, he_oper, eht_oper, - chandef)) { - mlme_dbg(sdata, - "bad 6 GHz operation, disabling HT/VHT/HE/EHT\n"); - ret = IEEE80211_CONN_DISABLE_HT | - IEEE80211_CONN_DISABLE_VHT | - IEEE80211_CONN_DISABLE_HE | - IEEE80211_CONN_DISABLE_EHT; - } else { - ret = 0; - } - vht_chandef = *chandef; - goto out; - } else if (sband->band == NL80211_BAND_S1GHZ) { - if (!ieee80211_chandef_s1g_oper(s1g_oper, chandef)) { + /* get special S1G case out of the way */ + if (sband->band == NL80211_BAND_S1GHZ) { + if (!ieee80211_chandef_s1g_oper(elems->s1g_oper, chandef)) { sdata_info(sdata, "Missing S1G Operation Element? Trying operating == primary\n"); chandef->width = ieee80211_s1g_channel_width(channel); } - ret = IEEE80211_CONN_DISABLE_HT | IEEE80211_CONN_DISABLE_40MHZ | - IEEE80211_CONN_DISABLE_VHT | - IEEE80211_CONN_DISABLE_80P80MHZ | - IEEE80211_CONN_DISABLE_160MHZ; - goto out; + return IEEE80211_CONN_MODE_S1G; } - memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap)); - ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); + /* get special 6 GHz case out of the way */ + if (sband->band == NL80211_BAND_6GHZ) { + enum ieee80211_conn_mode mode = IEEE80211_CONN_MODE_EHT; - if (!ht_oper || !sta_ht_cap.ht_supported) { - mlme_dbg(sdata, "HT operation missing / HT not supported\n"); - ret = IEEE80211_CONN_DISABLE_HT | - IEEE80211_CONN_DISABLE_VHT | - IEEE80211_CONN_DISABLE_HE | - IEEE80211_CONN_DISABLE_EHT; - goto out; + /* this is an error */ + if (conn->mode < IEEE80211_CONN_MODE_HE) + return IEEE80211_CONN_MODE_LEGACY; + + if (!elems->he_6ghz_capa || !elems->he_cap) { + sdata_info(sdata, + "HE 6 GHz AP is missing HE/HE 6 GHz band capability\n"); + return IEEE80211_CONN_MODE_LEGACY; + } + + if (!eht_oper || !elems->eht_cap) { + eht_oper = NULL; + mode = IEEE80211_CONN_MODE_HE; + } + + if (!ieee80211_chandef_he_6ghz_oper(sdata->local, he_oper, + eht_oper, chandef)) { + sdata_info(sdata, "bad HE/EHT 6 GHz operation\n"); + return IEEE80211_CONN_MODE_LEGACY; + } + + return mode; } + /* now we have the progression HT, VHT, ... */ + if (conn->mode < IEEE80211_CONN_MODE_HT) + return IEEE80211_CONN_MODE_LEGACY; + + if (!ht_oper || !elems->ht_cap_elem) + return IEEE80211_CONN_MODE_LEGACY; + chandef->width = NL80211_CHAN_WIDTH_20; ht_cfreq = ieee80211_channel_to_frequency(ht_oper->primary_chan, channel->band); /* check that channel matches the right operating channel */ - if (!tracking && channel->center_freq != ht_cfreq) { + if (!ignore_ht_channel_mismatch && channel->center_freq != ht_cfreq) { /* * It's possible that some APs are confused here; * Netgear WNDR3700 sometimes reports 4 higher than @@ -305,36 +236,22 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, "Wrong control channel: center-freq: %d ht-cfreq: %d ht->primary_chan: %d band: %d - Disabling HT\n", channel->center_freq, ht_cfreq, ht_oper->primary_chan, channel->band); - ret = IEEE80211_CONN_DISABLE_HT | - IEEE80211_CONN_DISABLE_VHT | - IEEE80211_CONN_DISABLE_HE | - IEEE80211_CONN_DISABLE_EHT; - goto out; + return IEEE80211_CONN_MODE_LEGACY; } - /* check 40 MHz support, if we have it */ - if (sta_ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) { - ieee80211_chandef_ht_oper(ht_oper, chandef); - } else { - mlme_dbg(sdata, "40 MHz not supported\n"); - /* 40 MHz (and 80 MHz) must be supported for VHT */ - ret = IEEE80211_CONN_DISABLE_VHT; - /* also mark 40 MHz disabled */ - ret |= IEEE80211_CONN_DISABLE_40MHZ; - goto out; - } + ieee80211_chandef_ht_oper(ht_oper, chandef); - if (!vht_oper || !sband->vht_cap.vht_supported) { - mlme_dbg(sdata, "VHT operation missing / VHT not supported\n"); - ret = IEEE80211_CONN_DISABLE_VHT; - goto out; - } + if (conn->mode < IEEE80211_CONN_MODE_VHT) + return IEEE80211_CONN_MODE_HT; vht_chandef = *chandef; - if (!(conn_flags & IEEE80211_CONN_DISABLE_HE) && - he_oper && - (le32_to_cpu(he_oper->he_oper_params) & - IEEE80211_HE_OPERATION_VHT_OPER_INFO)) { + + /* + * having he_cap/he_oper parsed out implies we're at + * least operating as HE STA + */ + if (elems->he_cap && he_oper && + he_oper->he_oper_params & cpu_to_le32(IEEE80211_HE_OPERATION_VHT_OPER_INFO)) { struct ieee80211_vht_operation he_oper_vht_cap; /* @@ -347,253 +264,625 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, if (!ieee80211_chandef_vht_oper(&sdata->local->hw, vht_cap_info, &he_oper_vht_cap, ht_oper, &vht_chandef)) { - if (!(conn_flags & IEEE80211_CONN_DISABLE_HE)) - sdata_info(sdata, - "HE AP VHT information is invalid, disabling HE\n"); - ret = IEEE80211_CONN_DISABLE_HE | IEEE80211_CONN_DISABLE_EHT; - goto out; + sdata_info(sdata, + "HE AP VHT information is invalid, disabling HE\n"); + /* this will cause us to re-parse as VHT STA */ + return IEEE80211_CONN_MODE_VHT; } + } else if (!vht_oper || !elems->vht_cap_elem) { + if (sband->band == NL80211_BAND_5GHZ) { + sdata_info(sdata, + "VHT information is missing, disabling VHT\n"); + return IEEE80211_CONN_MODE_HT; + } + no_vht = true; + } else if (sband->band == NL80211_BAND_2GHZ) { + no_vht = true; } else if (!ieee80211_chandef_vht_oper(&sdata->local->hw, vht_cap_info, vht_oper, ht_oper, &vht_chandef)) { - if (!(conn_flags & IEEE80211_CONN_DISABLE_VHT)) - sdata_info(sdata, - "AP VHT information is invalid, disabling VHT\n"); - ret = IEEE80211_CONN_DISABLE_VHT; - goto out; + sdata_info(sdata, + "AP VHT information is invalid, disabling VHT\n"); + return IEEE80211_CONN_MODE_HT; } - if (!cfg80211_chandef_valid(&vht_chandef)) { - if (!(conn_flags & IEEE80211_CONN_DISABLE_VHT)) - sdata_info(sdata, - "AP VHT information is invalid, disabling VHT\n"); - ret = IEEE80211_CONN_DISABLE_VHT; - goto out; + if (!cfg80211_chandef_compatible(chandef, &vht_chandef)) { + sdata_info(sdata, + "AP VHT information doesn't match HT, disabling VHT\n"); + return IEEE80211_CONN_MODE_HT; } - if (cfg80211_chandef_identical(chandef, &vht_chandef)) { - ret = 0; - goto out; - } + *chandef = vht_chandef; - if (!cfg80211_chandef_compatible(chandef, &vht_chandef)) { - if (!(conn_flags & IEEE80211_CONN_DISABLE_VHT)) - sdata_info(sdata, - "AP VHT information doesn't match HT, disabling VHT\n"); - ret = IEEE80211_CONN_DISABLE_VHT; - goto out; + /* stick to current max mode if we or the AP don't have HE */ + if (conn->mode < IEEE80211_CONN_MODE_HE || + !elems->he_operation || !elems->he_cap) { + if (no_vht) + return IEEE80211_CONN_MODE_HT; + return IEEE80211_CONN_MODE_VHT; } - *chandef = vht_chandef; + /* stick to HE if we or the AP don't have EHT */ + if (conn->mode < IEEE80211_CONN_MODE_EHT || + !eht_oper || !elems->eht_cap) + return IEEE80211_CONN_MODE_HE; /* * handle the case that the EHT operation indicates that it holds EHT * operation information (in case that the channel width differs from * the channel width reported in HT/VHT/HE). */ - if (eht_oper && (eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT)) { + if (eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT) { struct cfg80211_chan_def eht_chandef = *chandef; ieee80211_chandef_eht_oper((const void *)eht_oper->optional, - eht_chandef.width == - NL80211_CHAN_WIDTH_160, - false, &eht_chandef); + &eht_chandef); + + eht_chandef.punctured = + ieee80211_eht_oper_dis_subchan_bitmap(eht_oper); if (!cfg80211_chandef_valid(&eht_chandef)) { - if (!(conn_flags & IEEE80211_CONN_DISABLE_EHT)) - sdata_info(sdata, - "AP EHT information is invalid, disabling EHT\n"); - ret = IEEE80211_CONN_DISABLE_EHT; - goto out; + sdata_info(sdata, + "AP EHT information is invalid, disabling EHT\n"); + return IEEE80211_CONN_MODE_HE; } if (!cfg80211_chandef_compatible(chandef, &eht_chandef)) { - if (!(conn_flags & IEEE80211_CONN_DISABLE_EHT)) - sdata_info(sdata, - "AP EHT information is incompatible, disabling EHT\n"); - ret = IEEE80211_CONN_DISABLE_EHT; - goto out; + sdata_info(sdata, + "AP EHT information doesn't match HT/VHT/HE, disabling EHT\n"); + return IEEE80211_CONN_MODE_HE; } *chandef = eht_chandef; } - ret = 0; + return IEEE80211_CONN_MODE_EHT; +} + +static bool +ieee80211_verify_peer_he_mcs_support(struct ieee80211_sub_if_data *sdata, + const struct ieee80211_he_cap_elem *he_cap, + const struct ieee80211_he_operation *he_op) +{ + struct ieee80211_he_mcs_nss_supp *he_mcs_nss_supp; + u16 mcs_80_map_tx, mcs_80_map_rx; + u16 ap_min_req_set; + int nss; + + if (!he_cap) + return false; + + /* mcs_nss is right after he_cap info */ + he_mcs_nss_supp = (void *)(he_cap + 1); + + mcs_80_map_tx = le16_to_cpu(he_mcs_nss_supp->tx_mcs_80); + mcs_80_map_rx = le16_to_cpu(he_mcs_nss_supp->rx_mcs_80); + + /* P802.11-REVme/D0.3 + * 27.1.1 Introduction to the HE PHY + * ... + * An HE STA shall support the following features: + * ... + * Single spatial stream HE-MCSs 0 to 7 (transmit and receive) in all + * supported channel widths for HE SU PPDUs + */ + if ((mcs_80_map_tx & 0x3) == IEEE80211_HE_MCS_NOT_SUPPORTED || + (mcs_80_map_rx & 0x3) == IEEE80211_HE_MCS_NOT_SUPPORTED) { + sdata_info(sdata, + "Missing mandatory rates for 1 Nss, rx 0x%x, tx 0x%x, disable HE\n", + mcs_80_map_tx, mcs_80_map_rx); + return false; + } + + if (!he_op) + return true; + + ap_min_req_set = le16_to_cpu(he_op->he_mcs_nss_set); -out: /* - * When tracking the current AP, don't do any further checks if the - * new chandef is identical to the one we're currently using for the - * connection. This keeps us from playing ping-pong with regulatory, - * without it the following can happen (for example): - * - connect to an AP with 80 MHz, world regdom allows 80 MHz - * - AP advertises regdom US - * - CRDA loads regdom US with 80 MHz prohibited (old database) - * - the code below detects an unsupported channel, downgrades, and - * we disconnect from the AP in the caller - * - disconnect causes CRDA to reload world regdomain and the game - * starts anew. - * (see https://bugzilla.kernel.org/show_bug.cgi?id=70881) + * Apparently iPhone 13 (at least iOS version 15.3.1) sets this to all + * zeroes, which is nonsense, and completely inconsistent with itself + * (it doesn't have 8 streams). Accept the settings in this case anyway. + */ + if (!ap_min_req_set) + return true; + + /* make sure the AP is consistent with itself * - * It seems possible that there are still scenarios with CSA or real - * bandwidth changes where a this could happen, but those cases are - * less common and wouldn't completely prevent using the AP. + * P802.11-REVme/D0.3 + * 26.17.1 Basic HE BSS operation + * + * A STA that is operating in an HE BSS shall be able to receive and + * transmit at each of the <HE-MCS, NSS> tuple values indicated by the + * Basic HE-MCS And NSS Set field of the HE Operation parameter of the + * MLME-START.request primitive and shall be able to receive at each of + * the <HE-MCS, NSS> tuple values indicated by the Supported HE-MCS and + * NSS Set field in the HE Capabilities parameter of the MLMESTART.request + * primitive */ - if (tracking && - cfg80211_chandef_identical(chandef, &link->conf->chandef)) - return ret; + for (nss = 8; nss > 0; nss--) { + u8 ap_op_val = (ap_min_req_set >> (2 * (nss - 1))) & 3; + u8 ap_rx_val; + u8 ap_tx_val; + + if (ap_op_val == IEEE80211_HE_MCS_NOT_SUPPORTED) + continue; + + ap_rx_val = (mcs_80_map_rx >> (2 * (nss - 1))) & 3; + ap_tx_val = (mcs_80_map_tx >> (2 * (nss - 1))) & 3; + + if (ap_rx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || + ap_tx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || + ap_rx_val < ap_op_val || ap_tx_val < ap_op_val) { + sdata_info(sdata, + "Invalid rates for %d Nss, rx %d, tx %d oper %d, disable HE\n", + nss, ap_rx_val, ap_rx_val, ap_op_val); + return false; + } + } + + return true; +} + +static bool +ieee80211_verify_sta_he_mcs_support(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + const struct ieee80211_he_operation *he_op) +{ + const struct ieee80211_sta_he_cap *sta_he_cap = + ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); + u16 ap_min_req_set; + int i; - /* don't print the message below for VHT mismatch if VHT is disabled */ - if (ret & IEEE80211_CONN_DISABLE_VHT) - vht_chandef = *chandef; + if (!sta_he_cap || !he_op) + return false; + + ap_min_req_set = le16_to_cpu(he_op->he_mcs_nss_set); /* - * Ignore the DISABLED flag when we're already connected and only - * tracking the APs beacon for bandwidth changes - otherwise we - * might get disconnected here if we connect to an AP, update our - * regulatory information based on the AP's country IE and the - * information we have is wrong/outdated and disables the channel - * that we're actually using for the connection to the AP. + * Apparently iPhone 13 (at least iOS version 15.3.1) sets this to all + * zeroes, which is nonsense, and completely inconsistent with itself + * (it doesn't have 8 streams). Accept the settings in this case anyway. */ - while (!cfg80211_chandef_usable(sdata->local->hw.wiphy, chandef, - tracking ? 0 : - IEEE80211_CHAN_DISABLED)) { - if (WARN_ON(chandef->width == NL80211_CHAN_WIDTH_20_NOHT)) { - ret = IEEE80211_CONN_DISABLE_HT | - IEEE80211_CONN_DISABLE_VHT | - IEEE80211_CONN_DISABLE_HE | - IEEE80211_CONN_DISABLE_EHT; - break; + if (!ap_min_req_set) + return true; + + /* Need to go over for 80MHz, 160MHz and for 80+80 */ + for (i = 0; i < 3; i++) { + const struct ieee80211_he_mcs_nss_supp *sta_mcs_nss_supp = + &sta_he_cap->he_mcs_nss_supp; + u16 sta_mcs_map_rx = + le16_to_cpu(((__le16 *)sta_mcs_nss_supp)[2 * i]); + u16 sta_mcs_map_tx = + le16_to_cpu(((__le16 *)sta_mcs_nss_supp)[2 * i + 1]); + u8 nss; + bool verified = true; + + /* + * For each band there is a maximum of 8 spatial streams + * possible. Each of the sta_mcs_map_* is a 16-bit struct built + * of 2 bits per NSS (1-8), with the values defined in enum + * ieee80211_he_mcs_support. Need to make sure STA TX and RX + * capabilities aren't less than the AP's minimum requirements + * for this HE BSS per SS. + * It is enough to find one such band that meets the reqs. + */ + for (nss = 8; nss > 0; nss--) { + u8 sta_rx_val = (sta_mcs_map_rx >> (2 * (nss - 1))) & 3; + u8 sta_tx_val = (sta_mcs_map_tx >> (2 * (nss - 1))) & 3; + u8 ap_val = (ap_min_req_set >> (2 * (nss - 1))) & 3; + + if (ap_val == IEEE80211_HE_MCS_NOT_SUPPORTED) + continue; + + /* + * Make sure the HE AP doesn't require MCSs that aren't + * supported by the client as required by spec + * + * P802.11-REVme/D0.3 + * 26.17.1 Basic HE BSS operation + * + * An HE STA shall not attempt to join * (MLME-JOIN.request primitive) + * a BSS, unless it supports (i.e., is able to both transmit and + * receive using) all of the <HE-MCS, NSS> tuples in the basic + * HE-MCS and NSS set. + */ + if (sta_rx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || + sta_tx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || + (ap_val > sta_rx_val) || (ap_val > sta_tx_val)) { + verified = false; + break; + } + } + + if (verified) + return true; + } + + /* If here, STA doesn't meet AP's HE min requirements */ + return false; +} + +static u8 +ieee80211_get_eht_cap_mcs_nss(const struct ieee80211_sta_he_cap *sta_he_cap, + const struct ieee80211_sta_eht_cap *sta_eht_cap, + unsigned int idx, int bw) +{ + u8 he_phy_cap0 = sta_he_cap->he_cap_elem.phy_cap_info[0]; + u8 eht_phy_cap0 = sta_eht_cap->eht_cap_elem.phy_cap_info[0]; + + /* handle us being a 20 MHz-only EHT STA - with four values + * for MCS 0-7, 8-9, 10-11, 12-13. + */ + if (!(he_phy_cap0 & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK_ALL)) + return sta_eht_cap->eht_mcs_nss_supp.only_20mhz.rx_tx_max_nss[idx]; + + /* the others have MCS 0-9 together, rather than separately from 0-7 */ + if (idx > 0) + idx--; + + switch (bw) { + case 0: + return sta_eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_max_nss[idx]; + case 1: + if (!(he_phy_cap0 & + (IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G | + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G))) + return 0xff; /* pass check */ + return sta_eht_cap->eht_mcs_nss_supp.bw._160.rx_tx_max_nss[idx]; + case 2: + if (!(eht_phy_cap0 & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ)) + return 0xff; /* pass check */ + return sta_eht_cap->eht_mcs_nss_supp.bw._320.rx_tx_max_nss[idx]; + } + + WARN_ON(1); + return 0; +} + +static bool +ieee80211_verify_sta_eht_mcs_support(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + const struct ieee80211_eht_operation *eht_op) +{ + const struct ieee80211_sta_he_cap *sta_he_cap = + ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); + const struct ieee80211_sta_eht_cap *sta_eht_cap = + ieee80211_get_eht_iftype_cap_vif(sband, &sdata->vif); + const struct ieee80211_eht_mcs_nss_supp_20mhz_only *req; + unsigned int i; + + if (!sta_he_cap || !sta_eht_cap || !eht_op) + return false; + + req = &eht_op->basic_mcs_nss; + + for (i = 0; i < ARRAY_SIZE(req->rx_tx_max_nss); i++) { + u8 req_rx_nss, req_tx_nss; + unsigned int bw; + + req_rx_nss = u8_get_bits(req->rx_tx_max_nss[i], + IEEE80211_EHT_MCS_NSS_RX); + req_tx_nss = u8_get_bits(req->rx_tx_max_nss[i], + IEEE80211_EHT_MCS_NSS_TX); + + for (bw = 0; bw < 3; bw++) { + u8 have, have_rx_nss, have_tx_nss; + + have = ieee80211_get_eht_cap_mcs_nss(sta_he_cap, + sta_eht_cap, + i, bw); + have_rx_nss = u8_get_bits(have, + IEEE80211_EHT_MCS_NSS_RX); + have_tx_nss = u8_get_bits(have, + IEEE80211_EHT_MCS_NSS_TX); + + if (req_rx_nss > have_rx_nss || + req_tx_nss > have_tx_nss) + return false; + } + } + + return true; +} + +static bool ieee80211_chandef_usable(struct ieee80211_sub_if_data *sdata, + const struct cfg80211_chan_def *chandef, + u32 prohibited_flags) +{ + if (!cfg80211_chandef_usable(sdata->local->hw.wiphy, + chandef, prohibited_flags)) + return false; + + if (chandef->punctured && + ieee80211_hw_check(&sdata->local->hw, DISALLOW_PUNCTURING)) + return false; + + if (chandef->punctured && chandef->chan->band == NL80211_BAND_5GHZ && + ieee80211_hw_check(&sdata->local->hw, DISALLOW_PUNCTURING_5GHZ)) + return false; + + return true; +} + +static struct ieee802_11_elems * +ieee80211_determine_chan_mode(struct ieee80211_sub_if_data *sdata, + struct ieee80211_conn_settings *conn, + struct cfg80211_bss *cbss, int link_id, + struct ieee80211_chan_req *chanreq) +{ + const struct cfg80211_bss_ies *ies = rcu_dereference(cbss->ies); + struct ieee80211_bss *bss = (void *)cbss->priv; + struct ieee80211_channel *channel = cbss->channel; + struct ieee80211_elems_parse_params parse_params = { + .link_id = -1, + .from_ap = true, + .start = ies->data, + .len = ies->len, + }; + struct ieee802_11_elems *elems; + struct ieee80211_supported_band *sband; + struct cfg80211_chan_def ap_chandef; + enum ieee80211_conn_mode ap_mode; + int ret; + +again: + parse_params.mode = conn->mode; + elems = ieee802_11_parse_elems_full(&parse_params); + if (!elems) + return ERR_PTR(-ENOMEM); + + ap_mode = ieee80211_determine_ap_chan(sdata, channel, bss->vht_cap_info, + elems, false, conn, &ap_chandef); + + /* this should be impossible since parsing depends on our mode */ + if (WARN_ON(ap_mode > conn->mode)) { + ret = -EINVAL; + goto free; + } + + if (conn->mode != ap_mode) { + conn->mode = ap_mode; + kfree(elems); + goto again; + } + + mlme_link_id_dbg(sdata, link_id, "determined AP %pM to be %s\n", + cbss->bssid, ieee80211_conn_mode_str(ap_mode)); + + sband = sdata->local->hw.wiphy->bands[channel->band]; + + switch (channel->band) { + case NL80211_BAND_S1GHZ: + if (WARN_ON(ap_mode != IEEE80211_CONN_MODE_S1G)) { + ret = -EINVAL; + goto free; } + return elems; + case NL80211_BAND_6GHZ: + if (ap_mode < IEEE80211_CONN_MODE_HE) { + sdata_info(sdata, + "Rejecting non-HE 6/7 GHz connection"); + ret = -EINVAL; + goto free; + } + break; + default: + if (WARN_ON(ap_mode == IEEE80211_CONN_MODE_S1G)) { + ret = -EINVAL; + goto free; + } + } - ret |= ieee80211_chandef_downgrade(chandef); + switch (ap_mode) { + case IEEE80211_CONN_MODE_S1G: + WARN_ON(1); + ret = -EINVAL; + goto free; + case IEEE80211_CONN_MODE_LEGACY: + conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; + break; + case IEEE80211_CONN_MODE_HT: + conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, + conn->bw_limit, + IEEE80211_CONN_BW_LIMIT_40); + break; + case IEEE80211_CONN_MODE_VHT: + case IEEE80211_CONN_MODE_HE: + conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, + conn->bw_limit, + IEEE80211_CONN_BW_LIMIT_160); + break; + case IEEE80211_CONN_MODE_EHT: + conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, + conn->bw_limit, + IEEE80211_CONN_BW_LIMIT_320); + break; } - if (!he_oper || !cfg80211_chandef_usable(sdata->wdev.wiphy, chandef, - IEEE80211_CHAN_NO_HE)) - ret |= IEEE80211_CONN_DISABLE_HE | IEEE80211_CONN_DISABLE_EHT; + chanreq->oper = ap_chandef; - if (!eht_oper || !cfg80211_chandef_usable(sdata->wdev.wiphy, chandef, - IEEE80211_CHAN_NO_EHT)) - ret |= IEEE80211_CONN_DISABLE_EHT; + /* wider-bandwidth OFDMA is only done in EHT */ + if (conn->mode >= IEEE80211_CONN_MODE_EHT && + !(sdata->vif.driver_flags & IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW)) + chanreq->ap = ap_chandef; + else + chanreq->ap.chan = NULL; - if (chandef->width != vht_chandef.width && !tracking) + while (!ieee80211_chandef_usable(sdata, &chanreq->oper, + IEEE80211_CHAN_DISABLED)) { + if (WARN_ON(chanreq->oper.width == NL80211_CHAN_WIDTH_20_NOHT)) { + ret = -EINVAL; + goto free; + } + + ieee80211_chanreq_downgrade(chanreq, conn); + } + + if (conn->mode >= IEEE80211_CONN_MODE_HE && + !cfg80211_chandef_usable(sdata->wdev.wiphy, &chanreq->oper, + IEEE80211_CHAN_NO_HE)) { + conn->mode = IEEE80211_CONN_MODE_VHT; + conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, + conn->bw_limit, + IEEE80211_CONN_BW_LIMIT_160); + } + + if (conn->mode >= IEEE80211_CONN_MODE_EHT && + !cfg80211_chandef_usable(sdata->wdev.wiphy, &chanreq->oper, + IEEE80211_CHAN_NO_EHT)) { + conn->mode = IEEE80211_CONN_MODE_HE; + conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, + conn->bw_limit, + IEEE80211_CONN_BW_LIMIT_160); + } + + if (chanreq->oper.width != ap_chandef.width || ap_mode != conn->mode) sdata_info(sdata, - "capabilities/regulatory prevented using AP HT/VHT configuration, downgraded\n"); + "regulatory prevented using AP config, downgraded\n"); - WARN_ON_ONCE(!cfg80211_chandef_valid(chandef)); - return ret; + if (conn->mode >= IEEE80211_CONN_MODE_HE && + (!ieee80211_verify_peer_he_mcs_support(sdata, (void *)elems->he_cap, + elems->he_operation) || + !ieee80211_verify_sta_he_mcs_support(sdata, sband, + elems->he_operation))) { + conn->mode = IEEE80211_CONN_MODE_VHT; + sdata_info(sdata, "required MCSes not supported, disabling HE\n"); + } + + if (conn->mode >= IEEE80211_CONN_MODE_EHT && + !ieee80211_verify_sta_eht_mcs_support(sdata, sband, + elems->eht_operation)) { + conn->mode = IEEE80211_CONN_MODE_HE; + conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, + conn->bw_limit, + IEEE80211_CONN_BW_LIMIT_160); + sdata_info(sdata, "required MCSes not supported, disabling EHT\n"); + } + + /* the mode can only decrease, so this must terminate */ + if (ap_mode != conn->mode) { + kfree(elems); + goto again; + } + + mlme_link_id_dbg(sdata, link_id, + "connecting with %s mode, max bandwidth %d MHz\n", + ieee80211_conn_mode_str(conn->mode), + 20 * (1 << conn->bw_limit)); + + if (WARN_ON_ONCE(!cfg80211_chandef_valid(&chanreq->oper))) { + ret = -EINVAL; + goto free; + } + + return elems; +free: + kfree(elems); + return ERR_PTR(ret); } static int ieee80211_config_bw(struct ieee80211_link_data *link, - const struct ieee80211_ht_cap *ht_cap, - const struct ieee80211_vht_cap *vht_cap, - const struct ieee80211_ht_operation *ht_oper, - const struct ieee80211_vht_operation *vht_oper, - const struct ieee80211_he_operation *he_oper, - const struct ieee80211_eht_operation *eht_oper, - const struct ieee80211_s1g_oper_ie *s1g_oper, - const u8 *bssid, u64 *changed) + struct ieee802_11_elems *elems, + bool update, u64 *changed) { + struct ieee80211_channel *channel = link->conf->chanreq.oper.chan; struct ieee80211_sub_if_data *sdata = link->sdata; - struct ieee80211_local *local = sdata->local; - struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - struct ieee80211_channel *chan = link->conf->chandef.chan; - struct ieee80211_supported_band *sband = - local->hw.wiphy->bands[chan->band]; - struct cfg80211_chan_def chandef; - u16 ht_opmode; - ieee80211_conn_flags_t flags; + struct ieee80211_chan_req chanreq = {}; + enum ieee80211_conn_mode ap_mode; u32 vht_cap_info = 0; + u16 ht_opmode; int ret; - /* if HT was/is disabled, don't track any bandwidth changes */ - if (link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_HT || !ht_oper) + /* don't track any bandwidth changes in legacy/S1G modes */ + if (link->u.mgd.conn.mode == IEEE80211_CONN_MODE_LEGACY || + link->u.mgd.conn.mode == IEEE80211_CONN_MODE_S1G) return 0; - /* don't check VHT if we associated as non-VHT station */ - if (link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_VHT) - vht_oper = NULL; + if (elems->vht_cap_elem) + vht_cap_info = le32_to_cpu(elems->vht_cap_elem->vht_cap_info); + + ap_mode = ieee80211_determine_ap_chan(sdata, channel, vht_cap_info, + elems, true, &link->u.mgd.conn, + &chanreq.ap); - /* don't check HE if we associated as non-HE station */ - if (link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_HE || - !ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif)) { - he_oper = NULL; - eht_oper = NULL; + if (ap_mode != link->u.mgd.conn.mode) { + link_info(link, + "AP appears to change mode (expected %s, found %s), disconnect\n", + ieee80211_conn_mode_str(link->u.mgd.conn.mode), + ieee80211_conn_mode_str(ap_mode)); + return -EINVAL; } - /* don't check EHT if we associated as non-EHT station */ - if (link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_EHT || - !ieee80211_get_eht_iftype_cap_vif(sband, &sdata->vif)) - eht_oper = NULL; + chanreq.oper = chanreq.ap; + if (link->u.mgd.conn.mode < IEEE80211_CONN_MODE_EHT || + sdata->vif.driver_flags & IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW) + chanreq.ap.chan = NULL; /* - * if bss configuration changed store the new one - + * if HT operation mode changed store the new one - * this may be applicable even if channel is identical */ - ht_opmode = le16_to_cpu(ht_oper->operation_mode); - if (link->conf->ht_operation_mode != ht_opmode) { - *changed |= BSS_CHANGED_HT; - link->conf->ht_operation_mode = ht_opmode; + if (elems->ht_operation) { + ht_opmode = le16_to_cpu(elems->ht_operation->operation_mode); + if (link->conf->ht_operation_mode != ht_opmode) { + *changed |= BSS_CHANGED_HT; + link->conf->ht_operation_mode = ht_opmode; + } } - if (vht_cap) - vht_cap_info = le32_to_cpu(vht_cap->vht_cap_info); - - /* calculate new channel (type) based on HT/VHT/HE operation IEs */ - flags = ieee80211_determine_chantype(sdata, link, - link->u.mgd.conn_flags, - sband, chan, vht_cap_info, - ht_oper, vht_oper, - he_oper, eht_oper, - s1g_oper, &chandef, true); - /* * Downgrade the new channel if we associated with restricted - * capabilities. For example, if we associated as a 20 MHz STA - * to a 40 MHz AP (due to regulatory, capabilities or config - * reasons) then switching to a 40 MHz channel now won't do us - * any good -- we couldn't use it with the AP. + * bandwidth capabilities. For example, if we associated as a + * 20 MHz STA to a 40 MHz AP (due to regulatory, capabilities + * or config reasons) then switching to a 40 MHz channel now + * won't do us any good -- we couldn't use it with the AP. */ - if (link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_80P80MHZ && - chandef.width == NL80211_CHAN_WIDTH_80P80) - flags |= ieee80211_chandef_downgrade(&chandef); - if (link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_160MHZ && - chandef.width == NL80211_CHAN_WIDTH_160) - flags |= ieee80211_chandef_downgrade(&chandef); - if (link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_40MHZ && - chandef.width > NL80211_CHAN_WIDTH_20) - flags |= ieee80211_chandef_downgrade(&chandef); - - if (cfg80211_chandef_identical(&chandef, &link->conf->chandef)) + while (link->u.mgd.conn.bw_limit < + ieee80211_min_bw_limit_from_chandef(&chanreq.oper)) + ieee80211_chandef_downgrade(&chanreq.oper, NULL); + + if (ieee80211_chanreq_identical(&chanreq, &link->conf->chanreq)) return 0; link_info(link, - "AP %pM changed bandwidth, new config is %d.%03d MHz, width %d (%d.%03d/%d MHz)\n", - link->u.mgd.bssid, chandef.chan->center_freq, - chandef.chan->freq_offset, chandef.width, - chandef.center_freq1, chandef.freq1_offset, - chandef.center_freq2); - - if (flags != (link->u.mgd.conn_flags & - (IEEE80211_CONN_DISABLE_HT | - IEEE80211_CONN_DISABLE_VHT | - IEEE80211_CONN_DISABLE_HE | - IEEE80211_CONN_DISABLE_EHT | - IEEE80211_CONN_DISABLE_40MHZ | - IEEE80211_CONN_DISABLE_80P80MHZ | - IEEE80211_CONN_DISABLE_160MHZ | - IEEE80211_CONN_DISABLE_320MHZ)) || - !cfg80211_chandef_valid(&chandef)) { + "AP %pM changed bandwidth, new used config is %d.%03d MHz, width %d (%d.%03d/%d MHz)\n", + link->u.mgd.bssid, chanreq.oper.chan->center_freq, + chanreq.oper.chan->freq_offset, chanreq.oper.width, + chanreq.oper.center_freq1, chanreq.oper.freq1_offset, + chanreq.oper.center_freq2); + + if (!cfg80211_chandef_valid(&chanreq.oper)) { sdata_info(sdata, - "AP %pM changed caps/bw in a way we can't support (0x%x/0x%x) - disconnect\n", - link->u.mgd.bssid, flags, ifmgd->flags); + "AP %pM changed caps/bw in a way we can't support - disconnect\n", + link->u.mgd.bssid); return -EINVAL; } - ret = ieee80211_link_change_bandwidth(link, &chandef, changed); + if (!update) { + link->conf->chanreq = chanreq; + return 0; + } + /* + * We're tracking the current AP here, so don't do any further checks + * here. This keeps us from playing ping-pong with regulatory, without + * it the following can happen (for example): + * - connect to an AP with 80 MHz, world regdom allows 80 MHz + * - AP advertises regdom US + * - CRDA loads regdom US with 80 MHz prohibited (old database) + * - we detect an unsupported channel and disconnect + * - disconnect causes CRDA to reload world regdomain and the game + * starts anew. + * (see https://bugzilla.kernel.org/show_bug.cgi?id=70881) + * + * It seems possible that there are still scenarios with CSA or real + * bandwidth changes where a this could happen, but those cases are + * less common and wouldn't completely prevent using the AP. + */ + + ret = ieee80211_link_change_chanreq(link, &chanreq, changed); if (ret) { sdata_info(sdata, "AP %pM changed bandwidth to incompatible one - disconnect\n", @@ -612,7 +901,7 @@ static void ieee80211_add_ht_ie(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, struct ieee80211_channel *channel, enum ieee80211_smps_mode smps, - ieee80211_conn_flags_t conn_flags) + const struct ieee80211_conn_settings *conn) { u8 *pos; u32 flags = channel->flags; @@ -647,7 +936,7 @@ static void ieee80211_add_ht_ie(struct ieee80211_sub_if_data *sdata, * capable of 40 MHz -- some broken APs will never fall * back to trying to transmit in 20 MHz. */ - if (conn_flags & IEEE80211_CONN_DISABLE_40MHZ) { + if (conn->bw_limit <= IEEE80211_CONN_BW_LIMIT_20) { cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40; cap &= ~IEEE80211_HT_CAP_SGI_40; } @@ -686,7 +975,7 @@ static bool ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct ieee80211_supported_band *sband, struct ieee80211_vht_cap *ap_vht_cap, - ieee80211_conn_flags_t conn_flags) + const struct ieee80211_conn_settings *conn) { struct ieee80211_local *local = sdata->local; u8 *pos; @@ -703,16 +992,7 @@ static bool ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, /* determine capability flags */ cap = vht_cap.cap; - if (conn_flags & IEEE80211_CONN_DISABLE_80P80MHZ) { - u32 bw = cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; - - cap &= ~IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; - if (bw == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ || - bw == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) - cap |= IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ; - } - - if (conn_flags & IEEE80211_CONN_DISABLE_160MHZ) { + if (conn->bw_limit <= IEEE80211_CONN_BW_LIMIT_80) { cap &= ~IEEE80211_VHT_CAP_SHORT_GI_160; cap &= ~IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; } @@ -769,79 +1049,12 @@ static bool ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, return mu_mimo_owner; } -/* This function determines HE capability flags for the association - * and builds the IE. - */ -static void ieee80211_add_he_ie(struct ieee80211_sub_if_data *sdata, - struct sk_buff *skb, - struct ieee80211_supported_band *sband, - enum ieee80211_smps_mode smps_mode, - ieee80211_conn_flags_t conn_flags) -{ - u8 *pos, *pre_he_pos; - const struct ieee80211_sta_he_cap *he_cap; - u8 he_cap_size; - - he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); - if (WARN_ON(!he_cap)) - return; - - /* get a max size estimate */ - he_cap_size = - 2 + 1 + sizeof(he_cap->he_cap_elem) + - ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem) + - ieee80211_he_ppe_size(he_cap->ppe_thres[0], - he_cap->he_cap_elem.phy_cap_info); - pos = skb_put(skb, he_cap_size); - pre_he_pos = pos; - pos = ieee80211_ie_build_he_cap(conn_flags, - pos, he_cap, pos + he_cap_size); - /* trim excess if any */ - skb_trim(skb, skb->len - (pre_he_pos + he_cap_size - pos)); - - ieee80211_ie_build_he_6ghz_cap(sdata, smps_mode, skb); -} - -static void ieee80211_add_eht_ie(struct ieee80211_sub_if_data *sdata, - struct sk_buff *skb, - struct ieee80211_supported_band *sband) -{ - u8 *pos; - const struct ieee80211_sta_he_cap *he_cap; - const struct ieee80211_sta_eht_cap *eht_cap; - u8 eht_cap_size; - - he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); - eht_cap = ieee80211_get_eht_iftype_cap_vif(sband, &sdata->vif); - - /* - * EHT capabilities element is only added if the HE capabilities element - * was added so assume that 'he_cap' is valid and don't check it. - */ - if (WARN_ON(!he_cap || !eht_cap)) - return; - - eht_cap_size = - 2 + 1 + sizeof(eht_cap->eht_cap_elem) + - ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, - &eht_cap->eht_cap_elem, - false) + - ieee80211_eht_ppe_size(eht_cap->eht_ppe_thres[0], - eht_cap->eht_cap_elem.phy_cap_info); - pos = skb_put(skb, eht_cap_size); - ieee80211_ie_build_eht_cap(pos, he_cap, eht_cap, pos + eht_cap_size, - false); -} - static void ieee80211_assoc_add_rates(struct sk_buff *skb, enum nl80211_chan_width width, struct ieee80211_supported_band *sband, struct ieee80211_mgd_assoc_data *assoc_data) { - unsigned int rates_len, supp_rates_len; - u32 rates = 0; - int i, count; - u8 *pos; + u32 rates; if (assoc_data->supp_rates_len) { /* @@ -850,53 +1063,23 @@ static void ieee80211_assoc_add_rates(struct sk_buff *skb, * in the association request (e.g. D-Link DAP 1353 in * b-only mode)... */ - rates_len = ieee80211_parse_bitrates(width, sband, - assoc_data->supp_rates, - assoc_data->supp_rates_len, - &rates); + ieee80211_parse_bitrates(width, sband, + assoc_data->supp_rates, + assoc_data->supp_rates_len, + &rates); } else { /* * In case AP not provide any supported rates information * before association, we send information element(s) with * all rates that we support. */ - rates_len = sband->n_bitrates; - for (i = 0; i < sband->n_bitrates; i++) - rates |= BIT(i); - } - - supp_rates_len = rates_len; - if (supp_rates_len > 8) - supp_rates_len = 8; - - pos = skb_put(skb, supp_rates_len + 2); - *pos++ = WLAN_EID_SUPP_RATES; - *pos++ = supp_rates_len; - - count = 0; - for (i = 0; i < sband->n_bitrates; i++) { - if (BIT(i) & rates) { - int rate = DIV_ROUND_UP(sband->bitrates[i].bitrate, 5); - *pos++ = (u8)rate; - if (++count == 8) - break; - } + rates = ~0; } - if (rates_len > count) { - pos = skb_put(skb, rates_len - count + 2); - *pos++ = WLAN_EID_EXT_SUPP_RATES; - *pos++ = rates_len - count; - - for (i++; i < sband->n_bitrates; i++) { - if (BIT(i) & rates) { - int rate; - - rate = DIV_ROUND_UP(sband->bitrates[i].bitrate, 5); - *pos++ = (u8)rate; - } - } - } + ieee80211_put_srates_elem(skb, sband, 0, 0, ~rates, + WLAN_EID_SUPP_RATES); + ieee80211_put_srates_elem(skb, sband, 0, 0, ~rates, + WLAN_EID_EXT_SUPP_RATES); } static size_t ieee80211_add_before_ht_elems(struct sk_buff *skb, @@ -1133,11 +1316,11 @@ static size_t ieee80211_assoc_link_elems(struct ieee80211_sub_if_data *sdata, offset); if (sband->band != NL80211_BAND_6GHZ && - !(assoc_data->link[link_id].conn_flags & IEEE80211_CONN_DISABLE_HT)) { + assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_HT) { ieee80211_add_ht_ie(sdata, skb, assoc_data->link[link_id].ap_ht_param, sband, chan, smps_mode, - assoc_data->link[link_id].conn_flags); + &assoc_data->link[link_id].conn); ADD_PRESENT_ELEM(WLAN_EID_HT_CAPABILITY); } @@ -1147,37 +1330,28 @@ static size_t ieee80211_assoc_link_elems(struct ieee80211_sub_if_data *sdata, offset); if (sband->band != NL80211_BAND_6GHZ && - !(assoc_data->link[link_id].conn_flags & IEEE80211_CONN_DISABLE_VHT)) { + assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_VHT && + sband->vht_cap.vht_supported) { bool mu_mimo_owner = ieee80211_add_vht_ie(sdata, skb, sband, &assoc_data->link[link_id].ap_vht_cap, - assoc_data->link[link_id].conn_flags); + &assoc_data->link[link_id].conn); if (link) link->conf->mu_mimo_owner = mu_mimo_owner; ADD_PRESENT_ELEM(WLAN_EID_VHT_CAPABILITY); } - /* - * If AP doesn't support HT, mark HE and EHT as disabled. - * If on the 5GHz band, make sure it supports VHT. - */ - if (assoc_data->link[link_id].conn_flags & IEEE80211_CONN_DISABLE_HT || - (sband->band == NL80211_BAND_5GHZ && - assoc_data->link[link_id].conn_flags & IEEE80211_CONN_DISABLE_VHT)) - assoc_data->link[link_id].conn_flags |= - IEEE80211_CONN_DISABLE_HE | - IEEE80211_CONN_DISABLE_EHT; - /* if present, add any custom IEs that go before HE */ offset = ieee80211_add_before_he_elems(skb, extra_elems, extra_elems_len, offset); - if (!(assoc_data->link[link_id].conn_flags & IEEE80211_CONN_DISABLE_HE)) { - ieee80211_add_he_ie(sdata, skb, sband, smps_mode, - assoc_data->link[link_id].conn_flags); + if (assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_HE) { + ieee80211_put_he_cap(skb, sdata, sband, + &assoc_data->link[link_id].conn); ADD_PRESENT_EXT_ELEM(WLAN_EID_EXT_HE_CAPABILITY); + ieee80211_put_he_6ghz_cap(skb, sdata, smps_mode); } /* @@ -1185,7 +1359,7 @@ static size_t ieee80211_assoc_link_elems(struct ieee80211_sub_if_data *sdata, * calling ieee80211_assoc_add_ml_elem(), so add this one if * we're going to put it after the ML element */ - if (!(assoc_data->link[link_id].conn_flags & IEEE80211_CONN_DISABLE_EHT)) + if (assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_EHT) ADD_PRESENT_EXT_ELEM(WLAN_EID_EXT_EHT_CAPABILITY); if (link_id == assoc_data->assoc_link_id) @@ -1195,8 +1369,9 @@ static size_t ieee80211_assoc_link_elems(struct ieee80211_sub_if_data *sdata, /* crash if somebody gets it wrong */ present_elems = NULL; - if (!(assoc_data->link[link_id].conn_flags & IEEE80211_CONN_DISABLE_EHT)) - ieee80211_add_eht_ie(sdata, skb, sband); + if (assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_EHT) + ieee80211_put_eht_cap(skb, sdata, sband, + &assoc_data->link[link_id].conn); if (sband->band == NL80211_BAND_S1GHZ) { ieee80211_add_aid_request_ie(sdata, skb); @@ -1206,9 +1381,6 @@ static size_t ieee80211_assoc_link_elems(struct ieee80211_sub_if_data *sdata, if (iftd && iftd->vendor_elems.data && iftd->vendor_elems.len) skb_put_data(skb, iftd->vendor_elems.data, iftd->vendor_elems.len); - if (link) - link->u.mgd.conn_flags = assoc_data->link[link_id].conn_flags; - return offset; } @@ -1318,8 +1490,6 @@ static void ieee80211_assoc_add_ml_elem(struct ieee80211_sub_if_data *sdata, cpu_to_le16(IEEE80211_MLC_BASIC_PRES_EML_CAPA); skb_put_data(skb, &eml_capa, sizeof(eml_capa)); } - /* need indication from userspace to support this */ - mld_capa_ops &= ~cpu_to_le16(IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP); skb_put_data(skb, &mld_capa_ops, sizeof(mld_capa_ops)); for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { @@ -1499,7 +1669,7 @@ static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) /* Set MBSSID support for HE AP if needed */ if (ieee80211_hw_check(&local->hw, SUPPORTS_ONLY_HE_MULTI_BSSID) && - !(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_HE) && + link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HE && ext_capa && ext_capa->datalen >= 3) ext_capa->data[2] |= WLAN_EXT_CAPA3_MULTI_BSSID_SUPPORT; @@ -1544,7 +1714,7 @@ static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) * for some reason check it and want it to be set, set the bit for all * pre-EHT connections as we used to do. */ - if (link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_EHT) + if (link->u.mgd.conn.mode < IEEE80211_CONN_MODE_EHT) capab |= WLAN_CAPABILITY_ESS; /* add the elements for the assoc (main) link */ @@ -1741,8 +1911,8 @@ static void ieee80211_chswitch_work(struct wiphy *wiphy, return; } - if (!cfg80211_chandef_identical(&link->conf->chandef, - &link->csa_chandef)) { + if (!ieee80211_chanreq_identical(&link->conf->chanreq, + &link->csa_chanreq)) { sdata_info(sdata, "failed to finalize channel switch, disconnecting\n"); wiphy_work_queue(sdata->local->hw.wiphy, @@ -1767,19 +1937,15 @@ static void ieee80211_chswitch_post_beacon(struct ieee80211_link_data *link) WARN_ON(!link->conf->csa_active); - if (link->csa_block_tx) { + if (sdata->csa_blocked_queues) { ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); - link->csa_block_tx = false; + sdata->csa_blocked_queues = false; } link->conf->csa_active = false; + link->u.mgd.csa_blocked_tx = false; link->u.mgd.csa_waiting_bcn = false; - /* - * If the CSA IE is still present on the beacon after the switch, - * we need to consider it as a new CSA (possibly to self). - */ - link->u.mgd.beacon_crc_valid = false; ret = drv_post_channel_switch(link); if (ret) { @@ -1790,8 +1956,8 @@ static void ieee80211_chswitch_post_beacon(struct ieee80211_link_data *link) return; } - cfg80211_ch_switch_notify(sdata->dev, &link->reserved_chandef, - link->link_id, 0); + cfg80211_ch_switch_notify(sdata->dev, &link->reserved.oper, + link->link_id); } void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success, @@ -1838,14 +2004,16 @@ ieee80211_sta_abort_chanswitch(struct ieee80211_link_data *link) ieee80211_link_unreserve_chanctx(link); - if (link->csa_block_tx) + if (sdata->csa_blocked_queues) { ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); + sdata->csa_blocked_queues = false; + } - link->csa_block_tx = false; link->conf->csa_active = false; + link->u.mgd.csa_blocked_tx = false; - drv_abort_channel_switch(sdata); + drv_abort_channel_switch(link); } static void @@ -1857,12 +2025,14 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - struct cfg80211_bss *cbss = link->u.mgd.bss; + struct cfg80211_bss *cbss = link->conf->bss; struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *chanctx; enum nl80211_band current_band; struct ieee80211_csa_ie csa_ie; - struct ieee80211_channel_switch ch_switch; + struct ieee80211_channel_switch ch_switch = { + .link_id = link->link_id, + }; struct ieee80211_bss *bss; unsigned long timeout; int res; @@ -1876,14 +2046,14 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, bss = (void *)cbss->priv; res = ieee80211_parse_ch_switch_ie(sdata, elems, current_band, bss->vht_cap_info, - link->u.mgd.conn_flags, + &link->u.mgd.conn, link->u.mgd.bssid, &csa_ie); if (!res) { ch_switch.timestamp = timestamp; ch_switch.device_timestamp = device_timestamp; ch_switch.block_tx = csa_ie.mode; - ch_switch.chandef = csa_ie.chandef; + ch_switch.chandef = csa_ie.chanreq.oper; ch_switch.count = csa_ie.count; ch_switch.delay = csa_ie.max_switch_time; } @@ -1891,46 +2061,62 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, if (res < 0) goto drop_connection; - if (beacon && link->conf->csa_active && - !link->u.mgd.csa_waiting_bcn) { - if (res) + if (link->conf->csa_active) { + /* already processing - disregard action frames */ + if (!beacon) + return; + + if (link->u.mgd.csa_waiting_bcn) { + ieee80211_chswitch_post_beacon(link); + /* + * If the CSA IE is still present in the beacon after + * the switch, we need to consider it as a new CSA + * (possibly to self) - this happens by not returning + * here so we'll get to the check below. + */ + } else if (res) { ieee80211_sta_abort_chanswitch(link); - else + return; + } else { drv_channel_switch_rx_beacon(sdata, &ch_switch); - return; - } else if (link->conf->csa_active || res) { - /* disregard subsequent announcements if already processing */ - return; + return; + } } - if (link->conf->chandef.chan->band != - csa_ie.chandef.chan->band) { + /* nothing to do at all - no active CSA nor a new one */ + if (res) + return; + + if (link->conf->chanreq.oper.chan->band != + csa_ie.chanreq.oper.chan->band) { sdata_info(sdata, "AP %pM switches to different band (%d MHz, width:%d, CF1/2: %d/%d MHz), disconnecting\n", link->u.mgd.bssid, - csa_ie.chandef.chan->center_freq, - csa_ie.chandef.width, csa_ie.chandef.center_freq1, - csa_ie.chandef.center_freq2); + csa_ie.chanreq.oper.chan->center_freq, + csa_ie.chanreq.oper.width, + csa_ie.chanreq.oper.center_freq1, + csa_ie.chanreq.oper.center_freq2); goto drop_connection; } - if (!cfg80211_chandef_usable(local->hw.wiphy, &csa_ie.chandef, + if (!cfg80211_chandef_usable(local->hw.wiphy, &csa_ie.chanreq.oper, IEEE80211_CHAN_DISABLED)) { sdata_info(sdata, "AP %pM switches to unsupported channel " "(%d.%03d MHz, width:%d, CF1/2: %d.%03d/%d MHz), " "disconnecting\n", link->u.mgd.bssid, - csa_ie.chandef.chan->center_freq, - csa_ie.chandef.chan->freq_offset, - csa_ie.chandef.width, csa_ie.chandef.center_freq1, - csa_ie.chandef.freq1_offset, - csa_ie.chandef.center_freq2); + csa_ie.chanreq.oper.chan->center_freq, + csa_ie.chanreq.oper.chan->freq_offset, + csa_ie.chanreq.oper.width, + csa_ie.chanreq.oper.center_freq1, + csa_ie.chanreq.oper.freq1_offset, + csa_ie.chanreq.oper.center_freq2); goto drop_connection; } - if (cfg80211_chandef_identical(&csa_ie.chandef, - &link->conf->chandef) && + if (cfg80211_chandef_identical(&csa_ie.chanreq.oper, + &link->conf->chanreq.oper) && (!csa_ie.mode || !beacon)) { if (link->u.mgd.csa_ignored_same_chan) return; @@ -1942,12 +2128,13 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, } /* - * Drop all TDLS peers - either we disconnect or move to a different - * channel from this point on. There's no telling what our peer will do. + * Drop all TDLS peers on the affected link - either we disconnect or + * move to a different channel from this point on. There's no telling + * what our peer will do. * The TDLS WIDER_BW scenario is also problematic, as peers might now * have an incompatible wider chandef. */ - ieee80211_teardown_tdls_peers(sdata); + ieee80211_teardown_tdls_peers(link); conf = rcu_dereference_protected(link->conf->chanctx_conf, lockdep_is_held(&local->hw.wiphy->mtx)); @@ -1959,8 +2146,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, chanctx = container_of(conf, struct ieee80211_chanctx, conf); - if (local->use_chanctx && - !ieee80211_hw_check(&local->hw, CHANCTX_STA_CSA)) { + if (!ieee80211_hw_check(&local->hw, CHANCTX_STA_CSA)) { sdata_info(sdata, "driver doesn't support chan-switch with channel contexts\n"); goto drop_connection; @@ -1972,7 +2158,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, goto drop_connection; } - res = ieee80211_link_reserve_chanctx(link, &csa_ie.chandef, + res = ieee80211_link_reserve_chanctx(link, &csa_ie.chanreq, chanctx->mode, false); if (res) { sdata_info(sdata, @@ -1982,18 +2168,21 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, } link->conf->csa_active = true; - link->csa_chandef = csa_ie.chandef; - link->csa_block_tx = csa_ie.mode; + link->csa_chanreq = csa_ie.chanreq; link->u.mgd.csa_ignored_same_chan = false; link->u.mgd.beacon_crc_valid = false; + link->u.mgd.csa_blocked_tx = csa_ie.mode; - if (link->csa_block_tx) + if (csa_ie.mode && + !ieee80211_hw_check(&local->hw, HANDLES_QUIET_CSA)) { ieee80211_stop_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); + sdata->csa_blocked_queues = true; + } - cfg80211_ch_switch_started_notify(sdata->dev, &csa_ie.chandef, + cfg80211_ch_switch_started_notify(sdata->dev, &csa_ie.chanreq.oper, link->link_id, csa_ie.count, - csa_ie.mode, 0); + csa_ie.mode); if (local->ops->channel_switch) { /* use driver's channel switch callback */ @@ -2017,7 +2206,9 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, * reset when the disconnection worker runs. */ link->conf->csa_active = true; - link->csa_block_tx = csa_ie.mode; + link->u.mgd.csa_blocked_tx = csa_ie.mode; + sdata->csa_blocked_queues = + csa_ie.mode && !ieee80211_hw_check(&local->hw, HANDLES_QUIET_CSA); wiphy_work_queue(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); @@ -2414,7 +2605,7 @@ void ieee80211_dfs_cac_timer_work(struct wiphy *wiphy, struct wiphy_work *work) struct ieee80211_link_data *link = container_of(work, struct ieee80211_link_data, dfs_cac_timer_work.work); - struct cfg80211_chan_def chandef = link->conf->chandef; + struct cfg80211_chan_def chandef = link->conf->chanreq.oper; struct ieee80211_sub_if_data *sdata = link->sdata; lockdep_assert_wiphy(sdata->local->hw.wiphy); @@ -2769,7 +2960,7 @@ static u64 ieee80211_link_set_associated(struct ieee80211_link_data *link, ieee80211_check_rate_mask(link); - link->u.mgd.bss = cbss; + link->conf->bss = cbss; memcpy(link->u.mgd.bssid, cbss->bssid, ETH_ALEN); if (sdata->vif.p2p || @@ -2917,7 +3108,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, ifmgd->associated = false; /* other links will be destroyed */ - sdata->deflink.u.mgd.bss = NULL; + sdata->deflink.conf->bss = NULL; sdata->deflink.smps_mode = IEEE80211_SMPS_OFF; netif_carrier_off(sdata->dev); @@ -2992,7 +3183,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, sdata->vif.cfg.ssid_len = 0; /* remove AP and TDLS peers */ - sta_info_flush(sdata); + sta_info_flush(sdata, -1); /* finally reset all BSS / config parameters */ if (!ieee80211_vif_is_mld(&sdata->vif)) @@ -3058,7 +3249,6 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, sdata->deflink.u.mgd.disable_wmm_tracking = false; ifmgd->flags = 0; - sdata->deflink.u.mgd.conn_flags = 0; for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link; @@ -3070,27 +3260,47 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, } sdata->vif.bss_conf.csa_active = false; + sdata->deflink.u.mgd.csa_blocked_tx = false; sdata->deflink.u.mgd.csa_waiting_bcn = false; sdata->deflink.u.mgd.csa_ignored_same_chan = false; - if (sdata->deflink.csa_block_tx) { + if (sdata->csa_blocked_queues) { ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); - sdata->deflink.csa_block_tx = false; + sdata->csa_blocked_queues = false; } /* existing TX TSPEC sessions no longer exist */ memset(ifmgd->tx_tspec, 0, sizeof(ifmgd->tx_tspec)); wiphy_delayed_work_cancel(local->hw.wiphy, &ifmgd->tx_tspec_wk); + sdata->vif.bss_conf.power_type = IEEE80211_REG_UNSET_AP; sdata->vif.bss_conf.pwr_reduction = 0; sdata->vif.bss_conf.tx_pwr_env_num = 0; memset(sdata->vif.bss_conf.tx_pwr_env, 0, sizeof(sdata->vif.bss_conf.tx_pwr_env)); + sdata->vif.cfg.eml_cap = 0; + sdata->vif.cfg.eml_med_sync_delay = 0; + sdata->vif.cfg.mld_capa_op = 0; + memset(&sdata->u.mgd.ttlm_info, 0, sizeof(sdata->u.mgd.ttlm_info)); wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &ifmgd->ttlm_work); + + memset(&sdata->vif.neg_ttlm, 0, sizeof(sdata->vif.neg_ttlm)); + wiphy_delayed_work_cancel(sdata->local->hw.wiphy, + &ifmgd->neg_ttlm_timeout_work); + + sdata->u.mgd.removed_links = 0; + wiphy_delayed_work_cancel(sdata->local->hw.wiphy, + &sdata->u.mgd.ml_reconf_work); + + wiphy_work_cancel(sdata->local->hw.wiphy, + &ifmgd->teardown_ttlm_work); + ieee80211_vif_set_links(sdata, 0, 0); + + ifmgd->mcast_seq_last = IEEE80211_SN_MODULO; } static void ieee80211_reset_ap_probe(struct ieee80211_sub_if_data *sdata) @@ -3238,7 +3448,7 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) ieee80211_mlme_send_probe_req(sdata, sdata->vif.addr, dst, sdata->vif.cfg.ssid, sdata->vif.cfg.ssid_len, - sdata->deflink.u.mgd.bss->channel); + sdata->deflink.conf->bss->channel); } ifmgd->probe_timeout = jiffies + msecs_to_jiffies(probe_wait_ms); @@ -3321,7 +3531,7 @@ struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw, return NULL; if (ifmgd->associated) - cbss = sdata->deflink.u.mgd.bss; + cbss = sdata->deflink.conf->bss; else if (ifmgd->auth_data) cbss = ifmgd->auth_data->bss; else if (ifmgd->assoc_data && ifmgd->assoc_data->link[0].bss) @@ -3371,16 +3581,32 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; - bool tx; + bool tx = false; lockdep_assert_wiphy(local->hw.wiphy); if (!ifmgd->associated) return; - /* in MLO assume we have a link where we can TX the frame */ - tx = ieee80211_vif_is_mld(&sdata->vif) || - !sdata->deflink.csa_block_tx; + /* only transmit if we have a link that makes that worthwhile */ + for (unsigned int link_id = 0; + link_id < ARRAY_SIZE(sdata->link); + link_id++) { + struct ieee80211_link_data *link; + + if (!ieee80211_vif_link_active(&sdata->vif, link_id)) + continue; + + link = sdata_dereference(sdata->link[link_id], sdata); + if (WARN_ON_ONCE(!link)) + continue; + + if (link->u.mgd.csa_blocked_tx) + continue; + + tx = true; + break; + } if (!ifmgd->driver_disconnect) { unsigned int link_id; @@ -3400,8 +3626,8 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; - cfg80211_unlink_bss(local->hw.wiphy, link->u.mgd.bss); - link->u.mgd.bss = NULL; + cfg80211_unlink_bss(local->hw.wiphy, link->conf->bss); + link->conf->bss = NULL; } } @@ -3413,10 +3639,11 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) /* the other links will be destroyed */ sdata->vif.bss_conf.csa_active = false; sdata->deflink.u.mgd.csa_waiting_bcn = false; - if (sdata->deflink.csa_block_tx) { + sdata->deflink.u.mgd.csa_blocked_tx = false; + if (sdata->csa_blocked_queues) { ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); - sdata->deflink.csa_block_tx = false; + sdata->csa_blocked_queues = false; } ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), tx, @@ -3518,7 +3745,6 @@ static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata, sta_info_destroy_addr(sdata, auth_data->ap_addr); /* other links are destroyed */ - sdata->deflink.u.mgd.conn_flags = 0; eth_zero_addr(sdata->deflink.u.mgd.bssid); ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_BSSID); @@ -3556,7 +3782,6 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata, del_timer_sync(&sdata->u.mgd.timer); sta_info_destroy_addr(sdata, assoc_data->ap_addr); - sdata->deflink.u.mgd.conn_flags = 0; eth_zero_addr(sdata->deflink.u.mgd.bssid); ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_BSSID); @@ -4006,11 +4231,13 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, struct ieee80211_local *local = sdata->local; unsigned int link_id = link->link_id; struct ieee80211_elems_parse_params parse_params = { + .mode = link->u.mgd.conn.mode, .start = elem_start, .len = elem_len, .link_id = link_id == assoc_data->assoc_link_id ? -1 : link_id, .from_ap = true, }; + bool is_5ghz = cbss->channel->band == NL80211_BAND_5GHZ; bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ; bool is_s1g = cbss->channel->band == NL80211_BAND_S1GHZ; const struct cfg80211_bss_ies *bss_ies = NULL; @@ -4034,15 +4261,17 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, */ assoc_data->link[link_id].status = WLAN_STATUS_SUCCESS; if (elems->ml_basic) { - if (!(elems->ml_basic->control & - cpu_to_le16(IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT))) { + int bss_param_ch_cnt = + ieee80211_mle_get_bss_param_ch_cnt((const void *)elems->ml_basic); + + if (bss_param_ch_cnt < 0) { ret = false; goto out; } - link->u.mgd.bss_param_ch_cnt = - ieee80211_mle_get_bss_param_ch_cnt(elems->ml_basic); + link->u.mgd.bss_param_ch_cnt = bss_param_ch_cnt; } - } else if (!elems->prof || + } else if (elems->parse_error & IEEE80211_PARSE_ERR_DUP_NEST_ML_BASIC || + !elems->prof || !(elems->prof->control & prof_bss_param_ch_present)) { ret = false; goto out; @@ -4086,9 +4315,9 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, */ if (!is_6ghz && ((assoc_data->wmm && !elems->wmm_param) || - (!(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_HT) && + (link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT && (!elems->ht_cap_elem || !elems->ht_operation)) || - (!(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_VHT) && + (link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT && (!elems->vht_cap_elem || !elems->vht_operation)))) { const struct cfg80211_bss_ies *ies; struct ieee802_11_elems *bss_elems; @@ -4125,25 +4354,25 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, * have to include the IEs in the (re)association response. */ if (!elems->ht_cap_elem && bss_elems->ht_cap_elem && - !(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_HT)) { + link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT) { elems->ht_cap_elem = bss_elems->ht_cap_elem; sdata_info(sdata, "AP bug: HT capability missing from AssocResp\n"); } if (!elems->ht_operation && bss_elems->ht_operation && - !(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_HT)) { + link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT) { elems->ht_operation = bss_elems->ht_operation; sdata_info(sdata, "AP bug: HT operation missing from AssocResp\n"); } if (!elems->vht_cap_elem && bss_elems->vht_cap_elem && - !(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_VHT)) { + link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT) { elems->vht_cap_elem = bss_elems->vht_cap_elem; sdata_info(sdata, "AP bug: VHT capa missing from AssocResp\n"); } if (!elems->vht_operation && bss_elems->vht_operation && - !(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_VHT)) { + link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT) { elems->vht_operation = bss_elems->vht_operation; sdata_info(sdata, "AP bug: VHT operation missing from AssocResp\n"); @@ -4155,8 +4384,10 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, /* * We previously checked these in the beacon/probe response, so * they should be present here. This is just a safety net. + * Note that the ieee80211_config_bw() below would also check + * for this (and more), but this has better error reporting. */ - if (!is_6ghz && !(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_HT) && + if (!is_6ghz && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT && (!elems->wmm_param || !elems->ht_cap_elem || !elems->ht_operation)) { sdata_info(sdata, "HT AP is missing WMM params or HT capability/operation\n"); @@ -4164,7 +4395,7 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, goto out; } - if (!is_6ghz && !(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_VHT) && + if (is_5ghz && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT && (!elems->vht_cap_elem || !elems->vht_operation)) { sdata_info(sdata, "VHT AP is missing VHT capability/operation\n"); @@ -4172,36 +4403,28 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, goto out; } - if (is_6ghz && !(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_HE) && - !elems->he_6ghz_capa) { - sdata_info(sdata, - "HE 6 GHz AP is missing HE 6 GHz band capability\n"); + /* check/update if AP changed anything in assoc response vs. scan */ + if (ieee80211_config_bw(link, elems, + link_id == assoc_data->assoc_link_id, + changed)) { ret = false; goto out; } - if (WARN_ON(!link->conf->chandef.chan)) { - ret = false; - goto out; - } - sband = local->hw.wiphy->bands[link->conf->chandef.chan->band]; - - if (!(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_HE) && - (!elems->he_cap || !elems->he_operation)) { - sdata_info(sdata, - "HE AP is missing HE capability/operation\n"); + if (WARN_ON(!link->conf->chanreq.oper.chan)) { ret = false; goto out; } + sband = local->hw.wiphy->bands[link->conf->chanreq.oper.chan->band]; /* Set up internal HT/VHT capabilities */ - if (elems->ht_cap_elem && !(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_HT)) + if (elems->ht_cap_elem && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT) ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, elems->ht_cap_elem, link_sta); if (elems->vht_cap_elem && - !(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_VHT)) { + link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT) { const struct ieee80211_vht_cap *bss_vht_cap = NULL; const struct cfg80211_bss_ies *ies; @@ -4228,14 +4451,43 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, rcu_read_unlock(); } - if (elems->he_operation && !(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_HE) && + if (elems->he_operation && + link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HE && elems->he_cap) { + const struct ieee80211_he_6ghz_oper *he_6ghz_oper; + ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, elems->he_cap, elems->he_cap_len, elems->he_6ghz_capa, link_sta); + he_6ghz_oper = ieee80211_he_6ghz_oper(elems->he_operation); + + if (is_6ghz && he_6ghz_oper) { + switch (u8_get_bits(he_6ghz_oper->control, + IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) { + case IEEE80211_6GHZ_CTRL_REG_LPI_AP: + case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP: + bss_conf->power_type = IEEE80211_REG_LPI_AP; + break; + case IEEE80211_6GHZ_CTRL_REG_SP_AP: + case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP: + bss_conf->power_type = IEEE80211_REG_SP_AP; + break; + case IEEE80211_6GHZ_CTRL_REG_VLP_AP: + bss_conf->power_type = IEEE80211_REG_VLP_AP; + break; + default: + bss_conf->power_type = IEEE80211_REG_UNSET_AP; + break; + } + } else if (is_6ghz) { + link_info(link, + "HE 6 GHz operation missing (on %d MHz), expect issues\n", + bss_conf->chanreq.oper.chan->center_freq); + } + bss_conf->he_support = link_sta->pub->he_cap.has_he; if (elems->rsnx && elems->rsnx_len && (elems->rsnx[0] & WLAN_RSNX_CAPA_PROTECTED_TWT) && @@ -4249,7 +4501,7 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, link_sta, elems); if (elems->eht_operation && elems->eht_cap && - !(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_EHT)) { + link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_EHT) { ieee80211_eht_cap_ie_to_sta_eht_cap(sdata, sband, elems->he_cap, elems->he_cap_len, @@ -4258,7 +4510,6 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, link_sta); bss_conf->eht_support = link_sta->pub->eht_cap.has_eht; - *changed |= BSS_CHANGED_EHT_PUNCTURING; } else { bss_conf->eht_support = false; } @@ -4456,7 +4707,7 @@ static u8 ieee80211_max_rx_chains(struct ieee80211_link_data *link, bool support_160; u8 chains = 1; - if (link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_HT) + if (link->u.mgd.conn.mode < IEEE80211_CONN_MODE_HT) return chains; ht_cap_elem = ieee80211_bss_get_elem(cbss, WLAN_EID_HT_CAPABILITY); @@ -4469,7 +4720,7 @@ static u8 ieee80211_max_rx_chains(struct ieee80211_link_data *link, */ } - if (link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_VHT) + if (link->u.mgd.conn.mode < IEEE80211_CONN_MODE_VHT) return chains; vht_cap_elem = ieee80211_bss_get_elem(cbss, WLAN_EID_VHT_CAPABILITY); @@ -4488,7 +4739,7 @@ static u8 ieee80211_max_rx_chains(struct ieee80211_link_data *link, chains = max(chains, nss); } - if (link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_HE) + if (link->u.mgd.conn.mode < IEEE80211_CONN_MODE_HE) return chains; ies = rcu_dereference(cbss->ies); @@ -4539,533 +4790,331 @@ static u8 ieee80211_max_rx_chains(struct ieee80211_link_data *link, return chains; } -static bool -ieee80211_verify_peer_he_mcs_support(struct ieee80211_sub_if_data *sdata, - const struct cfg80211_bss_ies *ies, - const struct ieee80211_he_operation *he_op) +static void +ieee80211_determine_our_sta_mode(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + struct cfg80211_assoc_request *req, + bool wmm_used, int link_id, + struct ieee80211_conn_settings *conn) { - const struct element *he_cap_elem; - const struct ieee80211_he_cap_elem *he_cap; - struct ieee80211_he_mcs_nss_supp *he_mcs_nss_supp; - u16 mcs_80_map_tx, mcs_80_map_rx; - u16 ap_min_req_set; - int mcs_nss_size; - int nss; - - he_cap_elem = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_CAPABILITY, - ies->data, ies->len); - - if (!he_cap_elem) - return false; + struct ieee80211_sta_ht_cap sta_ht_cap = sband->ht_cap; + bool is_5ghz = sband->band == NL80211_BAND_5GHZ; + bool is_6ghz = sband->band == NL80211_BAND_6GHZ; + const struct ieee80211_sta_he_cap *he_cap; + const struct ieee80211_sta_eht_cap *eht_cap; + struct ieee80211_sta_vht_cap vht_cap; - /* invalid HE IE */ - if (he_cap_elem->datalen < 1 + sizeof(*he_cap)) { - sdata_info(sdata, - "Invalid HE elem, Disable HE\n"); - return false; + if (sband->band == NL80211_BAND_S1GHZ) { + conn->mode = IEEE80211_CONN_MODE_S1G; + conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; + mlme_dbg(sdata, "operating as S1G STA\n"); + return; } - /* skip one byte ext_tag_id */ - he_cap = (void *)(he_cap_elem->data + 1); - mcs_nss_size = ieee80211_he_mcs_nss_size(he_cap); + conn->mode = IEEE80211_CONN_MODE_LEGACY; + conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; - /* invalid HE IE */ - if (he_cap_elem->datalen < 1 + sizeof(*he_cap) + mcs_nss_size) { - sdata_info(sdata, - "Invalid HE elem with nss size, Disable HE\n"); - return false; + ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); + + if (req && req->flags & ASSOC_REQ_DISABLE_HT) { + mlme_link_id_dbg(sdata, link_id, + "HT disabled by flag, limiting to legacy\n"); + goto out; } - /* mcs_nss is right after he_cap info */ - he_mcs_nss_supp = (void *)(he_cap + 1); + if (!wmm_used) { + mlme_link_id_dbg(sdata, link_id, + "WMM/QoS not supported, limiting to legacy\n"); + goto out; + } - mcs_80_map_tx = le16_to_cpu(he_mcs_nss_supp->tx_mcs_80); - mcs_80_map_rx = le16_to_cpu(he_mcs_nss_supp->rx_mcs_80); + if (req) { + unsigned int i; - /* P802.11-REVme/D0.3 - * 27.1.1 Introduction to the HE PHY - * ... - * An HE STA shall support the following features: - * ... - * Single spatial stream HE-MCSs 0 to 7 (transmit and receive) in all - * supported channel widths for HE SU PPDUs - */ - if ((mcs_80_map_tx & 0x3) == IEEE80211_HE_MCS_NOT_SUPPORTED || - (mcs_80_map_rx & 0x3) == IEEE80211_HE_MCS_NOT_SUPPORTED) { - sdata_info(sdata, - "Missing mandatory rates for 1 Nss, rx 0x%x, tx 0x%x, disable HE\n", - mcs_80_map_tx, mcs_80_map_rx); - return false; + for (i = 0; i < req->crypto.n_ciphers_pairwise; i++) { + if (req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP40 || + req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_TKIP || + req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP104) { + netdev_info(sdata->dev, + "WEP/TKIP use, limiting to legacy\n"); + goto out; + } + } } - if (!he_op) - return true; + if (!sta_ht_cap.ht_supported && !is_6ghz) { + mlme_link_id_dbg(sdata, link_id, + "HT not supported (and not on 6 GHz), limiting to legacy\n"); + goto out; + } - ap_min_req_set = le16_to_cpu(he_op->he_mcs_nss_set); + /* HT is fine */ + conn->mode = IEEE80211_CONN_MODE_HT; + conn->bw_limit = sta_ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? + IEEE80211_CONN_BW_LIMIT_40 : + IEEE80211_CONN_BW_LIMIT_20; - /* - * Apparently iPhone 13 (at least iOS version 15.3.1) sets this to all - * zeroes, which is nonsense, and completely inconsistent with itself - * (it doesn't have 8 streams). Accept the settings in this case anyway. - */ - if (!ap_min_req_set) - return true; - - /* make sure the AP is consistent with itself - * - * P802.11-REVme/D0.3 - * 26.17.1 Basic HE BSS operation - * - * A STA that is operating in an HE BSS shall be able to receive and - * transmit at each of the <HE-MCS, NSS> tuple values indicated by the - * Basic HE-MCS And NSS Set field of the HE Operation parameter of the - * MLME-START.request primitive and shall be able to receive at each of - * the <HE-MCS, NSS> tuple values indicated by the Supported HE-MCS and - * NSS Set field in the HE Capabilities parameter of the MLMESTART.request - * primitive - */ - for (nss = 8; nss > 0; nss--) { - u8 ap_op_val = (ap_min_req_set >> (2 * (nss - 1))) & 3; - u8 ap_rx_val; - u8 ap_tx_val; + memcpy(&vht_cap, &sband->vht_cap, sizeof(vht_cap)); + ieee80211_apply_vhtcap_overrides(sdata, &vht_cap); - if (ap_op_val == IEEE80211_HE_MCS_NOT_SUPPORTED) - continue; + if (req && req->flags & ASSOC_REQ_DISABLE_VHT) { + mlme_link_id_dbg(sdata, link_id, + "VHT disabled by flag, limiting to HT\n"); + goto out; + } - ap_rx_val = (mcs_80_map_rx >> (2 * (nss - 1))) & 3; - ap_tx_val = (mcs_80_map_tx >> (2 * (nss - 1))) & 3; + if (vht_cap.vht_supported && is_5ghz) { + bool have_80mhz = false; + unsigned int i; - if (ap_rx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || - ap_tx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || - ap_rx_val < ap_op_val || ap_tx_val < ap_op_val) { - sdata_info(sdata, - "Invalid rates for %d Nss, rx %d, tx %d oper %d, disable HE\n", - nss, ap_rx_val, ap_rx_val, ap_op_val); - return false; + if (conn->bw_limit == IEEE80211_CONN_BW_LIMIT_20) { + mlme_link_id_dbg(sdata, link_id, + "no 40 MHz support on 5 GHz, limiting to HT\n"); + goto out; } - } - return true; -} + /* Allow VHT if at least one channel on the sband supports 80 MHz */ + for (i = 0; i < sband->n_channels; i++) { + if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED | + IEEE80211_CHAN_NO_80MHZ)) + continue; -static bool -ieee80211_verify_sta_he_mcs_support(struct ieee80211_sub_if_data *sdata, - struct ieee80211_supported_band *sband, - const struct ieee80211_he_operation *he_op) -{ - const struct ieee80211_sta_he_cap *sta_he_cap = - ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); - u16 ap_min_req_set; - int i; + have_80mhz = true; + break; + } - if (!sta_he_cap || !he_op) - return false; + if (!have_80mhz) { + mlme_link_id_dbg(sdata, link_id, + "no 80 MHz channel support on 5 GHz, limiting to HT\n"); + goto out; + } + } else if (is_5ghz) { /* !vht_supported but on 5 GHz */ + mlme_link_id_dbg(sdata, link_id, + "no VHT support on 5 GHz, limiting to HT\n"); + goto out; + } - ap_min_req_set = le16_to_cpu(he_op->he_mcs_nss_set); + /* VHT - if we have - is fine, including 80 MHz, check 160 below again */ + if (sband->band != NL80211_BAND_2GHZ) { + conn->mode = IEEE80211_CONN_MODE_VHT; + conn->bw_limit = IEEE80211_CONN_BW_LIMIT_160; + } - /* - * Apparently iPhone 13 (at least iOS version 15.3.1) sets this to all - * zeroes, which is nonsense, and completely inconsistent with itself - * (it doesn't have 8 streams). Accept the settings in this case anyway. - */ - if (!ap_min_req_set) - return true; + if (is_5ghz && + !(vht_cap.cap & (IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ | + IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ))) { + conn->bw_limit = IEEE80211_CONN_BW_LIMIT_80; + mlme_link_id_dbg(sdata, link_id, + "no VHT 160 MHz capability on 5 GHz, limiting to 80 MHz"); + } - /* Need to go over for 80MHz, 160MHz and for 80+80 */ - for (i = 0; i < 3; i++) { - const struct ieee80211_he_mcs_nss_supp *sta_mcs_nss_supp = - &sta_he_cap->he_mcs_nss_supp; - u16 sta_mcs_map_rx = - le16_to_cpu(((__le16 *)sta_mcs_nss_supp)[2 * i]); - u16 sta_mcs_map_tx = - le16_to_cpu(((__le16 *)sta_mcs_nss_supp)[2 * i + 1]); - u8 nss; - bool verified = true; + if (req && req->flags & ASSOC_REQ_DISABLE_HE) { + mlme_link_id_dbg(sdata, link_id, + "HE disabled by flag, limiting to HT/VHT\n"); + goto out; + } - /* - * For each band there is a maximum of 8 spatial streams - * possible. Each of the sta_mcs_map_* is a 16-bit struct built - * of 2 bits per NSS (1-8), with the values defined in enum - * ieee80211_he_mcs_support. Need to make sure STA TX and RX - * capabilities aren't less than the AP's minimum requirements - * for this HE BSS per SS. - * It is enough to find one such band that meets the reqs. - */ - for (nss = 8; nss > 0; nss--) { - u8 sta_rx_val = (sta_mcs_map_rx >> (2 * (nss - 1))) & 3; - u8 sta_tx_val = (sta_mcs_map_tx >> (2 * (nss - 1))) & 3; - u8 ap_val = (ap_min_req_set >> (2 * (nss - 1))) & 3; + he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); + if (!he_cap) { + WARN_ON(is_6ghz); + mlme_link_id_dbg(sdata, link_id, + "no HE support, limiting to HT/VHT\n"); + goto out; + } - if (ap_val == IEEE80211_HE_MCS_NOT_SUPPORTED) - continue; + /* so we have HE */ + conn->mode = IEEE80211_CONN_MODE_HE; - /* - * Make sure the HE AP doesn't require MCSs that aren't - * supported by the client as required by spec - * - * P802.11-REVme/D0.3 - * 26.17.1 Basic HE BSS operation - * - * An HE STA shall not attempt to join * (MLME-JOIN.request primitive) - * a BSS, unless it supports (i.e., is able to both transmit and - * receive using) all of the <HE-MCS, NSS> tuples in the basic - * HE-MCS and NSS set. - */ - if (sta_rx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || - sta_tx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || - (ap_val > sta_rx_val) || (ap_val > sta_tx_val)) { - verified = false; - break; - } + /* check bandwidth */ + switch (sband->band) { + default: + case NL80211_BAND_2GHZ: + if (he_cap->he_cap_elem.phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G) + break; + conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; + mlme_link_id_dbg(sdata, link_id, + "no 40 MHz HE cap in 2.4 GHz, limiting to 20 MHz\n"); + break; + case NL80211_BAND_5GHZ: + if (!(he_cap->he_cap_elem.phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G)) { + conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; + mlme_link_id_dbg(sdata, link_id, + "no 40/80 MHz HE cap in 5 GHz, limiting to 20 MHz\n"); + break; + } + if (!(he_cap->he_cap_elem.phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G)) { + conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, + conn->bw_limit, + IEEE80211_CONN_BW_LIMIT_80); + mlme_link_id_dbg(sdata, link_id, + "no 160 MHz HE cap in 5 GHz, limiting to 80 MHz\n"); } + break; + case NL80211_BAND_6GHZ: + if (he_cap->he_cap_elem.phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) + break; + conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, + conn->bw_limit, + IEEE80211_CONN_BW_LIMIT_80); + mlme_link_id_dbg(sdata, link_id, + "no 160 MHz HE cap in 6 GHz, limiting to 80 MHz\n"); + break; + } - if (verified) - return true; + if (req && req->flags & ASSOC_REQ_DISABLE_EHT) { + mlme_link_id_dbg(sdata, link_id, + "EHT disabled by flag, limiting to HE\n"); + goto out; } - /* If here, STA doesn't meet AP's HE min requirements */ - return false; -} + eht_cap = ieee80211_get_eht_iftype_cap_vif(sband, &sdata->vif); + if (!eht_cap) { + mlme_link_id_dbg(sdata, link_id, + "no EHT support, limiting to HE\n"); + goto out; + } -static u8 -ieee80211_get_eht_cap_mcs_nss(const struct ieee80211_sta_he_cap *sta_he_cap, - const struct ieee80211_sta_eht_cap *sta_eht_cap, - unsigned int idx, int bw) -{ - u8 he_phy_cap0 = sta_he_cap->he_cap_elem.phy_cap_info[0]; - u8 eht_phy_cap0 = sta_eht_cap->eht_cap_elem.phy_cap_info[0]; + /* we have EHT */ - /* handle us being a 20 MHz-only EHT STA - with four values - * for MCS 0-7, 8-9, 10-11, 12-13. - */ - if (!(he_phy_cap0 & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK_ALL)) - return sta_eht_cap->eht_mcs_nss_supp.only_20mhz.rx_tx_max_nss[idx]; + conn->mode = IEEE80211_CONN_MODE_EHT; - /* the others have MCS 0-9 together, rather than separately from 0-7 */ - if (idx > 0) - idx--; + /* check bandwidth */ + if (is_6ghz && + eht_cap->eht_cap_elem.phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ) + conn->bw_limit = IEEE80211_CONN_BW_LIMIT_320; + else if (is_6ghz) + mlme_link_id_dbg(sdata, link_id, + "no EHT 320 MHz cap in 6 GHz, limiting to 160 MHz\n"); - switch (bw) { - case 0: - return sta_eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_max_nss[idx]; - case 1: - if (!(he_phy_cap0 & - (IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G | - IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G))) - return 0xff; /* pass check */ - return sta_eht_cap->eht_mcs_nss_supp.bw._160.rx_tx_max_nss[idx]; - case 2: - if (!(eht_phy_cap0 & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ)) - return 0xff; /* pass check */ - return sta_eht_cap->eht_mcs_nss_supp.bw._320.rx_tx_max_nss[idx]; - } - - WARN_ON(1); - return 0; +out: + mlme_link_id_dbg(sdata, link_id, + "determined local STA to be %s, BW limited to %d MHz\n", + ieee80211_conn_mode_str(conn->mode), + 20 * (1 << conn->bw_limit)); } -static bool -ieee80211_verify_sta_eht_mcs_support(struct ieee80211_sub_if_data *sdata, - struct ieee80211_supported_band *sband, - const struct ieee80211_eht_operation *eht_op) +static void +ieee80211_determine_our_sta_mode_auth(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + struct cfg80211_auth_request *req, + bool wmm_used, + struct ieee80211_conn_settings *conn) { - const struct ieee80211_sta_he_cap *sta_he_cap = - ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); - const struct ieee80211_sta_eht_cap *sta_eht_cap = - ieee80211_get_eht_iftype_cap_vif(sband, &sdata->vif); - const struct ieee80211_eht_mcs_nss_supp_20mhz_only *req; - unsigned int i; - - if (!sta_he_cap || !sta_eht_cap || !eht_op) - return false; - - req = &eht_op->basic_mcs_nss; - - for (i = 0; i < ARRAY_SIZE(req->rx_tx_max_nss); i++) { - u8 req_rx_nss, req_tx_nss; - unsigned int bw; - - req_rx_nss = u8_get_bits(req->rx_tx_max_nss[i], - IEEE80211_EHT_MCS_NSS_RX); - req_tx_nss = u8_get_bits(req->rx_tx_max_nss[i], - IEEE80211_EHT_MCS_NSS_TX); + ieee80211_determine_our_sta_mode(sdata, sband, NULL, wmm_used, + req->link_id > 0 ? req->link_id : 0, + conn); +} - for (bw = 0; bw < 3; bw++) { - u8 have, have_rx_nss, have_tx_nss; +static void +ieee80211_determine_our_sta_mode_assoc(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + struct cfg80211_assoc_request *req, + bool wmm_used, int link_id, + struct ieee80211_conn_settings *conn) +{ + struct ieee80211_conn_settings tmp; - have = ieee80211_get_eht_cap_mcs_nss(sta_he_cap, - sta_eht_cap, - i, bw); - have_rx_nss = u8_get_bits(have, - IEEE80211_EHT_MCS_NSS_RX); - have_tx_nss = u8_get_bits(have, - IEEE80211_EHT_MCS_NSS_TX); + WARN_ON(!req); - if (req_rx_nss > have_rx_nss || - req_tx_nss > have_tx_nss) - return false; - } - } + ieee80211_determine_our_sta_mode(sdata, sband, req, wmm_used, link_id, + &tmp); - return true; + conn->mode = min_t(enum ieee80211_conn_mode, + conn->mode, tmp.mode); + conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, + conn->bw_limit, tmp.bw_limit); } static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, - struct cfg80211_bss *cbss, - bool mlo, - ieee80211_conn_flags_t *conn_flags) + int link_id, + struct cfg80211_bss *cbss, bool mlo, + struct ieee80211_conn_settings *conn) { struct ieee80211_local *local = sdata->local; - const struct ieee80211_ht_cap *ht_cap = NULL; - const struct ieee80211_ht_operation *ht_oper = NULL; - const struct ieee80211_vht_operation *vht_oper = NULL; - const struct ieee80211_he_operation *he_oper = NULL; - const struct ieee80211_eht_operation *eht_oper = NULL; - const struct ieee80211_s1g_oper_ie *s1g_oper = NULL; - struct ieee80211_supported_band *sband; - struct cfg80211_chan_def chandef; bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ; - bool is_5ghz = cbss->channel->band == NL80211_BAND_5GHZ; - bool supports_mlo = false; - struct ieee80211_bss *bss = (void *)cbss->priv; - struct ieee80211_elems_parse_params parse_params = { - .link_id = -1, - .from_ap = true, - }; + struct ieee80211_chan_req chanreq = {}; struct ieee802_11_elems *elems; - const struct cfg80211_bss_ies *ies; int ret; u32 i; - bool have_80mhz; lockdep_assert_wiphy(local->hw.wiphy); rcu_read_lock(); + elems = ieee80211_determine_chan_mode(sdata, conn, cbss, link_id, + &chanreq); - ies = rcu_dereference(cbss->ies); - parse_params.start = ies->data; - parse_params.len = ies->len; - elems = ieee802_11_parse_elems_full(&parse_params); - if (!elems) { + if (IS_ERR(elems)) { rcu_read_unlock(); - return -ENOMEM; - } - - sband = local->hw.wiphy->bands[cbss->channel->band]; - - *conn_flags &= ~(IEEE80211_CONN_DISABLE_40MHZ | - IEEE80211_CONN_DISABLE_80P80MHZ | - IEEE80211_CONN_DISABLE_160MHZ); - - /* disable HT/VHT/HE if we don't support them */ - if (!sband->ht_cap.ht_supported && !is_6ghz) { - mlme_dbg(sdata, "HT not supported, disabling HT/VHT/HE/EHT\n"); - *conn_flags |= IEEE80211_CONN_DISABLE_HT; - *conn_flags |= IEEE80211_CONN_DISABLE_VHT; - *conn_flags |= IEEE80211_CONN_DISABLE_HE; - *conn_flags |= IEEE80211_CONN_DISABLE_EHT; - } - - if (!sband->vht_cap.vht_supported && is_5ghz) { - mlme_dbg(sdata, "VHT not supported, disabling VHT/HE/EHT\n"); - *conn_flags |= IEEE80211_CONN_DISABLE_VHT; - *conn_flags |= IEEE80211_CONN_DISABLE_HE; - *conn_flags |= IEEE80211_CONN_DISABLE_EHT; + return PTR_ERR(elems); } - if (!ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif)) { - mlme_dbg(sdata, "HE not supported, disabling HE and EHT\n"); - *conn_flags |= IEEE80211_CONN_DISABLE_HE; - *conn_flags |= IEEE80211_CONN_DISABLE_EHT; - } - - if (!ieee80211_get_eht_iftype_cap_vif(sband, &sdata->vif)) { - mlme_dbg(sdata, "EHT not supported, disabling EHT\n"); - *conn_flags |= IEEE80211_CONN_DISABLE_EHT; - } - - if (!(*conn_flags & IEEE80211_CONN_DISABLE_HT) && !is_6ghz) { - ht_oper = elems->ht_operation; - ht_cap = elems->ht_cap_elem; - - if (!ht_cap) { - *conn_flags |= IEEE80211_CONN_DISABLE_HT; - ht_oper = NULL; - } - } - - if (!(*conn_flags & IEEE80211_CONN_DISABLE_VHT) && !is_6ghz) { - vht_oper = elems->vht_operation; - if (vht_oper && !ht_oper) { - vht_oper = NULL; - sdata_info(sdata, - "AP advertised VHT without HT, disabling HT/VHT/HE\n"); - *conn_flags |= IEEE80211_CONN_DISABLE_HT; - *conn_flags |= IEEE80211_CONN_DISABLE_VHT; - *conn_flags |= IEEE80211_CONN_DISABLE_HE; - *conn_flags |= IEEE80211_CONN_DISABLE_EHT; - } - - if (!elems->vht_cap_elem) { - *conn_flags |= IEEE80211_CONN_DISABLE_VHT; - vht_oper = NULL; - } + if (mlo && !elems->ml_basic) { + sdata_info(sdata, "Rejecting MLO as it is not supported by AP\n"); + rcu_read_unlock(); + kfree(elems); + return -EINVAL; } - if (!(*conn_flags & IEEE80211_CONN_DISABLE_HE)) { - he_oper = elems->he_operation; + if (link && is_6ghz && conn->mode >= IEEE80211_CONN_MODE_HE) { + struct ieee80211_bss_conf *bss_conf; + u8 j = 0; - if (link && is_6ghz) { - struct ieee80211_bss_conf *bss_conf; - u8 j = 0; + bss_conf = link->conf; - bss_conf = link->conf; + if (elems->pwr_constr_elem) + bss_conf->pwr_reduction = *elems->pwr_constr_elem; - if (elems->pwr_constr_elem) - bss_conf->pwr_reduction = *elems->pwr_constr_elem; + BUILD_BUG_ON(ARRAY_SIZE(bss_conf->tx_pwr_env) != + ARRAY_SIZE(elems->tx_pwr_env)); - BUILD_BUG_ON(ARRAY_SIZE(bss_conf->tx_pwr_env) != - ARRAY_SIZE(elems->tx_pwr_env)); - - for (i = 0; i < elems->tx_pwr_env_num; i++) { - if (elems->tx_pwr_env_len[i] > - sizeof(bss_conf->tx_pwr_env[j])) - continue; + for (i = 0; i < elems->tx_pwr_env_num; i++) { + if (elems->tx_pwr_env_len[i] > sizeof(bss_conf->tx_pwr_env[j])) + continue; - bss_conf->tx_pwr_env_num++; - memcpy(&bss_conf->tx_pwr_env[j], elems->tx_pwr_env[i], - elems->tx_pwr_env_len[i]); - j++; - } + bss_conf->tx_pwr_env_num++; + memcpy(&bss_conf->tx_pwr_env[j], elems->tx_pwr_env[i], + elems->tx_pwr_env_len[i]); + j++; } - - if (!ieee80211_verify_peer_he_mcs_support(sdata, ies, he_oper) || - !ieee80211_verify_sta_he_mcs_support(sdata, sband, he_oper)) - *conn_flags |= IEEE80211_CONN_DISABLE_HE | - IEEE80211_CONN_DISABLE_EHT; } - - /* - * EHT requires HE to be supported as well. Specifically for 6 GHz - * channels, the operation channel information can only be deduced from - * both the 6 GHz operation information (from the HE operation IE) and - * EHT operation. - */ - if (!(*conn_flags & - (IEEE80211_CONN_DISABLE_HE | - IEEE80211_CONN_DISABLE_EHT)) && - he_oper) { - const struct cfg80211_bss_ies *cbss_ies; - const struct element *eht_ml_elem; - const u8 *eht_oper_ie; - - cbss_ies = rcu_dereference(cbss->ies); - eht_oper_ie = cfg80211_find_ext_ie(WLAN_EID_EXT_EHT_OPERATION, - cbss_ies->data, cbss_ies->len); - if (eht_oper_ie && eht_oper_ie[1] >= - 1 + sizeof(struct ieee80211_eht_operation)) - eht_oper = (void *)(eht_oper_ie + 3); - else - eht_oper = NULL; - - if (!ieee80211_verify_sta_eht_mcs_support(sdata, sband, eht_oper)) - *conn_flags |= IEEE80211_CONN_DISABLE_EHT; - - eht_ml_elem = cfg80211_find_ext_elem(WLAN_EID_EXT_EHT_MULTI_LINK, - cbss_ies->data, cbss_ies->len); - - /* data + 1 / datalen - 1 since it's an extended element */ - if (!(*conn_flags & IEEE80211_CONN_DISABLE_EHT) && - eht_ml_elem && - ieee80211_mle_type_ok(eht_ml_elem->data + 1, - IEEE80211_ML_CONTROL_TYPE_BASIC, - eht_ml_elem->datalen - 1)) { - supports_mlo = true; - - sdata->vif.cfg.eml_cap = - ieee80211_mle_get_eml_cap(eht_ml_elem->data + 1); - sdata->vif.cfg.eml_med_sync_delay = - ieee80211_mle_get_eml_med_sync_delay(eht_ml_elem->data + 1); - } - } - - /* Allow VHT if at least one channel on the sband supports 80 MHz */ - have_80mhz = false; - for (i = 0; i < sband->n_channels; i++) { - if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED | - IEEE80211_CHAN_NO_80MHZ)) - continue; - - have_80mhz = true; - break; - } - - if (!have_80mhz) { - sdata_info(sdata, "80 MHz not supported, disabling VHT\n"); - *conn_flags |= IEEE80211_CONN_DISABLE_VHT; - } - - if (sband->band == NL80211_BAND_S1GHZ) { - s1g_oper = elems->s1g_oper; - if (!s1g_oper) - sdata_info(sdata, - "AP missing S1G operation element?\n"); - } - - *conn_flags |= - ieee80211_determine_chantype(sdata, link, *conn_flags, - sband, - cbss->channel, - bss->vht_cap_info, - ht_oper, vht_oper, - he_oper, eht_oper, - s1g_oper, - &chandef, false); - - if (link) - link->needed_rx_chains = - min(ieee80211_max_rx_chains(link, cbss), - local->rx_chains); - rcu_read_unlock(); /* the element data was RCU protected so no longer valid anyway */ kfree(elems); elems = NULL; - if (*conn_flags & IEEE80211_CONN_DISABLE_HE && is_6ghz) { - sdata_info(sdata, "Rejecting non-HE 6/7 GHz connection"); - return -EINVAL; - } - - if (mlo && !supports_mlo) { - sdata_info(sdata, "Rejecting MLO as it is not supported by AP\n"); - return -EINVAL; - } - if (!link) return 0; + rcu_read_lock(); + link->needed_rx_chains = min(ieee80211_max_rx_chains(link, cbss), + local->rx_chains); + rcu_read_unlock(); + /* * If this fails (possibly due to channel context sharing * on incompatible channels, e.g. 80+80 and 160 sharing the * same control channel) try to use a smaller bandwidth. */ - ret = ieee80211_link_use_channel(link, &chandef, + ret = ieee80211_link_use_channel(link, &chanreq, IEEE80211_CHANCTX_SHARED); /* don't downgrade for 5 and 10 MHz channels, though. */ - if (chandef.width == NL80211_CHAN_WIDTH_5 || - chandef.width == NL80211_CHAN_WIDTH_10) - goto out; + if (chanreq.oper.width == NL80211_CHAN_WIDTH_5 || + chanreq.oper.width == NL80211_CHAN_WIDTH_10) + return ret; + + while (ret && chanreq.oper.width != NL80211_CHAN_WIDTH_20_NOHT) { + ieee80211_chanreq_downgrade(&chanreq, conn); - while (ret && chandef.width != NL80211_CHAN_WIDTH_20_NOHT) { - *conn_flags |= - ieee80211_chandef_downgrade(&chandef); - ret = ieee80211_link_use_channel(link, &chandef, + ret = ieee80211_link_use_channel(link, &chanreq, IEEE80211_CHANCTX_SHARED); } - out: + return ret; } @@ -5126,6 +5175,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, if (WARN_ON(!sta)) goto out_err; + sta->sta.spp_amsdu = assoc_data->spp_amsdu; + if (ieee80211_vif_is_mld(&sdata->vif)) { for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { if (!assoc_data->link[link_id].bss) @@ -5189,8 +5240,10 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, link->conf->dtim_period = link->u.mgd.dtim_period ?: 1; if (link_id != assoc_data->assoc_link_id) { - err = ieee80211_prep_channel(sdata, link, cbss, true, - &link->u.mgd.conn_flags); + link->u.mgd.conn = assoc_data->link[link_id].conn; + + err = ieee80211_prep_channel(sdata, link, link_id, cbss, + true, &link->u.mgd.conn); if (err) { link_info(link, "prep_channel failed\n"); goto out_err; @@ -5308,6 +5361,9 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, if (!assoc_data) return; + parse_params.mode = + assoc_data->link[assoc_data->assoc_link_id].conn.mode; + if (!ether_addr_equal(assoc_data->ap_addr, mgmt->bssid) || !ether_addr_equal(assoc_data->ap_addr, mgmt->sa)) return; @@ -5424,6 +5480,13 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, assoc_data->ap_addr); goto abandon_assoc; } + + sdata->vif.cfg.eml_cap = + ieee80211_mle_get_eml_cap((const void *)elems->ml_basic); + sdata->vif.cfg.eml_med_sync_delay = + ieee80211_mle_get_eml_med_sync_delay((const void *)elems->ml_basic); + sdata->vif.cfg.mld_capa_op = + ieee80211_mle_get_mld_capa_op((const void *)elems->ml_basic); } sdata->vif.cfg.aid = aid; @@ -5686,49 +5749,6 @@ static bool ieee80211_rx_our_beacon(const u8 *tx_bssid, return ether_addr_equal(tx_bssid, bss->transmitted_bss->bssid); } -static bool ieee80211_config_puncturing(struct ieee80211_link_data *link, - const struct ieee80211_eht_operation *eht_oper, - u64 *changed) -{ - struct ieee80211_local *local = link->sdata->local; - u16 bitmap = 0, extracted; - - if ((eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT) && - (eht_oper->params & - IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT)) { - const struct ieee80211_eht_operation_info *info = - (void *)eht_oper->optional; - const u8 *disable_subchannel_bitmap = info->optional; - - bitmap = get_unaligned_le16(disable_subchannel_bitmap); - } - - extracted = ieee80211_extract_dis_subch_bmap(eht_oper, - &link->conf->chandef, - bitmap); - - /* accept if there are no changes */ - if (!(*changed & BSS_CHANGED_BANDWIDTH) && - extracted == link->conf->eht_puncturing) - return true; - - if (!cfg80211_valid_disable_subchannel_bitmap(&bitmap, - &link->conf->chandef)) { - link_info(link, - "Got an invalid disable subchannel bitmap from AP %pM: bitmap = 0x%x, bw = 0x%x. disconnect\n", - link->u.mgd.bssid, - bitmap, - link->conf->chandef.width); - return false; - } - - if (bitmap && ieee80211_hw_check(&local->hw, DISALLOW_PUNCTURING)) - return false; - - ieee80211_handle_puncturing_bitmap(link, eht_oper, bitmap, changed); - return true; -} - static void ieee80211_ml_reconf_work(struct wiphy *wiphy, struct wiphy_work *work) { @@ -5792,9 +5812,7 @@ out: static void ieee80211_ml_reconfiguration(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems) { - const struct ieee80211_multi_link_elem *ml; const struct element *sub; - ssize_t ml_len; unsigned long removed_links = 0; u16 link_removal_timeout[IEEE80211_MLD_MAX_NUM_LINKS] = {}; u8 link_id; @@ -5803,24 +5821,11 @@ static void ieee80211_ml_reconfiguration(struct ieee80211_sub_if_data *sdata, if (!ieee80211_vif_is_mld(&sdata->vif) || !elems->ml_reconf) return; - ml_len = cfg80211_defragment_element(elems->ml_reconf_elem, - elems->ie_start, - elems->total_len, - elems->scratch_pos, - elems->scratch + elems->scratch_len - - elems->scratch_pos, - WLAN_EID_FRAGMENT); - if (ml_len < 0) - return; - - elems->ml_reconf = (const void *)elems->scratch_pos; - elems->ml_reconf_len = ml_len; - ml = elems->ml_reconf; - /* Directly parse the sub elements as the common information doesn't * hold any useful information. */ - for_each_mle_subelement(sub, (u8 *)ml, ml_len) { + for_each_mle_subelement(sub, (const u8 *)elems->ml_reconf, + elems->ml_reconf_len) { struct ieee80211_mle_per_sta_profile *prof = (void *)sub->data; u8 *pos = prof->variable; u16 control; @@ -5891,6 +5896,64 @@ static void ieee80211_ml_reconfiguration(struct ieee80211_sub_if_data *sdata, TU_TO_JIFFIES(delay)); } +static int ieee80211_ttlm_set_links(struct ieee80211_sub_if_data *sdata, + u16 active_links, u16 dormant_links, + u16 suspended_links) +{ + u64 changed = 0; + int ret; + + if (!active_links) { + ret = -EINVAL; + goto out; + } + + /* If there is an active negotiated TTLM, it should be discarded by + * the new negotiated/advertised TTLM. + */ + if (sdata->vif.neg_ttlm.valid) { + memset(&sdata->vif.neg_ttlm, 0, sizeof(sdata->vif.neg_ttlm)); + sdata->vif.suspended_links = 0; + changed = BSS_CHANGED_MLD_TTLM; + } + + if (sdata->vif.active_links != active_links) { + /* usable links are affected when active_links are changed, + * so notify the driver about the status change + */ + changed |= BSS_CHANGED_MLD_VALID_LINKS; + active_links &= sdata->vif.active_links; + if (!active_links) + active_links = + BIT(__ffs(sdata->vif.valid_links & + ~dormant_links)); + ret = ieee80211_set_active_links(&sdata->vif, active_links); + if (ret) { + sdata_info(sdata, "Failed to set TTLM active links\n"); + goto out; + } + } + + ret = ieee80211_vif_set_links(sdata, sdata->vif.valid_links, + dormant_links); + if (ret) { + sdata_info(sdata, "Failed to set TTLM dormant links\n"); + goto out; + } + + sdata->vif.suspended_links = suspended_links; + if (sdata->vif.suspended_links) + changed |= BSS_CHANGED_MLD_TTLM; + + ieee80211_vif_cfg_change_notify(sdata, changed); + +out: + if (ret) + ieee80211_disconnect(&sdata->vif, false); + + return ret; +} + static void ieee80211_tid_to_link_map_work(struct wiphy *wiphy, struct wiphy_work *work) { @@ -5898,30 +5961,19 @@ static void ieee80211_tid_to_link_map_work(struct wiphy *wiphy, struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.ttlm_work.work); - int ret; new_active_links = sdata->u.mgd.ttlm_info.map & sdata->vif.valid_links; new_dormant_links = ~sdata->u.mgd.ttlm_info.map & sdata->vif.valid_links; - if (!new_active_links) { - ieee80211_disconnect(&sdata->vif, false); - return; - } ieee80211_vif_set_links(sdata, sdata->vif.valid_links, 0); - new_active_links = BIT(ffs(new_active_links) - 1); - ieee80211_set_active_links(&sdata->vif, new_active_links); - - ret = ieee80211_vif_set_links(sdata, sdata->vif.valid_links, - new_dormant_links); + if (ieee80211_ttlm_set_links(sdata, new_active_links, new_dormant_links, + 0)) + return; sdata->u.mgd.ttlm_info.active = true; sdata->u.mgd.ttlm_info.switch_time = 0; - - if (!ret) - ieee80211_vif_cfg_change_notify(sdata, - BSS_CHANGED_MLD_VALID_LINKS); } static u16 ieee80211_get_ttlm(u8 bm_size, u8 *data) @@ -6131,6 +6183,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, u8 *bssid, *variable = mgmt->u.beacon.variable; u8 deauth_buf[IEEE80211_DEAUTH_FRAME_LEN]; struct ieee80211_elems_parse_params parse_params = { + .mode = link->u.mgd.conn.mode, .link_id = -1, .from_ap = true, }; @@ -6184,7 +6237,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, link->u.mgd.dtim_period = elems->dtim_period; link->u.mgd.have_beacon = true; ifmgd->assoc_data->need_beacon = false; - if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY)) { + if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY) && + !ieee80211_is_s1g_beacon(hdr->frame_control)) { link->conf->sync_tsf = le64_to_cpu(mgmt->u.beacon.timestamp); link->conf->sync_device_ts = @@ -6213,7 +6267,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, } if (!ifmgd->associated || - !ieee80211_rx_our_beacon(bssid, link->u.mgd.bss)) + !ieee80211_rx_our_beacon(bssid, link->conf->bss)) return; bssid = link->u.mgd.bssid; @@ -6240,7 +6294,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, */ if (!ieee80211_is_s1g_beacon(hdr->frame_control)) ncrc = crc32_be(0, (void *)&mgmt->u.beacon.beacon_int, 4); - parse_params.bss = link->u.mgd.bss; + parse_params.bss = link->conf->bss; parse_params.filter = care_about_ies; parse_params.crc = ncrc; elems = ieee802_11_parse_elems_full(&parse_params); @@ -6302,9 +6356,6 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, } } - if (link->u.mgd.csa_waiting_bcn) - ieee80211_chswitch_post_beacon(link); - /* * Update beacon timing and dtim count on every beacon appearance. This * will allow the driver to use the most updated values. Do it before @@ -6378,21 +6429,14 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, goto free; } - if (WARN_ON(!link->conf->chandef.chan)) + if (WARN_ON(!link->conf->chanreq.oper.chan)) goto free; - sband = local->hw.wiphy->bands[link->conf->chandef.chan->band]; + sband = local->hw.wiphy->bands[link->conf->chanreq.oper.chan->band]; changed |= ieee80211_recalc_twt_req(sdata, sband, link, link_sta, elems); - if (ieee80211_config_bw(link, elems->ht_cap_elem, - elems->vht_cap_elem, elems->ht_operation, - elems->vht_operation, elems->he_operation, - elems->eht_operation, - elems->s1g_oper, bssid, &changed)) { - sdata_info(sdata, - "failed to follow AP %pM bandwidth change, disconnect\n", - bssid); + if (ieee80211_config_bw(link, elems, true, &changed)) { ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, WLAN_REASON_DEAUTH_LEAVING, true, deauth_buf); @@ -6414,21 +6458,6 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, elems->pwr_constr_elem, elems->cisco_dtpc_elem); - if (elems->eht_operation && - !(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_EHT)) { - if (!ieee80211_config_puncturing(link, elems->eht_operation, - &changed)) { - ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, - WLAN_REASON_DEAUTH_LEAVING, - true, deauth_buf); - ieee80211_report_disconnect(sdata, deauth_buf, - sizeof(deauth_buf), true, - WLAN_REASON_DEAUTH_LEAVING, - false); - goto free; - } - } - ieee80211_ml_reconfiguration(sdata, elems); ieee80211_process_adv_ttlm(sdata, elems, le64_to_cpu(mgmt->u.beacon.timestamp)); @@ -6438,6 +6467,430 @@ free: kfree(elems); } +static void ieee80211_apply_neg_ttlm(struct ieee80211_sub_if_data *sdata, + struct ieee80211_neg_ttlm neg_ttlm) +{ + u16 new_active_links, new_dormant_links, new_suspended_links, map = 0; + u8 i; + + for (i = 0; i < IEEE80211_TTLM_NUM_TIDS; i++) + map |= neg_ttlm.downlink[i] | neg_ttlm.uplink[i]; + + /* If there is an active TTLM, unset previously suspended links */ + if (sdata->vif.neg_ttlm.valid) + sdata->vif.dormant_links &= ~sdata->vif.suspended_links; + + /* exclude links that are already disabled by advertised TTLM */ + new_active_links = + map & sdata->vif.valid_links & ~sdata->vif.dormant_links; + new_suspended_links = + (~map & sdata->vif.valid_links) & ~sdata->vif.dormant_links; + new_dormant_links = sdata->vif.dormant_links | new_suspended_links; + if (ieee80211_ttlm_set_links(sdata, new_active_links, + new_dormant_links, new_suspended_links)) + return; + + sdata->vif.neg_ttlm = neg_ttlm; + sdata->vif.neg_ttlm.valid = true; +} + +static void ieee80211_neg_ttlm_timeout_work(struct wiphy *wiphy, + struct wiphy_work *work) +{ + struct ieee80211_sub_if_data *sdata = + container_of(work, struct ieee80211_sub_if_data, + u.mgd.neg_ttlm_timeout_work.work); + + sdata_info(sdata, + "No negotiated TTLM response from AP, disconnecting.\n"); + + __ieee80211_disconnect(sdata); +} + +static void +ieee80211_neg_ttlm_add_suggested_map(struct sk_buff *skb, + struct ieee80211_neg_ttlm *neg_ttlm) +{ + u8 i, direction[IEEE80211_TTLM_MAX_CNT]; + + if (memcmp(neg_ttlm->downlink, neg_ttlm->uplink, + sizeof(neg_ttlm->downlink))) { + direction[0] = IEEE80211_TTLM_DIRECTION_DOWN; + direction[1] = IEEE80211_TTLM_DIRECTION_UP; + } else { + direction[0] = IEEE80211_TTLM_DIRECTION_BOTH; + } + + for (i = 0; i < ARRAY_SIZE(direction); i++) { + u8 tid, len, map_ind = 0, *len_pos, *map_ind_pos, *pos; + __le16 map; + + len = sizeof(struct ieee80211_ttlm_elem) + 1 + 1; + + pos = skb_put(skb, len + 2); + *pos++ = WLAN_EID_EXTENSION; + len_pos = pos++; + *pos++ = WLAN_EID_EXT_TID_TO_LINK_MAPPING; + *pos++ = direction[i]; + map_ind_pos = pos++; + for (tid = 0; tid < IEEE80211_TTLM_NUM_TIDS; tid++) { + map = direction[i] == IEEE80211_TTLM_DIRECTION_UP ? + cpu_to_le16(neg_ttlm->uplink[tid]) : + cpu_to_le16(neg_ttlm->downlink[tid]); + if (!map) + continue; + + len += 2; + map_ind |= BIT(tid); + skb_put_data(skb, &map, sizeof(map)); + } + + *map_ind_pos = map_ind; + *len_pos = len; + + if (direction[i] == IEEE80211_TTLM_DIRECTION_BOTH) + break; + } +} + +static void +ieee80211_send_neg_ttlm_req(struct ieee80211_sub_if_data *sdata, + struct ieee80211_neg_ttlm *neg_ttlm, + u8 dialog_token) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_mgmt *mgmt; + struct sk_buff *skb; + int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.ttlm_req); + int ttlm_max_len = 2 + 1 + sizeof(struct ieee80211_ttlm_elem) + 1 + + 2 * 2 * IEEE80211_TTLM_NUM_TIDS; + + skb = dev_alloc_skb(local->tx_headroom + hdr_len + ttlm_max_len); + if (!skb) + return; + + skb_reserve(skb, local->tx_headroom); + mgmt = skb_put_zero(skb, hdr_len); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); + + mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; + mgmt->u.action.u.ttlm_req.action_code = + WLAN_PROTECTED_EHT_ACTION_TTLM_REQ; + mgmt->u.action.u.ttlm_req.dialog_token = dialog_token; + ieee80211_neg_ttlm_add_suggested_map(skb, neg_ttlm); + ieee80211_tx_skb(sdata, skb); +} + +int ieee80211_req_neg_ttlm(struct ieee80211_sub_if_data *sdata, + struct cfg80211_ttlm_params *params) +{ + struct ieee80211_neg_ttlm neg_ttlm = {}; + u8 i; + + if (!ieee80211_vif_is_mld(&sdata->vif) || + !(sdata->vif.cfg.mld_capa_op & + IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP)) + return -EINVAL; + + for (i = 0; i < IEEE80211_TTLM_NUM_TIDS; i++) { + if ((params->dlink[i] & ~sdata->vif.valid_links) || + (params->ulink[i] & ~sdata->vif.valid_links)) + return -EINVAL; + + neg_ttlm.downlink[i] = params->dlink[i]; + neg_ttlm.uplink[i] = params->ulink[i]; + } + + if (drv_can_neg_ttlm(sdata->local, sdata, &neg_ttlm) != + NEG_TTLM_RES_ACCEPT) + return -EINVAL; + + ieee80211_apply_neg_ttlm(sdata, neg_ttlm); + sdata->u.mgd.dialog_token_alloc++; + ieee80211_send_neg_ttlm_req(sdata, &sdata->vif.neg_ttlm, + sdata->u.mgd.dialog_token_alloc); + wiphy_delayed_work_cancel(sdata->local->hw.wiphy, + &sdata->u.mgd.neg_ttlm_timeout_work); + wiphy_delayed_work_queue(sdata->local->hw.wiphy, + &sdata->u.mgd.neg_ttlm_timeout_work, + IEEE80211_NEG_TTLM_REQ_TIMEOUT); + return 0; +} + +static void +ieee80211_send_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, + enum ieee80211_neg_ttlm_res ttlm_res, + u8 dialog_token, + struct ieee80211_neg_ttlm *neg_ttlm) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_mgmt *mgmt; + struct sk_buff *skb; + int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.ttlm_res); + int ttlm_max_len = 2 + 1 + sizeof(struct ieee80211_ttlm_elem) + 1 + + 2 * 2 * IEEE80211_TTLM_NUM_TIDS; + + skb = dev_alloc_skb(local->tx_headroom + hdr_len + ttlm_max_len); + if (!skb) + return; + + skb_reserve(skb, local->tx_headroom); + mgmt = skb_put_zero(skb, hdr_len); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); + + mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; + mgmt->u.action.u.ttlm_res.action_code = + WLAN_PROTECTED_EHT_ACTION_TTLM_RES; + mgmt->u.action.u.ttlm_res.dialog_token = dialog_token; + switch (ttlm_res) { + default: + WARN_ON(1); + fallthrough; + case NEG_TTLM_RES_REJECT: + mgmt->u.action.u.ttlm_res.status_code = + WLAN_STATUS_DENIED_TID_TO_LINK_MAPPING; + break; + case NEG_TTLM_RES_ACCEPT: + mgmt->u.action.u.ttlm_res.status_code = WLAN_STATUS_SUCCESS; + break; + case NEG_TTLM_RES_SUGGEST_PREFERRED: + mgmt->u.action.u.ttlm_res.status_code = + WLAN_STATUS_PREF_TID_TO_LINK_MAPPING_SUGGESTED; + ieee80211_neg_ttlm_add_suggested_map(skb, neg_ttlm); + break; + } + + ieee80211_tx_skb(sdata, skb); +} + +static int +ieee80211_parse_neg_ttlm(struct ieee80211_sub_if_data *sdata, + const struct ieee80211_ttlm_elem *ttlm, + struct ieee80211_neg_ttlm *neg_ttlm, + u8 *direction) +{ + u8 control, link_map_presence, map_size, tid; + u8 *pos; + + /* The element size was already validated in + * ieee80211_tid_to_link_map_size_ok() + */ + pos = (void *)ttlm->optional; + + control = ttlm->control; + + /* mapping switch time and expected duration fields are not expected + * in case of negotiated TTLM + */ + if (control & (IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT | + IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT)) { + mlme_dbg(sdata, + "Invalid TTLM element in negotiated TTLM request\n"); + return -EINVAL; + } + + if (control & IEEE80211_TTLM_CONTROL_DEF_LINK_MAP) { + for (tid = 0; tid < IEEE80211_TTLM_NUM_TIDS; tid++) { + neg_ttlm->downlink[tid] = sdata->vif.valid_links; + neg_ttlm->uplink[tid] = sdata->vif.valid_links; + } + *direction = IEEE80211_TTLM_DIRECTION_BOTH; + return 0; + } + + *direction = u8_get_bits(control, IEEE80211_TTLM_CONTROL_DIRECTION); + if (*direction != IEEE80211_TTLM_DIRECTION_DOWN && + *direction != IEEE80211_TTLM_DIRECTION_UP && + *direction != IEEE80211_TTLM_DIRECTION_BOTH) + return -EINVAL; + + link_map_presence = *pos; + pos++; + + if (control & IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE) + map_size = 1; + else + map_size = 2; + + for (tid = 0; tid < IEEE80211_TTLM_NUM_TIDS; tid++) { + u16 map; + + if (link_map_presence & BIT(tid)) { + map = ieee80211_get_ttlm(map_size, pos); + if (!map) { + mlme_dbg(sdata, + "No active links for TID %d", tid); + return -EINVAL; + } + } else { + map = 0; + } + + switch (*direction) { + case IEEE80211_TTLM_DIRECTION_BOTH: + neg_ttlm->downlink[tid] = map; + neg_ttlm->uplink[tid] = map; + break; + case IEEE80211_TTLM_DIRECTION_DOWN: + neg_ttlm->downlink[tid] = map; + break; + case IEEE80211_TTLM_DIRECTION_UP: + neg_ttlm->uplink[tid] = map; + break; + default: + return -EINVAL; + } + pos += map_size; + } + return 0; +} + +void ieee80211_process_neg_ttlm_req(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len) +{ + u8 dialog_token, direction[IEEE80211_TTLM_MAX_CNT] = {}, i; + size_t ies_len; + enum ieee80211_neg_ttlm_res ttlm_res = NEG_TTLM_RES_ACCEPT; + struct ieee802_11_elems *elems = NULL; + struct ieee80211_neg_ttlm neg_ttlm = {}; + + BUILD_BUG_ON(ARRAY_SIZE(direction) != ARRAY_SIZE(elems->ttlm)); + + if (!ieee80211_vif_is_mld(&sdata->vif)) + return; + + dialog_token = mgmt->u.action.u.ttlm_req.dialog_token; + ies_len = len - offsetof(struct ieee80211_mgmt, + u.action.u.ttlm_req.variable); + elems = ieee802_11_parse_elems(mgmt->u.action.u.ttlm_req.variable, + ies_len, true, NULL); + if (!elems) { + ttlm_res = NEG_TTLM_RES_REJECT; + goto out; + } + + for (i = 0; i < elems->ttlm_num; i++) { + if (ieee80211_parse_neg_ttlm(sdata, elems->ttlm[i], + &neg_ttlm, &direction[i]) || + (direction[i] == IEEE80211_TTLM_DIRECTION_BOTH && + elems->ttlm_num != 1)) { + ttlm_res = NEG_TTLM_RES_REJECT; + goto out; + } + } + + if (!elems->ttlm_num || + (elems->ttlm_num == 2 && direction[0] == direction[1])) { + ttlm_res = NEG_TTLM_RES_REJECT; + goto out; + } + + for (i = 0; i < IEEE80211_TTLM_NUM_TIDS; i++) { + if ((neg_ttlm.downlink[i] && + (neg_ttlm.downlink[i] & ~sdata->vif.valid_links)) || + (neg_ttlm.uplink[i] && + (neg_ttlm.uplink[i] & ~sdata->vif.valid_links))) { + ttlm_res = NEG_TTLM_RES_REJECT; + goto out; + } + } + + ttlm_res = drv_can_neg_ttlm(sdata->local, sdata, &neg_ttlm); + + if (ttlm_res != NEG_TTLM_RES_ACCEPT) + goto out; + + ieee80211_apply_neg_ttlm(sdata, neg_ttlm); +out: + kfree(elems); + ieee80211_send_neg_ttlm_res(sdata, ttlm_res, dialog_token, &neg_ttlm); +} + +void ieee80211_process_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len) +{ + if (!ieee80211_vif_is_mld(&sdata->vif) || + mgmt->u.action.u.ttlm_req.dialog_token != + sdata->u.mgd.dialog_token_alloc) + return; + + wiphy_delayed_work_cancel(sdata->local->hw.wiphy, + &sdata->u.mgd.neg_ttlm_timeout_work); + + /* MLD station sends a TID to link mapping request, mainly to handle + * BTM (BSS transition management) request, in which case it needs to + * restrict the active links set. + * In this case it's not expected that the MLD AP will reject the + * negotiated TTLM request. + * This can be better implemented in the future, to handle request + * rejections. + */ + if (mgmt->u.action.u.ttlm_res.status_code != WLAN_STATUS_SUCCESS) + __ieee80211_disconnect(sdata); +} + +static void ieee80211_teardown_ttlm_work(struct wiphy *wiphy, + struct wiphy_work *work) +{ + u16 new_dormant_links; + struct ieee80211_sub_if_data *sdata = + container_of(work, struct ieee80211_sub_if_data, + u.mgd.teardown_ttlm_work); + + if (!sdata->vif.neg_ttlm.valid) + return; + + memset(&sdata->vif.neg_ttlm, 0, sizeof(sdata->vif.neg_ttlm)); + new_dormant_links = + sdata->vif.dormant_links & ~sdata->vif.suspended_links; + sdata->vif.suspended_links = 0; + ieee80211_vif_set_links(sdata, sdata->vif.valid_links, + new_dormant_links); + ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_MLD_TTLM | + BSS_CHANGED_MLD_VALID_LINKS); +} + +void ieee80211_send_teardown_neg_ttlm(struct ieee80211_vif *vif) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_local *local = sdata->local; + struct ieee80211_mgmt *mgmt; + struct sk_buff *skb; + int frame_len = offsetofend(struct ieee80211_mgmt, + u.action.u.ttlm_tear_down); + struct ieee80211_tx_info *info; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + frame_len); + if (!skb) + return; + + skb_reserve(skb, local->hw.extra_tx_headroom); + mgmt = skb_put_zero(skb, frame_len); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); + + mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; + mgmt->u.action.u.ttlm_tear_down.action_code = + WLAN_PROTECTED_EHT_ACTION_TTLM_TEARDOWN; + + info = IEEE80211_SKB_CB(skb); + info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; + info->status_data = IEEE80211_STATUS_TYPE_NEG_TTLM; + ieee80211_tx_skb(sdata, skb); +} +EXPORT_SYMBOL(ieee80211_send_teardown_neg_ttlm); + void ieee80211_sta_rx_queued_ext(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { @@ -7067,6 +7520,10 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) ieee80211_sta_handle_tspec_ac_params_wk); wiphy_delayed_work_init(&ifmgd->ttlm_work, ieee80211_tid_to_link_map_work); + wiphy_delayed_work_init(&ifmgd->neg_ttlm_timeout_work, + ieee80211_neg_ttlm_timeout_work); + wiphy_work_init(&ifmgd->teardown_ttlm_work, + ieee80211_teardown_ttlm_work); ifmgd->flags = 0; ifmgd->powersave = sdata->wdev.ps; @@ -7076,6 +7533,7 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) spin_lock_init(&ifmgd->teardown_lock); ifmgd->teardown_skb = NULL; ifmgd->orig_teardown_skb = NULL; + ifmgd->mcast_seq_last = IEEE80211_SN_MODULO; } static void ieee80211_recalc_smps_work(struct wiphy *wiphy, @@ -7095,7 +7553,6 @@ void ieee80211_mgd_setup_link(struct ieee80211_link_data *link) unsigned int link_id = link->link_id; link->u.mgd.p2p_noa_index = -1; - link->u.mgd.conn_flags = 0; link->conf->bssid = link->u.mgd.bssid; link->smps_mode = IEEE80211_SMPS_OFF; @@ -7135,6 +7592,7 @@ void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local) static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss, s8 link_id, const u8 *ap_mld_addr, bool assoc, + struct ieee80211_conn_settings *conn, bool override) { struct ieee80211_local *local = sdata->local; @@ -7266,13 +7724,22 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, } if (new_sta || override) { - err = ieee80211_prep_channel(sdata, link, cbss, mlo, - &link->u.mgd.conn_flags); + /* + * Only set this if we're also going to calculate the AP + * settings etc., otherwise this was set before in a + * previous call. Note override is set to %true in assoc + * if the settings were changed. + */ + link->u.mgd.conn = *conn; + err = ieee80211_prep_channel(sdata, link, link->link_id, cbss, + mlo, &link->u.mgd.conn); if (err) { if (new_sta) sta_info_free(local, new_sta); goto out_err; } + /* pass out for use in assoc */ + *conn = link->u.mgd.conn; } if (new_sta) { @@ -7294,7 +7761,7 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, sdata_info(sdata, "failed to insert STA entry for the AP (error %d)\n", err); - goto out_err; + goto out_release_chan; } } else WARN_ON_ONCE(!ether_addr_equal(link->u.mgd.bssid, cbss->bssid)); @@ -7305,8 +7772,9 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, return 0; +out_release_chan: + ieee80211_link_release_channel(link); out_err: - ieee80211_link_release_channel(&sdata->deflink); ieee80211_vif_set_links(sdata, 0, 0); return err; } @@ -7387,10 +7855,13 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_mgd_auth_data *auth_data; + struct ieee80211_conn_settings conn; struct ieee80211_link_data *link; + struct ieee80211_supported_band *sband; + struct ieee80211_bss *bss; u16 auth_alg; int err; - bool cont_auth; + bool cont_auth, wmm_used; lockdep_assert_wiphy(sdata->local->hw.wiphy); @@ -7521,8 +7992,17 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, /* needed for transmitting the auth frame(s) properly */ memcpy(sdata->vif.cfg.ap_addr, auth_data->ap_addr, ETH_ALEN); + bss = (void *)req->bss->priv; + wmm_used = bss->wmm_used && (local->hw.queues >= IEEE80211_NUM_ACS); + + sband = local->hw.wiphy->bands[req->bss->channel->band]; + + ieee80211_determine_our_sta_mode_auth(sdata, sband, req, wmm_used, + &conn); + err = ieee80211_prep_connection(sdata, req->bss, req->link_id, - req->ap_mld_addr, cont_auth, false); + req->ap_mld_addr, cont_auth, + &conn, false); if (err) goto err_clear; @@ -7561,38 +8041,33 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, return err; } -static ieee80211_conn_flags_t +static void ieee80211_setup_assoc_link(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgd_assoc_data *assoc_data, struct cfg80211_assoc_request *req, - ieee80211_conn_flags_t conn_flags, + struct ieee80211_conn_settings *conn, unsigned int link_id) { struct ieee80211_local *local = sdata->local; const struct cfg80211_bss_ies *bss_ies; struct ieee80211_supported_band *sband; - const struct element *ht_elem, *vht_elem; struct ieee80211_link_data *link; struct cfg80211_bss *cbss; struct ieee80211_bss *bss; - bool is_5ghz, is_6ghz; cbss = assoc_data->link[link_id].bss; if (WARN_ON(!cbss)) - return 0; + return; bss = (void *)cbss->priv; sband = local->hw.wiphy->bands[cbss->channel->band]; if (WARN_ON(!sband)) - return 0; + return; link = sdata_dereference(sdata->link[link_id], sdata); if (WARN_ON(!link)) - return 0; - - is_5ghz = cbss->channel->band == NL80211_BAND_5GHZ; - is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ; + return; /* for MLO connections assume advertising all rates is OK */ if (!req->ap_mld_addr) { @@ -7609,40 +8084,18 @@ ieee80211_setup_assoc_link(struct ieee80211_sub_if_data *sdata, assoc_data->ie_pos += req->links[link_id].elems_len; } - rcu_read_lock(); - ht_elem = ieee80211_bss_get_elem(cbss, WLAN_EID_HT_OPERATION); - if (ht_elem && ht_elem->datalen >= sizeof(struct ieee80211_ht_operation)) - assoc_data->link[link_id].ap_ht_param = - ((struct ieee80211_ht_operation *)(ht_elem->data))->ht_param; - else if (!is_6ghz) - conn_flags |= IEEE80211_CONN_DISABLE_HT; - vht_elem = ieee80211_bss_get_elem(cbss, WLAN_EID_VHT_CAPABILITY); - if (vht_elem && vht_elem->datalen >= sizeof(struct ieee80211_vht_cap)) { - memcpy(&assoc_data->link[link_id].ap_vht_cap, vht_elem->data, - sizeof(struct ieee80211_vht_cap)); - } else if (is_5ghz) { - link_info(link, - "VHT capa missing/short, disabling VHT/HE/EHT\n"); - conn_flags |= IEEE80211_CONN_DISABLE_VHT | - IEEE80211_CONN_DISABLE_HE | - IEEE80211_CONN_DISABLE_EHT; - } - rcu_read_unlock(); - link->u.mgd.beacon_crc_valid = false; link->u.mgd.dtim_period = 0; link->u.mgd.have_beacon = false; - /* override HT/VHT configuration only if the AP and we support it */ - if (!(conn_flags & IEEE80211_CONN_DISABLE_HT)) { + /* override HT configuration only if the AP and we support it */ + if (conn->mode >= IEEE80211_CONN_MODE_HT) { struct ieee80211_sta_ht_cap sta_ht_cap; memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap)); ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); } - link->conf->eht_puncturing = 0; - rcu_read_lock(); bss_ies = rcu_dereference(cbss->beacon_ies); if (bss_ies) { @@ -7663,7 +8116,6 @@ ieee80211_setup_assoc_link(struct ieee80211_sub_if_data *sdata, } if (bss_ies) { - const struct ieee80211_eht_operation *eht_oper; const struct element *elem; elem = cfg80211_find_ext_elem(WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION, @@ -7680,32 +8132,6 @@ ieee80211_setup_assoc_link(struct ieee80211_sub_if_data *sdata, link->conf->ema_ap = true; else link->conf->ema_ap = false; - - elem = cfg80211_find_ext_elem(WLAN_EID_EXT_EHT_OPERATION, - bss_ies->data, bss_ies->len); - eht_oper = (const void *)(elem->data + 1); - - if (elem && - ieee80211_eht_oper_size_ok((const void *)(elem->data + 1), - elem->datalen - 1) && - (eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT) && - (eht_oper->params & IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT)) { - const struct ieee80211_eht_operation_info *info = - (void *)eht_oper->optional; - const u8 *disable_subchannel_bitmap = info->optional; - u16 bitmap; - - bitmap = get_unaligned_le16(disable_subchannel_bitmap); - if (cfg80211_valid_disable_subchannel_bitmap(&bitmap, - &link->conf->chandef) && - !(bitmap && ieee80211_hw_check(&local->hw, DISALLOW_PUNCTURING))) - ieee80211_handle_puncturing_bitmap(link, - eht_oper, - bitmap, - NULL); - else - conn_flags |= IEEE80211_CONN_DISABLE_EHT; - } } rcu_read_unlock(); @@ -7732,8 +8158,67 @@ ieee80211_setup_assoc_link(struct ieee80211_sub_if_data *sdata, } else { link->smps_mode = link->u.mgd.req_smps; } +} + +static int +ieee80211_mgd_get_ap_ht_vht_capa(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgd_assoc_data *assoc_data, + int link_id) +{ + struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; + enum nl80211_band band = cbss->channel->band; + struct ieee80211_supported_band *sband; + const struct element *elem; + int err; + + /* neither HT nor VHT elements used on 6 GHz */ + if (band == NL80211_BAND_6GHZ) + return 0; + + if (assoc_data->link[link_id].conn.mode < IEEE80211_CONN_MODE_HT) + return 0; + + rcu_read_lock(); + elem = ieee80211_bss_get_elem(cbss, WLAN_EID_HT_OPERATION); + if (!elem || elem->datalen < sizeof(struct ieee80211_ht_operation)) { + mlme_link_id_dbg(sdata, link_id, "no HT operation on BSS %pM\n", + cbss->bssid); + err = -EINVAL; + goto out_rcu; + } + assoc_data->link[link_id].ap_ht_param = + ((struct ieee80211_ht_operation *)(elem->data))->ht_param; + rcu_read_unlock(); - return conn_flags; + if (assoc_data->link[link_id].conn.mode < IEEE80211_CONN_MODE_VHT) + return 0; + + /* some drivers want to support VHT on 2.4 GHz even */ + sband = sdata->local->hw.wiphy->bands[band]; + if (!sband->vht_cap.vht_supported) + return 0; + + rcu_read_lock(); + elem = ieee80211_bss_get_elem(cbss, WLAN_EID_VHT_CAPABILITY); + /* but even then accept it not being present on the AP */ + if (!elem && band == NL80211_BAND_2GHZ) { + err = 0; + goto out_rcu; + } + if (!elem || elem->datalen < sizeof(struct ieee80211_vht_cap)) { + mlme_link_id_dbg(sdata, link_id, "no VHT capa on BSS %pM\n", + cbss->bssid); + err = -EINVAL; + goto out_rcu; + } + memcpy(&assoc_data->link[link_id].ap_vht_cap, elem->data, + sizeof(struct ieee80211_vht_cap)); + rcu_read_unlock(); + + return 0; +out_rcu: + rcu_read_unlock(); + return err; } int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, @@ -7745,11 +8230,10 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgd_assoc_data *assoc_data; const struct element *ssid_elem; struct ieee80211_vif_cfg *vif_cfg = &sdata->vif.cfg; - ieee80211_conn_flags_t conn_flags = 0; struct ieee80211_link_data *link; struct cfg80211_bss *cbss; - struct ieee80211_bss *bss; - bool override; + bool override, uapsd_supported; + bool match_auth; int i, err; size_t size = sizeof(*assoc_data) + req->ie_len; @@ -7768,44 +8252,26 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, if (ieee80211_mgd_csa_in_process(sdata, cbss)) { sdata_info(sdata, "AP is in CSA process, reject assoc\n"); - kfree(assoc_data); - return -EINVAL; + err = -EINVAL; + goto err_free; } rcu_read_lock(); ssid_elem = ieee80211_bss_get_elem(cbss, WLAN_EID_SSID); if (!ssid_elem || ssid_elem->datalen > sizeof(assoc_data->ssid)) { rcu_read_unlock(); - kfree(assoc_data); - return -EINVAL; + err = -EINVAL; + goto err_free; } memcpy(assoc_data->ssid, ssid_elem->data, ssid_elem->datalen); assoc_data->ssid_len = ssid_elem->datalen; - memcpy(vif_cfg->ssid, assoc_data->ssid, assoc_data->ssid_len); - vif_cfg->ssid_len = assoc_data->ssid_len; rcu_read_unlock(); - if (req->ap_mld_addr) { - for (i = 0; i < IEEE80211_MLD_MAX_NUM_LINKS; i++) { - if (!req->links[i].bss) - continue; - link = sdata_dereference(sdata->link[i], sdata); - if (link) - ether_addr_copy(assoc_data->link[i].addr, - link->conf->addr); - else - eth_random_addr(assoc_data->link[i].addr); - } - } else { - memcpy(assoc_data->link[0].addr, sdata->vif.addr, ETH_ALEN); - } - - assoc_data->s1g = cbss->channel->band == NL80211_BAND_S1GHZ; - - memcpy(assoc_data->ap_addr, - req->ap_mld_addr ?: req->bss->bssid, - ETH_ALEN); + if (req->ap_mld_addr) + memcpy(assoc_data->ap_addr, req->ap_mld_addr, ETH_ALEN); + else + memcpy(assoc_data->ap_addr, cbss->bssid, ETH_ALEN); if (ifmgd->associated) { u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; @@ -7823,98 +8289,148 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, false); } - if (ifmgd->auth_data && !ifmgd->auth_data->done) { - err = -EBUSY; - goto err_free; - } + memcpy(&ifmgd->ht_capa, &req->ht_capa, sizeof(ifmgd->ht_capa)); + memcpy(&ifmgd->ht_capa_mask, &req->ht_capa_mask, + sizeof(ifmgd->ht_capa_mask)); - if (ifmgd->assoc_data) { - err = -EBUSY; - goto err_free; - } + memcpy(&ifmgd->vht_capa, &req->vht_capa, sizeof(ifmgd->vht_capa)); + memcpy(&ifmgd->vht_capa_mask, &req->vht_capa_mask, + sizeof(ifmgd->vht_capa_mask)); - if (ifmgd->auth_data) { - bool match; + memcpy(&ifmgd->s1g_capa, &req->s1g_capa, sizeof(ifmgd->s1g_capa)); + memcpy(&ifmgd->s1g_capa_mask, &req->s1g_capa_mask, + sizeof(ifmgd->s1g_capa_mask)); - /* keep sta info, bssid if matching */ - match = ether_addr_equal(ifmgd->auth_data->ap_addr, - assoc_data->ap_addr) && - ifmgd->auth_data->link_id == req->link_id; + /* keep some setup (AP STA, channel, ...) if matching */ + match_auth = ifmgd->auth_data && + ether_addr_equal(ifmgd->auth_data->ap_addr, + assoc_data->ap_addr) && + ifmgd->auth_data->link_id == req->link_id; - /* Cleanup is delayed if auth_data matches */ - if (!match) - ieee80211_destroy_auth_data(sdata, false); - } + if (req->ap_mld_addr) { + uapsd_supported = true; - /* prepare assoc data */ + if (req->flags & (ASSOC_REQ_DISABLE_HT | + ASSOC_REQ_DISABLE_VHT | + ASSOC_REQ_DISABLE_HE | + ASSOC_REQ_DISABLE_EHT)) { + err = -EINVAL; + goto err_free; + } - bss = (void *)cbss->priv; - assoc_data->wmm = bss->wmm_used && - (local->hw.queues >= IEEE80211_NUM_ACS); + for (i = 0; i < IEEE80211_MLD_MAX_NUM_LINKS; i++) { + struct ieee80211_supported_band *sband; + struct cfg80211_bss *link_cbss = req->links[i].bss; + struct ieee80211_bss *bss; - /* - * IEEE802.11n does not allow TKIP/WEP as pairwise ciphers in HT mode. - * We still associate in non-HT mode (11a/b/g) if any one of these - * ciphers is configured as pairwise. - * We can set this to true for non-11n hardware, that'll be checked - * separately along with the peer capabilities. - */ - for (i = 0; i < req->crypto.n_ciphers_pairwise; i++) { - if (req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP40 || - req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_TKIP || - req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP104) { - conn_flags |= IEEE80211_CONN_DISABLE_HT; - conn_flags |= IEEE80211_CONN_DISABLE_VHT; - conn_flags |= IEEE80211_CONN_DISABLE_HE; - conn_flags |= IEEE80211_CONN_DISABLE_EHT; - netdev_info(sdata->dev, - "disabling HT/VHT/HE due to WEP/TKIP use\n"); + if (!link_cbss) + continue; + + bss = (void *)link_cbss->priv; + + if (!bss->wmm_used) { + err = -EINVAL; + req->links[i].error = err; + goto err_free; + } + + if (link_cbss->channel->band == NL80211_BAND_S1GHZ) { + err = -EINVAL; + req->links[i].error = err; + goto err_free; + } + + link = sdata_dereference(sdata->link[i], sdata); + if (link) + ether_addr_copy(assoc_data->link[i].addr, + link->conf->addr); + else + eth_random_addr(assoc_data->link[i].addr); + sband = local->hw.wiphy->bands[link_cbss->channel->band]; + + if (match_auth && i == assoc_link_id && link) + assoc_data->link[i].conn = link->u.mgd.conn; + else + assoc_data->link[i].conn = + ieee80211_conn_settings_unlimited; + ieee80211_determine_our_sta_mode_assoc(sdata, sband, + req, true, i, + &assoc_data->link[i].conn); + assoc_data->link[i].bss = link_cbss; + assoc_data->link[i].disabled = req->links[i].disabled; + + if (!bss->uapsd_supported) + uapsd_supported = false; + + if (assoc_data->link[i].conn.mode < IEEE80211_CONN_MODE_EHT) { + err = -EINVAL; + req->links[i].error = err; + goto err_free; + } + + err = ieee80211_mgd_get_ap_ht_vht_capa(sdata, + assoc_data, i); + if (err) { + err = -EINVAL; + req->links[i].error = err; + goto err_free; + } } - } - /* also disable HT/VHT/HE/EHT if the AP doesn't use WMM */ - if (!bss->wmm_used) { - conn_flags |= IEEE80211_CONN_DISABLE_HT; - conn_flags |= IEEE80211_CONN_DISABLE_VHT; - conn_flags |= IEEE80211_CONN_DISABLE_HE; - conn_flags |= IEEE80211_CONN_DISABLE_EHT; - netdev_info(sdata->dev, - "disabling HT/VHT/HE as WMM/QoS is not supported by the AP\n"); - } + assoc_data->wmm = true; + } else { + struct ieee80211_supported_band *sband; + struct ieee80211_bss *bss = (void *)cbss->priv; - if (req->flags & ASSOC_REQ_DISABLE_HT) { - mlme_dbg(sdata, "HT disabled by flag, disabling HT/VHT/HE\n"); - conn_flags |= IEEE80211_CONN_DISABLE_HT; - conn_flags |= IEEE80211_CONN_DISABLE_VHT; - conn_flags |= IEEE80211_CONN_DISABLE_HE; - conn_flags |= IEEE80211_CONN_DISABLE_EHT; - } + memcpy(assoc_data->link[0].addr, sdata->vif.addr, ETH_ALEN); + assoc_data->s1g = cbss->channel->band == NL80211_BAND_S1GHZ; - if (req->flags & ASSOC_REQ_DISABLE_VHT) { - mlme_dbg(sdata, "VHT disabled by flag, disabling VHT\n"); - conn_flags |= IEEE80211_CONN_DISABLE_VHT; - } + assoc_data->wmm = bss->wmm_used && + (local->hw.queues >= IEEE80211_NUM_ACS); + + if (cbss->channel->band == NL80211_BAND_6GHZ && + req->flags & (ASSOC_REQ_DISABLE_HT | + ASSOC_REQ_DISABLE_VHT | + ASSOC_REQ_DISABLE_HE)) { + err = -EINVAL; + goto err_free; + } - if (req->flags & ASSOC_REQ_DISABLE_HE) { - mlme_dbg(sdata, "HE disabled by flag, disabling HE/EHT\n"); - conn_flags |= IEEE80211_CONN_DISABLE_HE; - conn_flags |= IEEE80211_CONN_DISABLE_EHT; + sband = local->hw.wiphy->bands[cbss->channel->band]; + + assoc_data->link[0].bss = cbss; + + if (match_auth) + assoc_data->link[0].conn = sdata->deflink.u.mgd.conn; + else + assoc_data->link[0].conn = + ieee80211_conn_settings_unlimited; + ieee80211_determine_our_sta_mode_assoc(sdata, sband, req, + assoc_data->wmm, 0, + &assoc_data->link[0].conn); + + uapsd_supported = bss->uapsd_supported; + + err = ieee80211_mgd_get_ap_ht_vht_capa(sdata, assoc_data, 0); + if (err) + goto err_free; } - if (req->flags & ASSOC_REQ_DISABLE_EHT) - conn_flags |= IEEE80211_CONN_DISABLE_EHT; + assoc_data->spp_amsdu = req->flags & ASSOC_REQ_SPP_AMSDU; - memcpy(&ifmgd->ht_capa, &req->ht_capa, sizeof(ifmgd->ht_capa)); - memcpy(&ifmgd->ht_capa_mask, &req->ht_capa_mask, - sizeof(ifmgd->ht_capa_mask)); + if (ifmgd->auth_data && !ifmgd->auth_data->done) { + err = -EBUSY; + goto err_free; + } - memcpy(&ifmgd->vht_capa, &req->vht_capa, sizeof(ifmgd->vht_capa)); - memcpy(&ifmgd->vht_capa_mask, &req->vht_capa_mask, - sizeof(ifmgd->vht_capa_mask)); + if (ifmgd->assoc_data) { + err = -EBUSY; + goto err_free; + } - memcpy(&ifmgd->s1g_capa, &req->s1g_capa, sizeof(ifmgd->s1g_capa)); - memcpy(&ifmgd->s1g_capa_mask, &req->s1g_capa_mask, - sizeof(ifmgd->s1g_capa_mask)); + /* Cleanup is delayed if auth_data matches */ + if (ifmgd->auth_data && !match_auth) + ieee80211_destroy_auth_data(sdata, false); if (req->ie && req->ie_len) { memcpy(assoc_data->ie, req->ie, req->ie_len); @@ -7946,19 +8462,10 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, assoc_data->assoc_link_id = assoc_link_id; if (req->ap_mld_addr) { - for (i = 0; i < ARRAY_SIZE(assoc_data->link); i++) { - assoc_data->link[i].conn_flags = conn_flags; - assoc_data->link[i].bss = req->links[i].bss; - assoc_data->link[i].disabled = req->links[i].disabled; - } - /* if there was no authentication, set up the link */ err = ieee80211_vif_set_links(sdata, BIT(assoc_link_id), 0); if (err) goto err_clear; - } else { - assoc_data->link[0].conn_flags = conn_flags; - assoc_data->link[0].bss = cbss; } link = sdata_dereference(sdata->link[assoc_link_id], sdata); @@ -7967,19 +8474,21 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, goto err_clear; } - /* keep old conn_flags from ieee80211_prep_channel() from auth */ - conn_flags |= link->u.mgd.conn_flags; - conn_flags |= ieee80211_setup_assoc_link(sdata, assoc_data, req, - conn_flags, assoc_link_id); - override = link->u.mgd.conn_flags != conn_flags; - link->u.mgd.conn_flags |= conn_flags; + override = link->u.mgd.conn.mode != + assoc_data->link[assoc_link_id].conn.mode || + link->u.mgd.conn.bw_limit != + assoc_data->link[assoc_link_id].conn.bw_limit; + link->u.mgd.conn = assoc_data->link[assoc_link_id].conn; + + ieee80211_setup_assoc_link(sdata, assoc_data, req, &link->u.mgd.conn, + assoc_link_id); if (WARN((sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_UAPSD) && ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK), "U-APSD not supported with HW_PS_NULLFUNC_STACK\n")) sdata->vif.driver_flags &= ~IEEE80211_VIF_SUPPORTS_UAPSD; - if (bss->wmm_used && bss->uapsd_supported && + if (assoc_data->wmm && uapsd_supported && (sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_UAPSD)) { assoc_data->uapsd = true; ifmgd->flags |= IEEE80211_STA_UAPSD_ENABLED; @@ -8023,27 +8532,29 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, continue; if (i == assoc_data->assoc_link_id) continue; - /* only calculate the flags, hence link == NULL */ - err = ieee80211_prep_channel(sdata, NULL, + /* only calculate the mode, hence link == NULL */ + err = ieee80211_prep_channel(sdata, NULL, i, assoc_data->link[i].bss, true, - &assoc_data->link[i].conn_flags); + &assoc_data->link[i].conn); if (err) { req->links[i].error = err; goto err_clear; } } + memcpy(vif_cfg->ssid, assoc_data->ssid, assoc_data->ssid_len); + vif_cfg->ssid_len = assoc_data->ssid_len; + /* needed for transmitting the assoc frames properly */ memcpy(sdata->vif.cfg.ap_addr, assoc_data->ap_addr, ETH_ALEN); err = ieee80211_prep_connection(sdata, cbss, req->link_id, - req->ap_mld_addr, true, override); + req->ap_mld_addr, true, + &assoc_data->link[assoc_link_id].conn, + override); if (err) goto err_clear; - assoc_data->link[assoc_data->assoc_link_id].conn_flags = - link->u.mgd.conn_flags; - if (ieee80211_hw_check(&sdata->local->hw, NEED_DTIM_BEFORE_ASSOC)) { const struct cfg80211_bss_ies *beacon_ies; @@ -8204,9 +8715,9 @@ void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata) &ifmgd->csa_connection_drop_work); wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &ifmgd->tdls_peer_del_work); - wiphy_delayed_work_cancel(sdata->local->hw.wiphy, - &ifmgd->ml_reconf_work); wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &ifmgd->ttlm_work); + wiphy_delayed_work_cancel(sdata->local->hw.wiphy, + &ifmgd->neg_ttlm_timeout_work); if (ifmgd->assoc_data) ieee80211_destroy_assoc_data(sdata, ASSOC_TIMEOUT); diff --git a/net/mac80211/ocb.c b/net/mac80211/ocb.c index 449af4e1cc..9ef14e475c 100644 --- a/net/mac80211/ocb.c +++ b/net/mac80211/ocb.c @@ -168,6 +168,7 @@ void ieee80211_ocb_setup_sdata(struct ieee80211_sub_if_data *sdata) int ieee80211_ocb_join(struct ieee80211_sub_if_data *sdata, struct ocb_setup *setup) { + struct ieee80211_chan_req chanreq = { .oper = setup->chandef }; struct ieee80211_local *local = sdata->local; struct ieee80211_if_ocb *ifocb = &sdata->u.ocb; u64 changed = BSS_CHANGED_OCB | BSS_CHANGED_BSSID; @@ -182,7 +183,7 @@ int ieee80211_ocb_join(struct ieee80211_sub_if_data *sdata, sdata->deflink.smps_mode = IEEE80211_SMPS_OFF; sdata->deflink.needed_rx_chains = sdata->local->rx_chains; - err = ieee80211_link_use_channel(&sdata->deflink, &setup->chandef, + err = ieee80211_link_use_channel(&sdata->deflink, &chanreq, IEEE80211_CHANCTX_SHARED); if (err) return err; @@ -207,7 +208,7 @@ int ieee80211_ocb_leave(struct ieee80211_sub_if_data *sdata) lockdep_assert_wiphy(sdata->local->hw.wiphy); ifocb->joined = false; - sta_info_flush(sdata); + sta_info_flush(sdata, -1); spin_lock_bh(&ifocb->incomplete_lock); while (!list_empty(&ifocb->incomplete_stations)) { diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 6c40802025..65e1e9e971 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -86,7 +86,7 @@ void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local) lockdep_assert_wiphy(local->hw.wiphy); - if (WARN_ON(local->use_chanctx)) + if (WARN_ON(!local->emulate_chanctx)) return; /* @@ -136,7 +136,7 @@ void ieee80211_offchannel_return(struct ieee80211_local *local) lockdep_assert_wiphy(local->hw.wiphy); - if (WARN_ON(local->use_chanctx)) + if (WARN_ON(!local->emulate_chanctx)) return; list_for_each_entry(sdata, &local->interfaces, list) { @@ -351,10 +351,13 @@ static void _ieee80211_start_next_roc(struct ieee80211_local *local) * 20 MHz channel width) don't stop all the operations but still * treat it as though the ROC operation started properly, so * other ROC operations won't interfere with this one. + * + * Note: scan can't run, tmp_channel is what we use, so this + * must be the currently active channel. */ - roc->on_channel = roc->chan == local->_oper_chandef.chan && - local->_oper_chandef.width != NL80211_CHAN_WIDTH_5 && - local->_oper_chandef.width != NL80211_CHAN_WIDTH_10; + roc->on_channel = roc->chan == local->hw.conf.chandef.chan && + local->hw.conf.chandef.width != NL80211_CHAN_WIDTH_5 && + local->hw.conf.chandef.width != NL80211_CHAN_WIDTH_10; /* start this ROC */ ieee80211_recalc_idle(local); @@ -363,7 +366,7 @@ static void _ieee80211_start_next_roc(struct ieee80211_local *local) ieee80211_offchannel_stop_vifs(local); local->tmp_channel = roc->chan; - ieee80211_hw_config(local, 0); + ieee80211_hw_conf_chan(local); } wiphy_delayed_work_queue(local->hw.wiphy, &local->roc_work, @@ -426,7 +429,7 @@ static void __ieee80211_roc_work(struct ieee80211_local *local) return; if (!roc->started) { - WARN_ON(local->use_chanctx); + WARN_ON(!local->emulate_chanctx); _ieee80211_start_next_roc(local); } else { on_channel = roc->on_channel; @@ -439,7 +442,7 @@ static void __ieee80211_roc_work(struct ieee80211_local *local) ieee80211_flush_queues(local, NULL, false); local->tmp_channel = NULL; - ieee80211_hw_config(local, 0); + ieee80211_hw_conf_chan(local); ieee80211_offchannel_return(local); } @@ -539,7 +542,7 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, /* this may work, but is untested */ return -EOPNOTSUPP; - if (local->use_chanctx && !local->ops->remain_on_channel) + if (!local->emulate_chanctx && !local->ops->remain_on_channel) return -EOPNOTSUPP; roc = kzalloc(sizeof(*roc), GFP_KERNEL); @@ -894,8 +897,18 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, break; } - if (ether_addr_equal(conf->addr, mgmt->sa)) + if (ether_addr_equal(conf->addr, mgmt->sa)) { + /* If userspace requested Tx on a specific link + * use the same link id if the link bss is matching + * the requested chan. + */ + if (sdata->vif.valid_links && + params->link_id >= 0 && params->link_id == i && + params->chan == chanctx_conf->def.chan) + link_id = i; + break; + } chanctx_conf = NULL; } diff --git a/net/mac80211/parse.c b/net/mac80211/parse.c new file mode 100644 index 0000000000..055a60e909 --- /dev/null +++ b/net/mac80211/parse.c @@ -0,0 +1,971 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2002-2005, Instant802 Networks, Inc. + * Copyright 2005-2006, Devicescape Software, Inc. + * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> + * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> + * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright (C) 2015-2017 Intel Deutschland GmbH + * Copyright (C) 2018-2024 Intel Corporation + * + * element parsing for mac80211 + */ + +#include <net/mac80211.h> +#include <linux/netdevice.h> +#include <linux/export.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/skbuff.h> +#include <linux/etherdevice.h> +#include <linux/if_arp.h> +#include <linux/bitmap.h> +#include <linux/crc32.h> +#include <net/net_namespace.h> +#include <net/cfg80211.h> +#include <net/rtnetlink.h> +#include <kunit/visibility.h> + +#include "ieee80211_i.h" +#include "driver-ops.h" +#include "rate.h" +#include "mesh.h" +#include "wme.h" +#include "led.h" +#include "wep.h" + +struct ieee80211_elems_parse { + /* must be first for kfree to work */ + struct ieee802_11_elems elems; + + /* The basic Multi-Link element in the original elements */ + const struct element *ml_basic_elem; + + /* The reconfiguration Multi-Link element in the original elements */ + const struct element *ml_reconf_elem; + + /* + * scratch buffer that can be used for various element parsing related + * tasks, e.g., element de-fragmentation etc. + */ + size_t scratch_len; + u8 *scratch_pos; + u8 scratch[] __counted_by(scratch_len); +}; + +static void +ieee80211_parse_extension_element(u32 *crc, + const struct element *elem, + struct ieee80211_elems_parse *elems_parse, + struct ieee80211_elems_parse_params *params) +{ + struct ieee802_11_elems *elems = &elems_parse->elems; + const void *data = elem->data + 1; + bool calc_crc = false; + u8 len; + + if (!elem->datalen) + return; + + len = elem->datalen - 1; + + switch (elem->data[0]) { + case WLAN_EID_EXT_HE_MU_EDCA: + if (params->mode < IEEE80211_CONN_MODE_HE) + break; + calc_crc = true; + if (len >= sizeof(*elems->mu_edca_param_set)) + elems->mu_edca_param_set = data; + break; + case WLAN_EID_EXT_HE_CAPABILITY: + if (params->mode < IEEE80211_CONN_MODE_HE) + break; + if (ieee80211_he_capa_size_ok(data, len)) { + elems->he_cap = data; + elems->he_cap_len = len; + } + break; + case WLAN_EID_EXT_HE_OPERATION: + if (params->mode < IEEE80211_CONN_MODE_HE) + break; + calc_crc = true; + if (len >= sizeof(*elems->he_operation) && + len >= ieee80211_he_oper_size(data) - 1) + elems->he_operation = data; + break; + case WLAN_EID_EXT_UORA: + if (params->mode < IEEE80211_CONN_MODE_HE) + break; + if (len >= 1) + elems->uora_element = data; + break; + case WLAN_EID_EXT_MAX_CHANNEL_SWITCH_TIME: + if (len == 3) + elems->max_channel_switch_time = data; + break; + case WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION: + if (len >= sizeof(*elems->mbssid_config_ie)) + elems->mbssid_config_ie = data; + break; + case WLAN_EID_EXT_HE_SPR: + if (params->mode < IEEE80211_CONN_MODE_HE) + break; + if (len >= sizeof(*elems->he_spr) && + len >= ieee80211_he_spr_size(data) - 1) + elems->he_spr = data; + break; + case WLAN_EID_EXT_HE_6GHZ_CAPA: + if (params->mode < IEEE80211_CONN_MODE_HE) + break; + if (len >= sizeof(*elems->he_6ghz_capa)) + elems->he_6ghz_capa = data; + break; + case WLAN_EID_EXT_EHT_CAPABILITY: + if (params->mode < IEEE80211_CONN_MODE_EHT) + break; + if (ieee80211_eht_capa_size_ok(elems->he_cap, + data, len, + params->from_ap)) { + elems->eht_cap = data; + elems->eht_cap_len = len; + } + break; + case WLAN_EID_EXT_EHT_OPERATION: + if (params->mode < IEEE80211_CONN_MODE_EHT) + break; + if (ieee80211_eht_oper_size_ok(data, len)) + elems->eht_operation = data; + calc_crc = true; + break; + case WLAN_EID_EXT_EHT_MULTI_LINK: + if (params->mode < IEEE80211_CONN_MODE_EHT) + break; + calc_crc = true; + + if (ieee80211_mle_size_ok(data, len)) { + const struct ieee80211_multi_link_elem *mle = + (void *)data; + + switch (le16_get_bits(mle->control, + IEEE80211_ML_CONTROL_TYPE)) { + case IEEE80211_ML_CONTROL_TYPE_BASIC: + if (elems_parse->ml_basic_elem) { + elems->parse_error |= + IEEE80211_PARSE_ERR_DUP_NEST_ML_BASIC; + break; + } + elems_parse->ml_basic_elem = elem; + break; + case IEEE80211_ML_CONTROL_TYPE_RECONF: + elems_parse->ml_reconf_elem = elem; + break; + default: + break; + } + } + break; + case WLAN_EID_EXT_BANDWIDTH_INDICATION: + if (params->mode < IEEE80211_CONN_MODE_EHT) + break; + if (ieee80211_bandwidth_indication_size_ok(data, len)) + elems->bandwidth_indication = data; + calc_crc = true; + break; + case WLAN_EID_EXT_TID_TO_LINK_MAPPING: + if (params->mode < IEEE80211_CONN_MODE_EHT) + break; + calc_crc = true; + if (ieee80211_tid_to_link_map_size_ok(data, len) && + elems->ttlm_num < ARRAY_SIZE(elems->ttlm)) { + elems->ttlm[elems->ttlm_num] = (void *)data; + elems->ttlm_num++; + } + break; + } + + if (crc && calc_crc) + *crc = crc32_be(*crc, (void *)elem, elem->datalen + 2); +} + +static u32 +_ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params, + struct ieee80211_elems_parse *elems_parse, + const struct element *check_inherit) +{ + struct ieee802_11_elems *elems = &elems_parse->elems; + const struct element *elem; + bool calc_crc = params->filter != 0; + DECLARE_BITMAP(seen_elems, 256); + u32 crc = params->crc; + + bitmap_zero(seen_elems, 256); + + for_each_element(elem, params->start, params->len) { + const struct element *subelem; + u8 elem_parse_failed; + u8 id = elem->id; + u8 elen = elem->datalen; + const u8 *pos = elem->data; + + if (check_inherit && + !cfg80211_is_element_inherited(elem, + check_inherit)) + continue; + + switch (id) { + case WLAN_EID_SSID: + case WLAN_EID_SUPP_RATES: + case WLAN_EID_FH_PARAMS: + case WLAN_EID_DS_PARAMS: + case WLAN_EID_CF_PARAMS: + case WLAN_EID_TIM: + case WLAN_EID_IBSS_PARAMS: + case WLAN_EID_CHALLENGE: + case WLAN_EID_RSN: + case WLAN_EID_ERP_INFO: + case WLAN_EID_EXT_SUPP_RATES: + case WLAN_EID_HT_CAPABILITY: + case WLAN_EID_HT_OPERATION: + case WLAN_EID_VHT_CAPABILITY: + case WLAN_EID_VHT_OPERATION: + case WLAN_EID_MESH_ID: + case WLAN_EID_MESH_CONFIG: + case WLAN_EID_PEER_MGMT: + case WLAN_EID_PREQ: + case WLAN_EID_PREP: + case WLAN_EID_PERR: + case WLAN_EID_RANN: + case WLAN_EID_CHANNEL_SWITCH: + case WLAN_EID_EXT_CHANSWITCH_ANN: + case WLAN_EID_COUNTRY: + case WLAN_EID_PWR_CONSTRAINT: + case WLAN_EID_TIMEOUT_INTERVAL: + case WLAN_EID_SECONDARY_CHANNEL_OFFSET: + case WLAN_EID_WIDE_BW_CHANNEL_SWITCH: + case WLAN_EID_CHAN_SWITCH_PARAM: + case WLAN_EID_EXT_CAPABILITY: + case WLAN_EID_CHAN_SWITCH_TIMING: + case WLAN_EID_LINK_ID: + case WLAN_EID_BSS_MAX_IDLE_PERIOD: + case WLAN_EID_RSNX: + case WLAN_EID_S1G_BCN_COMPAT: + case WLAN_EID_S1G_CAPABILITIES: + case WLAN_EID_S1G_OPERATION: + case WLAN_EID_AID_RESPONSE: + case WLAN_EID_S1G_SHORT_BCN_INTERVAL: + /* + * not listing WLAN_EID_CHANNEL_SWITCH_WRAPPER -- it seems possible + * that if the content gets bigger it might be needed more than once + */ + if (test_bit(id, seen_elems)) { + elems->parse_error |= + IEEE80211_PARSE_ERR_DUP_ELEM; + continue; + } + break; + } + + if (calc_crc && id < 64 && (params->filter & (1ULL << id))) + crc = crc32_be(crc, pos - 2, elen + 2); + + elem_parse_failed = 0; + + switch (id) { + case WLAN_EID_LINK_ID: + if (elen + 2 < sizeof(struct ieee80211_tdls_lnkie)) { + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + } + elems->lnk_id = (void *)(pos - 2); + break; + case WLAN_EID_CHAN_SWITCH_TIMING: + if (elen < sizeof(struct ieee80211_ch_switch_timing)) { + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + } + elems->ch_sw_timing = (void *)pos; + break; + case WLAN_EID_EXT_CAPABILITY: + elems->ext_capab = pos; + elems->ext_capab_len = elen; + break; + case WLAN_EID_SSID: + elems->ssid = pos; + elems->ssid_len = elen; + break; + case WLAN_EID_SUPP_RATES: + elems->supp_rates = pos; + elems->supp_rates_len = elen; + break; + case WLAN_EID_DS_PARAMS: + if (elen >= 1) + elems->ds_params = pos; + else + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + case WLAN_EID_TIM: + if (elen >= sizeof(struct ieee80211_tim_ie)) { + elems->tim = (void *)pos; + elems->tim_len = elen; + } else + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + case WLAN_EID_VENDOR_SPECIFIC: + if (elen >= 4 && pos[0] == 0x00 && pos[1] == 0x50 && + pos[2] == 0xf2) { + /* Microsoft OUI (00:50:F2) */ + + if (calc_crc) + crc = crc32_be(crc, pos - 2, elen + 2); + + if (elen >= 5 && pos[3] == 2) { + /* OUI Type 2 - WMM IE */ + if (pos[4] == 0) { + elems->wmm_info = pos; + elems->wmm_info_len = elen; + } else if (pos[4] == 1) { + elems->wmm_param = pos; + elems->wmm_param_len = elen; + } + } + } + break; + case WLAN_EID_RSN: + elems->rsn = pos; + elems->rsn_len = elen; + break; + case WLAN_EID_ERP_INFO: + if (elen >= 1) + elems->erp_info = pos; + else + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + case WLAN_EID_EXT_SUPP_RATES: + elems->ext_supp_rates = pos; + elems->ext_supp_rates_len = elen; + break; + case WLAN_EID_HT_CAPABILITY: + if (params->mode < IEEE80211_CONN_MODE_HT) + break; + if (elen >= sizeof(struct ieee80211_ht_cap)) + elems->ht_cap_elem = (void *)pos; + else + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + case WLAN_EID_HT_OPERATION: + if (params->mode < IEEE80211_CONN_MODE_HT) + break; + if (elen >= sizeof(struct ieee80211_ht_operation)) + elems->ht_operation = (void *)pos; + else + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + case WLAN_EID_VHT_CAPABILITY: + if (params->mode < IEEE80211_CONN_MODE_VHT) + break; + if (elen >= sizeof(struct ieee80211_vht_cap)) + elems->vht_cap_elem = (void *)pos; + else + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + case WLAN_EID_VHT_OPERATION: + if (params->mode < IEEE80211_CONN_MODE_VHT) + break; + if (elen >= sizeof(struct ieee80211_vht_operation)) { + elems->vht_operation = (void *)pos; + if (calc_crc) + crc = crc32_be(crc, pos - 2, elen + 2); + break; + } + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + case WLAN_EID_OPMODE_NOTIF: + if (params->mode < IEEE80211_CONN_MODE_VHT) + break; + if (elen > 0) { + elems->opmode_notif = pos; + if (calc_crc) + crc = crc32_be(crc, pos - 2, elen + 2); + break; + } + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + case WLAN_EID_MESH_ID: + elems->mesh_id = pos; + elems->mesh_id_len = elen; + break; + case WLAN_EID_MESH_CONFIG: + if (elen >= sizeof(struct ieee80211_meshconf_ie)) + elems->mesh_config = (void *)pos; + else + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + case WLAN_EID_PEER_MGMT: + elems->peering = pos; + elems->peering_len = elen; + break; + case WLAN_EID_MESH_AWAKE_WINDOW: + if (elen >= 2) + elems->awake_window = (void *)pos; + break; + case WLAN_EID_PREQ: + elems->preq = pos; + elems->preq_len = elen; + break; + case WLAN_EID_PREP: + elems->prep = pos; + elems->prep_len = elen; + break; + case WLAN_EID_PERR: + elems->perr = pos; + elems->perr_len = elen; + break; + case WLAN_EID_RANN: + if (elen >= sizeof(struct ieee80211_rann_ie)) + elems->rann = (void *)pos; + else + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + case WLAN_EID_CHANNEL_SWITCH: + if (elen != sizeof(struct ieee80211_channel_sw_ie)) { + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + } + elems->ch_switch_ie = (void *)pos; + break; + case WLAN_EID_EXT_CHANSWITCH_ANN: + if (elen != sizeof(struct ieee80211_ext_chansw_ie)) { + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + } + elems->ext_chansw_ie = (void *)pos; + break; + case WLAN_EID_SECONDARY_CHANNEL_OFFSET: + if (params->mode < IEEE80211_CONN_MODE_HT) + break; + if (elen != sizeof(struct ieee80211_sec_chan_offs_ie)) { + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + } + elems->sec_chan_offs = (void *)pos; + break; + case WLAN_EID_CHAN_SWITCH_PARAM: + if (elen < + sizeof(*elems->mesh_chansw_params_ie)) { + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + } + elems->mesh_chansw_params_ie = (void *)pos; + break; + case WLAN_EID_WIDE_BW_CHANNEL_SWITCH: + if (params->mode < IEEE80211_CONN_MODE_VHT) + break; + + if (!params->action) { + elem_parse_failed = + IEEE80211_PARSE_ERR_UNEXPECTED_ELEM; + break; + } + + if (elen < sizeof(*elems->wide_bw_chansw_ie)) { + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + } + elems->wide_bw_chansw_ie = (void *)pos; + break; + case WLAN_EID_CHANNEL_SWITCH_WRAPPER: + if (params->mode < IEEE80211_CONN_MODE_VHT) + break; + if (params->action) { + elem_parse_failed = + IEEE80211_PARSE_ERR_UNEXPECTED_ELEM; + break; + } + /* + * This is a bit tricky, but as we only care about + * a few elements, parse them out manually. + */ + subelem = cfg80211_find_elem(WLAN_EID_WIDE_BW_CHANNEL_SWITCH, + pos, elen); + if (subelem) { + if (subelem->datalen >= sizeof(*elems->wide_bw_chansw_ie)) + elems->wide_bw_chansw_ie = + (void *)subelem->data; + else + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + } + + if (params->mode < IEEE80211_CONN_MODE_EHT) + break; + + subelem = cfg80211_find_ext_elem(WLAN_EID_EXT_BANDWIDTH_INDICATION, + pos, elen); + if (subelem) { + const void *edata = subelem->data + 1; + u8 edatalen = subelem->datalen - 1; + + if (ieee80211_bandwidth_indication_size_ok(edata, + edatalen)) + elems->bandwidth_indication = edata; + else + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + } + break; + case WLAN_EID_COUNTRY: + elems->country_elem = pos; + elems->country_elem_len = elen; + break; + case WLAN_EID_PWR_CONSTRAINT: + if (elen != 1) { + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + } + elems->pwr_constr_elem = pos; + break; + case WLAN_EID_CISCO_VENDOR_SPECIFIC: + /* Lots of different options exist, but we only care + * about the Dynamic Transmit Power Control element. + * First check for the Cisco OUI, then for the DTPC + * tag (0x00). + */ + if (elen < 4) { + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + } + + if (pos[0] != 0x00 || pos[1] != 0x40 || + pos[2] != 0x96 || pos[3] != 0x00) + break; + + if (elen != 6) { + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + } + + if (calc_crc) + crc = crc32_be(crc, pos - 2, elen + 2); + + elems->cisco_dtpc_elem = pos; + break; + case WLAN_EID_ADDBA_EXT: + if (elen < sizeof(struct ieee80211_addba_ext_ie)) { + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + } + elems->addba_ext_ie = (void *)pos; + break; + case WLAN_EID_TIMEOUT_INTERVAL: + if (elen >= sizeof(struct ieee80211_timeout_interval_ie)) + elems->timeout_int = (void *)pos; + else + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + case WLAN_EID_BSS_MAX_IDLE_PERIOD: + if (elen >= sizeof(*elems->max_idle_period_ie)) + elems->max_idle_period_ie = (void *)pos; + break; + case WLAN_EID_RSNX: + elems->rsnx = pos; + elems->rsnx_len = elen; + break; + case WLAN_EID_TX_POWER_ENVELOPE: + if (elen < 1 || + elen > sizeof(struct ieee80211_tx_pwr_env)) + break; + + if (elems->tx_pwr_env_num >= ARRAY_SIZE(elems->tx_pwr_env)) + break; + + elems->tx_pwr_env[elems->tx_pwr_env_num] = (void *)pos; + elems->tx_pwr_env_len[elems->tx_pwr_env_num] = elen; + elems->tx_pwr_env_num++; + break; + case WLAN_EID_EXTENSION: + ieee80211_parse_extension_element(calc_crc ? + &crc : NULL, + elem, elems_parse, + params); + break; + case WLAN_EID_S1G_CAPABILITIES: + if (params->mode != IEEE80211_CONN_MODE_S1G) + break; + if (elen >= sizeof(*elems->s1g_capab)) + elems->s1g_capab = (void *)pos; + else + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + case WLAN_EID_S1G_OPERATION: + if (params->mode != IEEE80211_CONN_MODE_S1G) + break; + if (elen == sizeof(*elems->s1g_oper)) + elems->s1g_oper = (void *)pos; + else + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + case WLAN_EID_S1G_BCN_COMPAT: + if (params->mode != IEEE80211_CONN_MODE_S1G) + break; + if (elen == sizeof(*elems->s1g_bcn_compat)) + elems->s1g_bcn_compat = (void *)pos; + else + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + case WLAN_EID_AID_RESPONSE: + if (params->mode != IEEE80211_CONN_MODE_S1G) + break; + if (elen == sizeof(struct ieee80211_aid_response_ie)) + elems->aid_resp = (void *)pos; + else + elem_parse_failed = + IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; + break; + default: + break; + } + + if (elem_parse_failed) + elems->parse_error |= elem_parse_failed; + else + __set_bit(id, seen_elems); + } + + if (!for_each_element_completed(elem, params->start, params->len)) + elems->parse_error |= IEEE80211_PARSE_ERR_INVALID_END; + + return crc; +} + +static size_t ieee802_11_find_bssid_profile(const u8 *start, size_t len, + struct ieee802_11_elems *elems, + struct cfg80211_bss *bss, + u8 *nontransmitted_profile) +{ + const struct element *elem, *sub; + size_t profile_len = 0; + bool found = false; + + if (!bss || !bss->transmitted_bss) + return profile_len; + + for_each_element_id(elem, WLAN_EID_MULTIPLE_BSSID, start, len) { + if (elem->datalen < 2) + continue; + if (elem->data[0] < 1 || elem->data[0] > 8) + continue; + + for_each_element(sub, elem->data + 1, elem->datalen - 1) { + u8 new_bssid[ETH_ALEN]; + const u8 *index; + + if (sub->id != 0 || sub->datalen < 4) { + /* not a valid BSS profile */ + continue; + } + + if (sub->data[0] != WLAN_EID_NON_TX_BSSID_CAP || + sub->data[1] != 2) { + /* The first element of the + * Nontransmitted BSSID Profile is not + * the Nontransmitted BSSID Capability + * element. + */ + continue; + } + + memset(nontransmitted_profile, 0, len); + profile_len = cfg80211_merge_profile(start, len, + elem, + sub, + nontransmitted_profile, + len); + + /* found a Nontransmitted BSSID Profile */ + index = cfg80211_find_ie(WLAN_EID_MULTI_BSSID_IDX, + nontransmitted_profile, + profile_len); + if (!index || index[1] < 1 || index[2] == 0) { + /* Invalid MBSSID Index element */ + continue; + } + + cfg80211_gen_new_bssid(bss->transmitted_bss->bssid, + elem->data[0], + index[2], + new_bssid); + if (ether_addr_equal(new_bssid, bss->bssid)) { + found = true; + elems->bssid_index_len = index[1]; + elems->bssid_index = (void *)&index[2]; + break; + } + } + } + + return found ? profile_len : 0; +} + +static void +ieee80211_mle_get_sta_prof(struct ieee80211_elems_parse *elems_parse, + u8 link_id) +{ + struct ieee802_11_elems *elems = &elems_parse->elems; + const struct ieee80211_multi_link_elem *ml = elems->ml_basic; + ssize_t ml_len = elems->ml_basic_len; + const struct element *sub; + + for_each_mle_subelement(sub, (u8 *)ml, ml_len) { + struct ieee80211_mle_per_sta_profile *prof = (void *)sub->data; + ssize_t sta_prof_len; + u16 control; + + if (sub->id != IEEE80211_MLE_SUBELEM_PER_STA_PROFILE) + continue; + + if (!ieee80211_mle_basic_sta_prof_size_ok(sub->data, + sub->datalen)) + return; + + control = le16_to_cpu(prof->control); + + if (link_id != u16_get_bits(control, + IEEE80211_MLE_STA_CONTROL_LINK_ID)) + continue; + + if (!(control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE)) + return; + + /* the sub element can be fragmented */ + sta_prof_len = + cfg80211_defragment_element(sub, + (u8 *)ml, ml_len, + elems_parse->scratch_pos, + elems_parse->scratch + + elems_parse->scratch_len - + elems_parse->scratch_pos, + IEEE80211_MLE_SUBELEM_FRAGMENT); + + if (sta_prof_len < 0) + return; + + elems->prof = (void *)elems_parse->scratch_pos; + elems->sta_prof_len = sta_prof_len; + elems_parse->scratch_pos += sta_prof_len; + + return; + } +} + +static void ieee80211_mle_parse_link(struct ieee80211_elems_parse *elems_parse, + struct ieee80211_elems_parse_params *params) +{ + struct ieee802_11_elems *elems = &elems_parse->elems; + struct ieee80211_mle_per_sta_profile *prof; + struct ieee80211_elems_parse_params sub = { + .mode = params->mode, + .action = params->action, + .from_ap = params->from_ap, + .link_id = -1, + }; + ssize_t ml_len = elems->ml_basic_len; + const struct element *non_inherit = NULL; + const u8 *end; + + ml_len = cfg80211_defragment_element(elems_parse->ml_basic_elem, + elems->ie_start, + elems->total_len, + elems_parse->scratch_pos, + elems_parse->scratch + + elems_parse->scratch_len - + elems_parse->scratch_pos, + WLAN_EID_FRAGMENT); + + if (ml_len < 0) + return; + + elems->ml_basic = (const void *)elems_parse->scratch_pos; + elems->ml_basic_len = ml_len; + elems_parse->scratch_pos += ml_len; + + if (params->link_id == -1) + return; + + ieee80211_mle_get_sta_prof(elems_parse, params->link_id); + prof = elems->prof; + + if (!prof) + return; + + /* check if we have the 4 bytes for the fixed part in assoc response */ + if (elems->sta_prof_len < sizeof(*prof) + prof->sta_info_len - 1 + 4) { + elems->prof = NULL; + elems->sta_prof_len = 0; + return; + } + + /* + * Skip the capability information and the status code that are expected + * as part of the station profile in association response frames. Note + * the -1 is because the 'sta_info_len' is accounted to as part of the + * per-STA profile, but not part of the 'u8 variable[]' portion. + */ + sub.start = prof->variable + prof->sta_info_len - 1 + 4; + end = (const u8 *)prof + elems->sta_prof_len; + sub.len = end - sub.start; + + non_inherit = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, + sub.start, sub.len); + _ieee802_11_parse_elems_full(&sub, elems_parse, non_inherit); +} + +static void +ieee80211_mle_defrag_reconf(struct ieee80211_elems_parse *elems_parse) +{ + struct ieee802_11_elems *elems = &elems_parse->elems; + ssize_t ml_len; + + ml_len = cfg80211_defragment_element(elems_parse->ml_reconf_elem, + elems->ie_start, + elems->total_len, + elems_parse->scratch_pos, + elems_parse->scratch + + elems_parse->scratch_len - + elems_parse->scratch_pos, + WLAN_EID_FRAGMENT); + if (ml_len < 0) + return; + elems->ml_reconf = (void *)elems_parse->scratch_pos; + elems->ml_reconf_len = ml_len; + elems_parse->scratch_pos += ml_len; +} + +struct ieee802_11_elems * +ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) +{ + struct ieee80211_elems_parse *elems_parse; + struct ieee802_11_elems *elems; + const struct element *non_inherit = NULL; + u8 *nontransmitted_profile; + int nontransmitted_profile_len = 0; + size_t scratch_len = 3 * params->len; + + BUILD_BUG_ON(offsetof(typeof(*elems_parse), elems) != 0); + + elems_parse = kzalloc(struct_size(elems_parse, scratch, scratch_len), + GFP_ATOMIC); + if (!elems_parse) + return NULL; + + elems_parse->scratch_len = scratch_len; + elems_parse->scratch_pos = elems_parse->scratch; + + elems = &elems_parse->elems; + elems->ie_start = params->start; + elems->total_len = params->len; + + nontransmitted_profile = elems_parse->scratch_pos; + nontransmitted_profile_len = + ieee802_11_find_bssid_profile(params->start, params->len, + elems, params->bss, + nontransmitted_profile); + elems_parse->scratch_pos += nontransmitted_profile_len; + non_inherit = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, + nontransmitted_profile, + nontransmitted_profile_len); + + elems->crc = _ieee802_11_parse_elems_full(params, elems_parse, + non_inherit); + + /* Override with nontransmitted profile, if found */ + if (nontransmitted_profile_len) { + struct ieee80211_elems_parse_params sub = { + .mode = params->mode, + .start = nontransmitted_profile, + .len = nontransmitted_profile_len, + .action = params->action, + .link_id = params->link_id, + }; + + _ieee802_11_parse_elems_full(&sub, elems_parse, NULL); + } + + ieee80211_mle_parse_link(elems_parse, params); + + ieee80211_mle_defrag_reconf(elems_parse); + + if (elems->tim && !elems->parse_error) { + const struct ieee80211_tim_ie *tim_ie = elems->tim; + + elems->dtim_period = tim_ie->dtim_period; + elems->dtim_count = tim_ie->dtim_count; + } + + /* Override DTIM period and count if needed */ + if (elems->bssid_index && + elems->bssid_index_len >= + offsetofend(struct ieee80211_bssid_index, dtim_period)) + elems->dtim_period = elems->bssid_index->dtim_period; + + if (elems->bssid_index && + elems->bssid_index_len >= + offsetofend(struct ieee80211_bssid_index, dtim_count)) + elems->dtim_count = elems->bssid_index->dtim_count; + + return elems; +} +EXPORT_SYMBOL_IF_KUNIT(ieee802_11_parse_elems_full); + +int ieee80211_parse_bitrates(enum nl80211_chan_width width, + const struct ieee80211_supported_band *sband, + const u8 *srates, int srates_len, u32 *rates) +{ + u32 rate_flags = ieee80211_chanwidth_rate_flags(width); + struct ieee80211_rate *br; + int brate, rate, i, j, count = 0; + + *rates = 0; + + for (i = 0; i < srates_len; i++) { + rate = srates[i] & 0x7f; + + for (j = 0; j < sband->n_bitrates; j++) { + br = &sband->bitrates[j]; + if ((rate_flags & br->flags) != rate_flags) + continue; + + brate = DIV_ROUND_UP(br->bitrate, 5); + if (brate == rate) { + *rates |= BIT(j); + count++; + break; + } + } + } + return count; +} diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 0efdaa8f2a..4dc1def695 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -4,7 +4,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz> * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2022 Intel Corporation + * Copyright (C) 2019, 2022-2024 Intel Corporation */ #include <linux/kernel.h> @@ -279,10 +279,10 @@ void ieee80211_check_rate_mask(struct ieee80211_link_data *link) u32 user_mask, basic_rates = link->conf->basic_rates; enum nl80211_band band; - if (WARN_ON(!link->conf->chandef.chan)) + if (WARN_ON(!link->conf->chanreq.oper.chan)) return; - band = link->conf->chandef.chan->band; + band = link->conf->chanreq.oper.chan->band; if (band == NL80211_BAND_S1GHZ) { /* TODO */ return; @@ -762,7 +762,7 @@ static bool rate_control_cap_mask(struct ieee80211_sub_if_data *sdata, u32 i, flags; *mask = sdata->rc_rateidx_mask[sband->band]; - flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef); + flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper); for (i = 0; i < sband->n_bitrates; i++) { if ((flags & sband->bitrates[i].flags) != flags) *mask &= ~BIT(i); @@ -818,7 +818,7 @@ rate_control_apply_mask_ratetbl(struct sta_info *sta, mcs_mask, vht_mask)) return; - chan_width = sta->sdata->vif.bss_conf.chandef.width; + chan_width = sta->sdata->vif.bss_conf.chanreq.oper.width; for (i = 0; i < IEEE80211_TX_RATE_TABLE_SIZE; i++) { if (rates->rate[i].idx < 0) break; @@ -855,7 +855,7 @@ static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata, * included in the configured mask and change the rate indexes * if needed. */ - chan_width = sdata->vif.bss_conf.chandef.width; + chan_width = sdata->vif.bss_conf.chanreq.oper.width; for (i = 0; i < max_rates; i++) { /* Skip invalid rates */ if (rates[i].idx < 0) @@ -877,6 +877,7 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif, struct ieee80211_sub_if_data *sdata; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_supported_band *sband; + u32 mask = ~0; rate_control_fill_sta_table(sta, info, dest, max_rates); @@ -889,9 +890,12 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif, if (ieee80211_is_tx_data(skb)) rate_control_apply_mask(sdata, sta, sband, dest, max_rates); + if (!(info->control.flags & IEEE80211_TX_CTRL_SCAN_TX)) + mask = sdata->rc_rateidx_mask[info->band]; + if (dest[0].idx < 0) __rate_control_send_low(&sdata->local->hw, sband, sta, info, - sdata->rc_rateidx_mask[info->band]); + mask); if (sta) rate_fixup_ratelist(vif, sband, info, dest, max_rates); diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 541b0f53c6..4914692750 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -6,7 +6,7 @@ * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation */ #include <linux/jiffies.h> @@ -1251,8 +1251,7 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); - u16 sc = le16_to_cpu(hdr->seq_ctrl); - u16 mpdu_seq_num = (sc & IEEE80211_SCTL_SEQ) >> 4; + u16 mpdu_seq_num = ieee80211_get_sn(hdr); u16 head_seq_num, buf_size; int index; bool ret = true; @@ -1435,13 +1434,31 @@ ieee80211_rx_h_check_dup(struct ieee80211_rx_data *rx) return RX_CONTINUE; if (ieee80211_is_ctl(hdr->frame_control) || - ieee80211_is_any_nullfunc(hdr->frame_control) || - is_multicast_ether_addr(hdr->addr1)) + ieee80211_is_any_nullfunc(hdr->frame_control)) return RX_CONTINUE; if (!rx->sta) return RX_CONTINUE; + if (unlikely(is_multicast_ether_addr(hdr->addr1))) { + struct ieee80211_sub_if_data *sdata = rx->sdata; + u16 sn = ieee80211_get_sn(hdr); + + if (!ieee80211_is_data_present(hdr->frame_control)) + return RX_CONTINUE; + + if (!ieee80211_vif_is_mld(&sdata->vif) || + sdata->vif.type != NL80211_IFTYPE_STATION) + return RX_CONTINUE; + + if (sdata->u.mgd.mcast_seq_last != IEEE80211_SN_MODULO && + ieee80211_sn_less_eq(sn, sdata->u.mgd.mcast_seq_last)) + return RX_DROP_U_DUP; + + sdata->u.mgd.mcast_seq_last = sn; + return RX_CONTINUE; + } + if (unlikely(ieee80211_has_retry(hdr->frame_control) && rx->sta->last_seq_ctrl[rx->seqno_idx] == hdr->seq_ctrl)) { I802_DEBUG_INC(rx->local->dot11FrameDuplicateCount); @@ -3351,7 +3368,7 @@ ieee80211_rx_check_bss_color_collision(struct ieee80211_rx_data *rx) if (ieee80211_hw_check(&rx->local->hw, DETECTS_COLOR_COLLISION)) return; - if (rx->sdata->vif.bss_conf.csa_active) + if (rx->link->conf->csa_active) return; baselen = mgmt->u.beacon.variable - rx->skb->data; @@ -3363,7 +3380,7 @@ ieee80211_rx_check_bss_color_collision(struct ieee80211_rx_data *rx) rx->skb->len - baselen); if (ie && ie->datalen >= sizeof(struct ieee80211_he_operation) && ie->datalen >= ieee80211_he_oper_size(ie->data + 1)) { - struct ieee80211_bss_conf *bss_conf = &rx->sdata->vif.bss_conf; + struct ieee80211_bss_conf *bss_conf = rx->link->conf; const struct ieee80211_he_operation *he_oper; u8 color; @@ -3377,7 +3394,7 @@ ieee80211_rx_check_bss_color_collision(struct ieee80211_rx_data *rx) if (color == bss_conf->he_bss_color.color) ieee80211_obss_color_collision_notify(&rx->sdata->vif, BIT_ULL(color), - GFP_ATOMIC); + bss_conf->link_id); } } @@ -3770,6 +3787,32 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) break; } break; + case WLAN_CATEGORY_PROTECTED_EHT: + if (len < offsetofend(typeof(*mgmt), + u.action.u.ttlm_req.action_code)) + break; + + switch (mgmt->u.action.u.ttlm_req.action_code) { + case WLAN_PROTECTED_EHT_ACTION_TTLM_REQ: + if (sdata->vif.type != NL80211_IFTYPE_STATION) + break; + + if (len < offsetofend(typeof(*mgmt), + u.action.u.ttlm_req)) + goto invalid; + goto queue; + case WLAN_PROTECTED_EHT_ACTION_TTLM_RES: + if (sdata->vif.type != NL80211_IFTYPE_STATION) + break; + + if (len < offsetofend(typeof(*mgmt), + u.action.u.ttlm_res)) + goto invalid; + goto queue; + default: + break; + } + break; } return RX_CONTINUE; @@ -3927,8 +3970,8 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx) __ieee80211_tx_skb_tid_band(rx->sdata, nskb, 7, -1, status->band); } - dev_kfree_skb(rx->skb); - return RX_QUEUED; + + return RX_DROP_U_UNKNOWN_ACTION_REJECTED; } static ieee80211_rx_result debug_noinline @@ -5199,7 +5242,6 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, */ if (!status->link_valid && pubsta->mlo) { - struct ieee80211_hdr *hdr = (void *)skb->data; struct link_sta_info *link_sta; link_sta = link_sta_info_get_bss(rx.sdata, diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index f9d5842601..b5f2df61c7 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -257,7 +257,6 @@ static bool ieee80211_scan_accept_presp(struct ieee80211_sub_if_data *sdata, void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_rx_status *rx_status = IEEE80211_SKB_RXCB(skb); - struct ieee80211_sub_if_data *sdata1, *sdata2; struct ieee80211_mgmt *mgmt = (void *)skb->data; struct ieee80211_bss *bss; struct ieee80211_channel *channel; @@ -281,12 +280,6 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) if (skb->len < min_hdr_len) return; - sdata1 = rcu_dereference(local->scan_sdata); - sdata2 = rcu_dereference(local->sched_scan_sdata); - - if (likely(!sdata1 && !sdata2)) - return; - if (test_and_clear_bit(SCAN_BEACON_WAIT, &local->scanning)) { /* * we were passive scanning because of radar/no-IR, but @@ -304,10 +297,17 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) return; if (ieee80211_is_probe_resp(mgmt->frame_control)) { + struct ieee80211_sub_if_data *sdata1, *sdata2; struct cfg80211_scan_request *scan_req; struct cfg80211_sched_scan_request *sched_scan_req; u32 scan_req_flags = 0, sched_scan_req_flags = 0; + sdata1 = rcu_dereference(local->scan_sdata); + sdata2 = rcu_dereference(local->sched_scan_sdata); + + if (likely(!sdata1 && !sdata2)) + return; + scan_req = rcu_dereference(local->scan_req); sched_scan_req = rcu_dereference(local->sched_scan_req); @@ -327,8 +327,16 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) sched_scan_req_flags, mgmt->da)) return; + } else { + /* Beacons are expected only with broadcast address */ + if (!is_broadcast_ether_addr(mgmt->da)) + return; } + /* Do not update the BSS table in case of only monitor interfaces */ + if (local->open_count == local->monitors) + return; + bss = ieee80211_bss_info_update(local, rx_status, mgmt, skb->len, channel); @@ -350,7 +358,8 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_sub_if_data *sdata) struct cfg80211_scan_request *req; struct cfg80211_chan_def chandef; u8 bands_used = 0; - int i, ielen, n_chans; + int i, ielen; + u32 *n_chans; u32 flags = 0; req = rcu_dereference_protected(local->scan_req, @@ -360,34 +369,34 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_sub_if_data *sdata) return false; if (ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS)) { + local->hw_scan_req->req.n_channels = req->n_channels; + for (i = 0; i < req->n_channels; i++) { local->hw_scan_req->req.channels[i] = req->channels[i]; bands_used |= BIT(req->channels[i]->band); } - - n_chans = req->n_channels; } else { do { if (local->hw_scan_band == NUM_NL80211_BANDS) return false; - n_chans = 0; + n_chans = &local->hw_scan_req->req.n_channels; + *n_chans = 0; for (i = 0; i < req->n_channels; i++) { if (req->channels[i]->band != local->hw_scan_band) continue; - local->hw_scan_req->req.channels[n_chans] = + local->hw_scan_req->req.channels[(*n_chans)++] = req->channels[i]; - n_chans++; + bands_used |= BIT(req->channels[i]->band); } local->hw_scan_band++; - } while (!n_chans); + } while (!*n_chans); } - local->hw_scan_req->req.n_channels = n_chans; ieee80211_prepare_scan_chandef(&chandef); if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) @@ -400,6 +409,8 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_sub_if_data *sdata) req->ie, req->ie_len, bands_used, req->rates, &chandef, flags); + if (ielen < 0) + return false; local->hw_scan_req->req.ie_len = ielen; local->hw_scan_req->req.no_cck = req->no_cck; ether_addr_copy(local->hw_scan_req->req.mac_addr, req->mac_addr); @@ -476,7 +487,7 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) } /* Set power back to normal operating levels. */ - ieee80211_hw_config(local, 0); + ieee80211_hw_conf_chan(local); if (!hw_scan && was_scanning) { ieee80211_configure_filter(local); @@ -523,7 +534,7 @@ static int ieee80211_start_sw_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { /* Software scan is not supported in multi-channel cases */ - if (local->use_chanctx) + if (!local->emulate_chanctx) return -EOPNOTSUPP; /* @@ -553,7 +564,7 @@ static int ieee80211_start_sw_scan(struct ieee80211_local *local, ieee80211_configure_filter(local); /* We need to set power level at maximum rate for scanning. */ - ieee80211_hw_config(local, 0); + ieee80211_hw_conf_chan(local); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0); @@ -638,6 +649,7 @@ static void ieee80211_send_scan_probe_req(struct ieee80211_sub_if_data *sdata, cpu_to_le16(IEEE80211_SN_TO_SEQ(sn)); } IEEE80211_SKB_CB(skb)->flags |= tx_flags; + IEEE80211_SKB_CB(skb)->control.flags |= IEEE80211_TX_CTRL_SCAN_TX; ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band); } } @@ -677,7 +689,10 @@ static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, * After sending probe requests, wait for probe responses * on the channel. */ - *next_delay = IEEE80211_CHANNEL_TIME; + *next_delay = msecs_to_jiffies(scan_req->duration) > + IEEE80211_PROBE_DELAY + IEEE80211_CHANNEL_TIME ? + msecs_to_jiffies(scan_req->duration) - IEEE80211_PROBE_DELAY : + IEEE80211_CHANNEL_TIME; local->next_scan_state = SCAN_DECISION; } @@ -694,19 +709,11 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, return -EBUSY; /* For an MLO connection, if a link ID was specified, validate that it - * is indeed active. If no link ID was specified, select one of the - * active links. + * is indeed active. */ - if (ieee80211_vif_is_mld(&sdata->vif)) { - if (req->tsf_report_link_id >= 0) { - if (!(sdata->vif.active_links & - BIT(req->tsf_report_link_id))) - return -EINVAL; - } else { - req->tsf_report_link_id = - __ffs(sdata->vif.active_links); - } - } + if (ieee80211_vif_is_mld(&sdata->vif) && req->tsf_report_link_id >= 0 && + !(sdata->vif.active_links & BIT(req->tsf_report_link_id))) + return -EINVAL; if (!__ieee80211_can_leave_ch(sdata)) return -EBUSY; @@ -738,15 +745,21 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, local->hw_scan_ies_bufsize *= n_bands; } - local->hw_scan_req = kmalloc( - sizeof(*local->hw_scan_req) + - req->n_channels * sizeof(req->channels[0]) + - local->hw_scan_ies_bufsize, GFP_KERNEL); + local->hw_scan_req = kmalloc(struct_size(local->hw_scan_req, + req.channels, + req->n_channels) + + local->hw_scan_ies_bufsize, + GFP_KERNEL); if (!local->hw_scan_req) return -ENOMEM; local->hw_scan_req->req.ssids = req->ssids; local->hw_scan_req->req.n_ssids = req->n_ssids; + /* None of the channels are actually set + * up but let UBSAN know the boundaries. + */ + local->hw_scan_req->req.n_channels = req->n_channels; + ies = (u8 *)local->hw_scan_req + sizeof(*local->hw_scan_req) + req->n_channels * sizeof(req->channels[0]); @@ -787,7 +800,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, if (hw_scan) { __set_bit(SCAN_HW_SCANNING, &local->scanning); } else if ((req->n_channels == 1) && - (req->channels[0] == local->_oper_chandef.chan)) { + (req->channels[0] == local->hw.conf.chandef.chan)) { /* * If we are scanning only on the operating channel * then we do not need to stop normal activities @@ -805,7 +818,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, ieee80211_configure_filter(local); /* accept probe-responses */ /* We need to ensure power level is at max for scanning. */ - ieee80211_hw_config(local, 0); + ieee80211_hw_conf_chan(local); if ((req->channels[0]->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) || @@ -970,13 +983,13 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local, /* If scanning on oper channel, use whatever channel-type * is currently in use. */ - if (chan == local->_oper_chandef.chan) - local->scan_chandef = local->_oper_chandef; + if (chan == local->hw.conf.chandef.chan) + local->scan_chandef = local->hw.conf.chandef; else local->scan_chandef.width = NL80211_CHAN_WIDTH_20_NOHT; set_channel: - if (ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL)) + if (ieee80211_hw_conf_chan(local)) skip = 1; /* advance state machine to next channel/band */ @@ -1000,7 +1013,10 @@ set_channel: */ if ((chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) || !scan_req->n_ssids) { - *next_delay = IEEE80211_PASSIVE_CHANNEL_TIME; + *next_delay = msecs_to_jiffies(scan_req->duration) > + IEEE80211_PASSIVE_CHANNEL_TIME ? + msecs_to_jiffies(scan_req->duration) : + IEEE80211_PASSIVE_CHANNEL_TIME; local->next_scan_state = SCAN_DECISION; if (scan_req->n_ssids) set_bit(SCAN_BEACON_WAIT, &local->scanning); @@ -1017,7 +1033,7 @@ static void ieee80211_scan_state_suspend(struct ieee80211_local *local, { /* switch back to the operating channel */ local->scan_chandef.chan = NULL; - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); + ieee80211_hw_conf_chan(local); /* disable PS */ ieee80211_offchannel_return(local); @@ -1316,10 +1332,12 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, ieee80211_prepare_scan_chandef(&chandef); - ieee80211_build_preq_ies(sdata, ie, num_bands * iebufsz, - &sched_scan_ies, req->ie, - req->ie_len, bands_used, rate_masks, &chandef, - flags); + ret = ieee80211_build_preq_ies(sdata, ie, num_bands * iebufsz, + &sched_scan_ies, req->ie, + req->ie_len, bands_used, rate_masks, + &chandef, flags); + if (ret < 0) + goto error; ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies); if (ret == 0) { @@ -1327,8 +1345,8 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, rcu_assign_pointer(local->sched_scan_req, req); } +error: kfree(ie); - out: if (ret) { /* Clean in case of failure after HW restart or upon resume. */ diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c index 55959b0b24..b2de4c6fb8 100644 --- a/net/mac80211/spectmgmt.c +++ b/net/mac80211/spectmgmt.c @@ -19,21 +19,222 @@ #include "sta_info.h" #include "wme.h" +static bool +wbcs_elem_to_chandef(const struct ieee80211_wide_bw_chansw_ie *wbcs_elem, + struct cfg80211_chan_def *chandef) +{ + u8 ccfs0 = wbcs_elem->new_center_freq_seg0; + u8 ccfs1 = wbcs_elem->new_center_freq_seg1; + u32 cf0 = ieee80211_channel_to_frequency(ccfs0, chandef->chan->band); + u32 cf1 = ieee80211_channel_to_frequency(ccfs1, chandef->chan->band); + + switch (wbcs_elem->new_channel_width) { + case IEEE80211_VHT_CHANWIDTH_160MHZ: + /* deprecated encoding */ + chandef->width = NL80211_CHAN_WIDTH_160; + chandef->center_freq1 = cf0; + break; + case IEEE80211_VHT_CHANWIDTH_80P80MHZ: + /* deprecated encoding */ + chandef->width = NL80211_CHAN_WIDTH_80P80; + chandef->center_freq1 = cf0; + chandef->center_freq2 = cf1; + break; + case IEEE80211_VHT_CHANWIDTH_80MHZ: + chandef->width = NL80211_CHAN_WIDTH_80; + chandef->center_freq1 = cf0; + + if (ccfs1) { + u8 diff = abs(ccfs0 - ccfs1); + + if (diff == 8) { + chandef->width = NL80211_CHAN_WIDTH_160; + chandef->center_freq1 = cf1; + } else if (diff > 8) { + chandef->width = NL80211_CHAN_WIDTH_80P80; + chandef->center_freq2 = cf1; + } + } + break; + case IEEE80211_VHT_CHANWIDTH_USE_HT: + default: + /* If the WBCS Element is present, new channel bandwidth is + * at least 40 MHz. + */ + chandef->width = NL80211_CHAN_WIDTH_40; + chandef->center_freq1 = cf0; + break; + } + + return cfg80211_chandef_valid(chandef); +} + +static void +validate_chandef_by_ht_vht_oper(struct ieee80211_sub_if_data *sdata, + struct ieee80211_conn_settings *conn, + u32 vht_cap_info, + struct cfg80211_chan_def *chandef) +{ + u32 control_freq, center_freq1, center_freq2; + enum nl80211_chan_width chan_width; + struct ieee80211_ht_operation ht_oper; + struct ieee80211_vht_operation vht_oper; + + if (conn->mode < IEEE80211_CONN_MODE_HT || + conn->bw_limit < IEEE80211_CONN_BW_LIMIT_40) { + chandef->chan = NULL; + return; + } + + control_freq = chandef->chan->center_freq; + center_freq1 = chandef->center_freq1; + center_freq2 = chandef->center_freq2; + chan_width = chandef->width; + + ht_oper.primary_chan = ieee80211_frequency_to_channel(control_freq); + if (control_freq != center_freq1) + ht_oper.ht_param = control_freq > center_freq1 ? + IEEE80211_HT_PARAM_CHA_SEC_BELOW : + IEEE80211_HT_PARAM_CHA_SEC_ABOVE; + else + ht_oper.ht_param = IEEE80211_HT_PARAM_CHA_SEC_NONE; + + ieee80211_chandef_ht_oper(&ht_oper, chandef); + + if (conn->mode < IEEE80211_CONN_MODE_VHT) + return; + + vht_oper.center_freq_seg0_idx = + ieee80211_frequency_to_channel(center_freq1); + vht_oper.center_freq_seg1_idx = center_freq2 ? + ieee80211_frequency_to_channel(center_freq2) : 0; + + switch (chan_width) { + case NL80211_CHAN_WIDTH_320: + WARN_ON(1); + break; + case NL80211_CHAN_WIDTH_160: + vht_oper.chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; + vht_oper.center_freq_seg1_idx = vht_oper.center_freq_seg0_idx; + vht_oper.center_freq_seg0_idx += + control_freq < center_freq1 ? -8 : 8; + break; + case NL80211_CHAN_WIDTH_80P80: + vht_oper.chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; + break; + case NL80211_CHAN_WIDTH_80: + vht_oper.chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; + break; + default: + vht_oper.chan_width = IEEE80211_VHT_CHANWIDTH_USE_HT; + break; + } + + ht_oper.operation_mode = + le16_encode_bits(vht_oper.center_freq_seg1_idx, + IEEE80211_HT_OP_MODE_CCFS2_MASK); + + if (!ieee80211_chandef_vht_oper(&sdata->local->hw, vht_cap_info, + &vht_oper, &ht_oper, chandef)) + chandef->chan = NULL; +} + +static void +validate_chandef_by_6ghz_he_eht_oper(struct ieee80211_sub_if_data *sdata, + struct ieee80211_conn_settings *conn, + struct cfg80211_chan_def *chandef) +{ + struct ieee80211_local *local = sdata->local; + u32 control_freq, center_freq1, center_freq2; + enum nl80211_chan_width chan_width; + struct { + struct ieee80211_he_operation _oper; + struct ieee80211_he_6ghz_oper _6ghz_oper; + } __packed he; + struct { + struct ieee80211_eht_operation _oper; + struct ieee80211_eht_operation_info _oper_info; + } __packed eht; + const struct ieee80211_eht_operation *eht_oper; + + if (conn->mode < IEEE80211_CONN_MODE_HE) { + chandef->chan = NULL; + return; + } + + control_freq = chandef->chan->center_freq; + center_freq1 = chandef->center_freq1; + center_freq2 = chandef->center_freq2; + chan_width = chandef->width; + + he._oper.he_oper_params = + le32_encode_bits(1, IEEE80211_HE_OPERATION_6GHZ_OP_INFO); + he._6ghz_oper.primary = + ieee80211_frequency_to_channel(control_freq); + he._6ghz_oper.ccfs0 = ieee80211_frequency_to_channel(center_freq1); + he._6ghz_oper.ccfs1 = center_freq2 ? + ieee80211_frequency_to_channel(center_freq2) : 0; + + switch (chan_width) { + case NL80211_CHAN_WIDTH_320: + he._6ghz_oper.ccfs1 = he._6ghz_oper.ccfs0; + he._6ghz_oper.ccfs0 += control_freq < center_freq1 ? -16 : 16; + he._6ghz_oper.control = IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ; + break; + case NL80211_CHAN_WIDTH_160: + he._6ghz_oper.ccfs1 = he._6ghz_oper.ccfs0; + he._6ghz_oper.ccfs0 += control_freq < center_freq1 ? -8 : 8; + fallthrough; + case NL80211_CHAN_WIDTH_80P80: + he._6ghz_oper.control = + IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ; + break; + case NL80211_CHAN_WIDTH_80: + he._6ghz_oper.control = + IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ; + break; + case NL80211_CHAN_WIDTH_40: + he._6ghz_oper.control = + IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ; + break; + default: + he._6ghz_oper.control = + IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ; + break; + } + + if (conn->mode < IEEE80211_CONN_MODE_EHT) { + eht_oper = NULL; + } else { + eht._oper.params = IEEE80211_EHT_OPER_INFO_PRESENT; + eht._oper_info.control = he._6ghz_oper.control; + eht._oper_info.ccfs0 = he._6ghz_oper.ccfs0; + eht._oper_info.ccfs1 = he._6ghz_oper.ccfs1; + eht_oper = &eht._oper; + } + + if (!ieee80211_chandef_he_6ghz_oper(local, &he._oper, + eht_oper, chandef)) + chandef->chan = NULL; +} + int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band current_band, u32 vht_cap_info, - ieee80211_conn_flags_t conn_flags, u8 *bssid, + struct ieee80211_conn_settings *conn, + u8 *bssid, struct ieee80211_csa_ie *csa_ie) { enum nl80211_band new_band = current_band; int new_freq; - u8 new_chan_no; + u8 new_chan_no = 0, new_op_class = 0; struct ieee80211_channel *new_chan; - struct cfg80211_chan_def new_vht_chandef = {}; + struct cfg80211_chan_def new_chandef = {}; const struct ieee80211_sec_chan_offs_ie *sec_chan_offs; const struct ieee80211_wide_bw_chansw_ie *wide_bw_chansw_ie; const struct ieee80211_bandwidth_indication *bwi; + const struct ieee80211_ext_chansw_ie *ext_chansw_elem; int secondary_channel_offset = -1; memset(csa_ie, 0, sizeof(*csa_ie)); @@ -41,36 +242,41 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, sec_chan_offs = elems->sec_chan_offs; wide_bw_chansw_ie = elems->wide_bw_chansw_ie; bwi = elems->bandwidth_indication; + ext_chansw_elem = elems->ext_chansw_ie; - if (conn_flags & (IEEE80211_CONN_DISABLE_HT | - IEEE80211_CONN_DISABLE_40MHZ)) { + if (conn->mode < IEEE80211_CONN_MODE_HT || + conn->bw_limit < IEEE80211_CONN_BW_LIMIT_40) { sec_chan_offs = NULL; wide_bw_chansw_ie = NULL; } - if (conn_flags & IEEE80211_CONN_DISABLE_VHT) + if (conn->mode < IEEE80211_CONN_MODE_VHT) wide_bw_chansw_ie = NULL; - if (elems->ext_chansw_ie) { - if (!ieee80211_operating_class_to_band( - elems->ext_chansw_ie->new_operating_class, - &new_band)) { - sdata_info(sdata, - "cannot understand ECSA IE operating class, %d, ignoring\n", - elems->ext_chansw_ie->new_operating_class); + if (ext_chansw_elem) { + new_op_class = ext_chansw_elem->new_operating_class; + + if (!ieee80211_operating_class_to_band(new_op_class, &new_band)) { + new_op_class = 0; + sdata_info(sdata, "cannot understand ECSA IE operating class, %d, ignoring\n", + ext_chansw_elem->new_operating_class); + } else { + new_chan_no = ext_chansw_elem->new_ch_num; + csa_ie->count = ext_chansw_elem->count; + csa_ie->mode = ext_chansw_elem->mode; } - new_chan_no = elems->ext_chansw_ie->new_ch_num; - csa_ie->count = elems->ext_chansw_ie->count; - csa_ie->mode = elems->ext_chansw_ie->mode; - } else if (elems->ch_switch_ie) { + } + + if (!new_op_class && elems->ch_switch_ie) { new_chan_no = elems->ch_switch_ie->new_ch_num; csa_ie->count = elems->ch_switch_ie->count; csa_ie->mode = elems->ch_switch_ie->mode; - } else { - /* nothing here we understand */ - return 1; } + /* nothing here we understand */ + if (!new_chan_no) + return 1; + /* Mesh Channel Switch Parameters Element */ if (elems->mesh_chansw_params_ie) { csa_ie->ttl = elems->mesh_chansw_params_ie->mesh_ttl; @@ -95,7 +301,7 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, if (sec_chan_offs) { secondary_channel_offset = sec_chan_offs->sec_chan_offs; - } else if (!(conn_flags & IEEE80211_CONN_DISABLE_HT)) { + } else if (conn->mode >= IEEE80211_CONN_MODE_HT) { /* If the secondary channel offset IE is not present, * we can't know what's the post-CSA offset, so the * best we can do is use 20MHz. @@ -107,26 +313,26 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, default: /* secondary_channel_offset was present but is invalid */ case IEEE80211_HT_PARAM_CHA_SEC_NONE: - cfg80211_chandef_create(&csa_ie->chandef, new_chan, + cfg80211_chandef_create(&csa_ie->chanreq.oper, new_chan, NL80211_CHAN_HT20); break; case IEEE80211_HT_PARAM_CHA_SEC_ABOVE: - cfg80211_chandef_create(&csa_ie->chandef, new_chan, + cfg80211_chandef_create(&csa_ie->chanreq.oper, new_chan, NL80211_CHAN_HT40PLUS); break; case IEEE80211_HT_PARAM_CHA_SEC_BELOW: - cfg80211_chandef_create(&csa_ie->chandef, new_chan, + cfg80211_chandef_create(&csa_ie->chanreq.oper, new_chan, NL80211_CHAN_HT40MINUS); break; case -1: - cfg80211_chandef_create(&csa_ie->chandef, new_chan, + cfg80211_chandef_create(&csa_ie->chanreq.oper, new_chan, NL80211_CHAN_NO_HT); /* keep width for 5/10 MHz channels */ - switch (sdata->vif.bss_conf.chandef.width) { + switch (sdata->vif.bss_conf.chanreq.oper.width) { case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: - csa_ie->chandef.width = - sdata->vif.bss_conf.chandef.width; + csa_ie->chanreq.oper.width = + sdata->vif.bss_conf.chanreq.oper.width; break; default: break; @@ -134,59 +340,52 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, break; } + /* parse one of the Elements to build a new chandef */ + memset(&new_chandef, 0, sizeof(new_chandef)); + new_chandef.chan = new_chan; if (bwi) { /* start with the CSA one */ - new_vht_chandef = csa_ie->chandef; + new_chandef = csa_ie->chanreq.oper; /* and update the width accordingly */ - /* FIXME: support 160/320 */ - ieee80211_chandef_eht_oper(&bwi->info, true, true, - &new_vht_chandef); - } else if (wide_bw_chansw_ie) { - u8 new_seg1 = wide_bw_chansw_ie->new_center_freq_seg1; - struct ieee80211_vht_operation vht_oper = { - .chan_width = - wide_bw_chansw_ie->new_channel_width, - .center_freq_seg0_idx = - wide_bw_chansw_ie->new_center_freq_seg0, - .center_freq_seg1_idx = new_seg1, - /* .basic_mcs_set doesn't matter */ - }; - struct ieee80211_ht_operation ht_oper = { - .operation_mode = - cpu_to_le16(new_seg1 << - IEEE80211_HT_OP_MODE_CCFS2_SHIFT), - }; - - /* default, for the case of IEEE80211_VHT_CHANWIDTH_USE_HT, - * to the previously parsed chandef - */ - new_vht_chandef = csa_ie->chandef; - - /* ignore if parsing fails */ - if (!ieee80211_chandef_vht_oper(&sdata->local->hw, - vht_cap_info, - &vht_oper, &ht_oper, - &new_vht_chandef)) - new_vht_chandef.chan = NULL; - - if (conn_flags & IEEE80211_CONN_DISABLE_80P80MHZ && - new_vht_chandef.width == NL80211_CHAN_WIDTH_80P80) - ieee80211_chandef_downgrade(&new_vht_chandef); - if (conn_flags & IEEE80211_CONN_DISABLE_160MHZ && - new_vht_chandef.width == NL80211_CHAN_WIDTH_160) - ieee80211_chandef_downgrade(&new_vht_chandef); + ieee80211_chandef_eht_oper(&bwi->info, &new_chandef); + + if (bwi->params & IEEE80211_BW_IND_DIS_SUBCH_PRESENT) + new_chandef.punctured = + get_unaligned_le16(bwi->info.optional); + } else if (!wide_bw_chansw_ie || !wbcs_elem_to_chandef(wide_bw_chansw_ie, + &new_chandef)) { + if (!ieee80211_operating_class_to_chandef(new_op_class, new_chan, + &new_chandef)) + new_chandef = csa_ie->chanreq.oper; } - /* if VHT data is there validate & use it */ - if (new_vht_chandef.chan) { - if (!cfg80211_chandef_compatible(&new_vht_chandef, - &csa_ie->chandef)) { + /* check if the new chandef fits the capabilities */ + if (new_band == NL80211_BAND_6GHZ) + validate_chandef_by_6ghz_he_eht_oper(sdata, conn, &new_chandef); + else + validate_chandef_by_ht_vht_oper(sdata, conn, vht_cap_info, + &new_chandef); + + /* if data is there validate the bandwidth & use it */ + if (new_chandef.chan) { + if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_320 && + new_chandef.width == NL80211_CHAN_WIDTH_320) + ieee80211_chandef_downgrade(&new_chandef, NULL); + + if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_160 && + (new_chandef.width == NL80211_CHAN_WIDTH_80P80 || + new_chandef.width == NL80211_CHAN_WIDTH_160)) + ieee80211_chandef_downgrade(&new_chandef, NULL); + + if (!cfg80211_chandef_compatible(&new_chandef, + &csa_ie->chanreq.oper)) { sdata_info(sdata, "BSS %pM: CSA has inconsistent channel data, disconnecting\n", bssid); return -EINVAL; } - csa_ie->chandef = new_vht_chandef; + + csa_ie->chanreq.oper = new_chandef; } if (elems->max_channel_switch_time) diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 4391d8dd63..aa22f09e6d 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1566,7 +1566,8 @@ void sta_info_stop(struct ieee80211_local *local) } -int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans) +int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans, + int link_id) { struct ieee80211_local *local = sdata->local; struct sta_info *sta, *tmp; @@ -1580,12 +1581,18 @@ int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans) WARN_ON(vlans && !sdata->bss); list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { - if (sdata == sta->sdata || - (vlans && sdata->bss == sta->sdata->bss)) { - if (!WARN_ON(__sta_info_destroy_part1(sta))) - list_add(&sta->free_list, &free_list); - ret++; - } + if (sdata != sta->sdata && + (!vlans || sdata->bss != sta->sdata->bss)) + continue; + + if (link_id >= 0 && sta->sta.valid_links && + !(sta->sta.valid_links & BIT(link_id))) + continue; + + if (!WARN_ON(__sta_info_destroy_part1(sta))) + list_add(&sta->free_list, &free_list); + + ret++; } if (!list_empty(&free_list)) { @@ -1717,7 +1724,7 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) skb_queue_head_init(&pending); /* sync with ieee80211_tx_h_unicast_ps_buf */ - spin_lock(&sta->ps_lock); + spin_lock_bh(&sta->ps_lock); /* Send all buffered frames to the station */ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { int count = skb_queue_len(&pending), tmp; @@ -1746,7 +1753,7 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) */ clear_sta_flag(sta, WLAN_STA_PSPOLL); clear_sta_flag(sta, WLAN_STA_UAPSD); - spin_unlock(&sta->ps_lock); + spin_unlock_bh(&sta->ps_lock); atomic_dec(&ps->num_sta_ps); diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index ac4c7a6f96..9195d5a2de 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -727,6 +727,12 @@ struct sta_info { struct ieee80211_sta sta; }; +static inline int ieee80211_tdls_sta_link_id(struct sta_info *sta) +{ + /* TDLS STA can only have a single link */ + return sta->sta.valid_links ? __ffs(sta->sta.valid_links) : 0; +} + static inline enum nl80211_plink_state sta_plink_state(struct sta_info *sta) { #ifdef CONFIG_MAC80211_MESH @@ -886,23 +892,31 @@ void sta_info_stop(struct ieee80211_local *local); /** * __sta_info_flush - flush matching STA entries from the STA table * - * Returns the number of removed STA entries. + * Return: the number of removed STA entries. * * @sdata: sdata to remove all stations from * @vlans: if the given interface is an AP interface, also flush VLANs + * @link_id: if given (>=0), all those STA entries using @link_id only + * will be removed. If -1 is passed, all STA entries will be + * removed. */ -int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans); +int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans, + int link_id); /** * sta_info_flush - flush matching STA entries from the STA table * - * Returns the number of removed STA entries. + * Return: the number of removed STA entries. * * @sdata: sdata to remove all stations from + * @link_id: if given (>=0), all those STA entries using @link_id only + * will be removed. If -1 is passed, all STA entries will be + * removed. */ -static inline int sta_info_flush(struct ieee80211_sub_if_data *sdata) +static inline int sta_info_flush(struct ieee80211_sub_if_data *sdata, + int link_id) { - return __sta_info_flush(sdata, false); + return __sta_info_flush(sdata, false, link_id); } void sta_set_rate_info_tx(struct sta_info *sta, diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 1708b33cdc..dd8f857a1f 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2008-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright 2021-2023 Intel Corporation + * Copyright 2021-2024 Intel Corporation */ #include <linux/export.h> @@ -696,6 +696,23 @@ static void ieee80211_handle_smps_status(struct ieee80211_sub_if_data *sdata, wiphy_work_queue(sdata->local->hw.wiphy, &link->u.mgd.recalc_smps); } +static void +ieee80211_handle_teardown_ttlm_status(struct ieee80211_sub_if_data *sdata, + bool acked) +{ + if (!sdata || !ieee80211_sdata_running(sdata)) + return; + + if (!acked) + return; + + if (sdata->vif.type != NL80211_IFTYPE_STATION) + return; + + wiphy_work_queue(sdata->local->hw.wiphy, + &sdata->u.mgd.teardown_ttlm_work); +} + static void ieee80211_report_used_skb(struct ieee80211_local *local, struct sk_buff *skb, bool dropped, ktime_t ack_hwtstamp) @@ -773,6 +790,9 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, ieee80211_handle_smps_status(sdata, acked, info->status_data); break; + case IEEE80211_STATUS_TYPE_NEG_TTLM: + ieee80211_handle_teardown_ttlm_status(sdata, acked); + break; } rcu_read_unlock(); } diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 49730b4241..f07b409164 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -6,7 +6,7 @@ * Copyright 2014, Intel Corporation * Copyright 2014 Intel Mobile Communications GmbH * Copyright 2015 - 2016 Intel Deutschland GmbH - * Copyright (C) 2019, 2021-2023 Intel Corporation + * Copyright (C) 2019, 2021-2024 Intel Corporation */ #include <linux/ieee80211.h> @@ -159,7 +159,7 @@ static void ieee80211_tdls_add_oper_classes(struct ieee80211_link_data *link, u8 *pos; u8 op_class; - if (!ieee80211_chandef_to_operating_class(&link->conf->chandef, + if (!ieee80211_chandef_to_operating_class(&link->conf->chanreq.oper, &op_class)) return; @@ -347,7 +347,7 @@ ieee80211_tdls_chandef_vht_upgrade(struct ieee80211_sub_if_data *sdata, (uc.width > sta->tdls_chandef.width && !cfg80211_reg_can_beacon_relax(sdata->local->hw.wiphy, &uc, sdata->wdev.iftype))) - ieee80211_chandef_downgrade(&uc); + ieee80211_chandef_downgrade(&uc, NULL); if (!cfg80211_chandef_identical(&uc, &sta->tdls_chandef)) { tdls_dbg(sdata, "TDLS ch width upgraded %d -> %d\n", @@ -382,8 +382,8 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_link_data *link, if (WARN_ON_ONCE(!sband)) return; - ieee80211_add_srates_ie(sdata, skb, false, sband->band); - ieee80211_add_ext_srates_ie(sdata, skb, false, sband->band); + ieee80211_put_srates_elem(skb, sband, 0, 0, 0, WLAN_EID_SUPP_RATES); + ieee80211_put_srates_elem(skb, sband, 0, 0, 0, WLAN_EID_EXT_SUPP_RATES); ieee80211_tdls_add_supp_channels(sdata, skb); /* add any custom IEs that go before Extended Capabilities */ @@ -438,7 +438,7 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_link_data *link, if (WARN_ON_ONCE(!sta)) return; - sta->tdls_chandef = link->conf->chandef; + sta->tdls_chandef = link->conf->chanreq.oper; } ieee80211_tdls_add_oper_classes(link, skb); @@ -548,30 +548,14 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_link_data *link, } /* build the HE-cap from sband */ - if (he_cap && - (action_code == WLAN_TDLS_SETUP_REQUEST || - action_code == WLAN_TDLS_SETUP_RESPONSE || - action_code == WLAN_PUB_ACTION_TDLS_DISCOVER_RES)) { - __le16 he_6ghz_capa; - u8 cap_size; - - cap_size = - 2 + 1 + sizeof(he_cap->he_cap_elem) + - ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem) + - ieee80211_he_ppe_size(he_cap->ppe_thres[0], - he_cap->he_cap_elem.phy_cap_info); - pos = skb_put(skb, cap_size); - pos = ieee80211_ie_build_he_cap(0, pos, he_cap, pos + cap_size); + if (action_code == WLAN_TDLS_SETUP_REQUEST || + action_code == WLAN_TDLS_SETUP_RESPONSE || + action_code == WLAN_PUB_ACTION_TDLS_DISCOVER_RES) { + ieee80211_put_he_cap(skb, sdata, sband, NULL); /* Build HE 6Ghz capa IE from sband */ - if (sband->band == NL80211_BAND_6GHZ) { - cap_size = 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa); - pos = skb_put(skb, cap_size); - he_6ghz_capa = - ieee80211_get_he_6ghz_capa_vif(sband, &sdata->vif); - pos = ieee80211_write_he_6ghz_cap(pos, he_6ghz_capa, - pos + cap_size); - } + if (sband->band == NL80211_BAND_6GHZ) + ieee80211_put_he_6ghz_cap(skb, sdata, link->smps_mode); } /* add any custom IEs that go before EHT capabilities */ @@ -591,21 +575,10 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_link_data *link, } /* build the EHT-cap from sband */ - if (he_cap && eht_cap && - (action_code == WLAN_TDLS_SETUP_REQUEST || - action_code == WLAN_TDLS_SETUP_RESPONSE || - action_code == WLAN_PUB_ACTION_TDLS_DISCOVER_RES)) { - u8 cap_size; - - cap_size = - 2 + 1 + sizeof(eht_cap->eht_cap_elem) + - ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, - &eht_cap->eht_cap_elem, false) + - ieee80211_eht_ppe_size(eht_cap->eht_ppe_thres[0], - eht_cap->eht_cap_elem.phy_cap_info); - pos = skb_put(skb, cap_size); - ieee80211_ie_build_eht_cap(pos, he_cap, eht_cap, pos + cap_size, false); - } + if (action_code == WLAN_TDLS_SETUP_REQUEST || + action_code == WLAN_TDLS_SETUP_RESPONSE || + action_code == WLAN_PUB_ACTION_TDLS_DISCOVER_RES) + ieee80211_put_eht_cap(skb, sdata, sband, NULL); /* add any remaining IEs */ if (extra_ies_len) { @@ -638,7 +611,7 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_link_data *link, if (WARN_ON_ONCE(!sta || !ap_sta)) return; - sta->tdls_chandef = link->conf->chandef; + sta->tdls_chandef = link->conf->chanreq.oper; /* add any custom IEs that go before the QoS IE */ if (extra_ies_len) { @@ -684,7 +657,7 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_link_data *link, pos = skb_put(skb, 2 + sizeof(struct ieee80211_ht_operation)); ieee80211_ie_build_ht_oper(pos, &sta->sta.deflink.ht_cap, - &link->conf->chandef, prot, + &link->conf->chanreq.oper, prot, true); } @@ -1413,8 +1386,8 @@ iee80211_tdls_recalc_ht_protection(struct ieee80211_sub_if_data *sdata, IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT; u16 opmode; - /* Nothing to do if the BSS connection uses HT */ - if (!(sdata->deflink.u.mgd.conn_flags & IEEE80211_CONN_DISABLE_HT)) + /* Nothing to do if the BSS connection uses (at least) HT */ + if (sdata->deflink.u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT) return; tdls_ht = (sta && sta->sta.deflink.ht_cap.ht_supported) || @@ -2055,8 +2028,9 @@ ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata, } } -void ieee80211_teardown_tdls_peers(struct ieee80211_sub_if_data *sdata) +void ieee80211_teardown_tdls_peers(struct ieee80211_link_data *link) { + struct ieee80211_sub_if_data *sdata = link->sdata; struct sta_info *sta; u16 reason = WLAN_REASON_TDLS_TEARDOWN_UNSPECIFIED; @@ -2066,6 +2040,9 @@ void ieee80211_teardown_tdls_peers(struct ieee80211_sub_if_data *sdata) !test_sta_flag(sta, WLAN_STA_AUTHORIZED)) continue; + if (sta->deflink.link_id != link->link_id) + continue; + ieee80211_tdls_oper_request(&sdata->vif, sta->sta.addr, NL80211_TDLS_TEARDOWN, reason, GFP_ATOMIC); diff --git a/net/mac80211/tests/elems.c b/net/mac80211/tests/elems.c index 997d0cd27b..a413ba29f7 100644 --- a/net/mac80211/tests/elems.c +++ b/net/mac80211/tests/elems.c @@ -2,7 +2,7 @@ /* * KUnit tests for element parsing * - * Copyright (C) 2023 Intel Corporation + * Copyright (C) 2023-2024 Intel Corporation */ #include <kunit/test.h> #include "../ieee80211_i.h" @@ -14,6 +14,7 @@ static void mle_defrag(struct kunit *test) struct ieee80211_elems_parse_params parse_params = { .link_id = 12, .from_ap = true, + .mode = IEEE80211_CONN_MODE_EHT, }; struct ieee802_11_elems *parsed; struct sk_buff *skb; @@ -68,7 +69,7 @@ static void mle_defrag(struct kunit *test) if (IS_ERR_OR_NULL(parsed)) goto free_skb; - KUNIT_EXPECT_NOT_NULL(test, parsed->ml_basic_elem); + KUNIT_EXPECT_NOT_NULL(test, parsed->ml_basic); KUNIT_EXPECT_EQ(test, parsed->ml_basic_len, 2 /* control */ + diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 06835ed4c4..b26aacfbc6 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -2,7 +2,7 @@ /* * Portions of this file * Copyright(c) 2016-2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2023 Intel Corporation + * Copyright (C) 2018 - 2024 Intel Corporation */ #if !defined(__MAC80211_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ) @@ -33,7 +33,7 @@ __string(vif_name, sdata->name) #define VIF_ASSIGN __entry->vif_type = sdata->vif.type; __entry->sdata = sdata; \ __entry->p2p = sdata->vif.p2p; \ - __assign_str(vif_name, sdata->name) + __assign_str(vif_name) #define VIF_PR_FMT " vif:%s(%d%s)" #define VIF_PR_ARG __get_str(vif_name), __entry->vif_type, __entry->p2p ? "/p2p" : "" @@ -50,7 +50,7 @@ __entry->center_freq1 = (c) ? (c)->center_freq1 : 0; \ __entry->freq1_offset = (c) ? (c)->freq1_offset : 0; \ __entry->center_freq2 = (c) ? (c)->center_freq2 : 0; -#define CHANDEF_PR_FMT " control:%d.%03d MHz width:%d center: %d.%03d/%d MHz" +#define CHANDEF_PR_FMT " chandef(%d.%03d MHz,width:%d,center: %d.%03d/%d MHz)" #define CHANDEF_PR_ARG __entry->control_freq, __entry->freq_offset, __entry->chan_width, \ __entry->center_freq1, __entry->freq1_offset, __entry->center_freq2 @@ -69,22 +69,45 @@ __entry->min_center_freq1 = (c)->center_freq1; \ __entry->min_freq1_offset = (c)->freq1_offset; \ __entry->min_center_freq2 = (c)->center_freq2; -#define MIN_CHANDEF_PR_FMT " min_control:%d.%03d MHz min_width:%d min_center: %d.%03d/%d MHz" +#define MIN_CHANDEF_PR_FMT " mindef(%d.%03d MHz,width:%d,center: %d.%03d/%d MHz)" #define MIN_CHANDEF_PR_ARG __entry->min_control_freq, __entry->min_freq_offset, \ __entry->min_chan_width, \ __entry->min_center_freq1, __entry->min_freq1_offset, \ __entry->min_center_freq2 +#define AP_CHANDEF_ENTRY \ + __field(u32, ap_control_freq) \ + __field(u32, ap_freq_offset) \ + __field(u32, ap_chan_width) \ + __field(u32, ap_center_freq1) \ + __field(u32, ap_freq1_offset) \ + __field(u32, ap_center_freq2) + +#define AP_CHANDEF_ASSIGN(c) \ + __entry->ap_control_freq = (c)->chan ? (c)->chan->center_freq : 0;\ + __entry->ap_freq_offset = (c)->chan ? (c)->chan->freq_offset : 0;\ + __entry->ap_chan_width = (c)->chan ? (c)->width : 0; \ + __entry->ap_center_freq1 = (c)->chan ? (c)->center_freq1 : 0; \ + __entry->ap_freq1_offset = (c)->chan ? (c)->freq1_offset : 0; \ + __entry->ap_center_freq2 = (c)->chan ? (c)->center_freq2 : 0; +#define AP_CHANDEF_PR_FMT " ap(%d.%03d MHz,width:%d,center: %d.%03d/%d MHz)" +#define AP_CHANDEF_PR_ARG __entry->ap_control_freq, __entry->ap_freq_offset, \ + __entry->ap_chan_width, \ + __entry->ap_center_freq1, __entry->ap_freq1_offset, \ + __entry->ap_center_freq2 + #define CHANCTX_ENTRY CHANDEF_ENTRY \ MIN_CHANDEF_ENTRY \ + AP_CHANDEF_ENTRY \ __field(u8, rx_chains_static) \ __field(u8, rx_chains_dynamic) #define CHANCTX_ASSIGN CHANDEF_ASSIGN(&ctx->conf.def) \ MIN_CHANDEF_ASSIGN(&ctx->conf.min_def) \ + AP_CHANDEF_ASSIGN(&ctx->conf.ap) \ __entry->rx_chains_static = ctx->conf.rx_chains_static; \ __entry->rx_chains_dynamic = ctx->conf.rx_chains_dynamic -#define CHANCTX_PR_FMT CHANDEF_PR_FMT MIN_CHANDEF_PR_FMT " chains:%d/%d" -#define CHANCTX_PR_ARG CHANDEF_PR_ARG, MIN_CHANDEF_PR_ARG, \ +#define CHANCTX_PR_FMT CHANDEF_PR_FMT MIN_CHANDEF_PR_FMT AP_CHANDEF_PR_FMT " chains:%d/%d" +#define CHANCTX_PR_ARG CHANDEF_PR_ARG, MIN_CHANDEF_PR_ARG, AP_CHANDEF_PR_ARG, \ __entry->rx_chains_static, __entry->rx_chains_dynamic #define KEY_ENTRY __field(u32, cipher) \ @@ -503,9 +526,9 @@ TRACE_EVENT(drv_link_info_changed, __entry->ht_operation_mode = link_conf->ht_operation_mode; __entry->cqm_rssi_thold = link_conf->cqm_rssi_thold; __entry->cqm_rssi_hyst = link_conf->cqm_rssi_hyst; - __entry->channel_width = link_conf->chandef.width; - __entry->channel_cfreq1 = link_conf->chandef.center_freq1; - __entry->channel_cfreq1_offset = link_conf->chandef.freq1_offset; + __entry->channel_width = link_conf->chanreq.oper.width; + __entry->channel_cfreq1 = link_conf->chanreq.oper.center_freq1; + __entry->channel_cfreq1_offset = link_conf->chanreq.oper.freq1_offset; __entry->qos = link_conf->qos; __entry->hidden_ssid = link_conf->hidden_ssid; __entry->txpower = link_conf->txpower; @@ -1186,7 +1209,7 @@ DEFINE_EVENT(sta_event, drv_flush_sta, TP_ARGS(local, sdata, sta) ); -TRACE_EVENT(drv_channel_switch, +DECLARE_EVENT_CLASS(chanswitch_evt, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel_switch *ch_switch), @@ -1201,6 +1224,7 @@ TRACE_EVENT(drv_channel_switch, __field(u32, device_timestamp) __field(bool, block_tx) __field(u8, count) + __field(u8, link_id) ), TP_fast_assign( @@ -1211,14 +1235,24 @@ TRACE_EVENT(drv_channel_switch, __entry->device_timestamp = ch_switch->device_timestamp; __entry->block_tx = ch_switch->block_tx; __entry->count = ch_switch->count; + __entry->link_id = ch_switch->link_id; ), TP_printk( - LOCAL_PR_FMT VIF_PR_FMT " new " CHANDEF_PR_FMT " count:%d", - LOCAL_PR_ARG, VIF_PR_ARG, CHANDEF_PR_ARG, __entry->count + LOCAL_PR_FMT VIF_PR_FMT CHANDEF_PR_FMT " count:%d block_tx:%d timestamp:%llu device_ts:%u link_id:%d", + LOCAL_PR_ARG, VIF_PR_ARG, CHANDEF_PR_ARG, __entry->count, + __entry->block_tx, __entry->timestamp, + __entry->device_timestamp, __entry->link_id ) ); +DEFINE_EVENT(chanswitch_evt, drv_channel_switch, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_channel_switch *ch_switch), + TP_ARGS(local, sdata, ch_switch) +); + TRACE_EVENT(drv_set_antenna, TP_PROTO(struct ieee80211_local *local, u32 tx_ant, u32 rx_ant, int ret), @@ -2098,39 +2132,11 @@ TRACE_EVENT(drv_channel_switch_beacon, ) ); -TRACE_EVENT(drv_pre_channel_switch, +DEFINE_EVENT(chanswitch_evt, drv_pre_channel_switch, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel_switch *ch_switch), - - TP_ARGS(local, sdata, ch_switch), - - TP_STRUCT__entry( - LOCAL_ENTRY - VIF_ENTRY - CHANDEF_ENTRY - __field(u64, timestamp) - __field(u32, device_timestamp) - __field(bool, block_tx) - __field(u8, count) - ), - - TP_fast_assign( - LOCAL_ASSIGN; - VIF_ASSIGN; - CHANDEF_ASSIGN(&ch_switch->chandef) - __entry->timestamp = ch_switch->timestamp; - __entry->device_timestamp = ch_switch->device_timestamp; - __entry->block_tx = ch_switch->block_tx; - __entry->count = ch_switch->count; - ), - - TP_printk( - LOCAL_PR_FMT VIF_PR_FMT " prepare channel switch to " - CHANDEF_PR_FMT " count:%d block_tx:%d timestamp:%llu", - LOCAL_PR_ARG, VIF_PR_ARG, CHANDEF_PR_ARG, __entry->count, - __entry->block_tx, __entry->timestamp - ) + TP_ARGS(local, sdata, ch_switch) ); DEFINE_EVENT(local_sdata_evt, drv_post_channel_switch, @@ -2145,40 +2151,11 @@ DEFINE_EVENT(local_sdata_evt, drv_abort_channel_switch, TP_ARGS(local, sdata) ); -TRACE_EVENT(drv_channel_switch_rx_beacon, +DEFINE_EVENT(chanswitch_evt, drv_channel_switch_rx_beacon, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel_switch *ch_switch), - - TP_ARGS(local, sdata, ch_switch), - - TP_STRUCT__entry( - LOCAL_ENTRY - VIF_ENTRY - CHANDEF_ENTRY - __field(u64, timestamp) - __field(u32, device_timestamp) - __field(bool, block_tx) - __field(u8, count) - ), - - TP_fast_assign( - LOCAL_ASSIGN; - VIF_ASSIGN; - CHANDEF_ASSIGN(&ch_switch->chandef) - __entry->timestamp = ch_switch->timestamp; - __entry->device_timestamp = ch_switch->device_timestamp; - __entry->block_tx = ch_switch->block_tx; - __entry->count = ch_switch->count; - ), - - TP_printk( - LOCAL_PR_FMT VIF_PR_FMT - " received a channel switch beacon to " - CHANDEF_PR_FMT " count:%d block_tx:%d timestamp:%llu", - LOCAL_PR_ARG, VIF_PR_ARG, CHANDEF_PR_ARG, __entry->count, - __entry->block_tx, __entry->timestamp - ) + TP_ARGS(local, sdata, ch_switch) ); TRACE_EVENT(drv_get_txpower, @@ -3035,6 +3012,34 @@ TRACE_EVENT(api_radar_detected, ) ); +TRACE_EVENT(api_request_smps, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_link_data *link, + enum ieee80211_smps_mode smps_mode), + + TP_ARGS(local, sdata, link, smps_mode), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + __field(int, link_id) + __field(u32, smps_mode) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + __entry->link_id = link->link_id, + __entry->smps_mode = smps_mode; + ), + + TP_printk( + LOCAL_PR_FMT " " VIF_PR_FMT " link:%d, smps_mode:%d", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->link_id, __entry->smps_mode + ) +); + /* * Tracing for internal functions * (which may also be called in response to driver calls) @@ -3088,6 +3093,58 @@ TRACE_EVENT(stop_queue, ) ); +TRACE_EVENT(drv_can_neg_ttlm, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_neg_ttlm *neg_ttlm), + + TP_ARGS(local, sdata, neg_ttlm), + + TP_STRUCT__entry(LOCAL_ENTRY + VIF_ENTRY + __array(u16, downlink, sizeof(u16) * 8) + __array(u16, uplink, sizeof(u16) * 8) + ), + + TP_fast_assign(LOCAL_ASSIGN; + VIF_ASSIGN; + memcpy(__entry->downlink, neg_ttlm->downlink, + sizeof(neg_ttlm->downlink)); + memcpy(__entry->uplink, neg_ttlm->uplink, + sizeof(neg_ttlm->uplink)); + ), + + TP_printk(LOCAL_PR_FMT ", " VIF_PR_FMT, LOCAL_PR_ARG, VIF_PR_ARG) +); + +TRACE_EVENT(drv_neg_ttlm_res, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + enum ieee80211_neg_ttlm_res res, + struct ieee80211_neg_ttlm *neg_ttlm), + + TP_ARGS(local, sdata, res, neg_ttlm), + + TP_STRUCT__entry(LOCAL_ENTRY + VIF_ENTRY + __field(u32, res) + __array(u16, downlink, sizeof(u16) * 8) + __array(u16, uplink, sizeof(u16) * 8) + ), + + TP_fast_assign(LOCAL_ASSIGN; + VIF_ASSIGN; + __entry->res = res; + memcpy(__entry->downlink, neg_ttlm->downlink, + sizeof(neg_ttlm->downlink)); + memcpy(__entry->uplink, neg_ttlm->uplink, + sizeof(neg_ttlm->uplink)); + ), + + TP_printk(LOCAL_PR_FMT VIF_PR_FMT " response: %d\n ", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->res + ) +); #endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH diff --git a/net/mac80211/trace_msg.h b/net/mac80211/trace_msg.h index c9dbe9aab7..aea4ce55c5 100644 --- a/net/mac80211/trace_msg.h +++ b/net/mac80211/trace_msg.h @@ -16,8 +16,6 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM mac80211_msg -#define MAX_MSG_LEN 120 - DECLARE_EVENT_CLASS(mac80211_msg_event, TP_PROTO(struct va_format *vaf), diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 6fbb15b659..edba4a3184 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -133,6 +133,7 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, mrate = sband->bitrates[0].bitrate; for (i = 0; i < sband->n_bitrates; i++) { struct ieee80211_rate *r = &sband->bitrates[i]; + u32 flag; if (r->bitrate > txrate->bitrate) break; @@ -145,28 +146,24 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, switch (sband->band) { case NL80211_BAND_2GHZ: - case NL80211_BAND_LC: { - u32 flag; + case NL80211_BAND_LC: if (tx->sdata->deflink.operating_11g_mode) flag = IEEE80211_RATE_MANDATORY_G; else flag = IEEE80211_RATE_MANDATORY_B; - if (r->flags & flag) - mrate = r->bitrate; break; - } case NL80211_BAND_5GHZ: case NL80211_BAND_6GHZ: - if (r->flags & IEEE80211_RATE_MANDATORY_A) - mrate = r->bitrate; + flag = IEEE80211_RATE_MANDATORY_A; break; - case NL80211_BAND_S1GHZ: - case NL80211_BAND_60GHZ: - /* TODO, for now fall through */ - case NUM_NL80211_BANDS: + default: + flag = 0; WARN_ON(1); break; } + + if (r->flags & flag) + mrate = r->bitrate; } if (rate == -1) { /* No matching basic rate found; use highest suitable mandatory @@ -701,11 +698,16 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) txrc.bss_conf = &tx->sdata->vif.bss_conf; txrc.skb = tx->skb; txrc.reported_rate.idx = -1; - txrc.rate_idx_mask = tx->sdata->rc_rateidx_mask[info->band]; - if (tx->sdata->rc_has_mcs_mask[info->band]) - txrc.rate_idx_mcs_mask = - tx->sdata->rc_rateidx_mcs_mask[info->band]; + if (unlikely(info->control.flags & IEEE80211_TX_CTRL_SCAN_TX)) { + txrc.rate_idx_mask = ~0; + } else { + txrc.rate_idx_mask = tx->sdata->rc_rateidx_mask[info->band]; + + if (tx->sdata->rc_has_mcs_mask[info->band]) + txrc.rate_idx_mcs_mask = + tx->sdata->rc_rateidx_mcs_mask[info->band]; + } txrc.bss = (tx->sdata->vif.type == NL80211_IFTYPE_AP || tx->sdata->vif.type == NL80211_IFTYPE_MESH_POINT || @@ -1607,8 +1609,8 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local) local->cparams.target = MS2TIME(20); local->cparams.ecn = true; - local->cvars = kcalloc(fq->flows_cnt, sizeof(local->cvars[0]), - GFP_KERNEL); + local->cvars = kvcalloc(fq->flows_cnt, sizeof(local->cvars[0]), + GFP_KERNEL); if (!local->cvars) { spin_lock_bh(&fq->lock); fq_reset(fq, fq_skb_free_func); @@ -1628,7 +1630,7 @@ void ieee80211_txq_teardown_flows(struct ieee80211_local *local) { struct fq *fq = &local->fq; - kfree(local->cvars); + kvfree(local->cvars); local->cvars = NULL; spin_lock_bh(&fq->lock); @@ -1766,7 +1768,7 @@ static bool __ieee80211_tx(struct ieee80211_local *local, break; } sdata = rcu_dereference(local->monitor_sdata); - if (sdata) { + if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { vif = &sdata->vif; info->hw_queue = vif->hw_queue[skb_get_queue_mapping(skb)]; @@ -2393,12 +2395,18 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, if (chanctx_conf) chandef = &chanctx_conf->def; - else if (!local->use_chanctx) - chandef = &local->_oper_chandef; else goto fail_rcu; /* + * If driver/HW supports IEEE80211_CHAN_CAN_MONITOR we still + * shouldn't transmit on disabled channels. + */ + if (!cfg80211_chandef_usable(local->hw.wiphy, chandef, + IEEE80211_CHAN_DISABLED)) + goto fail_rcu; + + /* * Frame injection is not allowed if beaconing is not allowed * or if we need radar detection. Beaconing is usually not allowed when * the mode or operation (Adhoc, AP, Mesh) does not support DFS. @@ -2766,8 +2774,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, if (tdls_peer) { /* For TDLS only one link can be valid with peer STA */ - int tdls_link_id = sta->sta.valid_links ? - __ffs(sta->sta.valid_links) : 0; + int tdls_link_id = ieee80211_tdls_sta_link_id(sta); struct ieee80211_link_data *link; /* DA SA BSSID */ @@ -3093,8 +3100,7 @@ void ieee80211_check_fast_xmit(struct sta_info *sta) case NL80211_IFTYPE_STATION: if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { /* For TDLS only one link can be valid with peer STA */ - int tdls_link_id = sta->sta.valid_links ? - __ffs(sta->sta.valid_links) : 0; + int tdls_link_id = ieee80211_tdls_sta_link_id(sta); struct ieee80211_link_data *link; /* DA SA BSSID */ @@ -3951,7 +3957,8 @@ begin: break; } tx.sdata = rcu_dereference(local->monitor_sdata); - if (tx.sdata) { + if (tx.sdata && + ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { vif = &tx.sdata->vif; info->hw_queue = vif->hw_queue[skb_get_queue_mapping(skb)]; @@ -3959,7 +3966,8 @@ begin: ieee80211_free_txskb(&local->hw, skb); goto begin; } else { - vif = NULL; + info->control.vif = NULL; + return skb; } break; case NL80211_IFTYPE_AP_VLAN: @@ -5032,16 +5040,24 @@ static u8 __ieee80211_beacon_update_cntdwn(struct beacon_data *beacon) return beacon->cntdwn_current_counter; } -u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif) +u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_link_data *link; struct beacon_data *beacon = NULL; u8 count = 0; + if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) + return 0; + rcu_read_lock(); + link = rcu_dereference(sdata->link[link_id]); + if (!link) + goto unlock; + if (sdata->vif.type == NL80211_IFTYPE_AP) - beacon = rcu_dereference(sdata->deflink.u.ap.beacon); + beacon = rcu_dereference(link->u.ap.beacon); else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) beacon = rcu_dereference(sdata->u.ibss.presp); else if (ieee80211_vif_is_mesh(&sdata->vif)) @@ -5083,9 +5099,11 @@ unlock: } EXPORT_SYMBOL(ieee80211_beacon_set_cntdwn); -bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif) +bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif, + unsigned int link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_link_data *link; struct beacon_data *beacon = NULL; u8 *beacon_data; size_t beacon_data_len; @@ -5094,9 +5112,17 @@ bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif) if (!ieee80211_sdata_running(sdata)) return false; + if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) + return 0; + rcu_read_lock(); + + link = rcu_dereference(sdata->link[link_id]); + if (!link) + goto out; + if (vif->type == NL80211_IFTYPE_AP) { - beacon = rcu_dereference(sdata->deflink.u.ap.beacon); + beacon = rcu_dereference(link->u.ap.beacon); if (WARN_ON(!beacon || !beacon->tail)) goto out; beacon_data = beacon->tail; @@ -5282,7 +5308,7 @@ ieee80211_beacon_get_ap(struct ieee80211_hw *hw, if (beacon->cntdwn_counter_offsets[0]) { if (!is_template) - ieee80211_beacon_update_cntdwn(vif); + ieee80211_beacon_update_cntdwn(vif, link->link_id); ieee80211_set_beacon_cntdwn(sdata, beacon, link); } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 643c54855b..c11dbe82ae 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -6,7 +6,7 @@ * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * utilities for mac80211 */ @@ -46,6 +46,11 @@ struct ieee80211_hw *wiphy_to_ieee80211_hw(struct wiphy *wiphy) } EXPORT_SYMBOL(wiphy_to_ieee80211_hw); +const struct ieee80211_conn_settings ieee80211_conn_settings_unlimited = { + .mode = IEEE80211_CONN_MODE_EHT, + .bw_limit = IEEE80211_CONN_BW_LIMIT_320, +}; + u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, enum nl80211_iftype type) { @@ -771,7 +776,7 @@ static void __iterate_interfaces(struct ieee80211_local *local, sdata = rcu_dereference_check(local->monitor_sdata, lockdep_is_held(&local->iflist_mtx) || lockdep_is_held(&local->hw.wiphy->mtx)); - if (sdata && + if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) && (iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL || !active_only || sdata->flags & IEEE80211_SDATA_IN_DRIVER)) iterator(data, sdata->vif.addr, &sdata->vif); @@ -912,776 +917,6 @@ void ieee80211_queue_delayed_work(struct ieee80211_hw *hw, } EXPORT_SYMBOL(ieee80211_queue_delayed_work); -static void -ieee80211_parse_extension_element(u32 *crc, - const struct element *elem, - struct ieee802_11_elems *elems, - struct ieee80211_elems_parse_params *params) -{ - const void *data = elem->data + 1; - bool calc_crc = false; - u8 len; - - if (!elem->datalen) - return; - - len = elem->datalen - 1; - - switch (elem->data[0]) { - case WLAN_EID_EXT_HE_MU_EDCA: - calc_crc = true; - if (len >= sizeof(*elems->mu_edca_param_set)) - elems->mu_edca_param_set = data; - break; - case WLAN_EID_EXT_HE_CAPABILITY: - if (ieee80211_he_capa_size_ok(data, len)) { - elems->he_cap = data; - elems->he_cap_len = len; - } - break; - case WLAN_EID_EXT_HE_OPERATION: - calc_crc = true; - if (len >= sizeof(*elems->he_operation) && - len >= ieee80211_he_oper_size(data) - 1) - elems->he_operation = data; - break; - case WLAN_EID_EXT_UORA: - if (len >= 1) - elems->uora_element = data; - break; - case WLAN_EID_EXT_MAX_CHANNEL_SWITCH_TIME: - if (len == 3) - elems->max_channel_switch_time = data; - break; - case WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION: - if (len >= sizeof(*elems->mbssid_config_ie)) - elems->mbssid_config_ie = data; - break; - case WLAN_EID_EXT_HE_SPR: - if (len >= sizeof(*elems->he_spr) && - len >= ieee80211_he_spr_size(data)) - elems->he_spr = data; - break; - case WLAN_EID_EXT_HE_6GHZ_CAPA: - if (len >= sizeof(*elems->he_6ghz_capa)) - elems->he_6ghz_capa = data; - break; - case WLAN_EID_EXT_EHT_CAPABILITY: - if (ieee80211_eht_capa_size_ok(elems->he_cap, - data, len, - params->from_ap)) { - elems->eht_cap = data; - elems->eht_cap_len = len; - } - break; - case WLAN_EID_EXT_EHT_OPERATION: - if (ieee80211_eht_oper_size_ok(data, len)) - elems->eht_operation = data; - calc_crc = true; - break; - case WLAN_EID_EXT_EHT_MULTI_LINK: - calc_crc = true; - - if (ieee80211_mle_size_ok(data, len)) { - const struct ieee80211_multi_link_elem *mle = - (void *)data; - - switch (le16_get_bits(mle->control, - IEEE80211_ML_CONTROL_TYPE)) { - case IEEE80211_ML_CONTROL_TYPE_BASIC: - elems->ml_basic_elem = (void *)elem; - elems->ml_basic = data; - elems->ml_basic_len = len; - break; - case IEEE80211_ML_CONTROL_TYPE_RECONF: - elems->ml_reconf_elem = (void *)elem; - elems->ml_reconf = data; - elems->ml_reconf_len = len; - break; - default: - break; - } - } - break; - case WLAN_EID_EXT_BANDWIDTH_INDICATION: - if (ieee80211_bandwidth_indication_size_ok(data, len)) - elems->bandwidth_indication = data; - calc_crc = true; - break; - case WLAN_EID_EXT_TID_TO_LINK_MAPPING: - calc_crc = true; - if (ieee80211_tid_to_link_map_size_ok(data, len) && - elems->ttlm_num < ARRAY_SIZE(elems->ttlm)) { - elems->ttlm[elems->ttlm_num] = (void *)data; - elems->ttlm_num++; - } - break; - } - - if (crc && calc_crc) - *crc = crc32_be(*crc, (void *)elem, elem->datalen + 2); -} - -static u32 -_ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params, - struct ieee802_11_elems *elems, - const struct element *check_inherit) -{ - const struct element *elem; - bool calc_crc = params->filter != 0; - DECLARE_BITMAP(seen_elems, 256); - u32 crc = params->crc; - - bitmap_zero(seen_elems, 256); - - for_each_element(elem, params->start, params->len) { - const struct element *subelem; - bool elem_parse_failed; - u8 id = elem->id; - u8 elen = elem->datalen; - const u8 *pos = elem->data; - - if (check_inherit && - !cfg80211_is_element_inherited(elem, - check_inherit)) - continue; - - switch (id) { - case WLAN_EID_SSID: - case WLAN_EID_SUPP_RATES: - case WLAN_EID_FH_PARAMS: - case WLAN_EID_DS_PARAMS: - case WLAN_EID_CF_PARAMS: - case WLAN_EID_TIM: - case WLAN_EID_IBSS_PARAMS: - case WLAN_EID_CHALLENGE: - case WLAN_EID_RSN: - case WLAN_EID_ERP_INFO: - case WLAN_EID_EXT_SUPP_RATES: - case WLAN_EID_HT_CAPABILITY: - case WLAN_EID_HT_OPERATION: - case WLAN_EID_VHT_CAPABILITY: - case WLAN_EID_VHT_OPERATION: - case WLAN_EID_MESH_ID: - case WLAN_EID_MESH_CONFIG: - case WLAN_EID_PEER_MGMT: - case WLAN_EID_PREQ: - case WLAN_EID_PREP: - case WLAN_EID_PERR: - case WLAN_EID_RANN: - case WLAN_EID_CHANNEL_SWITCH: - case WLAN_EID_EXT_CHANSWITCH_ANN: - case WLAN_EID_COUNTRY: - case WLAN_EID_PWR_CONSTRAINT: - case WLAN_EID_TIMEOUT_INTERVAL: - case WLAN_EID_SECONDARY_CHANNEL_OFFSET: - case WLAN_EID_WIDE_BW_CHANNEL_SWITCH: - case WLAN_EID_CHAN_SWITCH_PARAM: - case WLAN_EID_EXT_CAPABILITY: - case WLAN_EID_CHAN_SWITCH_TIMING: - case WLAN_EID_LINK_ID: - case WLAN_EID_BSS_MAX_IDLE_PERIOD: - case WLAN_EID_RSNX: - case WLAN_EID_S1G_BCN_COMPAT: - case WLAN_EID_S1G_CAPABILITIES: - case WLAN_EID_S1G_OPERATION: - case WLAN_EID_AID_RESPONSE: - case WLAN_EID_S1G_SHORT_BCN_INTERVAL: - /* - * not listing WLAN_EID_CHANNEL_SWITCH_WRAPPER -- it seems possible - * that if the content gets bigger it might be needed more than once - */ - if (test_bit(id, seen_elems)) { - elems->parse_error = true; - continue; - } - break; - } - - if (calc_crc && id < 64 && (params->filter & (1ULL << id))) - crc = crc32_be(crc, pos - 2, elen + 2); - - elem_parse_failed = false; - - switch (id) { - case WLAN_EID_LINK_ID: - if (elen + 2 < sizeof(struct ieee80211_tdls_lnkie)) { - elem_parse_failed = true; - break; - } - elems->lnk_id = (void *)(pos - 2); - break; - case WLAN_EID_CHAN_SWITCH_TIMING: - if (elen < sizeof(struct ieee80211_ch_switch_timing)) { - elem_parse_failed = true; - break; - } - elems->ch_sw_timing = (void *)pos; - break; - case WLAN_EID_EXT_CAPABILITY: - elems->ext_capab = pos; - elems->ext_capab_len = elen; - break; - case WLAN_EID_SSID: - elems->ssid = pos; - elems->ssid_len = elen; - break; - case WLAN_EID_SUPP_RATES: - elems->supp_rates = pos; - elems->supp_rates_len = elen; - break; - case WLAN_EID_DS_PARAMS: - if (elen >= 1) - elems->ds_params = pos; - else - elem_parse_failed = true; - break; - case WLAN_EID_TIM: - if (elen >= sizeof(struct ieee80211_tim_ie)) { - elems->tim = (void *)pos; - elems->tim_len = elen; - } else - elem_parse_failed = true; - break; - case WLAN_EID_VENDOR_SPECIFIC: - if (elen >= 4 && pos[0] == 0x00 && pos[1] == 0x50 && - pos[2] == 0xf2) { - /* Microsoft OUI (00:50:F2) */ - - if (calc_crc) - crc = crc32_be(crc, pos - 2, elen + 2); - - if (elen >= 5 && pos[3] == 2) { - /* OUI Type 2 - WMM IE */ - if (pos[4] == 0) { - elems->wmm_info = pos; - elems->wmm_info_len = elen; - } else if (pos[4] == 1) { - elems->wmm_param = pos; - elems->wmm_param_len = elen; - } - } - } - break; - case WLAN_EID_RSN: - elems->rsn = pos; - elems->rsn_len = elen; - break; - case WLAN_EID_ERP_INFO: - if (elen >= 1) - elems->erp_info = pos; - else - elem_parse_failed = true; - break; - case WLAN_EID_EXT_SUPP_RATES: - elems->ext_supp_rates = pos; - elems->ext_supp_rates_len = elen; - break; - case WLAN_EID_HT_CAPABILITY: - if (elen >= sizeof(struct ieee80211_ht_cap)) - elems->ht_cap_elem = (void *)pos; - else - elem_parse_failed = true; - break; - case WLAN_EID_HT_OPERATION: - if (elen >= sizeof(struct ieee80211_ht_operation)) - elems->ht_operation = (void *)pos; - else - elem_parse_failed = true; - break; - case WLAN_EID_VHT_CAPABILITY: - if (elen >= sizeof(struct ieee80211_vht_cap)) - elems->vht_cap_elem = (void *)pos; - else - elem_parse_failed = true; - break; - case WLAN_EID_VHT_OPERATION: - if (elen >= sizeof(struct ieee80211_vht_operation)) { - elems->vht_operation = (void *)pos; - if (calc_crc) - crc = crc32_be(crc, pos - 2, elen + 2); - break; - } - elem_parse_failed = true; - break; - case WLAN_EID_OPMODE_NOTIF: - if (elen > 0) { - elems->opmode_notif = pos; - if (calc_crc) - crc = crc32_be(crc, pos - 2, elen + 2); - break; - } - elem_parse_failed = true; - break; - case WLAN_EID_MESH_ID: - elems->mesh_id = pos; - elems->mesh_id_len = elen; - break; - case WLAN_EID_MESH_CONFIG: - if (elen >= sizeof(struct ieee80211_meshconf_ie)) - elems->mesh_config = (void *)pos; - else - elem_parse_failed = true; - break; - case WLAN_EID_PEER_MGMT: - elems->peering = pos; - elems->peering_len = elen; - break; - case WLAN_EID_MESH_AWAKE_WINDOW: - if (elen >= 2) - elems->awake_window = (void *)pos; - break; - case WLAN_EID_PREQ: - elems->preq = pos; - elems->preq_len = elen; - break; - case WLAN_EID_PREP: - elems->prep = pos; - elems->prep_len = elen; - break; - case WLAN_EID_PERR: - elems->perr = pos; - elems->perr_len = elen; - break; - case WLAN_EID_RANN: - if (elen >= sizeof(struct ieee80211_rann_ie)) - elems->rann = (void *)pos; - else - elem_parse_failed = true; - break; - case WLAN_EID_CHANNEL_SWITCH: - if (elen != sizeof(struct ieee80211_channel_sw_ie)) { - elem_parse_failed = true; - break; - } - elems->ch_switch_ie = (void *)pos; - break; - case WLAN_EID_EXT_CHANSWITCH_ANN: - if (elen != sizeof(struct ieee80211_ext_chansw_ie)) { - elem_parse_failed = true; - break; - } - elems->ext_chansw_ie = (void *)pos; - break; - case WLAN_EID_SECONDARY_CHANNEL_OFFSET: - if (elen != sizeof(struct ieee80211_sec_chan_offs_ie)) { - elem_parse_failed = true; - break; - } - elems->sec_chan_offs = (void *)pos; - break; - case WLAN_EID_CHAN_SWITCH_PARAM: - if (elen < - sizeof(*elems->mesh_chansw_params_ie)) { - elem_parse_failed = true; - break; - } - elems->mesh_chansw_params_ie = (void *)pos; - break; - case WLAN_EID_WIDE_BW_CHANNEL_SWITCH: - if (!params->action || - elen < sizeof(*elems->wide_bw_chansw_ie)) { - elem_parse_failed = true; - break; - } - elems->wide_bw_chansw_ie = (void *)pos; - break; - case WLAN_EID_CHANNEL_SWITCH_WRAPPER: - if (params->action) { - elem_parse_failed = true; - break; - } - /* - * This is a bit tricky, but as we only care about - * a few elements, parse them out manually. - */ - subelem = cfg80211_find_elem(WLAN_EID_WIDE_BW_CHANNEL_SWITCH, - pos, elen); - if (subelem) { - if (subelem->datalen >= sizeof(*elems->wide_bw_chansw_ie)) - elems->wide_bw_chansw_ie = - (void *)subelem->data; - else - elem_parse_failed = true; - } - - subelem = cfg80211_find_ext_elem(WLAN_EID_EXT_BANDWIDTH_INDICATION, - pos, elen); - if (subelem) { - const void *edata = subelem->data + 1; - u8 edatalen = subelem->datalen - 1; - - if (ieee80211_bandwidth_indication_size_ok(edata, - edatalen)) - elems->bandwidth_indication = edata; - else - elem_parse_failed = true; - } - break; - case WLAN_EID_COUNTRY: - elems->country_elem = pos; - elems->country_elem_len = elen; - break; - case WLAN_EID_PWR_CONSTRAINT: - if (elen != 1) { - elem_parse_failed = true; - break; - } - elems->pwr_constr_elem = pos; - break; - case WLAN_EID_CISCO_VENDOR_SPECIFIC: - /* Lots of different options exist, but we only care - * about the Dynamic Transmit Power Control element. - * First check for the Cisco OUI, then for the DTPC - * tag (0x00). - */ - if (elen < 4) { - elem_parse_failed = true; - break; - } - - if (pos[0] != 0x00 || pos[1] != 0x40 || - pos[2] != 0x96 || pos[3] != 0x00) - break; - - if (elen != 6) { - elem_parse_failed = true; - break; - } - - if (calc_crc) - crc = crc32_be(crc, pos - 2, elen + 2); - - elems->cisco_dtpc_elem = pos; - break; - case WLAN_EID_ADDBA_EXT: - if (elen < sizeof(struct ieee80211_addba_ext_ie)) { - elem_parse_failed = true; - break; - } - elems->addba_ext_ie = (void *)pos; - break; - case WLAN_EID_TIMEOUT_INTERVAL: - if (elen >= sizeof(struct ieee80211_timeout_interval_ie)) - elems->timeout_int = (void *)pos; - else - elem_parse_failed = true; - break; - case WLAN_EID_BSS_MAX_IDLE_PERIOD: - if (elen >= sizeof(*elems->max_idle_period_ie)) - elems->max_idle_period_ie = (void *)pos; - break; - case WLAN_EID_RSNX: - elems->rsnx = pos; - elems->rsnx_len = elen; - break; - case WLAN_EID_TX_POWER_ENVELOPE: - if (elen < 1 || - elen > sizeof(struct ieee80211_tx_pwr_env)) - break; - - if (elems->tx_pwr_env_num >= ARRAY_SIZE(elems->tx_pwr_env)) - break; - - elems->tx_pwr_env[elems->tx_pwr_env_num] = (void *)pos; - elems->tx_pwr_env_len[elems->tx_pwr_env_num] = elen; - elems->tx_pwr_env_num++; - break; - case WLAN_EID_EXTENSION: - ieee80211_parse_extension_element(calc_crc ? - &crc : NULL, - elem, elems, params); - break; - case WLAN_EID_S1G_CAPABILITIES: - if (elen >= sizeof(*elems->s1g_capab)) - elems->s1g_capab = (void *)pos; - else - elem_parse_failed = true; - break; - case WLAN_EID_S1G_OPERATION: - if (elen == sizeof(*elems->s1g_oper)) - elems->s1g_oper = (void *)pos; - else - elem_parse_failed = true; - break; - case WLAN_EID_S1G_BCN_COMPAT: - if (elen == sizeof(*elems->s1g_bcn_compat)) - elems->s1g_bcn_compat = (void *)pos; - else - elem_parse_failed = true; - break; - case WLAN_EID_AID_RESPONSE: - if (elen == sizeof(struct ieee80211_aid_response_ie)) - elems->aid_resp = (void *)pos; - else - elem_parse_failed = true; - break; - default: - break; - } - - if (elem_parse_failed) - elems->parse_error = true; - else - __set_bit(id, seen_elems); - } - - if (!for_each_element_completed(elem, params->start, params->len)) - elems->parse_error = true; - - return crc; -} - -static size_t ieee802_11_find_bssid_profile(const u8 *start, size_t len, - struct ieee802_11_elems *elems, - struct cfg80211_bss *bss, - u8 *nontransmitted_profile) -{ - const struct element *elem, *sub; - size_t profile_len = 0; - bool found = false; - - if (!bss || !bss->transmitted_bss) - return profile_len; - - for_each_element_id(elem, WLAN_EID_MULTIPLE_BSSID, start, len) { - if (elem->datalen < 2) - continue; - if (elem->data[0] < 1 || elem->data[0] > 8) - continue; - - for_each_element(sub, elem->data + 1, elem->datalen - 1) { - u8 new_bssid[ETH_ALEN]; - const u8 *index; - - if (sub->id != 0 || sub->datalen < 4) { - /* not a valid BSS profile */ - continue; - } - - if (sub->data[0] != WLAN_EID_NON_TX_BSSID_CAP || - sub->data[1] != 2) { - /* The first element of the - * Nontransmitted BSSID Profile is not - * the Nontransmitted BSSID Capability - * element. - */ - continue; - } - - memset(nontransmitted_profile, 0, len); - profile_len = cfg80211_merge_profile(start, len, - elem, - sub, - nontransmitted_profile, - len); - - /* found a Nontransmitted BSSID Profile */ - index = cfg80211_find_ie(WLAN_EID_MULTI_BSSID_IDX, - nontransmitted_profile, - profile_len); - if (!index || index[1] < 1 || index[2] == 0) { - /* Invalid MBSSID Index element */ - continue; - } - - cfg80211_gen_new_bssid(bss->transmitted_bss->bssid, - elem->data[0], - index[2], - new_bssid); - if (ether_addr_equal(new_bssid, bss->bssid)) { - found = true; - elems->bssid_index_len = index[1]; - elems->bssid_index = (void *)&index[2]; - break; - } - } - } - - return found ? profile_len : 0; -} - -static void ieee80211_mle_get_sta_prof(struct ieee802_11_elems *elems, - u8 link_id) -{ - const struct ieee80211_multi_link_elem *ml = elems->ml_basic; - ssize_t ml_len = elems->ml_basic_len; - const struct element *sub; - - if (!ml || !ml_len) - return; - - if (le16_get_bits(ml->control, IEEE80211_ML_CONTROL_TYPE) != - IEEE80211_ML_CONTROL_TYPE_BASIC) - return; - - for_each_mle_subelement(sub, (u8 *)ml, ml_len) { - struct ieee80211_mle_per_sta_profile *prof = (void *)sub->data; - ssize_t sta_prof_len; - u16 control; - - if (sub->id != IEEE80211_MLE_SUBELEM_PER_STA_PROFILE) - continue; - - if (!ieee80211_mle_basic_sta_prof_size_ok(sub->data, - sub->datalen)) - return; - - control = le16_to_cpu(prof->control); - - if (link_id != u16_get_bits(control, - IEEE80211_MLE_STA_CONTROL_LINK_ID)) - continue; - - if (!(control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE)) - return; - - /* the sub element can be fragmented */ - sta_prof_len = - cfg80211_defragment_element(sub, - (u8 *)ml, ml_len, - elems->scratch_pos, - elems->scratch + - elems->scratch_len - - elems->scratch_pos, - IEEE80211_MLE_SUBELEM_FRAGMENT); - - if (sta_prof_len < 0) - return; - - elems->prof = (void *)elems->scratch_pos; - elems->sta_prof_len = sta_prof_len; - elems->scratch_pos += sta_prof_len; - - return; - } -} - -static void ieee80211_mle_parse_link(struct ieee802_11_elems *elems, - struct ieee80211_elems_parse_params *params) -{ - struct ieee80211_mle_per_sta_profile *prof; - struct ieee80211_elems_parse_params sub = { - .action = params->action, - .from_ap = params->from_ap, - .link_id = -1, - }; - ssize_t ml_len = elems->ml_basic_len; - const struct element *non_inherit = NULL; - const u8 *end; - - if (params->link_id == -1) - return; - - ml_len = cfg80211_defragment_element(elems->ml_basic_elem, - elems->ie_start, - elems->total_len, - elems->scratch_pos, - elems->scratch + - elems->scratch_len - - elems->scratch_pos, - WLAN_EID_FRAGMENT); - - if (ml_len < 0) - return; - - elems->ml_basic = (const void *)elems->scratch_pos; - elems->ml_basic_len = ml_len; - - ieee80211_mle_get_sta_prof(elems, params->link_id); - prof = elems->prof; - - if (!prof) - return; - - /* check if we have the 4 bytes for the fixed part in assoc response */ - if (elems->sta_prof_len < sizeof(*prof) + prof->sta_info_len - 1 + 4) { - elems->prof = NULL; - elems->sta_prof_len = 0; - return; - } - - /* - * Skip the capability information and the status code that are expected - * as part of the station profile in association response frames. Note - * the -1 is because the 'sta_info_len' is accounted to as part of the - * per-STA profile, but not part of the 'u8 variable[]' portion. - */ - sub.start = prof->variable + prof->sta_info_len - 1 + 4; - end = (const u8 *)prof + elems->sta_prof_len; - sub.len = end - sub.start; - - non_inherit = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, - sub.start, sub.len); - _ieee802_11_parse_elems_full(&sub, elems, non_inherit); -} - -struct ieee802_11_elems * -ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) -{ - struct ieee802_11_elems *elems; - const struct element *non_inherit = NULL; - u8 *nontransmitted_profile; - int nontransmitted_profile_len = 0; - size_t scratch_len = 3 * params->len; - - elems = kzalloc(struct_size(elems, scratch, scratch_len), GFP_ATOMIC); - if (!elems) - return NULL; - elems->ie_start = params->start; - elems->total_len = params->len; - elems->scratch_len = scratch_len; - elems->scratch_pos = elems->scratch; - - nontransmitted_profile = elems->scratch_pos; - nontransmitted_profile_len = - ieee802_11_find_bssid_profile(params->start, params->len, - elems, params->bss, - nontransmitted_profile); - elems->scratch_pos += nontransmitted_profile_len; - elems->scratch_len -= nontransmitted_profile_len; - non_inherit = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, - nontransmitted_profile, - nontransmitted_profile_len); - - elems->crc = _ieee802_11_parse_elems_full(params, elems, non_inherit); - - /* Override with nontransmitted profile, if found */ - if (nontransmitted_profile_len) { - struct ieee80211_elems_parse_params sub = { - .start = nontransmitted_profile, - .len = nontransmitted_profile_len, - .action = params->action, - .link_id = params->link_id, - }; - - _ieee802_11_parse_elems_full(&sub, elems, NULL); - } - - ieee80211_mle_parse_link(elems, params); - - if (elems->tim && !elems->parse_error) { - const struct ieee80211_tim_ie *tim_ie = elems->tim; - - elems->dtim_period = tim_ie->dtim_period; - elems->dtim_count = tim_ie->dtim_count; - } - - /* Override DTIM period and count if needed */ - if (elems->bssid_index && - elems->bssid_index_len >= - offsetofend(struct ieee80211_bssid_index, dtim_period)) - elems->dtim_period = elems->bssid_index->dtim_period; - - if (elems->bssid_index && - elems->bssid_index_len >= - offsetofend(struct ieee80211_bssid_index, dtim_count)) - elems->dtim_count = elems->bssid_index->dtim_count; - - return elems; -} -EXPORT_SYMBOL_IF_KUNIT(ieee802_11_parse_elems_full); - void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata, struct ieee80211_tx_queue_params *qparam, int ac) @@ -1938,37 +1173,34 @@ void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, } } -u8 *ieee80211_write_he_6ghz_cap(u8 *pos, __le16 cap, u8 *end) +static int ieee80211_put_s1g_cap(struct sk_buff *skb, + struct ieee80211_sta_s1g_cap *s1g_cap) { - if ((end - pos) < 5) - return pos; + if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_s1g_cap)) + return -ENOBUFS; - *pos++ = WLAN_EID_EXTENSION; - *pos++ = 1 + sizeof(cap); - *pos++ = WLAN_EID_EXT_HE_6GHZ_CAPA; - memcpy(pos, &cap, sizeof(cap)); + skb_put_u8(skb, WLAN_EID_S1G_CAPABILITIES); + skb_put_u8(skb, sizeof(struct ieee80211_s1g_cap)); - return pos + 2; + skb_put_data(skb, &s1g_cap->cap, sizeof(s1g_cap->cap)); + skb_put_data(skb, &s1g_cap->nss_mcs, sizeof(s1g_cap->nss_mcs)); + + return 0; } -static int ieee80211_build_preq_ies_band(struct ieee80211_sub_if_data *sdata, - u8 *buffer, size_t buffer_len, - const u8 *ie, size_t ie_len, - enum nl80211_band band, - u32 rate_mask, - struct cfg80211_chan_def *chandef, - size_t *offset, u32 flags) +static int ieee80211_put_preq_ies_band(struct sk_buff *skb, + struct ieee80211_sub_if_data *sdata, + const u8 *ie, size_t ie_len, + size_t *offset, + enum nl80211_band band, + u32 rate_mask, + struct cfg80211_chan_def *chandef, + u32 flags) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; - const struct ieee80211_sta_he_cap *he_cap; - const struct ieee80211_sta_eht_cap *eht_cap; - u8 *pos = buffer, *end = buffer + buffer_len; + int i, err; size_t noffset; - int supp_rates_len, i; - u8 rates[32]; - int num_rates; - int ext_rates_len; u32 rate_flags; bool have_80mhz = false; @@ -1981,32 +1213,13 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_sub_if_data *sdata, rate_flags = ieee80211_chandef_rate_flags(chandef); /* For direct scan add S1G IE and consider its override bits */ - if (band == NL80211_BAND_S1GHZ) { - if (end - pos < 2 + sizeof(struct ieee80211_s1g_cap)) - goto out_err; - pos = ieee80211_ie_build_s1g_cap(pos, &sband->s1g_cap); - goto done; - } - - num_rates = 0; - for (i = 0; i < sband->n_bitrates; i++) { - if ((BIT(i) & rate_mask) == 0) - continue; /* skip rate */ - if ((rate_flags & sband->bitrates[i].flags) != rate_flags) - continue; - - rates[num_rates++] = - (u8) DIV_ROUND_UP(sband->bitrates[i].bitrate, 5); - } - - supp_rates_len = min_t(int, num_rates, 8); + if (band == NL80211_BAND_S1GHZ) + return ieee80211_put_s1g_cap(skb, &sband->s1g_cap); - if (end - pos < 2 + supp_rates_len) - goto out_err; - *pos++ = WLAN_EID_SUPP_RATES; - *pos++ = supp_rates_len; - memcpy(pos, rates, supp_rates_len); - pos += supp_rates_len; + err = ieee80211_put_srates_elem(skb, sband, 0, rate_flags, + ~rate_mask, WLAN_EID_SUPP_RATES); + if (err) + return err; /* insert "request information" if in custom IEs */ if (ie && ie_len) { @@ -2019,34 +1232,28 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_sub_if_data *sdata, before_extrates, ARRAY_SIZE(before_extrates), *offset); - if (end - pos < noffset - *offset) - goto out_err; - memcpy(pos, ie + *offset, noffset - *offset); - pos += noffset - *offset; + if (skb_tailroom(skb) < noffset - *offset) + return -ENOBUFS; + skb_put_data(skb, ie + *offset, noffset - *offset); *offset = noffset; } - ext_rates_len = num_rates - supp_rates_len; - if (ext_rates_len > 0) { - if (end - pos < 2 + ext_rates_len) - goto out_err; - *pos++ = WLAN_EID_EXT_SUPP_RATES; - *pos++ = ext_rates_len; - memcpy(pos, rates + supp_rates_len, ext_rates_len); - pos += ext_rates_len; - } + err = ieee80211_put_srates_elem(skb, sband, 0, rate_flags, + ~rate_mask, WLAN_EID_EXT_SUPP_RATES); + if (err) + return err; if (chandef->chan && sband->band == NL80211_BAND_2GHZ) { - if (end - pos < 3) - goto out_err; - *pos++ = WLAN_EID_DS_PARAMS; - *pos++ = 1; - *pos++ = ieee80211_frequency_to_channel( - chandef->chan->center_freq); + if (skb_tailroom(skb) < 3) + return -ENOBUFS; + skb_put_u8(skb, WLAN_EID_DS_PARAMS); + skb_put_u8(skb, 1); + skb_put_u8(skb, + ieee80211_frequency_to_channel(chandef->chan->center_freq)); } if (flags & IEEE80211_PROBE_FLAG_MIN_CONTENT) - goto done; + return 0; /* insert custom IEs that go before HT */ if (ie && ie_len) { @@ -2061,18 +1268,21 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_sub_if_data *sdata, noffset = ieee80211_ie_split(ie, ie_len, before_ht, ARRAY_SIZE(before_ht), *offset); - if (end - pos < noffset - *offset) - goto out_err; - memcpy(pos, ie + *offset, noffset - *offset); - pos += noffset - *offset; + if (skb_tailroom(skb) < noffset - *offset) + return -ENOBUFS; + skb_put_data(skb, ie + *offset, noffset - *offset); *offset = noffset; } if (sband->ht_cap.ht_supported) { - if (end - pos < 2 + sizeof(struct ieee80211_ht_cap)) - goto out_err; - pos = ieee80211_ie_build_ht_cap(pos, &sband->ht_cap, - sband->ht_cap.cap); + u8 *pos; + + if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_ht_cap)) + return -ENOBUFS; + + pos = skb_put(skb, 2 + sizeof(struct ieee80211_ht_cap)); + ieee80211_ie_build_ht_cap(pos, &sband->ht_cap, + sband->ht_cap.cap); } /* insert custom IEs that go before VHT */ @@ -2093,10 +1303,9 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_sub_if_data *sdata, noffset = ieee80211_ie_split(ie, ie_len, before_vht, ARRAY_SIZE(before_vht), *offset); - if (end - pos < noffset - *offset) - goto out_err; - memcpy(pos, ie + *offset, noffset - *offset); - pos += noffset - *offset; + if (skb_tailroom(skb) < noffset - *offset) + return -ENOBUFS; + skb_put_data(skb, ie + *offset, noffset - *offset); *offset = noffset; } @@ -2111,10 +1320,14 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_sub_if_data *sdata, } if (sband->vht_cap.vht_supported && have_80mhz) { - if (end - pos < 2 + sizeof(struct ieee80211_vht_cap)) - goto out_err; - pos = ieee80211_ie_build_vht_cap(pos, &sband->vht_cap, - sband->vht_cap.cap); + u8 *pos; + + if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_vht_cap)) + return -ENOBUFS; + + pos = skb_put(skb, 2 + sizeof(struct ieee80211_vht_cap)); + ieee80211_ie_build_vht_cap(pos, &sband->vht_cap, + sband->vht_cap.cap); } /* insert custom IEs that go before HE */ @@ -2131,107 +1344,128 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_sub_if_data *sdata, noffset = ieee80211_ie_split(ie, ie_len, before_he, ARRAY_SIZE(before_he), *offset); - if (end - pos < noffset - *offset) - goto out_err; - memcpy(pos, ie + *offset, noffset - *offset); - pos += noffset - *offset; + if (skb_tailroom(skb) < noffset - *offset) + return -ENOBUFS; + skb_put_data(skb, ie + *offset, noffset - *offset); *offset = noffset; } - he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); - if (he_cap && - cfg80211_any_usable_channels(local->hw.wiphy, BIT(sband->band), + if (cfg80211_any_usable_channels(local->hw.wiphy, BIT(sband->band), IEEE80211_CHAN_NO_HE)) { - pos = ieee80211_ie_build_he_cap(0, pos, he_cap, end); - if (!pos) - goto out_err; + err = ieee80211_put_he_cap(skb, sdata, sband, NULL); + if (err) + return err; } - eht_cap = ieee80211_get_eht_iftype_cap_vif(sband, &sdata->vif); - - if (eht_cap && - cfg80211_any_usable_channels(local->hw.wiphy, BIT(sband->band), + if (cfg80211_any_usable_channels(local->hw.wiphy, BIT(sband->band), IEEE80211_CHAN_NO_HE | IEEE80211_CHAN_NO_EHT)) { - pos = ieee80211_ie_build_eht_cap(pos, he_cap, eht_cap, end, - sdata->vif.type == NL80211_IFTYPE_AP); - if (!pos) - goto out_err; + err = ieee80211_put_eht_cap(skb, sdata, sband, NULL); + if (err) + return err; } - if (cfg80211_any_usable_channels(local->hw.wiphy, - BIT(NL80211_BAND_6GHZ), - IEEE80211_CHAN_NO_HE)) { - struct ieee80211_supported_band *sband6; - - sband6 = local->hw.wiphy->bands[NL80211_BAND_6GHZ]; - he_cap = ieee80211_get_he_iftype_cap_vif(sband6, &sdata->vif); - - if (he_cap) { - enum nl80211_iftype iftype = - ieee80211_vif_type_p2p(&sdata->vif); - __le16 cap = ieee80211_get_he_6ghz_capa(sband6, iftype); - - pos = ieee80211_write_he_6ghz_cap(pos, cap, end); - } - } + err = ieee80211_put_he_6ghz_cap(skb, sdata, IEEE80211_SMPS_OFF); + if (err) + return err; /* * If adding more here, adjust code in main.c * that calculates local->scan_ies_len. */ - return pos - buffer; - out_err: - WARN_ONCE(1, "not enough space for preq IEs\n"); - done: - return pos - buffer; + return 0; } -int ieee80211_build_preq_ies(struct ieee80211_sub_if_data *sdata, u8 *buffer, - size_t buffer_len, - struct ieee80211_scan_ies *ie_desc, - const u8 *ie, size_t ie_len, - u8 bands_used, u32 *rate_masks, - struct cfg80211_chan_def *chandef, - u32 flags) +static int ieee80211_put_preq_ies(struct sk_buff *skb, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_scan_ies *ie_desc, + const u8 *ie, size_t ie_len, + u8 bands_used, u32 *rate_masks, + struct cfg80211_chan_def *chandef, + u32 flags) { - size_t pos = 0, old_pos = 0, custom_ie_offset = 0; - int i; + size_t custom_ie_offset = 0; + int i, err; memset(ie_desc, 0, sizeof(*ie_desc)); for (i = 0; i < NUM_NL80211_BANDS; i++) { if (bands_used & BIT(i)) { - pos += ieee80211_build_preq_ies_band(sdata, - buffer + pos, - buffer_len - pos, - ie, ie_len, i, - rate_masks[i], - chandef, - &custom_ie_offset, - flags); - ie_desc->ies[i] = buffer + old_pos; - ie_desc->len[i] = pos - old_pos; - old_pos = pos; + ie_desc->ies[i] = skb_tail_pointer(skb); + err = ieee80211_put_preq_ies_band(skb, sdata, + ie, ie_len, + &custom_ie_offset, + i, rate_masks[i], + chandef, flags); + if (err) + return err; + ie_desc->len[i] = skb_tail_pointer(skb) - + ie_desc->ies[i]; } } /* add any remaining custom IEs */ if (ie && ie_len) { - if (WARN_ONCE(buffer_len - pos < ie_len - custom_ie_offset, + if (WARN_ONCE(skb_tailroom(skb) < ie_len - custom_ie_offset, "not enough space for preq custom IEs\n")) - return pos; - memcpy(buffer + pos, ie + custom_ie_offset, - ie_len - custom_ie_offset); - ie_desc->common_ies = buffer + pos; - ie_desc->common_ie_len = ie_len - custom_ie_offset; - pos += ie_len - custom_ie_offset; + return -ENOBUFS; + ie_desc->common_ies = skb_tail_pointer(skb); + skb_put_data(skb, ie + custom_ie_offset, + ie_len - custom_ie_offset); + ie_desc->common_ie_len = skb_tail_pointer(skb) - + ie_desc->common_ies; } - return pos; + return 0; }; +int ieee80211_build_preq_ies(struct ieee80211_sub_if_data *sdata, u8 *buffer, + size_t buffer_len, + struct ieee80211_scan_ies *ie_desc, + const u8 *ie, size_t ie_len, + u8 bands_used, u32 *rate_masks, + struct cfg80211_chan_def *chandef, + u32 flags) +{ + struct sk_buff *skb = alloc_skb(buffer_len, GFP_KERNEL); + uintptr_t offs; + int ret, i; + u8 *start; + + if (!skb) + return -ENOMEM; + + start = skb_tail_pointer(skb); + memset(start, 0, skb_tailroom(skb)); + ret = ieee80211_put_preq_ies(skb, sdata, ie_desc, ie, ie_len, + bands_used, rate_masks, chandef, + flags); + if (ret < 0) { + goto out; + } + + if (skb->len > buffer_len) { + ret = -ENOBUFS; + goto out; + } + + memcpy(buffer, start, skb->len); + + /* adjust ie_desc for copy */ + for (i = 0; i < NUM_NL80211_BANDS; i++) { + offs = ie_desc->ies[i] - start; + ie_desc->ies[i] = buffer + offs; + } + offs = ie_desc->common_ies - start; + ie_desc->common_ies = buffer + offs; + + ret = skb->len; +out: + consume_skb(skb); + return ret; +} + struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, const u8 *src, const u8 *dst, u32 ratemask, @@ -2244,7 +1478,6 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, struct cfg80211_chan_def chandef; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; - int ies_len; u32 rate_masks[NUM_NL80211_BANDS] = {}; struct ieee80211_scan_ies dummy_ie_desc; @@ -2253,7 +1486,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, * in order to maximize the chance that we get a response. Some * badly-behaved APs don't respond when this parameter is included. */ - chandef.width = sdata->vif.bss_conf.chandef.width; + chandef.width = sdata->vif.bss_conf.chanreq.oper.width; if (flags & IEEE80211_PROBE_FLAG_DIRECTED) chandef.chan = NULL; else @@ -2265,11 +1498,9 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, return NULL; rate_masks[chan->band] = ratemask; - ies_len = ieee80211_build_preq_ies(sdata, skb_tail_pointer(skb), - skb_tailroom(skb), &dummy_ie_desc, - ie, ie_len, BIT(chan->band), - rate_masks, &chandef, flags); - skb_put(skb, ies_len); + ieee80211_put_preq_ies(skb, sdata, &dummy_ie_desc, + ie, ie_len, BIT(chan->band), + rate_masks, &chandef, flags); if (dst) { mgmt = (struct ieee80211_mgmt *) skb->data; @@ -2295,7 +1526,8 @@ u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, if (WARN_ON(!sband)) return 1; - rate_flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef); + rate_flags = + ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper); num_rates = sband->n_bitrates; supp_rates = 0; @@ -2335,6 +1567,10 @@ u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, void ieee80211_stop_device(struct ieee80211_local *local) { + local_bh_disable(); + ieee80211_handle_queued_frames(local); + local_bh_enable(); + ieee80211_led_radio(local, false); ieee80211_mod_tpt_led_trig(local, 0, IEEE80211_TPT_LEDTRIG_FL_RADIO); @@ -2416,9 +1652,6 @@ static void ieee80211_assign_chanctx(struct ieee80211_local *local, lockdep_assert_wiphy(local->hw.wiphy); - if (!local->use_chanctx) - return; - conf = rcu_dereference_protected(link->conf->chanctx_conf, lockdep_is_held(&local->hw.wiphy->mtx)); if (conf) { @@ -2612,7 +1845,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) /* add interfaces */ sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); - if (sdata) { + if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) { /* in HW restart it exists already */ WARN_ON(local->resuming); res = drv_add_interface(local, sdata); @@ -2648,20 +1881,20 @@ int ieee80211_reconfig(struct ieee80211_local *local) } /* add channel contexts */ - if (local->use_chanctx) { - list_for_each_entry(ctx, &local->chanctx_list, list) - if (ctx->replace_state != - IEEE80211_CHANCTX_REPLACES_OTHER) - WARN_ON(drv_add_chanctx(local, ctx)); - - sdata = wiphy_dereference(local->hw.wiphy, - local->monitor_sdata); - if (sdata && ieee80211_sdata_running(sdata)) - ieee80211_assign_chanctx(local, sdata, &sdata->deflink); - } + list_for_each_entry(ctx, &local->chanctx_list, list) + if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) + WARN_ON(drv_add_chanctx(local, ctx)); + + sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); + if (sdata && ieee80211_sdata_running(sdata)) + ieee80211_assign_chanctx(local, sdata, &sdata->deflink); /* reconfigure hardware */ - ieee80211_hw_config(local, ~0); + ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_LISTEN_INTERVAL | + IEEE80211_CONF_CHANGE_MONITOR | + IEEE80211_CONF_CHANGE_PS | + IEEE80211_CONF_CHANGE_RETRY_LIMITS | + IEEE80211_CONF_CHANGE_IDLE); ieee80211_configure_filter(local); @@ -2703,11 +1936,12 @@ int ieee80211_reconfig(struct ieee80211_local *local) old); } + sdata->restart_active_links = active_links; + for (link_id = 0; link_id < ARRAY_SIZE(sdata->vif.link_conf); link_id++) { - if (ieee80211_vif_is_mld(&sdata->vif) && - !(sdata->vif.active_links & BIT(link_id))) + if (!ieee80211_vif_link_active(&sdata->vif, link_id)) continue; link = sdata_dereference(sdata->link[link_id], sdata); @@ -2756,9 +1990,6 @@ int ieee80211_reconfig(struct ieee80211_local *local) sdata->vif.bss_conf.protected_keep_alive) changed |= BSS_CHANGED_KEEP_ALIVE; - if (sdata->vif.bss_conf.eht_puncturing) - changed |= BSS_CHANGED_EHT_PUNCTURING; - ieee80211_bss_info_change_notify(sdata, changed); } else if (!WARN_ON(!link)) { @@ -2834,9 +2065,6 @@ int ieee80211_reconfig(struct ieee80211_local *local) WARN_ON(1); break; } - - if (active_links) - ieee80211_set_active_links(&sdata->vif, active_links); } ieee80211_recalc_ps(local); @@ -2877,6 +2105,20 @@ int ieee80211_reconfig(struct ieee80211_local *local) list_for_each_entry(sdata, &local->interfaces, list) ieee80211_reenable_keys(sdata); + /* re-enable multi-link for client interfaces */ + list_for_each_entry(sdata, &local->interfaces, list) { + if (sdata->restart_active_links) + ieee80211_set_active_links(&sdata->vif, + sdata->restart_active_links); + /* + * If a link switch was scheduled before the restart, and ran + * before reconfig, it will do nothing, so re-schedule. + */ + if (sdata->desired_active_links) + wiphy_work_queue(sdata->local->hw.wiphy, + &sdata->activate_links_work); + } + /* Reconfigure sched scan if it was interrupted by FW restart */ sched_scan_sdata = rcu_dereference_protected(local->sched_scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); @@ -3109,21 +2351,6 @@ size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset) return pos; } -u8 *ieee80211_ie_build_s1g_cap(u8 *pos, struct ieee80211_sta_s1g_cap *s1g_cap) -{ - *pos++ = WLAN_EID_S1G_CAPABILITIES; - *pos++ = sizeof(struct ieee80211_s1g_cap); - memset(pos, 0, sizeof(struct ieee80211_s1g_cap)); - - memcpy(pos, &s1g_cap->cap, sizeof(s1g_cap->cap)); - pos += sizeof(s1g_cap->cap); - - memcpy(pos, &s1g_cap->nss_mcs, sizeof(s1g_cap->nss_mcs)); - pos += sizeof(s1g_cap->nss_mcs); - - return pos; -} - u8 *ieee80211_ie_build_ht_cap(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, u16 cap) { @@ -3180,7 +2407,8 @@ u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, return pos; } -u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata, u8 iftype) +/* this may return more than ieee80211_put_he_6ghz_cap() will need */ +u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata) { const struct ieee80211_sta_he_cap *he_cap; struct ieee80211_supported_band *sband; @@ -3190,7 +2418,7 @@ u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata, u8 iftype) if (!sband) return 0; - he_cap = ieee80211_get_he_iftype_cap(sband, iftype); + he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); if (!he_cap) return 0; @@ -3201,38 +2429,75 @@ u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata, u8 iftype) he_cap->he_cap_elem.phy_cap_info); } -u8 *ieee80211_ie_build_he_cap(ieee80211_conn_flags_t disable_flags, u8 *pos, +static void +ieee80211_get_adjusted_he_cap(const struct ieee80211_conn_settings *conn, const struct ieee80211_sta_he_cap *he_cap, - u8 *end) + struct ieee80211_he_cap_elem *elem) { - struct ieee80211_he_cap_elem elem; - u8 n; - u8 ie_len; - u8 *orig_pos = pos; + u8 ru_limit, max_ru; - /* Make sure we have place for the IE */ - /* - * TODO: the 1 added is because this temporarily is under the EXTENSION - * IE. Get rid of it when it moves. - */ - if (!he_cap) - return orig_pos; + *elem = he_cap->he_cap_elem; - /* modify on stack first to calculate 'n' and 'ie_len' correctly */ - elem = he_cap->he_cap_elem; + switch (conn->bw_limit) { + case IEEE80211_CONN_BW_LIMIT_20: + ru_limit = IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_242; + break; + case IEEE80211_CONN_BW_LIMIT_40: + ru_limit = IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_484; + break; + case IEEE80211_CONN_BW_LIMIT_80: + ru_limit = IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_996; + break; + default: + ru_limit = IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_2x996; + break; + } - if (disable_flags & IEEE80211_CONN_DISABLE_40MHZ) - elem.phy_cap_info[0] &= + max_ru = elem->phy_cap_info[8] & IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_MASK; + max_ru = min(max_ru, ru_limit); + elem->phy_cap_info[8] &= ~IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_MASK; + elem->phy_cap_info[8] |= max_ru; + + if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_40) { + elem->phy_cap_info[0] &= ~(IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G); + elem->phy_cap_info[9] &= + ~IEEE80211_HE_PHY_CAP9_LONGER_THAN_16_SIGB_OFDM_SYM; + } - if (disable_flags & IEEE80211_CONN_DISABLE_160MHZ) - elem.phy_cap_info[0] &= - ~IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G; + if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_160) { + elem->phy_cap_info[0] &= + ~(IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G | + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G); + elem->phy_cap_info[5] &= + ~IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_MASK; + elem->phy_cap_info[7] &= + ~(IEEE80211_HE_PHY_CAP7_STBC_TX_ABOVE_80MHZ | + IEEE80211_HE_PHY_CAP7_STBC_RX_ABOVE_80MHZ); + } +} - if (disable_flags & IEEE80211_CONN_DISABLE_80P80MHZ) - elem.phy_cap_info[0] &= - ~IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G; +int ieee80211_put_he_cap(struct sk_buff *skb, + struct ieee80211_sub_if_data *sdata, + const struct ieee80211_supported_band *sband, + const struct ieee80211_conn_settings *conn) +{ + const struct ieee80211_sta_he_cap *he_cap; + struct ieee80211_he_cap_elem elem; + u8 *len; + u8 n; + u8 ie_len; + + if (!conn) + conn = &ieee80211_conn_settings_unlimited; + + he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); + if (!he_cap) + return 0; + + /* modify on stack first to calculate 'n' and 'ie_len' correctly */ + ieee80211_get_adjusted_he_cap(conn, he_cap, &elem); n = ieee80211_he_mcs_nss_size(&elem); ie_len = 2 + 1 + @@ -3240,19 +2505,17 @@ u8 *ieee80211_ie_build_he_cap(ieee80211_conn_flags_t disable_flags, u8 *pos, ieee80211_he_ppe_size(he_cap->ppe_thres[0], he_cap->he_cap_elem.phy_cap_info); - if ((end - pos) < ie_len) - return orig_pos; + if (skb_tailroom(skb) < ie_len) + return -ENOBUFS; - *pos++ = WLAN_EID_EXTENSION; - pos++; /* We'll set the size later below */ - *pos++ = WLAN_EID_EXT_HE_CAPABILITY; + skb_put_u8(skb, WLAN_EID_EXTENSION); + len = skb_put(skb, 1); /* We'll set the size later below */ + skb_put_u8(skb, WLAN_EID_EXT_HE_CAPABILITY); /* Fixed data */ - memcpy(pos, &elem, sizeof(elem)); - pos += sizeof(elem); + skb_put_data(skb, &elem, sizeof(elem)); - memcpy(pos, &he_cap->he_mcs_nss_supp, n); - pos += n; + skb_put_data(skb, &he_cap->he_mcs_nss_supp, n); /* Check if PPE Threshold should be present */ if ((he_cap->he_cap_elem.phy_cap_info[6] & @@ -3276,41 +2539,39 @@ u8 *ieee80211_ie_build_he_cap(ieee80211_conn_flags_t disable_flags, u8 *pos, n = DIV_ROUND_UP(n, 8); /* Copy PPE Thresholds */ - memcpy(pos, &he_cap->ppe_thres, n); - pos += n; + skb_put_data(skb, &he_cap->ppe_thres, n); end: - orig_pos[1] = (pos - orig_pos) - 2; - return pos; + *len = skb_tail_pointer(skb) - len - 1; + return 0; } -void ieee80211_ie_build_he_6ghz_cap(struct ieee80211_sub_if_data *sdata, - enum ieee80211_smps_mode smps_mode, - struct sk_buff *skb) +int ieee80211_put_he_6ghz_cap(struct sk_buff *skb, + struct ieee80211_sub_if_data *sdata, + enum ieee80211_smps_mode smps_mode) { struct ieee80211_supported_band *sband; const struct ieee80211_sband_iftype_data *iftd; enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); - u8 *pos; - u16 cap; + __le16 cap; if (!cfg80211_any_usable_channels(sdata->local->hw.wiphy, BIT(NL80211_BAND_6GHZ), IEEE80211_CHAN_NO_HE)) - return; + return 0; sband = sdata->local->hw.wiphy->bands[NL80211_BAND_6GHZ]; iftd = ieee80211_get_sband_iftype_data(sband, iftype); if (!iftd) - return; + return 0; /* Check for device HE 6 GHz capability before adding element */ if (!iftd->he_6ghz_capa.capa) - return; + return 0; - cap = le16_to_cpu(iftd->he_6ghz_capa.capa); - cap &= ~IEEE80211_HE_6GHZ_CAP_SM_PS; + cap = iftd->he_6ghz_capa.capa; + cap &= cpu_to_le16(~IEEE80211_HE_6GHZ_CAP_SM_PS); switch (smps_mode) { case IEEE80211_SMPS_AUTOMATIC: @@ -3318,22 +2579,27 @@ void ieee80211_ie_build_he_6ghz_cap(struct ieee80211_sub_if_data *sdata, WARN_ON(1); fallthrough; case IEEE80211_SMPS_OFF: - cap |= u16_encode_bits(WLAN_HT_CAP_SM_PS_DISABLED, - IEEE80211_HE_6GHZ_CAP_SM_PS); + cap |= le16_encode_bits(WLAN_HT_CAP_SM_PS_DISABLED, + IEEE80211_HE_6GHZ_CAP_SM_PS); break; case IEEE80211_SMPS_STATIC: - cap |= u16_encode_bits(WLAN_HT_CAP_SM_PS_STATIC, - IEEE80211_HE_6GHZ_CAP_SM_PS); + cap |= le16_encode_bits(WLAN_HT_CAP_SM_PS_STATIC, + IEEE80211_HE_6GHZ_CAP_SM_PS); break; case IEEE80211_SMPS_DYNAMIC: - cap |= u16_encode_bits(WLAN_HT_CAP_SM_PS_DYNAMIC, - IEEE80211_HE_6GHZ_CAP_SM_PS); + cap |= le16_encode_bits(WLAN_HT_CAP_SM_PS_DYNAMIC, + IEEE80211_HE_6GHZ_CAP_SM_PS); break; } - pos = skb_put(skb, 2 + 1 + sizeof(cap)); - ieee80211_write_he_6ghz_cap(pos, cpu_to_le16(cap), - pos + 2 + 1 + sizeof(cap)); + if (skb_tailroom(skb) < 2 + 1 + sizeof(cap)) + return -ENOBUFS; + + skb_put_u8(skb, WLAN_EID_EXTENSION); + skb_put_u8(skb, 1 + sizeof(cap)); + skb_put_u8(skb, WLAN_EID_EXT_HE_6GHZ_CAPA); + skb_put_data(skb, &cap, sizeof(cap)); + return 0; } u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, @@ -3785,7 +3051,6 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info, } void ieee80211_chandef_eht_oper(const struct ieee80211_eht_operation_info *info, - bool support_160, bool support_320, struct cfg80211_chan_def *chandef) { chandef->center_freq1 = @@ -3804,90 +3069,38 @@ void ieee80211_chandef_eht_oper(const struct ieee80211_eht_operation_info *info, chandef->width = NL80211_CHAN_WIDTH_80; break; case IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ: - if (support_160) { - chandef->width = NL80211_CHAN_WIDTH_160; - chandef->center_freq1 = - ieee80211_channel_to_frequency(info->ccfs1, - chandef->chan->band); - } else { - chandef->width = NL80211_CHAN_WIDTH_80; - } + chandef->width = NL80211_CHAN_WIDTH_160; + chandef->center_freq1 = + ieee80211_channel_to_frequency(info->ccfs1, + chandef->chan->band); break; case IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ: - if (support_320) { - chandef->width = NL80211_CHAN_WIDTH_320; - chandef->center_freq1 = - ieee80211_channel_to_frequency(info->ccfs1, - chandef->chan->band); - } else if (support_160) { - chandef->width = NL80211_CHAN_WIDTH_160; - } else { - chandef->width = NL80211_CHAN_WIDTH_80; - - if (chandef->center_freq1 > chandef->chan->center_freq) - chandef->center_freq1 -= 40; - else - chandef->center_freq1 += 40; - } + chandef->width = NL80211_CHAN_WIDTH_320; + chandef->center_freq1 = + ieee80211_channel_to_frequency(info->ccfs1, + chandef->chan->band); break; } } -bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata, +bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_local *local, const struct ieee80211_he_operation *he_oper, const struct ieee80211_eht_operation *eht_oper, struct cfg80211_chan_def *chandef) { - struct ieee80211_local *local = sdata->local; - struct ieee80211_supported_band *sband; - enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); - const struct ieee80211_sta_he_cap *he_cap; - const struct ieee80211_sta_eht_cap *eht_cap; struct cfg80211_chan_def he_chandef = *chandef; const struct ieee80211_he_6ghz_oper *he_6ghz_oper; - struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; - bool support_80_80, support_160, support_320; - u8 he_phy_cap, eht_phy_cap; u32 freq; if (chandef->chan->band != NL80211_BAND_6GHZ) return true; - sband = local->hw.wiphy->bands[NL80211_BAND_6GHZ]; - - he_cap = ieee80211_get_he_iftype_cap(sband, iftype); - if (!he_cap) { - sdata_info(sdata, "Missing iftype sband data/HE cap"); + if (!he_oper) return false; - } - - he_phy_cap = he_cap->he_cap_elem.phy_cap_info[0]; - support_160 = - he_phy_cap & - IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G; - support_80_80 = - he_phy_cap & - IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G; - - if (!he_oper) { - sdata_info(sdata, - "HE is not advertised on (on %d MHz), expect issues\n", - chandef->chan->center_freq); - return false; - } - - eht_cap = ieee80211_get_eht_iftype_cap(sband, iftype); - if (!eht_cap) - eht_oper = NULL; he_6ghz_oper = ieee80211_he_6ghz_oper(he_oper); - - if (!he_6ghz_oper) { - sdata_info(sdata, - "HE 6GHz operation missing (on %d MHz), expect issues\n", - chandef->chan->center_freq); + if (!he_6ghz_oper) return false; - } /* * The EHT operation IE does not contain the primary channel so the @@ -3896,20 +3109,10 @@ bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata, */ freq = ieee80211_channel_to_frequency(he_6ghz_oper->primary, NL80211_BAND_6GHZ); - he_chandef.chan = ieee80211_get_channel(sdata->local->hw.wiphy, freq); + he_chandef.chan = ieee80211_get_channel(local->hw.wiphy, freq); - switch (u8_get_bits(he_6ghz_oper->control, - IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) { - case IEEE80211_6GHZ_CTRL_REG_LPI_AP: - bss_conf->power_type = IEEE80211_REG_LPI_AP; - break; - case IEEE80211_6GHZ_CTRL_REG_SP_AP: - bss_conf->power_type = IEEE80211_REG_SP_AP; - break; - default: - bss_conf->power_type = IEEE80211_REG_UNSET_AP; - break; - } + if (!he_chandef.chan) + return false; if (!eht_oper || !(eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT)) { @@ -3928,13 +3131,10 @@ bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata, he_chandef.width = NL80211_CHAN_WIDTH_80; if (!he_6ghz_oper->ccfs1) break; - if (abs(he_6ghz_oper->ccfs1 - he_6ghz_oper->ccfs0) == 8) { - if (support_160) - he_chandef.width = NL80211_CHAN_WIDTH_160; - } else { - if (support_80_80) - he_chandef.width = NL80211_CHAN_WIDTH_80P80; - } + if (abs(he_6ghz_oper->ccfs1 - he_6ghz_oper->ccfs0) == 8) + he_chandef.width = NL80211_CHAN_WIDTH_160; + else + he_chandef.width = NL80211_CHAN_WIDTH_80P80; break; } @@ -3946,30 +3146,19 @@ bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata, he_chandef.center_freq1 = ieee80211_channel_to_frequency(he_6ghz_oper->ccfs0, NL80211_BAND_6GHZ); - if (support_80_80 || support_160) - he_chandef.center_freq2 = - ieee80211_channel_to_frequency(he_6ghz_oper->ccfs1, - NL80211_BAND_6GHZ); + he_chandef.center_freq2 = + ieee80211_channel_to_frequency(he_6ghz_oper->ccfs1, + NL80211_BAND_6GHZ); } } else { - eht_phy_cap = eht_cap->eht_cap_elem.phy_cap_info[0]; - support_320 = - eht_phy_cap & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ; - ieee80211_chandef_eht_oper((const void *)eht_oper->optional, - support_160, support_320, &he_chandef); + he_chandef.punctured = + ieee80211_eht_oper_dis_subchan_bitmap(eht_oper); } - if (!cfg80211_chandef_valid(&he_chandef)) { - sdata_info(sdata, - "HE 6GHz operation resulted in invalid chandef: %d MHz/%d/%d MHz/%d MHz\n", - he_chandef.chan ? he_chandef.chan->center_freq : 0, - he_chandef.width, - he_chandef.center_freq1, - he_chandef.center_freq2); + if (!cfg80211_chandef_valid(&he_chandef)) return false; - } *chandef = he_chandef; @@ -4012,121 +3201,62 @@ bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, return true; } -int ieee80211_parse_bitrates(enum nl80211_chan_width width, - const struct ieee80211_supported_band *sband, - const u8 *srates, int srates_len, u32 *rates) +int ieee80211_put_srates_elem(struct sk_buff *skb, + const struct ieee80211_supported_band *sband, + u32 basic_rates, u32 rate_flags, u32 masked_rates, + u8 element_id) { - u32 rate_flags = ieee80211_chanwidth_rate_flags(width); - struct ieee80211_rate *br; - int brate, rate, i, j, count = 0; - - *rates = 0; - - for (i = 0; i < srates_len; i++) { - rate = srates[i] & 0x7f; - - for (j = 0; j < sband->n_bitrates; j++) { - br = &sband->bitrates[j]; - if ((rate_flags & br->flags) != rate_flags) - continue; + u8 i, rates, skip; - brate = DIV_ROUND_UP(br->bitrate, 5); - if (brate == rate) { - *rates |= BIT(j); - count++; - break; - } - } - } - return count; -} - -int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata, - struct sk_buff *skb, bool need_basic, - enum nl80211_band band) -{ - struct ieee80211_local *local = sdata->local; - struct ieee80211_supported_band *sband; - int rate; - u8 i, rates, *pos; - u32 basic_rates = sdata->vif.bss_conf.basic_rates; - u32 rate_flags; - - rate_flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef); - sband = local->hw.wiphy->bands[band]; rates = 0; for (i = 0; i < sband->n_bitrates; i++) { if ((rate_flags & sband->bitrates[i].flags) != rate_flags) continue; + if (masked_rates & BIT(i)) + continue; rates++; } - if (rates > 8) - rates = 8; - - if (skb_tailroom(skb) < rates + 2) - return -ENOMEM; - pos = skb_put(skb, rates + 2); - *pos++ = WLAN_EID_SUPP_RATES; - *pos++ = rates; - for (i = 0; i < rates; i++) { - u8 basic = 0; - if ((rate_flags & sband->bitrates[i].flags) != rate_flags) - continue; - - if (need_basic && basic_rates & BIT(i)) - basic = 0x80; - rate = DIV_ROUND_UP(sband->bitrates[i].bitrate, 5); - *pos++ = basic | (u8) rate; + if (element_id == WLAN_EID_SUPP_RATES) { + rates = min_t(u8, rates, 8); + skip = 0; + } else { + skip = 8; + if (rates <= skip) + return 0; + rates -= skip; } - return 0; -} + if (skb_tailroom(skb) < rates + 2) + return -ENOBUFS; -int ieee80211_add_ext_srates_ie(struct ieee80211_sub_if_data *sdata, - struct sk_buff *skb, bool need_basic, - enum nl80211_band band) -{ - struct ieee80211_local *local = sdata->local; - struct ieee80211_supported_band *sband; - int rate; - u8 i, exrates, *pos; - u32 basic_rates = sdata->vif.bss_conf.basic_rates; - u32 rate_flags; + skb_put_u8(skb, element_id); + skb_put_u8(skb, rates); - rate_flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef); + for (i = 0; i < sband->n_bitrates && rates; i++) { + int rate; + u8 basic; - sband = local->hw.wiphy->bands[band]; - exrates = 0; - for (i = 0; i < sband->n_bitrates; i++) { if ((rate_flags & sband->bitrates[i].flags) != rate_flags) continue; - exrates++; - } + if (masked_rates & BIT(i)) + continue; - if (exrates > 8) - exrates -= 8; - else - exrates = 0; + if (skip > 0) { + skip--; + continue; + } - if (skb_tailroom(skb) < exrates + 2) - return -ENOMEM; + basic = basic_rates & BIT(i) ? 0x80 : 0; - if (exrates) { - pos = skb_put(skb, exrates + 2); - *pos++ = WLAN_EID_EXT_SUPP_RATES; - *pos++ = exrates; - for (i = 8; i < sband->n_bitrates; i++) { - u8 basic = 0; - if ((rate_flags & sband->bitrates[i].flags) - != rate_flags) - continue; - if (need_basic && basic_rates & BIT(i)) - basic = 0x80; - rate = DIV_ROUND_UP(sband->bitrates[i].bitrate, 5); - *pos++ = basic | (u8) rate; - } + rate = DIV_ROUND_UP(sband->bitrates[i].bitrate, 5); + skb_put_u8(skb, basic | (u8)rate); + rates--; } + + WARN(rates > 0, "rates confused: rates:%d, element:%d\n", + rates, element_id); + return 0; } @@ -4338,7 +3468,7 @@ void ieee80211_dfs_cac_cancel(struct ieee80211_local *local) &sdata->deflink.dfs_cac_timer_work); if (sdata->wdev.cac_started) { - chandef = sdata->vif.bss_conf.chandef; + chandef = sdata->vif.bss_conf.chanreq.oper; ieee80211_link_release_channel(&sdata->deflink); cfg80211_cac_event(sdata->dev, &chandef, @@ -4386,78 +3516,92 @@ void ieee80211_radar_detected(struct ieee80211_hw *hw) } EXPORT_SYMBOL(ieee80211_radar_detected); -ieee80211_conn_flags_t ieee80211_chandef_downgrade(struct cfg80211_chan_def *c) +void ieee80211_chandef_downgrade(struct cfg80211_chan_def *c, + struct ieee80211_conn_settings *conn) { - ieee80211_conn_flags_t ret; - int tmp; + enum nl80211_chan_width new_primary_width; + struct ieee80211_conn_settings _ignored = {}; + + /* allow passing NULL if caller doesn't care */ + if (!conn) + conn = &_ignored; + +again: + /* no-HT indicates nothing to do */ + new_primary_width = NL80211_CHAN_WIDTH_20_NOHT; switch (c->width) { + default: + case NL80211_CHAN_WIDTH_20_NOHT: + WARN_ON_ONCE(1); + fallthrough; case NL80211_CHAN_WIDTH_20: c->width = NL80211_CHAN_WIDTH_20_NOHT; - ret = IEEE80211_CONN_DISABLE_HT | IEEE80211_CONN_DISABLE_VHT; + conn->mode = IEEE80211_CONN_MODE_LEGACY; + conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; + c->punctured = 0; break; case NL80211_CHAN_WIDTH_40: c->width = NL80211_CHAN_WIDTH_20; c->center_freq1 = c->chan->center_freq; - ret = IEEE80211_CONN_DISABLE_40MHZ | - IEEE80211_CONN_DISABLE_VHT; + if (conn->mode == IEEE80211_CONN_MODE_VHT) + conn->mode = IEEE80211_CONN_MODE_HT; + conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; + c->punctured = 0; break; case NL80211_CHAN_WIDTH_80: - tmp = (30 + c->chan->center_freq - c->center_freq1)/20; - /* n_P40 */ - tmp /= 2; - /* freq_P40 */ - c->center_freq1 = c->center_freq1 - 20 + 40 * tmp; - c->width = NL80211_CHAN_WIDTH_40; - ret = IEEE80211_CONN_DISABLE_VHT; + new_primary_width = NL80211_CHAN_WIDTH_40; + if (conn->mode == IEEE80211_CONN_MODE_VHT) + conn->mode = IEEE80211_CONN_MODE_HT; + conn->bw_limit = IEEE80211_CONN_BW_LIMIT_40; break; case NL80211_CHAN_WIDTH_80P80: c->center_freq2 = 0; c->width = NL80211_CHAN_WIDTH_80; - ret = IEEE80211_CONN_DISABLE_80P80MHZ | - IEEE80211_CONN_DISABLE_160MHZ; + conn->bw_limit = IEEE80211_CONN_BW_LIMIT_80; break; case NL80211_CHAN_WIDTH_160: - /* n_P20 */ - tmp = (70 + c->chan->center_freq - c->center_freq1)/20; - /* n_P80 */ - tmp /= 4; - c->center_freq1 = c->center_freq1 - 40 + 80 * tmp; - c->width = NL80211_CHAN_WIDTH_80; - ret = IEEE80211_CONN_DISABLE_80P80MHZ | - IEEE80211_CONN_DISABLE_160MHZ; + new_primary_width = NL80211_CHAN_WIDTH_80; + conn->bw_limit = IEEE80211_CONN_BW_LIMIT_80; break; case NL80211_CHAN_WIDTH_320: - /* n_P20 */ - tmp = (150 + c->chan->center_freq - c->center_freq1) / 20; - /* n_P160 */ - tmp /= 8; - c->center_freq1 = c->center_freq1 - 80 + 160 * tmp; - c->width = NL80211_CHAN_WIDTH_160; - ret = IEEE80211_CONN_DISABLE_320MHZ; - break; - default: - case NL80211_CHAN_WIDTH_20_NOHT: - WARN_ON_ONCE(1); - c->width = NL80211_CHAN_WIDTH_20_NOHT; - ret = IEEE80211_CONN_DISABLE_HT | IEEE80211_CONN_DISABLE_VHT; + new_primary_width = NL80211_CHAN_WIDTH_160; + conn->bw_limit = IEEE80211_CONN_BW_LIMIT_160; break; case NL80211_CHAN_WIDTH_1: case NL80211_CHAN_WIDTH_2: case NL80211_CHAN_WIDTH_4: case NL80211_CHAN_WIDTH_8: case NL80211_CHAN_WIDTH_16: + WARN_ON_ONCE(1); + /* keep c->width */ + conn->mode = IEEE80211_CONN_MODE_S1G; + conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; + break; case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: WARN_ON_ONCE(1); /* keep c->width */ - ret = IEEE80211_CONN_DISABLE_HT | IEEE80211_CONN_DISABLE_VHT; + conn->mode = IEEE80211_CONN_MODE_LEGACY; + conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; break; } - WARN_ON_ONCE(!cfg80211_chandef_valid(c)); + if (new_primary_width != NL80211_CHAN_WIDTH_20_NOHT) { + c->center_freq1 = cfg80211_chandef_primary(c, new_primary_width, + &c->punctured); + c->width = new_primary_width; + } - return ret; + /* + * With an 80 MHz channel, we might have the puncturing in the primary + * 40 Mhz channel, but that's not valid when downgraded to 40 MHz width. + * In that case, downgrade again. + */ + if (!cfg80211_chandef_valid(c) && c->punctured) + goto again; + + WARN_ON_ONCE(!cfg80211_chandef_valid(c)); } /* @@ -4773,7 +3917,7 @@ static u8 ieee80211_chanctx_radar_detect(struct ieee80211_local *local, list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list) if (link->reserved_radar_required) - radar_detect |= BIT(link->reserved_chandef.width); + radar_detect |= BIT(link->reserved.oper.width); /* * An in-place reservation context should not have any assigned vifs @@ -4787,7 +3931,7 @@ static u8 ieee80211_chanctx_radar_detect(struct ieee80211_local *local, continue; radar_detect |= - BIT(link->conf->chandef.width); + BIT(link->conf->chanreq.oper.width); } return radar_detect; @@ -5037,7 +4181,8 @@ u16 ieee80211_encode_usf(int listen_interval) return (u16) listen_interval; } -u8 ieee80211_ie_len_eht_cap(struct ieee80211_sub_if_data *sdata, u8 iftype) +/* this may return more than ieee80211_put_eht_cap() will need */ +u8 ieee80211_ie_len_eht_cap(struct ieee80211_sub_if_data *sdata) { const struct ieee80211_sta_he_cap *he_cap; const struct ieee80211_sta_eht_cap *eht_cap; @@ -5049,13 +4194,12 @@ u8 ieee80211_ie_len_eht_cap(struct ieee80211_sub_if_data *sdata, u8 iftype) if (!sband) return 0; - he_cap = ieee80211_get_he_iftype_cap(sband, iftype); - eht_cap = ieee80211_get_eht_iftype_cap(sband, iftype); + he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); + eht_cap = ieee80211_get_eht_iftype_cap_vif(sband, &sdata->vif); if (!he_cap || !eht_cap) return 0; - is_ap = iftype == NL80211_IFTYPE_AP || - iftype == NL80211_IFTYPE_P2P_GO; + is_ap = sdata->vif.type == NL80211_IFTYPE_AP; n = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, &eht_cap->eht_cap_elem, @@ -5067,45 +4211,134 @@ u8 ieee80211_ie_len_eht_cap(struct ieee80211_sub_if_data *sdata, u8 iftype) return 0; } -u8 *ieee80211_ie_build_eht_cap(u8 *pos, - const struct ieee80211_sta_he_cap *he_cap, - const struct ieee80211_sta_eht_cap *eht_cap, - u8 *end, - bool for_ap) +int ieee80211_put_eht_cap(struct sk_buff *skb, + struct ieee80211_sub_if_data *sdata, + const struct ieee80211_supported_band *sband, + const struct ieee80211_conn_settings *conn) { + const struct ieee80211_sta_he_cap *he_cap = + ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); + const struct ieee80211_sta_eht_cap *eht_cap = + ieee80211_get_eht_iftype_cap_vif(sband, &sdata->vif); + bool for_ap = sdata->vif.type == NL80211_IFTYPE_AP; + struct ieee80211_eht_cap_elem_fixed fixed; + struct ieee80211_he_cap_elem he; u8 mcs_nss_len, ppet_len; + u8 orig_mcs_nss_len; u8 ie_len; - u8 *orig_pos = pos; + + if (!conn) + conn = &ieee80211_conn_settings_unlimited; /* Make sure we have place for the IE */ if (!he_cap || !eht_cap) - return orig_pos; + return 0; + + orig_mcs_nss_len = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, + &eht_cap->eht_cap_elem, + for_ap); + + ieee80211_get_adjusted_he_cap(conn, he_cap, &he); + + fixed = eht_cap->eht_cap_elem; + + if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_80) + fixed.phy_cap_info[6] &= + ~IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_80MHZ; - mcs_nss_len = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, - &eht_cap->eht_cap_elem, - for_ap); + if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_160) { + fixed.phy_cap_info[1] &= + ~IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK; + fixed.phy_cap_info[2] &= + ~IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK; + fixed.phy_cap_info[6] &= + ~IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_160MHZ; + } + + if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_320) { + fixed.phy_cap_info[0] &= + ~IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ; + fixed.phy_cap_info[1] &= + ~IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_320MHZ_MASK; + fixed.phy_cap_info[2] &= + ~IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_320MHZ_MASK; + fixed.phy_cap_info[6] &= + ~IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_320MHZ; + } + + if (conn->bw_limit == IEEE80211_CONN_BW_LIMIT_20) + fixed.phy_cap_info[0] &= + ~IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ; + + mcs_nss_len = ieee80211_eht_mcs_nss_size(&he, &fixed, for_ap); ppet_len = ieee80211_eht_ppe_size(eht_cap->eht_ppe_thres[0], - eht_cap->eht_cap_elem.phy_cap_info); + fixed.phy_cap_info); ie_len = 2 + 1 + sizeof(eht_cap->eht_cap_elem) + mcs_nss_len + ppet_len; - if ((end - pos) < ie_len) - return orig_pos; + if (skb_tailroom(skb) < ie_len) + return -ENOBUFS; - *pos++ = WLAN_EID_EXTENSION; - *pos++ = ie_len - 2; - *pos++ = WLAN_EID_EXT_EHT_CAPABILITY; + skb_put_u8(skb, WLAN_EID_EXTENSION); + skb_put_u8(skb, ie_len - 2); + skb_put_u8(skb, WLAN_EID_EXT_EHT_CAPABILITY); + skb_put_data(skb, &fixed, sizeof(fixed)); - /* Fixed data */ - memcpy(pos, &eht_cap->eht_cap_elem, sizeof(eht_cap->eht_cap_elem)); - pos += sizeof(eht_cap->eht_cap_elem); + if (mcs_nss_len == 4 && orig_mcs_nss_len != 4) { + /* + * If the (non-AP) STA became 20 MHz only, then convert from + * <=80 to 20-MHz-only format, where MCSes are indicated in + * the groups 0-7, 8-9, 10-11, 12-13 rather than just 0-9, + * 10-11, 12-13. Thus, use 0-9 for 0-7 and 8-9. + */ + skb_put_u8(skb, eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs9_max_nss); + skb_put_u8(skb, eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs9_max_nss); + skb_put_u8(skb, eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs11_max_nss); + skb_put_u8(skb, eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs13_max_nss); + } else { + skb_put_data(skb, &eht_cap->eht_mcs_nss_supp, mcs_nss_len); + } - memcpy(pos, &eht_cap->eht_mcs_nss_supp, mcs_nss_len); - pos += mcs_nss_len; + if (ppet_len) + skb_put_data(skb, &eht_cap->eht_ppe_thres, ppet_len); - if (ppet_len) { - memcpy(pos, &eht_cap->eht_ppe_thres, ppet_len); - pos += ppet_len; - } + return 0; +} - return pos; +const char *ieee80211_conn_mode_str(enum ieee80211_conn_mode mode) +{ + static const char * const modes[] = { + [IEEE80211_CONN_MODE_S1G] = "S1G", + [IEEE80211_CONN_MODE_LEGACY] = "legacy", + [IEEE80211_CONN_MODE_HT] = "HT", + [IEEE80211_CONN_MODE_VHT] = "VHT", + [IEEE80211_CONN_MODE_HE] = "HE", + [IEEE80211_CONN_MODE_EHT] = "EHT", + }; + + if (WARN_ON(mode >= ARRAY_SIZE(modes))) + return "<out of range>"; + + return modes[mode] ?: "<missing string>"; +} + +enum ieee80211_conn_bw_limit +ieee80211_min_bw_limit_from_chandef(struct cfg80211_chan_def *chandef) +{ + switch (chandef->width) { + case NL80211_CHAN_WIDTH_20_NOHT: + case NL80211_CHAN_WIDTH_20: + return IEEE80211_CONN_BW_LIMIT_20; + case NL80211_CHAN_WIDTH_40: + return IEEE80211_CONN_BW_LIMIT_40; + case NL80211_CHAN_WIDTH_80: + return IEEE80211_CONN_BW_LIMIT_80; + case NL80211_CHAN_WIDTH_80P80: + case NL80211_CHAN_WIDTH_160: + return IEEE80211_CONN_BW_LIMIT_160; + case NL80211_CHAN_WIDTH_320: + return IEEE80211_CONN_BW_LIMIT_320; + default: + WARN(1, "unhandled chandef width %d\n", chandef->width); + return IEEE80211_CONN_BW_LIMIT_20; + } } diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index bc13b14199..642891cafb 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -369,7 +369,7 @@ ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta) link_conf = rcu_dereference(sdata->vif.link_conf[link_id]); if (eht_cap->has_eht && - link_conf->chandef.chan->band == NL80211_BAND_6GHZ) { + link_conf->chanreq.oper.chan->band == NL80211_BAND_6GHZ) { info = eht_cap->eht_cap_elem.phy_cap_info[0]; if (info & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ) { @@ -380,7 +380,7 @@ ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta) info = he_cap->he_cap_elem.phy_cap_info[0]; - if (link_conf->chandef.chan->band == NL80211_BAND_2GHZ) { + if (link_conf->chanreq.oper.chan->band == NL80211_BAND_2GHZ) { if (info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G) ret = IEEE80211_STA_RX_BW_40; else @@ -515,7 +515,7 @@ ieee80211_sta_cur_vht_bw(struct link_sta_info *link_sta) if (WARN_ON(!link_conf)) bss_width = NL80211_CHAN_WIDTH_20_NOHT; else - bss_width = link_conf->chandef.width; + bss_width = link_conf->chanreq.oper.width; rcu_read_unlock(); bw = ieee80211_sta_cap_rx_bw(link_sta); diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 94dae7cb6d..047a337970 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -315,7 +315,7 @@ ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx) * Calculate AAD for CCMP/GCMP, returning qos_tid since we * need that in CCMP also for b_0. */ -static u8 ccmp_gcmp_aad(struct sk_buff *skb, u8 *aad) +static u8 ccmp_gcmp_aad(struct sk_buff *skb, u8 *aad, bool spp_amsdu) { struct ieee80211_hdr *hdr = (void *)skb->data; __le16 mask_fc; @@ -340,7 +340,14 @@ static u8 ccmp_gcmp_aad(struct sk_buff *skb, u8 *aad) len_a += 6; if (ieee80211_is_data_qos(hdr->frame_control)) { - qos_tid = ieee80211_get_tid(hdr); + qos_tid = *ieee80211_get_qos_ctl(hdr); + + if (spp_amsdu) + qos_tid &= IEEE80211_QOS_CTL_TID_MASK | + IEEE80211_QOS_CTL_A_MSDU_PRESENT; + else + qos_tid &= IEEE80211_QOS_CTL_TID_MASK; + mask_fc &= ~cpu_to_le16(IEEE80211_FCTL_ORDER); len_a += 2; } else { @@ -369,10 +376,11 @@ static u8 ccmp_gcmp_aad(struct sk_buff *skb, u8 *aad) return qos_tid; } -static void ccmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *b_0, u8 *aad) +static void ccmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *b_0, u8 *aad, + bool spp_amsdu) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; - u8 qos_tid = ccmp_gcmp_aad(skb, aad); + u8 qos_tid = ccmp_gcmp_aad(skb, aad, spp_amsdu); /* In CCM, the initial vectors (IV) used for CTR mode encryption and CBC * mode authentication are not allowed to collide, yet both are derived @@ -479,7 +487,8 @@ static int ccmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb, return 0; pos += IEEE80211_CCMP_HDR_LEN; - ccmp_special_blocks(skb, pn, b_0, aad); + ccmp_special_blocks(skb, pn, b_0, aad, + key->conf.flags & IEEE80211_KEY_FLAG_SPP_AMSDU); return ieee80211_aes_ccm_encrypt(key->u.ccmp.tfm, b_0, aad, pos, len, skb_put(skb, mic_len)); } @@ -557,7 +566,8 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx, u8 aad[2 * AES_BLOCK_SIZE]; u8 b_0[AES_BLOCK_SIZE]; /* hardware didn't decrypt/verify MIC */ - ccmp_special_blocks(skb, pn, b_0, aad); + ccmp_special_blocks(skb, pn, b_0, aad, + key->conf.flags & IEEE80211_KEY_FLAG_SPP_AMSDU); if (ieee80211_aes_ccm_decrypt( key->u.ccmp.tfm, b_0, aad, @@ -581,7 +591,8 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx, return RX_CONTINUE; } -static void gcmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *j_0, u8 *aad) +static void gcmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *j_0, u8 *aad, + bool spp_amsdu) { struct ieee80211_hdr *hdr = (void *)skb->data; @@ -591,7 +602,7 @@ static void gcmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *j_0, u8 *aad) j_0[14] = 0; j_0[AES_BLOCK_SIZE - 1] = 0x01; - ccmp_gcmp_aad(skb, aad); + ccmp_gcmp_aad(skb, aad, spp_amsdu); } static inline void gcmp_pn2hdr(u8 *hdr, const u8 *pn, int key_id) @@ -680,7 +691,8 @@ static int gcmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) return 0; pos += IEEE80211_GCMP_HDR_LEN; - gcmp_special_blocks(skb, pn, j_0, aad); + gcmp_special_blocks(skb, pn, j_0, aad, + key->conf.flags & IEEE80211_KEY_FLAG_SPP_AMSDU); return ieee80211_aes_gcm_encrypt(key->u.gcmp.tfm, j_0, aad, pos, len, skb_put(skb, IEEE80211_GCMP_MIC_LEN)); } @@ -753,7 +765,8 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx) u8 aad[2 * AES_BLOCK_SIZE]; u8 j_0[AES_BLOCK_SIZE]; /* hardware didn't decrypt/verify MIC */ - gcmp_special_blocks(skb, pn, j_0, aad); + gcmp_special_blocks(skb, pn, j_0, aad, + key->conf.flags & IEEE80211_KEY_FLAG_SPP_AMSDU); if (ieee80211_aes_gcm_decrypt( key->u.gcmp.tfm, j_0, aad, @@ -882,7 +895,8 @@ ieee80211_crypto_aes_cmac_256_encrypt(struct ieee80211_tx_data *tx) info = IEEE80211_SKB_CB(skb); - if (info->control.hw_key) + if (info->control.hw_key && + !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIE)) return TX_CONTINUE; if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie))) @@ -898,6 +912,9 @@ ieee80211_crypto_aes_cmac_256_encrypt(struct ieee80211_tx_data *tx) bip_ipn_set64(mmie->sequence_number, pn64); + if (info->control.hw_key) + return TX_CONTINUE; + bip_aad(skb, aad); /* MIC = AES-256-CMAC(IGTK, AAD || Management Frame Body || MMIE, 128) @@ -1027,7 +1044,8 @@ ieee80211_crypto_aes_gmac_encrypt(struct ieee80211_tx_data *tx) info = IEEE80211_SKB_CB(skb); - if (info->control.hw_key) + if (info->control.hw_key && + !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIE)) return TX_CONTINUE; if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie))) @@ -1043,6 +1061,9 @@ ieee80211_crypto_aes_gmac_encrypt(struct ieee80211_tx_data *tx) bip_ipn_set64(mmie->sequence_number, pn64); + if (info->control.hw_key) + return TX_CONTINUE; + bip_aad(skb, aad); hdr = (struct ieee80211_hdr *)skb->data; diff --git a/net/mac802154/main.c b/net/mac802154/main.c index 9ab7396668..21b7c3b280 100644 --- a/net/mac802154/main.c +++ b/net/mac802154/main.c @@ -161,8 +161,10 @@ void ieee802154_configure_durations(struct wpan_phy *phy, } phy->symbol_duration = duration; - phy->lifs_period = (IEEE802154_LIFS_PERIOD * phy->symbol_duration) / NSEC_PER_SEC; - phy->sifs_period = (IEEE802154_SIFS_PERIOD * phy->symbol_duration) / NSEC_PER_SEC; + phy->lifs_period = + (IEEE802154_LIFS_PERIOD * phy->symbol_duration) / NSEC_PER_USEC; + phy->sifs_period = + (IEEE802154_SIFS_PERIOD * phy->symbol_duration) / NSEC_PER_USEC; } EXPORT_SYMBOL(ieee802154_configure_durations); @@ -184,10 +186,10 @@ static void ieee802154_setup_wpan_phy_pib(struct wpan_phy *wpan_phy) * Should be done when all drivers sets this value. */ - wpan_phy->lifs_period = - (IEEE802154_LIFS_PERIOD * wpan_phy->symbol_duration) / 1000; - wpan_phy->sifs_period = - (IEEE802154_SIFS_PERIOD * wpan_phy->symbol_duration) / 1000; + wpan_phy->lifs_period = (IEEE802154_LIFS_PERIOD * + wpan_phy->symbol_duration) / NSEC_PER_USEC; + wpan_phy->sifs_period = (IEEE802154_SIFS_PERIOD * + wpan_phy->symbol_duration) / NSEC_PER_USEC; } int ieee802154_register_hw(struct ieee802154_hw *hw) diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index 2a6f1ed763..6fbed5bb5c 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -34,8 +34,8 @@ void ieee802154_xmit_sync_worker(struct work_struct *work) if (res) goto err_tx; - dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; + DEV_STATS_INC(dev, tx_packets); + DEV_STATS_ADD(dev, tx_bytes, skb->len); ieee802154_xmit_complete(&local->hw, skb, false); @@ -90,8 +90,8 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb) if (ret) goto err_wake_netif_queue; - dev->stats.tx_packets++; - dev->stats.tx_bytes += len; + DEV_STATS_INC(dev, tx_packets); + DEV_STATS_ADD(dev, tx_bytes, len); } else { local->tx_skb = skb; queue_work(local->workqueue, &local->sync_tx_work); diff --git a/net/mctp/Kconfig b/net/mctp/Kconfig index 3a5c0e70da..d8d3413a37 100644 --- a/net/mctp/Kconfig +++ b/net/mctp/Kconfig @@ -14,6 +14,7 @@ menuconfig MCTP config MCTP_TEST bool "MCTP core tests" if !KUNIT_ALL_TESTS + select MCTP_FLOWS depends on MCTP=y && KUNIT=y default KUNIT_ALL_TESTS diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index f6be58b68c..de52a9191d 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -350,30 +350,102 @@ static int mctp_getsockopt(struct socket *sock, int level, int optname, return -EINVAL; } -static int mctp_ioctl_alloctag(struct mctp_sock *msk, unsigned long arg) +/* helpers for reading/writing the tag ioc, handling compatibility across the + * two versions, and some basic API error checking + */ +static int mctp_ioctl_tag_copy_from_user(unsigned long arg, + struct mctp_ioc_tag_ctl2 *ctl, + bool tagv2) +{ + struct mctp_ioc_tag_ctl ctl_compat; + unsigned long size; + void *ptr; + int rc; + + if (tagv2) { + size = sizeof(*ctl); + ptr = ctl; + } else { + size = sizeof(ctl_compat); + ptr = &ctl_compat; + } + + rc = copy_from_user(ptr, (void __user *)arg, size); + if (rc) + return -EFAULT; + + if (!tagv2) { + /* compat, using defaults for new fields */ + ctl->net = MCTP_INITIAL_DEFAULT_NET; + ctl->peer_addr = ctl_compat.peer_addr; + ctl->local_addr = MCTP_ADDR_ANY; + ctl->flags = ctl_compat.flags; + ctl->tag = ctl_compat.tag; + } + + if (ctl->flags) + return -EINVAL; + + if (ctl->local_addr != MCTP_ADDR_ANY && + ctl->local_addr != MCTP_ADDR_NULL) + return -EINVAL; + + return 0; +} + +static int mctp_ioctl_tag_copy_to_user(unsigned long arg, + struct mctp_ioc_tag_ctl2 *ctl, + bool tagv2) +{ + struct mctp_ioc_tag_ctl ctl_compat; + unsigned long size; + void *ptr; + int rc; + + if (tagv2) { + ptr = ctl; + size = sizeof(*ctl); + } else { + ctl_compat.peer_addr = ctl->peer_addr; + ctl_compat.tag = ctl->tag; + ctl_compat.flags = ctl->flags; + + ptr = &ctl_compat; + size = sizeof(ctl_compat); + } + + rc = copy_to_user((void __user *)arg, ptr, size); + if (rc) + return -EFAULT; + + return 0; +} + +static int mctp_ioctl_alloctag(struct mctp_sock *msk, bool tagv2, + unsigned long arg) { struct net *net = sock_net(&msk->sk); struct mctp_sk_key *key = NULL; - struct mctp_ioc_tag_ctl ctl; + struct mctp_ioc_tag_ctl2 ctl; unsigned long flags; u8 tag; + int rc; - if (copy_from_user(&ctl, (void __user *)arg, sizeof(ctl))) - return -EFAULT; + rc = mctp_ioctl_tag_copy_from_user(arg, &ctl, tagv2); + if (rc) + return rc; if (ctl.tag) return -EINVAL; - if (ctl.flags) - return -EINVAL; - - key = mctp_alloc_local_tag(msk, ctl.peer_addr, MCTP_ADDR_ANY, - true, &tag); + key = mctp_alloc_local_tag(msk, ctl.net, MCTP_ADDR_ANY, + ctl.peer_addr, true, &tag); if (IS_ERR(key)) return PTR_ERR(key); ctl.tag = tag | MCTP_TAG_OWNER | MCTP_TAG_PREALLOC; - if (copy_to_user((void __user *)arg, &ctl, sizeof(ctl))) { + rc = mctp_ioctl_tag_copy_to_user(arg, &ctl, tagv2); + if (rc) { unsigned long fl2; /* Unwind our key allocation: the keys list lock needs to be * taken before the individual key locks, and we need a valid @@ -385,28 +457,27 @@ static int mctp_ioctl_alloctag(struct mctp_sock *msk, unsigned long arg) __mctp_key_remove(key, net, fl2, MCTP_TRACE_KEY_DROPPED); mctp_key_unref(key); spin_unlock_irqrestore(&net->mctp.keys_lock, flags); - return -EFAULT; + return rc; } mctp_key_unref(key); return 0; } -static int mctp_ioctl_droptag(struct mctp_sock *msk, unsigned long arg) +static int mctp_ioctl_droptag(struct mctp_sock *msk, bool tagv2, + unsigned long arg) { struct net *net = sock_net(&msk->sk); - struct mctp_ioc_tag_ctl ctl; + struct mctp_ioc_tag_ctl2 ctl; unsigned long flags, fl2; struct mctp_sk_key *key; struct hlist_node *tmp; int rc; u8 tag; - if (copy_from_user(&ctl, (void __user *)arg, sizeof(ctl))) - return -EFAULT; - - if (ctl.flags) - return -EINVAL; + rc = mctp_ioctl_tag_copy_from_user(arg, &ctl, tagv2); + if (rc) + return rc; /* Must be a local tag, TO set, preallocated */ if ((ctl.tag & ~MCTP_TAG_MASK) != (MCTP_TAG_OWNER | MCTP_TAG_PREALLOC)) @@ -422,6 +493,7 @@ static int mctp_ioctl_droptag(struct mctp_sock *msk, unsigned long arg) */ spin_lock_irqsave(&key->lock, fl2); if (key->manual_alloc && + ctl.net == key->net && ctl.peer_addr == key->peer_addr && tag == key->tag) { __mctp_key_remove(key, net, fl2, @@ -439,12 +511,17 @@ static int mctp_ioctl_droptag(struct mctp_sock *msk, unsigned long arg) static int mctp_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct mctp_sock *msk = container_of(sock->sk, struct mctp_sock, sk); + bool tagv2 = false; switch (cmd) { + case SIOCMCTPALLOCTAG2: case SIOCMCTPALLOCTAG: - return mctp_ioctl_alloctag(msk, arg); + tagv2 = cmd == SIOCMCTPALLOCTAG2; + return mctp_ioctl_alloctag(msk, tagv2, arg); case SIOCMCTPDROPTAG: - return mctp_ioctl_droptag(msk, arg); + case SIOCMCTPDROPTAG2: + tagv2 = cmd == SIOCMCTPDROPTAG2; + return mctp_ioctl_droptag(msk, tagv2, arg); } return -EINVAL; diff --git a/net/mctp/route.c b/net/mctp/route.c index 01c530dbc1..eefd7834d9 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -73,13 +73,50 @@ static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb) return NULL; } -static bool mctp_key_match(struct mctp_sk_key *key, mctp_eid_t local, - mctp_eid_t peer, u8 tag) +/* A note on the key allocations. + * + * struct net->mctp.keys contains our set of currently-allocated keys for + * MCTP tag management. The lookup tuple for these is the peer EID, + * local EID and MCTP tag. + * + * In some cases, the peer EID may be MCTP_EID_ANY: for example, when a + * broadcast message is sent, we may receive responses from any peer EID. + * Because the broadcast dest address is equivalent to ANY, we create + * a key with (local = local-eid, peer = ANY). This allows a match on the + * incoming broadcast responses from any peer. + * + * We perform lookups when packets are received, and when tags are allocated + * in two scenarios: + * + * - when a packet is sent, with a locally-owned tag: we need to find an + * unused tag value for the (local, peer) EID pair. + * + * - when a tag is manually allocated: we need to find an unused tag value + * for the peer EID, but don't have a specific local EID at that stage. + * + * in the latter case, on successful allocation, we end up with a tag with + * (local = ANY, peer = peer-eid). + * + * So, the key set allows both a local EID of ANY, as well as a peer EID of + * ANY in the lookup tuple. Both may be ANY if we prealloc for a broadcast. + * The matching (in mctp_key_match()) during lookup allows the match value to + * be ANY in either the dest or source addresses. + * + * When allocating (+ inserting) a tag, we need to check for conflicts amongst + * the existing tag set. This requires macthing either exactly on the local + * and peer addresses, or either being ANY. + */ + +static bool mctp_key_match(struct mctp_sk_key *key, unsigned int net, + mctp_eid_t local, mctp_eid_t peer, u8 tag) { + if (key->net != net) + return false; + if (!mctp_address_matches(key->local_addr, local)) return false; - if (key->peer_addr != peer) + if (!mctp_address_matches(key->peer_addr, peer)) return false; if (key->tag != tag) @@ -92,7 +129,7 @@ static bool mctp_key_match(struct mctp_sk_key *key, mctp_eid_t local, * key exists. */ static struct mctp_sk_key *mctp_lookup_key(struct net *net, struct sk_buff *skb, - mctp_eid_t peer, + unsigned int netid, mctp_eid_t peer, unsigned long *irqflags) __acquires(&key->lock) { @@ -108,7 +145,7 @@ static struct mctp_sk_key *mctp_lookup_key(struct net *net, struct sk_buff *skb, spin_lock_irqsave(&net->mctp.keys_lock, flags); hlist_for_each_entry(key, &net->mctp.keys, hlist) { - if (!mctp_key_match(key, mh->dest, peer, tag)) + if (!mctp_key_match(key, netid, mh->dest, peer, tag)) continue; spin_lock(&key->lock); @@ -131,6 +168,7 @@ static struct mctp_sk_key *mctp_lookup_key(struct net *net, struct sk_buff *skb, } static struct mctp_sk_key *mctp_key_alloc(struct mctp_sock *msk, + unsigned int net, mctp_eid_t local, mctp_eid_t peer, u8 tag, gfp_t gfp) { @@ -140,6 +178,7 @@ static struct mctp_sk_key *mctp_key_alloc(struct mctp_sock *msk, if (!key) return NULL; + key->net = net; key->peer_addr = peer; key->local_addr = local; key->tag = tag; @@ -185,8 +224,8 @@ static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk) } hlist_for_each_entry(tmp, &net->mctp.keys, hlist) { - if (mctp_key_match(tmp, key->local_addr, key->peer_addr, - key->tag)) { + if (mctp_key_match(tmp, key->net, key->local_addr, + key->peer_addr, key->tag)) { spin_lock(&tmp->lock); if (tmp->valid) rc = -EEXIST; @@ -327,6 +366,7 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) struct net *net = dev_net(skb->dev); struct mctp_sock *msk; struct mctp_hdr *mh; + unsigned int netid; unsigned long f; u8 tag, flags; int rc; @@ -345,6 +385,7 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) /* grab header, advance data ptr */ mh = mctp_hdr(skb); + netid = mctp_cb(skb)->net; skb_pull(skb, sizeof(struct mctp_hdr)); if (mh->ver != 1) @@ -358,7 +399,7 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) /* lookup socket / reasm context, exactly matching (src,dest,tag). * we hold a ref on the key, and key->lock held. */ - key = mctp_lookup_key(net, skb, mh->src, &f); + key = mctp_lookup_key(net, skb, netid, mh->src, &f); if (flags & MCTP_HDR_FLAG_SOM) { if (key) { @@ -368,8 +409,12 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) * key lookup to find the socket, but don't use this * key for reassembly - we'll create a more specific * one for future packets if required (ie, !EOM). + * + * this lookup requires key->peer to be MCTP_ADDR_ANY, + * it doesn't match just any key->peer. */ - any_key = mctp_lookup_key(net, skb, MCTP_ADDR_ANY, &f); + any_key = mctp_lookup_key(net, skb, netid, + MCTP_ADDR_ANY, &f); if (any_key) { msk = container_of(any_key->sk, struct mctp_sock, sk); @@ -406,7 +451,7 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) * packets for this message */ if (!key) { - key = mctp_key_alloc(msk, mh->dest, mh->src, + key = mctp_key_alloc(msk, netid, mh->dest, mh->src, tag, GFP_ATOMIC); if (!key) { rc = -ENOMEM; @@ -596,11 +641,12 @@ static void mctp_reserve_tag(struct net *net, struct mctp_sk_key *key, refcount_inc(&key->refs); } -/* Allocate a locally-owned tag value for (saddr, daddr), and reserve +/* Allocate a locally-owned tag value for (local, peer), and reserve * it for the socket msk */ struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk, - mctp_eid_t daddr, mctp_eid_t saddr, + unsigned int netid, + mctp_eid_t local, mctp_eid_t peer, bool manual, u8 *tagp) { struct net *net = sock_net(&msk->sk); @@ -610,11 +656,11 @@ struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk, u8 tagbits; /* for NULL destination EIDs, we may get a response from any peer */ - if (daddr == MCTP_ADDR_NULL) - daddr = MCTP_ADDR_ANY; + if (peer == MCTP_ADDR_NULL) + peer = MCTP_ADDR_ANY; /* be optimistic, alloc now */ - key = mctp_key_alloc(msk, saddr, daddr, 0, GFP_KERNEL); + key = mctp_key_alloc(msk, netid, local, peer, 0, GFP_KERNEL); if (!key) return ERR_PTR(-ENOMEM); @@ -631,12 +677,24 @@ struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk, * lock held, they don't change over the lifetime of the key. */ + /* tags are net-specific */ + if (tmp->net != netid) + continue; + /* if we don't own the tag, it can't conflict */ if (tmp->tag & MCTP_HDR_FLAG_TO) continue; - if (!(mctp_address_matches(tmp->peer_addr, daddr) && - mctp_address_matches(tmp->local_addr, saddr))) + /* Since we're avoiding conflicting entries, match peer and + * local addresses, including with a wildcard on ANY. See + * 'A note on key allocations' for background. + */ + if (peer != MCTP_ADDR_ANY && + !mctp_address_matches(tmp->peer_addr, peer)) + continue; + + if (local != MCTP_ADDR_ANY && + !mctp_address_matches(tmp->local_addr, local)) continue; spin_lock(&tmp->lock); @@ -671,6 +729,7 @@ struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk, } static struct mctp_sk_key *mctp_lookup_prealloc_tag(struct mctp_sock *msk, + unsigned int netid, mctp_eid_t daddr, u8 req_tag, u8 *tagp) { @@ -685,6 +744,9 @@ static struct mctp_sk_key *mctp_lookup_prealloc_tag(struct mctp_sock *msk, spin_lock_irqsave(&mns->keys_lock, flags); hlist_for_each_entry(tmp, &mns->keys, hlist) { + if (tmp->net != netid) + continue; + if (tmp->tag != req_tag) continue; @@ -868,6 +930,7 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt, struct mctp_sk_key *key; struct mctp_hdr *hdr; unsigned long flags; + unsigned int netid; unsigned int mtu; mctp_eid_t saddr; bool ext_rt; @@ -919,16 +982,17 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt, rc = 0; } spin_unlock_irqrestore(&rt->dev->addrs_lock, flags); + netid = READ_ONCE(rt->dev->net); if (rc) goto out_release; if (req_tag & MCTP_TAG_OWNER) { if (req_tag & MCTP_TAG_PREALLOC) - key = mctp_lookup_prealloc_tag(msk, daddr, + key = mctp_lookup_prealloc_tag(msk, netid, daddr, req_tag, &tag); else - key = mctp_alloc_local_tag(msk, daddr, saddr, + key = mctp_alloc_local_tag(msk, netid, saddr, daddr, false, &tag); if (IS_ERR(key)) { diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c index 92ea4158f7..77e5dd4222 100644 --- a/net/mctp/test/route-test.c +++ b/net/mctp/test/route-test.c @@ -79,6 +79,16 @@ static void mctp_test_route_destroy(struct kunit *test, kfree_rcu(&rt->rt, rcu); } +static void mctp_test_skb_set_dev(struct sk_buff *skb, + struct mctp_test_dev *dev) +{ + struct mctp_skb_cb *cb; + + cb = mctp_cb(skb); + cb->net = READ_ONCE(dev->mdev->net); + skb->dev = dev->ndev; +} + static struct sk_buff *mctp_test_create_skb(const struct mctp_hdr *hdr, unsigned int data_len) { @@ -91,6 +101,7 @@ static struct sk_buff *mctp_test_create_skb(const struct mctp_hdr *hdr, if (!skb) return NULL; + __mctp_cb(skb); memcpy(skb_put(skb, hdr_len), hdr, hdr_len); buf = skb_put(skb, data_len); @@ -111,6 +122,7 @@ static struct sk_buff *__mctp_test_create_skb_data(const struct mctp_hdr *hdr, if (!skb) return NULL; + __mctp_cb(skb); memcpy(skb_put(skb, hdr_len), hdr, hdr_len); memcpy(skb_put(skb, data_len), data, data_len); @@ -249,8 +261,6 @@ static void mctp_test_rx_input(struct kunit *test) skb = mctp_test_create_skb(¶ms->hdr, 1); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); - __mctp_cb(skb); - mctp_pkttype_receive(skb, dev->ndev, &mctp_packet_type, NULL); KUNIT_EXPECT_EQ(test, !!rt->pkts.qlen, params->input); @@ -283,7 +293,8 @@ KUNIT_ARRAY_PARAM(mctp_rx_input, mctp_rx_input_tests, static void __mctp_route_test_init(struct kunit *test, struct mctp_test_dev **devp, struct mctp_test_route **rtp, - struct socket **sockp) + struct socket **sockp, + unsigned int netid) { struct sockaddr_mctp addr = {0}; struct mctp_test_route *rt; @@ -293,6 +304,8 @@ static void __mctp_route_test_init(struct kunit *test, dev = mctp_test_create_dev(); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); + if (netid != MCTP_NET_ANY) + WRITE_ONCE(dev->mdev->net, netid); rt = mctp_test_create_route(&init_net, dev->mdev, 8, 68); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt); @@ -301,7 +314,7 @@ static void __mctp_route_test_init(struct kunit *test, KUNIT_ASSERT_EQ(test, rc, 0); addr.smctp_family = AF_MCTP; - addr.smctp_network = MCTP_NET_ANY; + addr.smctp_network = netid; addr.smctp_addr.s_addr = 8; addr.smctp_type = 0; rc = kernel_bind(sock, (struct sockaddr *)&addr, sizeof(addr)); @@ -339,13 +352,12 @@ static void mctp_test_route_input_sk(struct kunit *test) params = test->param_value; - __mctp_route_test_init(test, &dev, &rt, &sock); + __mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY); skb = mctp_test_create_skb_data(¶ms->hdr, ¶ms->type); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); - skb->dev = dev->ndev; - __mctp_cb(skb); + mctp_test_skb_set_dev(skb, dev); rc = mctp_route_input(&rt->rt, skb); @@ -410,15 +422,14 @@ static void mctp_test_route_input_sk_reasm(struct kunit *test) params = test->param_value; - __mctp_route_test_init(test, &dev, &rt, &sock); + __mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY); for (i = 0; i < params->n_hdrs; i++) { c = i; skb = mctp_test_create_skb_data(¶ms->hdrs[i], &c); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); - skb->dev = dev->ndev; - __mctp_cb(skb); + mctp_test_skb_set_dev(skb, dev); rc = mctp_route_input(&rt->rt, skb); } @@ -544,6 +555,7 @@ static void mctp_test_route_input_sk_keys(struct kunit *test) struct mctp_sock *msk; struct socket *sock; unsigned long flags; + unsigned int net; int rc; u8 c; @@ -551,6 +563,7 @@ static void mctp_test_route_input_sk_keys(struct kunit *test) dev = mctp_test_create_dev(); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); + net = READ_ONCE(dev->mdev->net); rt = mctp_test_create_route(&init_net, dev->mdev, 8, 68); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt); @@ -562,8 +575,9 @@ static void mctp_test_route_input_sk_keys(struct kunit *test) mns = &sock_net(sock->sk)->mctp; /* set the incoming tag according to test params */ - key = mctp_key_alloc(msk, params->key_local_addr, params->key_peer_addr, - params->key_tag, GFP_KERNEL); + key = mctp_key_alloc(msk, net, params->key_local_addr, + params->key_peer_addr, params->key_tag, + GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, key); @@ -576,8 +590,7 @@ static void mctp_test_route_input_sk_keys(struct kunit *test) skb = mctp_test_create_skb_data(¶ms->hdr, &c); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); - skb->dev = dev->ndev; - __mctp_cb(skb); + mctp_test_skb_set_dev(skb, dev); rc = mctp_route_input(&rt->rt, skb); @@ -665,6 +678,373 @@ static void mctp_route_input_sk_keys_to_desc( KUNIT_ARRAY_PARAM(mctp_route_input_sk_keys, mctp_route_input_sk_keys_tests, mctp_route_input_sk_keys_to_desc); +struct test_net { + unsigned int netid; + struct mctp_test_dev *dev; + struct mctp_test_route *rt; + struct socket *sock; + struct sk_buff *skb; + struct mctp_sk_key *key; + struct { + u8 type; + unsigned int data; + } msg; +}; + +static void +mctp_test_route_input_multiple_nets_bind_init(struct kunit *test, + struct test_net *t) +{ + struct mctp_hdr hdr = RX_HDR(1, 9, 8, FL_S | FL_E | FL_T(1) | FL_TO); + + t->msg.data = t->netid; + + __mctp_route_test_init(test, &t->dev, &t->rt, &t->sock, t->netid); + + t->skb = mctp_test_create_skb_data(&hdr, &t->msg); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, t->skb); + mctp_test_skb_set_dev(t->skb, t->dev); +} + +static void +mctp_test_route_input_multiple_nets_bind_fini(struct kunit *test, + struct test_net *t) +{ + __mctp_route_test_fini(test, t->dev, t->rt, t->sock); +} + +/* Test that skbs from different nets (otherwise identical) get routed to their + * corresponding socket via the sockets' bind() + */ +static void mctp_test_route_input_multiple_nets_bind(struct kunit *test) +{ + struct sk_buff *rx_skb1, *rx_skb2; + struct test_net t1, t2; + int rc; + + t1.netid = 1; + t2.netid = 2; + + t1.msg.type = 0; + t2.msg.type = 0; + + mctp_test_route_input_multiple_nets_bind_init(test, &t1); + mctp_test_route_input_multiple_nets_bind_init(test, &t2); + + rc = mctp_route_input(&t1.rt->rt, t1.skb); + KUNIT_ASSERT_EQ(test, rc, 0); + rc = mctp_route_input(&t2.rt->rt, t2.skb); + KUNIT_ASSERT_EQ(test, rc, 0); + + rx_skb1 = skb_recv_datagram(t1.sock->sk, MSG_DONTWAIT, &rc); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, rx_skb1); + KUNIT_EXPECT_EQ(test, rx_skb1->len, sizeof(t1.msg)); + KUNIT_EXPECT_EQ(test, + *(unsigned int *)skb_pull(rx_skb1, sizeof(t1.msg.data)), + t1.netid); + kfree_skb(rx_skb1); + + rx_skb2 = skb_recv_datagram(t2.sock->sk, MSG_DONTWAIT, &rc); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, rx_skb2); + KUNIT_EXPECT_EQ(test, rx_skb2->len, sizeof(t2.msg)); + KUNIT_EXPECT_EQ(test, + *(unsigned int *)skb_pull(rx_skb2, sizeof(t2.msg.data)), + t2.netid); + kfree_skb(rx_skb2); + + mctp_test_route_input_multiple_nets_bind_fini(test, &t1); + mctp_test_route_input_multiple_nets_bind_fini(test, &t2); +} + +static void +mctp_test_route_input_multiple_nets_key_init(struct kunit *test, + struct test_net *t) +{ + struct mctp_hdr hdr = RX_HDR(1, 9, 8, FL_S | FL_E | FL_T(1)); + struct mctp_sock *msk; + struct netns_mctp *mns; + unsigned long flags; + + t->msg.data = t->netid; + + __mctp_route_test_init(test, &t->dev, &t->rt, &t->sock, t->netid); + + msk = container_of(t->sock->sk, struct mctp_sock, sk); + + t->key = mctp_key_alloc(msk, t->netid, hdr.dest, hdr.src, 1, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, t->key); + + mns = &sock_net(t->sock->sk)->mctp; + spin_lock_irqsave(&mns->keys_lock, flags); + mctp_reserve_tag(&init_net, t->key, msk); + spin_unlock_irqrestore(&mns->keys_lock, flags); + + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, t->key); + t->skb = mctp_test_create_skb_data(&hdr, &t->msg); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, t->skb); + mctp_test_skb_set_dev(t->skb, t->dev); +} + +static void +mctp_test_route_input_multiple_nets_key_fini(struct kunit *test, + struct test_net *t) +{ + mctp_key_unref(t->key); + __mctp_route_test_fini(test, t->dev, t->rt, t->sock); +} + +/* test that skbs from different nets (otherwise identical) get routed to their + * corresponding socket via the sk_key + */ +static void mctp_test_route_input_multiple_nets_key(struct kunit *test) +{ + struct sk_buff *rx_skb1, *rx_skb2; + struct test_net t1, t2; + int rc; + + t1.netid = 1; + t2.netid = 2; + + /* use type 1 which is not bound */ + t1.msg.type = 1; + t2.msg.type = 1; + + mctp_test_route_input_multiple_nets_key_init(test, &t1); + mctp_test_route_input_multiple_nets_key_init(test, &t2); + + rc = mctp_route_input(&t1.rt->rt, t1.skb); + KUNIT_ASSERT_EQ(test, rc, 0); + rc = mctp_route_input(&t2.rt->rt, t2.skb); + KUNIT_ASSERT_EQ(test, rc, 0); + + rx_skb1 = skb_recv_datagram(t1.sock->sk, MSG_DONTWAIT, &rc); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, rx_skb1); + KUNIT_EXPECT_EQ(test, rx_skb1->len, sizeof(t1.msg)); + KUNIT_EXPECT_EQ(test, + *(unsigned int *)skb_pull(rx_skb1, sizeof(t1.msg.data)), + t1.netid); + kfree_skb(rx_skb1); + + rx_skb2 = skb_recv_datagram(t2.sock->sk, MSG_DONTWAIT, &rc); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, rx_skb2); + KUNIT_EXPECT_EQ(test, rx_skb2->len, sizeof(t2.msg)); + KUNIT_EXPECT_EQ(test, + *(unsigned int *)skb_pull(rx_skb2, sizeof(t2.msg.data)), + t2.netid); + kfree_skb(rx_skb2); + + mctp_test_route_input_multiple_nets_key_fini(test, &t1); + mctp_test_route_input_multiple_nets_key_fini(test, &t2); +} + +#if IS_ENABLED(CONFIG_MCTP_FLOWS) + +static void mctp_test_flow_init(struct kunit *test, + struct mctp_test_dev **devp, + struct mctp_test_route **rtp, + struct socket **sock, + struct sk_buff **skbp, + unsigned int len) +{ + struct mctp_test_route *rt; + struct mctp_test_dev *dev; + struct sk_buff *skb; + + /* we have a slightly odd routing setup here; the test route + * is for EID 8, which is our local EID. We don't do a routing + * lookup, so that's fine - all we require is a path through + * mctp_local_output, which will call rt->output on whatever + * route we provide + */ + __mctp_route_test_init(test, &dev, &rt, sock, MCTP_NET_ANY); + + /* Assign a single EID. ->addrs is freed on mctp netdev release */ + dev->mdev->addrs = kmalloc(sizeof(u8), GFP_KERNEL); + dev->mdev->num_addrs = 1; + dev->mdev->addrs[0] = 8; + + skb = alloc_skb(len + sizeof(struct mctp_hdr) + 1, GFP_KERNEL); + KUNIT_ASSERT_TRUE(test, skb); + __mctp_cb(skb); + skb_reserve(skb, sizeof(struct mctp_hdr) + 1); + memset(skb_put(skb, len), 0, len); + + /* take a ref for the route, we'll decrement in local output */ + refcount_inc(&rt->rt.refs); + + *devp = dev; + *rtp = rt; + *skbp = skb; +} + +static void mctp_test_flow_fini(struct kunit *test, + struct mctp_test_dev *dev, + struct mctp_test_route *rt, + struct socket *sock) +{ + __mctp_route_test_fini(test, dev, rt, sock); +} + +/* test that an outgoing skb has the correct MCTP extension data set */ +static void mctp_test_packet_flow(struct kunit *test) +{ + struct sk_buff *skb, *skb2; + struct mctp_test_route *rt; + struct mctp_test_dev *dev; + struct mctp_flow *flow; + struct socket *sock; + u8 dst = 8; + int n, rc; + + mctp_test_flow_init(test, &dev, &rt, &sock, &skb, 30); + + rc = mctp_local_output(sock->sk, &rt->rt, skb, dst, MCTP_TAG_OWNER); + KUNIT_ASSERT_EQ(test, rc, 0); + + n = rt->pkts.qlen; + KUNIT_ASSERT_EQ(test, n, 1); + + skb2 = skb_dequeue(&rt->pkts); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb2); + + flow = skb_ext_find(skb2, SKB_EXT_MCTP); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, flow); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, flow->key); + KUNIT_ASSERT_PTR_EQ(test, flow->key->sk, sock->sk); + + kfree_skb(skb2); + mctp_test_flow_fini(test, dev, rt, sock); +} + +/* test that outgoing skbs, after fragmentation, all have the correct MCTP + * extension data set. + */ +static void mctp_test_fragment_flow(struct kunit *test) +{ + struct mctp_flow *flows[2]; + struct sk_buff *tx_skbs[2]; + struct mctp_test_route *rt; + struct mctp_test_dev *dev; + struct sk_buff *skb; + struct socket *sock; + u8 dst = 8; + int n, rc; + + mctp_test_flow_init(test, &dev, &rt, &sock, &skb, 100); + + rc = mctp_local_output(sock->sk, &rt->rt, skb, dst, MCTP_TAG_OWNER); + KUNIT_ASSERT_EQ(test, rc, 0); + + n = rt->pkts.qlen; + KUNIT_ASSERT_EQ(test, n, 2); + + /* both resulting packets should have the same flow data */ + tx_skbs[0] = skb_dequeue(&rt->pkts); + tx_skbs[1] = skb_dequeue(&rt->pkts); + + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, tx_skbs[0]); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, tx_skbs[1]); + + flows[0] = skb_ext_find(tx_skbs[0], SKB_EXT_MCTP); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, flows[0]); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, flows[0]->key); + KUNIT_ASSERT_PTR_EQ(test, flows[0]->key->sk, sock->sk); + + flows[1] = skb_ext_find(tx_skbs[1], SKB_EXT_MCTP); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, flows[1]); + KUNIT_ASSERT_PTR_EQ(test, flows[1]->key, flows[0]->key); + + kfree_skb(tx_skbs[0]); + kfree_skb(tx_skbs[1]); + mctp_test_flow_fini(test, dev, rt, sock); +} + +#else +static void mctp_test_packet_flow(struct kunit *test) +{ + kunit_skip(test, "Requires CONFIG_MCTP_FLOWS=y"); +} + +static void mctp_test_fragment_flow(struct kunit *test) +{ + kunit_skip(test, "Requires CONFIG_MCTP_FLOWS=y"); +} +#endif + +/* Test that outgoing skbs cause a suitable tag to be created */ +static void mctp_test_route_output_key_create(struct kunit *test) +{ + const unsigned int netid = 50; + const u8 dst = 26, src = 15; + struct mctp_test_route *rt; + struct mctp_test_dev *dev; + struct mctp_sk_key *key; + struct netns_mctp *mns; + unsigned long flags; + struct socket *sock; + struct sk_buff *skb; + bool empty, single; + const int len = 2; + int rc; + + dev = mctp_test_create_dev(); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); + WRITE_ONCE(dev->mdev->net, netid); + + rt = mctp_test_create_route(&init_net, dev->mdev, dst, 68); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt); + + rc = sock_create_kern(&init_net, AF_MCTP, SOCK_DGRAM, 0, &sock); + KUNIT_ASSERT_EQ(test, rc, 0); + + dev->mdev->addrs = kmalloc(sizeof(u8), GFP_KERNEL); + dev->mdev->num_addrs = 1; + dev->mdev->addrs[0] = src; + + skb = alloc_skb(sizeof(struct mctp_hdr) + 1 + len, GFP_KERNEL); + KUNIT_ASSERT_TRUE(test, skb); + __mctp_cb(skb); + skb_reserve(skb, sizeof(struct mctp_hdr) + 1 + len); + memset(skb_put(skb, len), 0, len); + + refcount_inc(&rt->rt.refs); + + mns = &sock_net(sock->sk)->mctp; + + /* We assume we're starting from an empty keys list, which requires + * preceding tests to clean up correctly! + */ + spin_lock_irqsave(&mns->keys_lock, flags); + empty = hlist_empty(&mns->keys); + spin_unlock_irqrestore(&mns->keys_lock, flags); + KUNIT_ASSERT_TRUE(test, empty); + + rc = mctp_local_output(sock->sk, &rt->rt, skb, dst, MCTP_TAG_OWNER); + KUNIT_ASSERT_EQ(test, rc, 0); + + key = NULL; + single = false; + spin_lock_irqsave(&mns->keys_lock, flags); + if (!hlist_empty(&mns->keys)) { + key = hlist_entry(mns->keys.first, struct mctp_sk_key, hlist); + single = hlist_is_singular_node(&key->hlist, &mns->keys); + } + spin_unlock_irqrestore(&mns->keys_lock, flags); + + KUNIT_ASSERT_NOT_NULL(test, key); + KUNIT_ASSERT_TRUE(test, single); + + KUNIT_EXPECT_EQ(test, key->net, netid); + KUNIT_EXPECT_EQ(test, key->local_addr, src); + KUNIT_EXPECT_EQ(test, key->peer_addr, dst); + /* key has incoming tag, so inverse of what we sent */ + KUNIT_EXPECT_FALSE(test, key->tag & MCTP_TAG_OWNER); + + sock_release(sock); + mctp_test_route_destroy(test, rt); + mctp_test_destroy_dev(dev); +} + static struct kunit_case mctp_test_cases[] = { KUNIT_CASE_PARAM(mctp_test_fragment, mctp_frag_gen_params), KUNIT_CASE_PARAM(mctp_test_rx_input, mctp_rx_input_gen_params), @@ -673,6 +1053,11 @@ static struct kunit_case mctp_test_cases[] = { mctp_route_input_sk_reasm_gen_params), KUNIT_CASE_PARAM(mctp_test_route_input_sk_keys, mctp_route_input_sk_keys_gen_params), + KUNIT_CASE(mctp_test_route_input_multiple_nets_bind), + KUNIT_CASE(mctp_test_route_input_multiple_nets_key), + KUNIT_CASE(mctp_test_packet_flow), + KUNIT_CASE(mctp_test_fragment_flow), + KUNIT_CASE(mctp_test_route_output_key_create), {} }; diff --git a/net/mctp/test/utils.c b/net/mctp/test/utils.c index e03ba66bbe..565763eb02 100644 --- a/net/mctp/test/utils.c +++ b/net/mctp/test/utils.c @@ -4,6 +4,7 @@ #include <linux/mctp.h> #include <linux/if_arp.h> +#include <net/mctp.h> #include <net/mctpdevice.h> #include <net/pkt_sched.h> @@ -54,6 +55,7 @@ struct mctp_test_dev *mctp_test_create_dev(void) rcu_read_lock(); dev->mdev = __mctp_dev_get(ndev); + dev->mdev->net = mctp_default_net(dev_net(ndev)); rcu_read_unlock(); return dev; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 1af29af653..2dc7a908a6 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -594,7 +594,7 @@ static struct net_device *inet_fib_lookup_dev(struct net *net, struct in_addr daddr; memcpy(&daddr, addr, sizeof(struct in_addr)); - rt = ip_route_output(net, daddr.s_addr, 0, 0, 0); + rt = ip_route_output(net, daddr.s_addr, 0, 0, 0, RT_SCOPE_UNIVERSE); if (IS_ERR(rt)) return ERR_CAST(rt); @@ -1154,7 +1154,7 @@ static int mpls_netconf_fill_devconf(struct sk_buff *skb, struct mpls_dev *mdev, if ((all || type == NETCONFA_INPUT) && nla_put_s32(skb, NETCONFA_INPUT, - mdev->input_enabled) < 0) + READ_ONCE(mdev->input_enabled)) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -1303,11 +1303,12 @@ static int mpls_netconf_dump_devconf(struct sk_buff *skb, { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); - struct hlist_head *head; + struct { + unsigned long ifindex; + } *ctx = (void *)cb->ctx; struct net_device *dev; struct mpls_dev *mdev; - int idx, s_idx; - int h, s_h; + int err = 0; if (cb->strict_check) { struct netlink_ext_ack *extack = cb->extack; @@ -1324,40 +1325,23 @@ static int mpls_netconf_dump_devconf(struct sk_buff *skb, } } - s_h = cb->args[0]; - s_idx = idx = cb->args[1]; - - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &net->dev_index_head[h]; - rcu_read_lock(); - cb->seq = net->dev_base_seq; - hlist_for_each_entry_rcu(dev, head, index_hlist) { - if (idx < s_idx) - goto cont; - mdev = mpls_dev_get(dev); - if (!mdev) - goto cont; - if (mpls_netconf_fill_devconf(skb, mdev, - NETLINK_CB(cb->skb).portid, - nlh->nlmsg_seq, - RTM_NEWNETCONF, - NLM_F_MULTI, - NETCONFA_ALL) < 0) { - rcu_read_unlock(); - goto done; - } - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); -cont: - idx++; - } - rcu_read_unlock(); + rcu_read_lock(); + for_each_netdev_dump(net, dev, ctx->ifindex) { + mdev = mpls_dev_get(dev); + if (!mdev) + continue; + err = mpls_netconf_fill_devconf(skb, mdev, + NETLINK_CB(cb->skb).portid, + nlh->nlmsg_seq, + RTM_NEWNETCONF, + NLM_F_MULTI, + NETCONFA_ALL); + if (err < 0) + break; } -done: - cb->args[0] = h; - cb->args[1] = idx; + rcu_read_unlock(); - return skb->len; + return err; } #define MPLS_PERDEV_SYSCTL_OFFSET(field) \ @@ -1393,13 +1377,13 @@ static const struct ctl_table mpls_dev_table[] = { .proc_handler = mpls_conf_proc, .data = MPLS_PERDEV_SYSCTL_OFFSET(input_enabled), }, - { } }; static int mpls_dev_sysctl_register(struct net_device *dev, struct mpls_dev *mdev) { char path[sizeof("net/mpls/conf/") + IFNAMSIZ]; + size_t table_size = ARRAY_SIZE(mpls_dev_table); struct net *net = dev_net(dev); struct ctl_table *table; int i; @@ -1411,7 +1395,7 @@ static int mpls_dev_sysctl_register(struct net_device *dev, /* Table data contains only offsets relative to the base of * the mdev at this point, so make them absolute. */ - for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++) { + for (i = 0; i < table_size; i++) { table[i].data = (char *)mdev + (uintptr_t)table[i].data; table[i].extra1 = mdev; table[i].extra2 = net; @@ -1419,8 +1403,7 @@ static int mpls_dev_sysctl_register(struct net_device *dev, snprintf(path, sizeof(path), "net/mpls/conf/%s", dev->name); - mdev->sysctl = register_net_sysctl_sz(net, path, table, - ARRAY_SIZE(mpls_dev_table)); + mdev->sysctl = register_net_sysctl_sz(net, path, table, table_size); if (!mdev->sysctl) goto free; @@ -1438,7 +1421,7 @@ static void mpls_dev_sysctl_unregister(struct net_device *dev, struct mpls_dev *mdev) { struct net *net = dev_net(dev); - struct ctl_table *table; + const struct ctl_table *table; if (!mdev->sysctl) return; @@ -2179,7 +2162,9 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct mpls_route __rcu **platform_label; - struct fib_dump_filter filter = {}; + struct fib_dump_filter filter = { + .rtnl_held = true, + }; unsigned int flags = NLM_F_MULTI; size_t platform_labels; unsigned int index; @@ -2667,11 +2652,11 @@ static const struct ctl_table mpls_table[] = { .extra1 = SYSCTL_ONE, .extra2 = &ttl_max, }, - { } }; static int mpls_net_init(struct net *net) { + size_t table_size = ARRAY_SIZE(mpls_table); struct ctl_table *table; int i; @@ -2687,11 +2672,11 @@ static int mpls_net_init(struct net *net) /* Table data contains only offsets relative to the base of * the mdev at this point, so make them absolute. */ - for (i = 0; i < ARRAY_SIZE(mpls_table) - 1; i++) + for (i = 0; i < table_size; i++) table[i].data = (char *)net + (uintptr_t)table[i].data; net->mpls.ctl = register_net_sysctl_sz(net, "net/mpls", table, - ARRAY_SIZE(mpls_table)); + table_size); if (net->mpls.ctl == NULL) { kfree(table); return -ENOMEM; @@ -2704,7 +2689,7 @@ static void mpls_net_exit(struct net *net) { struct mpls_route __rcu **platform_label; size_t platform_labels; - struct ctl_table *table; + const struct ctl_table *table; unsigned int index; table = net->mpls.ctl->ctl_table_arg; @@ -2771,7 +2756,8 @@ static int __init mpls_init(void) mpls_getroute, mpls_dump_routes, 0); rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf, - mpls_netconf_dump_devconf, 0); + mpls_netconf_dump_devconf, + RTNL_FLAG_DUMP_UNLOCKED); err = ipgre_tunnel_encap_add_mpls_ops(); if (err) pr_err("Can't add mpls over gre tunnel ops\n"); diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c index 45d1e6a157..34ab659f54 100644 --- a/net/mpls/mpls_gso.c +++ b/net/mpls/mpls_gso.c @@ -109,5 +109,5 @@ module_init(mpls_gso_init); module_exit(mpls_gso_exit); MODULE_DESCRIPTION("MPLS GSO support"); -MODULE_AUTHOR("Simon Horman (horms@verge.net.au)"); +MODULE_AUTHOR("Simon Horman <horms@verge.net.au>"); MODULE_LICENSE("GPL"); diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index ef59e25dc4..4385fd3b13 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -55,8 +55,6 @@ static int mpls_xmit(struct sk_buff *skb) out_dev = dst->dev; net = dev_net(out_dev); - skb_orphan(skb); - if (!mpls_output_possible(out_dev) || !dst->lwtstate || skb_warn_if_lro(skb)) goto drop; @@ -83,7 +81,7 @@ static int mpls_xmit(struct sk_buff *skb) ttl = net->mpls.default_ttl; else ttl = ip_hdr(skb)->ttl; - rt = (struct rtable *)dst; + rt = dst_rtable(dst); } else if (dst->ops->family == AF_INET6) { if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DISABLED) ttl = tun_encap_info->default_ttl; @@ -92,7 +90,7 @@ static int mpls_xmit(struct sk_buff *skb) ttl = net->mpls.default_ttl; else ttl = ipv6_hdr(skb)->hop_limit; - rt6 = (struct rt6_info *)dst; + rt6 = dst_rt6_info(dst); } else { goto drop; } diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 13fe0748dd..98b1dd498f 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -92,10 +92,65 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) pernet->allow_join_initial_addr_port = 1; pernet->stale_loss_cnt = 4; pernet->pm_type = MPTCP_PM_TYPE_KERNEL; - strcpy(pernet->scheduler, "default"); + strscpy(pernet->scheduler, "default", sizeof(pernet->scheduler)); } #ifdef CONFIG_SYSCTL +static int mptcp_set_scheduler(const struct net *net, const char *name) +{ + struct mptcp_pernet *pernet = mptcp_get_pernet(net); + struct mptcp_sched_ops *sched; + int ret = 0; + + rcu_read_lock(); + sched = mptcp_sched_find(name); + if (sched) + strscpy(pernet->scheduler, name, MPTCP_SCHED_NAME_MAX); + else + ret = -ENOENT; + rcu_read_unlock(); + + return ret; +} + +static int proc_scheduler(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + const struct net *net = current->nsproxy->net_ns; + char val[MPTCP_SCHED_NAME_MAX]; + struct ctl_table tbl = { + .data = val, + .maxlen = MPTCP_SCHED_NAME_MAX, + }; + int ret; + + strscpy(val, mptcp_get_scheduler(net), MPTCP_SCHED_NAME_MAX); + + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) + ret = mptcp_set_scheduler(net, val); + + return ret; +} + +static int proc_available_schedulers(struct ctl_table *ctl, + int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + struct ctl_table tbl = { .maxlen = MPTCP_SCHED_BUF_MAX, }; + int ret; + + tbl.data = kmalloc(tbl.maxlen, GFP_USER); + if (!tbl.data) + return -ENOMEM; + + mptcp_get_available_schedulers(tbl.data, MPTCP_SCHED_BUF_MAX); + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + kfree(tbl.data); + + return ret; +} + static struct ctl_table mptcp_sysctl_table[] = { { .procname = "enabled", @@ -148,7 +203,13 @@ static struct ctl_table mptcp_sysctl_table[] = { .procname = "scheduler", .maxlen = MPTCP_SCHED_NAME_MAX, .mode = 0644, - .proc_handler = proc_dostring, + .proc_handler = proc_scheduler, + }, + { + .procname = "available_schedulers", + .maxlen = MPTCP_SCHED_BUF_MAX, + .mode = 0644, + .proc_handler = proc_available_schedulers, }, { .procname = "close_timeout", @@ -156,7 +217,6 @@ static struct ctl_table mptcp_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - {} }; static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) @@ -178,7 +238,8 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) table[4].data = &pernet->stale_loss_cnt; table[5].data = &pernet->pm_type; table[6].data = &pernet->scheduler; - table[7].data = &pernet->close_timeout; + /* table[7] is for available_schedulers which is read-only info */ + table[8].data = &pernet->close_timeout; hdr = register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table, ARRAY_SIZE(mptcp_sysctl_table)); @@ -198,7 +259,7 @@ err_alloc: static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) { - struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg; + const struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg; unregister_net_sysctl_table(pernet->ctl_table_hdr); diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c index 7017dd6065..3ae46b545d 100644 --- a/net/mptcp/diag.c +++ b/net/mptcp/diag.c @@ -10,7 +10,6 @@ #include <linux/net.h> #include <linux/inet_diag.h> #include <net/netlink.h> -#include <uapi/linux/mptcp.h> #include "protocol.h" static int subflow_get_info(struct sock *sk, struct sk_buff *skb) diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index c30405e768..7884217f33 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -19,7 +19,9 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS), SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN), SNMP_MIB_ITEM("MPJoinSynRx", MPTCP_MIB_JOINSYNRX), + SNMP_MIB_ITEM("MPJoinSynBackupRx", MPTCP_MIB_JOINSYNBACKUPRX), SNMP_MIB_ITEM("MPJoinSynAckRx", MPTCP_MIB_JOINSYNACKRX), + SNMP_MIB_ITEM("MPJoinSynAckBackupRx", MPTCP_MIB_JOINSYNACKBACKUPRX), SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC), SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX), SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index dd7fd1f246..66aa67f49d 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -1,5 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ +#include <net/inet_common.h> + enum linux_mptcp_mib_field { MPTCP_MIB_NUM = 0, MPTCP_MIB_MPCAPABLEPASSIVE, /* Received SYN with MP_CAPABLE */ @@ -12,7 +14,9 @@ enum linux_mptcp_mib_field { MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */ MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */ MPTCP_MIB_JOINSYNRX, /* Received a SYN + MP_JOIN */ + MPTCP_MIB_JOINSYNBACKUPRX, /* Received a SYN + MP_JOIN + backup flag */ MPTCP_MIB_JOINSYNACKRX, /* Received a SYN/ACK + MP_JOIN */ + MPTCP_MIB_JOINSYNACKBACKUPRX, /* Received a SYN/ACK + MP_JOIN + backup flag */ MPTCP_MIB_JOINSYNACKMAC, /* HMAC was wrong on SYN/ACK + MP_JOIN */ MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */ MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index 5409c2ea3f..0566dd7938 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -10,7 +10,6 @@ #include <linux/net.h> #include <linux/inet_diag.h> #include <net/netlink.h> -#include <uapi/linux/mptcp.h> #include "protocol.h" static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, @@ -225,6 +224,7 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, } static const struct inet_diag_handler mptcp_diag_handler = { + .owner = THIS_MODULE, .dump = mptcp_diag_dump, .dump_one = mptcp_diag_dump_one, .idiag_get_info = mptcp_diag_get_info, diff --git a/net/mptcp/mptcp_pm_gen.c b/net/mptcp/mptcp_pm_gen.c index 670da7822e..c30a2a90a1 100644 --- a/net/mptcp/mptcp_pm_gen.c +++ b/net/mptcp/mptcp_pm_gen.c @@ -32,8 +32,9 @@ const struct nla_policy mptcp_pm_del_addr_nl_policy[MPTCP_PM_ENDPOINT_ADDR + 1] }; /* MPTCP_PM_CMD_GET_ADDR - do */ -const struct nla_policy mptcp_pm_get_addr_nl_policy[MPTCP_PM_ENDPOINT_ADDR + 1] = { - [MPTCP_PM_ENDPOINT_ADDR] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy), +const struct nla_policy mptcp_pm_get_addr_nl_policy[MPTCP_PM_ATTR_TOKEN + 1] = { + [MPTCP_PM_ATTR_ADDR] = NLA_POLICY_NESTED(mptcp_pm_address_nl_policy), + [MPTCP_PM_ATTR_TOKEN] = { .type = NLA_U32, }, }; /* MPTCP_PM_CMD_FLUSH_ADDRS - do */ @@ -110,7 +111,7 @@ const struct genl_ops mptcp_pm_nl_ops[11] = { .doit = mptcp_pm_nl_get_addr_doit, .dumpit = mptcp_pm_nl_get_addr_dumpit, .policy = mptcp_pm_get_addr_nl_policy, - .maxattr = MPTCP_PM_ENDPOINT_ADDR, + .maxattr = MPTCP_PM_ATTR_TOKEN, .flags = GENL_UNS_ADMIN_PERM, }, { diff --git a/net/mptcp/mptcp_pm_gen.h b/net/mptcp/mptcp_pm_gen.h index ac9fc7225b..e24258f6f8 100644 --- a/net/mptcp/mptcp_pm_gen.h +++ b/net/mptcp/mptcp_pm_gen.h @@ -18,7 +18,7 @@ extern const struct nla_policy mptcp_pm_add_addr_nl_policy[MPTCP_PM_ENDPOINT_ADD extern const struct nla_policy mptcp_pm_del_addr_nl_policy[MPTCP_PM_ENDPOINT_ADDR + 1]; -extern const struct nla_policy mptcp_pm_get_addr_nl_policy[MPTCP_PM_ENDPOINT_ADDR + 1]; +extern const struct nla_policy mptcp_pm_get_addr_nl_policy[MPTCP_PM_ATTR_TOKEN + 1]; extern const struct nla_policy mptcp_pm_flush_addrs_nl_policy[MPTCP_PM_ENDPOINT_ADDR + 1]; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 63fc0758c2..ac2f1a54cc 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -689,8 +689,8 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * opts->suboptions |= OPTION_MPTCP_ADD_ADDR; if (!echo) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDRTX); - opts->ahmac = add_addr_generate_hmac(msk->local_key, - msk->remote_key, + opts->ahmac = add_addr_generate_hmac(READ_ONCE(msk->local_key), + READ_ONCE(msk->remote_key), &opts->addr); } else { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADDTX); @@ -792,7 +792,7 @@ static bool mptcp_established_options_fastclose(struct sock *sk, *size = TCPOLEN_MPTCP_FASTCLOSE; opts->suboptions |= OPTION_MPTCP_FASTCLOSE; - opts->rcvr_key = msk->remote_key; + opts->rcvr_key = READ_ONCE(msk->remote_key); pr_debug("FASTCLOSE key=%llu", opts->rcvr_key); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX); @@ -909,7 +909,7 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, return true; } else if (subflow_req->mp_join) { opts->suboptions = OPTION_MPTCP_MPJ_SYNACK; - opts->backup = subflow_req->backup; + opts->backup = subflow_req->request_bkup; opts->join_id = subflow_req->local_id; opts->thmac = subflow_req->thmac; opts->nonce = subflow_req->local_nonce; @@ -958,7 +958,8 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, if (subflow->remote_key_valid && (((mp_opt->suboptions & OPTION_MPTCP_DSS) && mp_opt->use_ack) || - ((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) && !mp_opt->echo))) { + ((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) && + (!mp_opt->echo || subflow->mp_join)))) { /* subflows are fully established as soon as we get any * additional ack, including ADD_ADDR. */ @@ -1031,7 +1032,7 @@ u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq) static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una) { msk->bytes_acked += new_snd_una - msk->snd_una; - msk->snd_una = new_snd_una; + WRITE_ONCE(msk->snd_una, new_snd_una); } static void ack_update_msk(struct mptcp_sock *msk, @@ -1058,21 +1059,22 @@ static void ack_update_msk(struct mptcp_sock *msk, new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd; if (after64(new_wnd_end, msk->wnd_end)) - msk->wnd_end = new_wnd_end; + WRITE_ONCE(msk->wnd_end, new_wnd_end); /* this assumes mptcp_incoming_options() is invoked after tcp_ack() */ - if (after64(msk->wnd_end, READ_ONCE(msk->snd_nxt))) + if (after64(msk->wnd_end, snd_nxt)) __mptcp_check_push(sk, ssk); if (after64(new_snd_una, old_snd_una)) { __mptcp_snd_una_update(msk, new_snd_una); __mptcp_data_acked(sk); } + msk->last_ack_recv = tcp_jiffies32; mptcp_data_unlock(sk); trace_ack_update_msk(mp_opt->data_ack, old_snd_una, new_snd_una, - new_wnd_end, msk->wnd_end); + new_wnd_end, READ_ONCE(msk->wnd_end)); } bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit) @@ -1100,8 +1102,8 @@ static bool add_addr_hmac_valid(struct mptcp_sock *msk, if (mp_opt->echo) return true; - hmac = add_addr_generate_hmac(msk->remote_key, - msk->local_key, + hmac = add_addr_generate_hmac(READ_ONCE(msk->remote_key), + READ_ONCE(msk->local_key), &mp_opt->addr); pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n", @@ -1148,7 +1150,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) if (unlikely(mp_opt.suboptions != OPTION_MPTCP_DSS)) { if ((mp_opt.suboptions & OPTION_MPTCP_FASTCLOSE) && - msk->local_key == mp_opt.rcvr_key) { + READ_ONCE(msk->local_key) == mp_opt.rcvr_key) { WRITE_ONCE(msk->rcv_fastclose, true); mptcp_schedule_work((struct sock *)msk); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSERX); diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 4ae19113b8..23bb89c94e 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -6,7 +6,6 @@ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> -#include <net/tcp.h> #include <net/mptcp.h> #include "protocol.h" @@ -77,7 +76,7 @@ void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int { struct mptcp_pm_data *pm = &msk->pm; - pr_debug("msk=%p, token=%u side=%d", msk, msk->token, server_side); + pr_debug("msk=%p, token=%u side=%d", msk, READ_ONCE(msk->token), server_side); WRITE_ONCE(pm->server_side, server_side); mptcp_event(MPTCP_EVENT_CREATED, msk, ssk, GFP_ATOMIC); @@ -427,6 +426,18 @@ int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) return mptcp_pm_nl_get_local_id(msk, &skc_local); } +bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc) +{ + struct mptcp_addr_info skc_local; + + mptcp_local_address((struct sock_common *)skc, &skc_local); + + if (mptcp_pm_is_userspace(msk)) + return mptcp_userspace_pm_is_backup(msk, &skc_local); + + return mptcp_pm_nl_is_backup(msk, &skc_local); +} + int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, u8 *flags, int *ifindex) { @@ -441,13 +452,27 @@ int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id return mptcp_pm_nl_get_flags_and_ifindex_by_id(msk, id, flags, ifindex); } -int mptcp_pm_set_flags(struct net *net, struct nlattr *token, - struct mptcp_pm_addr_entry *loc, - struct mptcp_pm_addr_entry *rem, u8 bkup) +int mptcp_pm_get_addr(struct sk_buff *skb, struct genl_info *info) +{ + if (info->attrs[MPTCP_PM_ATTR_TOKEN]) + return mptcp_userspace_pm_get_addr(skb, info); + return mptcp_pm_nl_get_addr(skb, info); +} + +int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb) +{ + const struct genl_info *info = genl_info_dump(cb); + + if (info->attrs[MPTCP_PM_ATTR_TOKEN]) + return mptcp_userspace_pm_dump_addr(msg, cb); + return mptcp_pm_nl_dump_addr(msg, cb); +} + +int mptcp_pm_set_flags(struct sk_buff *skb, struct genl_info *info) { - if (token) - return mptcp_userspace_pm_set_flags(net, token, loc, rem, bkup); - return mptcp_pm_nl_set_flags(net, loc, bkup); + if (info->attrs[MPTCP_PM_ATTR_TOKEN]) + return mptcp_userspace_pm_set_flags(skb, info); + return mptcp_pm_nl_set_flags(skb, info); } void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 58d17d9604..4cae2aa7be 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -8,18 +8,13 @@ #include <linux/inet.h> #include <linux/kernel.h> -#include <net/tcp.h> #include <net/inet_common.h> #include <net/netns/generic.h> #include <net/mptcp.h> -#include <net/genetlink.h> -#include <uapi/linux/mptcp.h> #include "protocol.h" #include "mib.h" - -/* forward declaration */ -static struct genl_family mptcp_genl_family; +#include "mptcp_pm_gen.h" static int pm_nl_pernet_id; @@ -353,7 +348,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, add_entry = mptcp_lookup_anno_list_by_saddr(msk, addr); if (add_entry) { - if (mptcp_pm_is_kernel(msk)) + if (WARN_ON_ONCE(mptcp_pm_is_kernel(msk))) return false; sk_reset_timer(sk, &add_entry->add_timer, @@ -476,7 +471,6 @@ static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_con slow = lock_sock_fast(ssk); if (prio) { subflow->send_mp_prio = 1; - subflow->backup = backup; subflow->request_bkup = backup; } @@ -505,15 +499,12 @@ __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) } static struct mptcp_pm_addr_entry * -__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info, - bool lookup_by_id) +__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) { struct mptcp_pm_addr_entry *entry; list_for_each_entry(entry, &pernet->local_addr_list, list) { - if ((!lookup_by_id && - mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) || - (lookup_by_id && entry->addr.id == info->id)) + if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) return entry; } return NULL; @@ -521,8 +512,8 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info, static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) { + struct mptcp_pm_addr_entry *local, *signal_and_subflow = NULL; struct sock *sk = (struct sock *)msk; - struct mptcp_pm_addr_entry *local; unsigned int add_addr_signal_max; unsigned int local_addr_max; struct pm_nl_pernet *pernet; @@ -543,7 +534,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) mptcp_local_address((struct sock_common *)msk->first, &mpc_addr); rcu_read_lock(); - entry = __lookup_addr(pernet, &mpc_addr, false); + entry = __lookup_addr(pernet, &mpc_addr); if (entry) { __clear_bit(entry->addr.id, msk->pm.id_avail_bitmap); msk->mpc_endpoint_id = entry->addr.id; @@ -564,8 +555,6 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) /* check first for announce */ if (msk->pm.add_addr_signaled < add_addr_signal_max) { - local = select_signal_address(pernet, msk); - /* due to racing events on both ends we can reach here while * previous add address is still running: if we invoke now * mptcp_pm_announce_addr(), that will fail and the @@ -576,16 +565,26 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL)) return; - if (local) { - if (mptcp_pm_alloc_anno_list(msk, &local->addr)) { - __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); - msk->pm.add_addr_signaled++; - mptcp_pm_announce_addr(msk, &local->addr, false); - mptcp_pm_nl_addr_send_ack(msk); - } - } + local = select_signal_address(pernet, msk); + if (!local) + goto subflow; + + /* If the alloc fails, we are on memory pressure, not worth + * continuing, and trying to create subflows. + */ + if (!mptcp_pm_alloc_anno_list(msk, &local->addr)) + return; + + __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); + msk->pm.add_addr_signaled++; + mptcp_pm_announce_addr(msk, &local->addr, false); + mptcp_pm_nl_addr_send_ack(msk); + + if (local->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) + signal_and_subflow = local; } +subflow: /* check if should create a new subflow */ while (msk->pm.local_addr_used < local_addr_max && msk->pm.subflows < subflows_max) { @@ -593,9 +592,14 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) bool fullmesh; int i, nr; - local = select_local_address(pernet, msk); - if (!local) - break; + if (signal_and_subflow) { + local = signal_and_subflow; + signal_and_subflow = NULL; + } else { + local = select_local_address(pernet, msk); + if (!local) + break; + } fullmesh = !!(local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH); @@ -685,6 +689,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) unsigned int add_addr_accept_max; struct mptcp_addr_info remote; unsigned int subflows_max; + bool sf_created = false; int i, nr; add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); @@ -712,15 +717,18 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) if (nr == 0) return; - msk->pm.add_addr_accepted++; - if (msk->pm.add_addr_accepted >= add_addr_accept_max || - msk->pm.subflows >= subflows_max) - WRITE_ONCE(msk->pm.accept_addr, false); - spin_unlock_bh(&msk->pm.lock); for (i = 0; i < nr; i++) - __mptcp_subflow_connect(sk, &addrs[i], &remote); + if (__mptcp_subflow_connect(sk, &addrs[i], &remote) == 0) + sf_created = true; spin_lock_bh(&msk->pm.lock); + + if (sf_created) { + msk->pm.add_addr_accepted++; + if (msk->pm.add_addr_accepted >= add_addr_accept_max || + msk->pm.subflows >= subflows_max) + WRITE_ONCE(msk->pm.accept_addr, false); + } } void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) @@ -822,10 +830,13 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, spin_lock_bh(&msk->pm.lock); removed = true; - __MPTCP_INC_STATS(sock_net(sk), rm_type); + if (rm_type == MPTCP_MIB_RMSUBFLOW) + __MPTCP_INC_STATS(sock_net(sk), rm_type); } if (rm_type == MPTCP_MIB_RMSUBFLOW) __set_bit(rm_id ? rm_id : msk->mpc_endpoint_id, msk->pm.id_avail_bitmap); + else if (rm_type == MPTCP_MIB_RMADDR) + __MPTCP_INC_STATS(sock_net(sk), rm_type); if (!removed) continue; @@ -1103,6 +1114,24 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc return ret; } +bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + struct mptcp_pm_addr_entry *entry; + bool backup = false; + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + if (mptcp_addresses_equal(&entry->addr, skc, entry->addr.port)) { + backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); + break; + } + } + rcu_read_unlock(); + + return backup; +} + #define MPTCP_PM_CMD_GRP_OFFSET 0 #define MPTCP_PM_EV_GRP_OFFSET 1 @@ -1312,8 +1341,8 @@ int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info) if (ret < 0) return ret; - if (addr.addr.port && !(addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { - GENL_SET_ERR_MSG(info, "flags must have signal when using port"); + if (addr.addr.port && !address_use_port(&addr)) { + GENL_SET_ERR_MSG(info, "flags must have signal and not subflow when using port"); return -EINVAL; } @@ -1402,6 +1431,7 @@ static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, ret = remove_anno_list_by_saddr(msk, addr); if (ret || force) { spin_lock_bh(&msk->pm.lock); + msk->pm.add_addr_signaled -= ret; mptcp_pm_remove_addr(msk, &list); spin_unlock_bh(&msk->pm.lock); } @@ -1535,39 +1565,49 @@ void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list) { struct mptcp_rm_list alist = { .nr = 0 }; struct mptcp_pm_addr_entry *entry; + int anno_nr = 0; list_for_each_entry(entry, rm_list, list) { - if ((remove_anno_list_by_saddr(msk, &entry->addr) || - lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) && - alist.nr < MPTCP_RM_IDS_MAX) - alist.ids[alist.nr++] = entry->addr.id; + if (alist.nr >= MPTCP_RM_IDS_MAX) + break; + + /* only delete if either announced or matching a subflow */ + if (remove_anno_list_by_saddr(msk, &entry->addr)) + anno_nr++; + else if (!lookup_subflow_by_saddr(&msk->conn_list, + &entry->addr)) + continue; + + alist.ids[alist.nr++] = entry->addr.id; } if (alist.nr) { spin_lock_bh(&msk->pm.lock); + msk->pm.add_addr_signaled -= anno_nr; mptcp_pm_remove_addr(msk, &alist); spin_unlock_bh(&msk->pm.lock); } } -void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, - struct list_head *rm_list) +static void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, + struct list_head *rm_list) { struct mptcp_rm_list alist = { .nr = 0 }, slist = { .nr = 0 }; struct mptcp_pm_addr_entry *entry; list_for_each_entry(entry, rm_list, list) { - if (lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) && - slist.nr < MPTCP_RM_IDS_MAX) + if (slist.nr < MPTCP_RM_IDS_MAX && + lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) slist.ids[slist.nr++] = entry->addr.id; - if (remove_anno_list_by_saddr(msk, &entry->addr) && - alist.nr < MPTCP_RM_IDS_MAX) + if (alist.nr < MPTCP_RM_IDS_MAX && + remove_anno_list_by_saddr(msk, &entry->addr)) alist.ids[alist.nr++] = entry->addr.id; } if (alist.nr) { spin_lock_bh(&msk->pm.lock); + msk->pm.add_addr_signaled -= alist.nr; mptcp_pm_remove_addr(msk, &alist); spin_unlock_bh(&msk->pm.lock); } @@ -1636,8 +1676,8 @@ int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info) return 0; } -static int mptcp_nl_fill_addr(struct sk_buff *skb, - struct mptcp_pm_addr_entry *entry) +int mptcp_nl_fill_addr(struct sk_buff *skb, + struct mptcp_pm_addr_entry *entry) { struct mptcp_addr_info *addr = &entry->addr; struct nlattr *attr; @@ -1675,7 +1715,7 @@ nla_put_failure: return -EMSGSIZE; } -int mptcp_pm_nl_get_addr_doit(struct sk_buff *skb, struct genl_info *info) +int mptcp_pm_nl_get_addr(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; struct pm_nl_pernet *pernet = genl_info_pm_nl(info); @@ -1725,8 +1765,13 @@ fail: return ret; } -int mptcp_pm_nl_get_addr_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +int mptcp_pm_nl_get_addr_doit(struct sk_buff *skb, struct genl_info *info) +{ + return mptcp_pm_get_addr(skb, info); +} + +int mptcp_pm_nl_dump_addr(struct sk_buff *msg, + struct netlink_callback *cb) { struct net *net = sock_net(msg->sk); struct mptcp_pm_addr_entry *entry; @@ -1768,6 +1813,12 @@ int mptcp_pm_nl_get_addr_dumpit(struct sk_buff *msg, return msg->len; } +int mptcp_pm_nl_get_addr_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + return mptcp_pm_dump_addr(msg, cb); +} + static int parse_limit(struct genl_info *info, int id, unsigned int *limit) { struct nlattr *attr = info->attrs[id]; @@ -1882,66 +1933,63 @@ next: return ret; } -int mptcp_pm_nl_set_flags(struct net *net, struct mptcp_pm_addr_entry *addr, u8 bkup) +int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info) { - struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); + struct mptcp_pm_addr_entry addr = { .addr = { .family = AF_UNSPEC }, }; + struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; u8 changed, mask = MPTCP_PM_ADDR_FLAG_BACKUP | MPTCP_PM_ADDR_FLAG_FULLMESH; + struct net *net = sock_net(skb->sk); struct mptcp_pm_addr_entry *entry; + struct pm_nl_pernet *pernet; u8 lookup_by_id = 0; + u8 bkup = 0; + int ret; + + pernet = pm_nl_get_pernet(net); + + ret = mptcp_pm_parse_entry(attr, info, false, &addr); + if (ret < 0) + return ret; - if (addr->addr.family == AF_UNSPEC) { + if (addr.addr.family == AF_UNSPEC) { lookup_by_id = 1; - if (!addr->addr.id) + if (!addr.addr.id) { + GENL_SET_ERR_MSG(info, "missing required inputs"); return -EOPNOTSUPP; + } } + if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP) + bkup = 1; + spin_lock_bh(&pernet->lock); - entry = __lookup_addr(pernet, &addr->addr, lookup_by_id); + entry = lookup_by_id ? __lookup_addr_by_id(pernet, addr.addr.id) : + __lookup_addr(pernet, &addr.addr); if (!entry) { spin_unlock_bh(&pernet->lock); + GENL_SET_ERR_MSG(info, "address not found"); return -EINVAL; } - if ((addr->flags & MPTCP_PM_ADDR_FLAG_FULLMESH) && + if ((addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) && (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { spin_unlock_bh(&pernet->lock); + GENL_SET_ERR_MSG(info, "invalid addr flags"); return -EINVAL; } - changed = (addr->flags ^ entry->flags) & mask; - entry->flags = (entry->flags & ~mask) | (addr->flags & mask); - *addr = *entry; + changed = (addr.flags ^ entry->flags) & mask; + entry->flags = (entry->flags & ~mask) | (addr.flags & mask); + addr = *entry; spin_unlock_bh(&pernet->lock); - mptcp_nl_set_flags(net, &addr->addr, bkup, changed); + mptcp_nl_set_flags(net, &addr.addr, bkup, changed); return 0; } int mptcp_pm_nl_set_flags_doit(struct sk_buff *skb, struct genl_info *info) { - struct mptcp_pm_addr_entry remote = { .addr = { .family = AF_UNSPEC }, }; - struct mptcp_pm_addr_entry addr = { .addr = { .family = AF_UNSPEC }, }; - struct nlattr *attr_rem = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; - struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; - struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; - struct net *net = sock_net(skb->sk); - u8 bkup = 0; - int ret; - - ret = mptcp_pm_parse_entry(attr, info, false, &addr); - if (ret < 0) - return ret; - - if (attr_rem) { - ret = mptcp_pm_parse_entry(attr_rem, info, false, &remote); - if (ret < 0) - return ret; - } - - if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP) - bkup = 1; - - return mptcp_pm_set_flags(net, token, &addr, &remote, bkup); + return mptcp_pm_set_flags(skb, info); } static void mptcp_nl_mcast_send(struct net *net, struct sk_buff *nlskb, gfp_t gfp) @@ -2014,7 +2062,7 @@ static int mptcp_event_put_token_and_ssk(struct sk_buff *skb, const struct mptcp_subflow_context *sf; u8 sk_err; - if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token)) + if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token))) return -EMSGSIZE; if (mptcp_event_add_subflow(skb, ssk)) @@ -2072,7 +2120,7 @@ static int mptcp_event_created(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { - int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token); + int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)); if (err) return err; @@ -2100,7 +2148,7 @@ void mptcp_event_addr_removed(const struct mptcp_sock *msk, uint8_t id) if (!nlh) goto nla_put_failure; - if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token)) + if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token))) goto nla_put_failure; if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, id)) @@ -2135,7 +2183,7 @@ void mptcp_event_addr_announced(const struct sock *ssk, if (!nlh) goto nla_put_failure; - if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token)) + if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token))) goto nla_put_failure; if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, info->id)) @@ -2251,7 +2299,7 @@ void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, goto nla_put_failure; break; case MPTCP_EVENT_CLOSED: - if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token) < 0) + if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)) < 0) goto nla_put_failure; break; case MPTCP_EVENT_ANNOUNCED: @@ -2281,7 +2329,7 @@ nla_put_failure: nlmsg_free(skb); } -static struct genl_family mptcp_genl_family __ro_after_init = { +struct genl_family mptcp_genl_family __ro_after_init = { .name = MPTCP_PM_NAME, .version = MPTCP_PM_VER, .netnsok = true, diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index bc97cc30f0..8eaa9fbe3e 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -6,6 +6,7 @@ #include "protocol.h" #include "mib.h" +#include "mptcp_pm_gen.h" void mptcp_free_local_addr_list(struct mptcp_sock *msk) { @@ -106,19 +107,26 @@ static int mptcp_userspace_pm_delete_local_addr(struct mptcp_sock *msk, return -EINVAL; } +static struct mptcp_pm_addr_entry * +mptcp_userspace_pm_lookup_addr_by_id(struct mptcp_sock *msk, unsigned int id) +{ + struct mptcp_pm_addr_entry *entry; + + list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { + if (entry->addr.id == id) + return entry; + } + return NULL; +} + int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, u8 *flags, int *ifindex) { - struct mptcp_pm_addr_entry *entry, *match = NULL; + struct mptcp_pm_addr_entry *match; spin_lock_bh(&msk->pm.lock); - list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { - if (id == entry->addr.id) { - match = entry; - break; - } - } + match = mptcp_userspace_pm_lookup_addr_by_id(msk, id); spin_unlock_bh(&msk->pm.lock); if (match) { *flags = match->flags; @@ -157,6 +165,24 @@ int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, return mptcp_userspace_pm_append_new_local_addr(msk, &new_entry, true); } +bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, + struct mptcp_addr_info *skc) +{ + struct mptcp_pm_addr_entry *entry; + bool backup = false; + + spin_lock_bh(&msk->pm.lock); + list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { + if (mptcp_addresses_equal(&entry->addr, skc, false)) { + backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); + break; + } + } + spin_unlock_bh(&msk->pm.lock); + + return backup; +} + int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; @@ -261,7 +287,7 @@ int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *id = info->attrs[MPTCP_PM_ATTR_LOC_ID]; - struct mptcp_pm_addr_entry *match = NULL; + struct mptcp_pm_addr_entry *match; struct mptcp_pm_addr_entry *entry; struct mptcp_sock *msk; LIST_HEAD(free_list); @@ -298,13 +324,7 @@ int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) lock_sock(sk); - list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { - if (entry->addr.id == id_val) { - match = entry; - break; - } - } - + match = mptcp_userspace_pm_lookup_addr_by_id(msk, id_val); if (!match) { GENL_SET_ERR_MSG(info, "address with specified id not found"); release_sock(sk); @@ -334,7 +354,6 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct mptcp_pm_addr_entry local = { 0 }; struct mptcp_addr_info addr_r; - struct mptcp_addr_info addr_l; struct mptcp_sock *msk; int err = -EINVAL; struct sock *sk; @@ -360,25 +379,31 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) goto create_err; } - err = mptcp_pm_parse_addr(laddr, info, &addr_l); + err = mptcp_pm_parse_entry(laddr, info, true, &local); if (err < 0) { NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr"); goto create_err; } + if (local.flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { + GENL_SET_ERR_MSG(info, "invalid addr flags"); + err = -EINVAL; + goto create_err; + } + local.flags |= MPTCP_PM_ADDR_FLAG_SUBFLOW; + err = mptcp_pm_parse_addr(raddr, info, &addr_r); if (err < 0) { NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr"); goto create_err; } - if (!mptcp_pm_addr_families_match(sk, &addr_l, &addr_r)) { + if (!mptcp_pm_addr_families_match(sk, &local.addr, &addr_r)) { GENL_SET_ERR_MSG(info, "families mismatch"); err = -EINVAL; goto create_err; } - local.addr = addr_l; err = mptcp_userspace_pm_append_new_local_addr(msk, &local, false); if (err < 0) { GENL_SET_ERR_MSG(info, "did not match address and id"); @@ -387,7 +412,7 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) lock_sock(sk); - err = __mptcp_subflow_connect(sk, &addr_l, &addr_r); + err = __mptcp_subflow_connect(sk, &local.addr, &addr_r); release_sock(sk); @@ -540,35 +565,194 @@ destroy_err: return err; } -int mptcp_userspace_pm_set_flags(struct net *net, struct nlattr *token, - struct mptcp_pm_addr_entry *loc, - struct mptcp_pm_addr_entry *rem, u8 bkup) +int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info) { + struct mptcp_pm_addr_entry loc = { .addr = { .family = AF_UNSPEC }, }; + struct mptcp_pm_addr_entry rem = { .addr = { .family = AF_UNSPEC }, }; + struct nlattr *attr_rem = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; + struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; + struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct net *net = sock_net(skb->sk); struct mptcp_sock *msk; int ret = -EINVAL; struct sock *sk; u32 token_val; + u8 bkup = 0; token_val = nla_get_u32(token); msk = mptcp_token_get_sock(net, token_val); - if (!msk) + if (!msk) { + NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); return ret; + } sk = (struct sock *)msk; - if (!mptcp_pm_is_userspace(msk)) + if (!mptcp_pm_is_userspace(msk)) { + GENL_SET_ERR_MSG(info, "userspace PM not selected"); + goto set_flags_err; + } + + ret = mptcp_pm_parse_entry(attr, info, false, &loc); + if (ret < 0) goto set_flags_err; - if (loc->addr.family == AF_UNSPEC || - rem->addr.family == AF_UNSPEC) + if (attr_rem) { + ret = mptcp_pm_parse_entry(attr_rem, info, false, &rem); + if (ret < 0) + goto set_flags_err; + } + + if (loc.addr.family == AF_UNSPEC || + rem.addr.family == AF_UNSPEC) { + GENL_SET_ERR_MSG(info, "invalid address families"); + ret = -EINVAL; goto set_flags_err; + } + + if (loc.flags & MPTCP_PM_ADDR_FLAG_BACKUP) + bkup = 1; lock_sock(sk); - ret = mptcp_pm_nl_mp_prio_send_ack(msk, &loc->addr, &rem->addr, bkup); + ret = mptcp_pm_nl_mp_prio_send_ack(msk, &loc.addr, &rem.addr, bkup); release_sock(sk); set_flags_err: sock_put(sk); return ret; } + +int mptcp_userspace_pm_dump_addr(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct id_bitmap { + DECLARE_BITMAP(map, MPTCP_PM_MAX_ADDR_ID + 1); + } *bitmap; + const struct genl_info *info = genl_info_dump(cb); + struct net *net = sock_net(msg->sk); + struct mptcp_pm_addr_entry *entry; + struct mptcp_sock *msk; + struct nlattr *token; + int ret = -EINVAL; + struct sock *sk; + void *hdr; + + bitmap = (struct id_bitmap *)cb->ctx; + token = info->attrs[MPTCP_PM_ATTR_TOKEN]; + + msk = mptcp_token_get_sock(net, nla_get_u32(token)); + if (!msk) { + NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + return ret; + } + + sk = (struct sock *)msk; + + if (!mptcp_pm_is_userspace(msk)) { + GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); + goto out; + } + + lock_sock(sk); + spin_lock_bh(&msk->pm.lock); + list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { + if (test_bit(entry->addr.id, bitmap->map)) + continue; + + hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &mptcp_genl_family, + NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); + if (!hdr) + break; + + if (mptcp_nl_fill_addr(msg, entry) < 0) { + genlmsg_cancel(msg, hdr); + break; + } + + __set_bit(entry->addr.id, bitmap->map); + genlmsg_end(msg, hdr); + } + spin_unlock_bh(&msk->pm.lock); + release_sock(sk); + ret = msg->len; + +out: + sock_put(sk); + return ret; +} + +int mptcp_userspace_pm_get_addr(struct sk_buff *skb, + struct genl_info *info) +{ + struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; + struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; + struct mptcp_pm_addr_entry addr, *entry; + struct net *net = sock_net(skb->sk); + struct mptcp_sock *msk; + struct sk_buff *msg; + int ret = -EINVAL; + struct sock *sk; + void *reply; + + msk = mptcp_token_get_sock(net, nla_get_u32(token)); + if (!msk) { + NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + return ret; + } + + sk = (struct sock *)msk; + + if (!mptcp_pm_is_userspace(msk)) { + GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); + goto out; + } + + ret = mptcp_pm_parse_entry(attr, info, false, &addr); + if (ret < 0) + goto out; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto out; + } + + reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, + info->genlhdr->cmd); + if (!reply) { + GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); + ret = -EMSGSIZE; + goto fail; + } + + lock_sock(sk); + spin_lock_bh(&msk->pm.lock); + entry = mptcp_userspace_pm_lookup_addr_by_id(msk, addr.addr.id); + if (!entry) { + GENL_SET_ERR_MSG(info, "address not found"); + ret = -EINVAL; + goto unlock_fail; + } + + ret = mptcp_nl_fill_addr(msg, entry); + if (ret) + goto unlock_fail; + + genlmsg_end(msg, reply); + ret = genlmsg_reply(msg, info); + spin_unlock_bh(&msk->pm.lock); + release_sock(sk); + sock_put(sk); + return ret; + +unlock_fail: + spin_unlock_bh(&msk->pm.lock); + release_sock(sk); +fail: + nlmsg_free(msg); +out: + sock_put(sk); + return ret; +} diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 2b921af271..ff8292d0cf 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -15,12 +15,12 @@ #include <net/inet_common.h> #include <net/inet_hashtables.h> #include <net/protocol.h> -#include <net/tcp.h> #include <net/tcp_states.h> #if IS_ENABLED(CONFIG_MPTCP_IPV6) #include <net/transp_v6.h> #endif #include <net/mptcp.h> +#include <net/hotdata.h> #include <net/xfrm.h> #include <asm/ioctls.h> #include "protocol.h" @@ -350,8 +350,10 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, skb_orphan(skb); /* try to fetch required memory from subflow */ - if (!mptcp_rmem_schedule(sk, ssk, skb->truesize)) + if (!mptcp_rmem_schedule(sk, ssk, skb->truesize)) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); goto drop; + } has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; @@ -410,6 +412,7 @@ static void mptcp_close_wake_up(struct sock *sk) sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); } +/* called under the msk socket lock */ static bool mptcp_pending_data_fin_ack(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -441,16 +444,17 @@ static void mptcp_check_data_fin_ack(struct sock *sk) } } +/* can be called with no lock acquired */ static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) { struct mptcp_sock *msk = mptcp_sk(sk); if (READ_ONCE(msk->rcv_data_fin) && - ((1 << sk->sk_state) & + ((1 << inet_sk_state_load(sk)) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2))) { u64 rcv_data_fin_seq = READ_ONCE(msk->rcv_data_fin_seq); - if (msk->ack_seq == rcv_data_fin_seq) { + if (READ_ONCE(msk->ack_seq) == rcv_data_fin_seq) { if (seq) *seq = rcv_data_fin_seq; @@ -705,6 +709,8 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, } } while (more_data_avail); + if (moved > 0) + msk->last_data_recv = tcp_jiffies32; *bytes += moved; return done; } @@ -748,7 +754,7 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk) __skb_queue_tail(&sk->sk_receive_queue, skb); } msk->bytes_received += end_seq - msk->ack_seq; - msk->ack_seq = end_seq; + WRITE_ONCE(msk->ack_seq, end_seq); moved = true; } return moved; @@ -840,10 +846,8 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) sk_rbuf = ssk_rbuf; /* over limit? can't append more skbs to msk, Also, no need to wake-up*/ - if (__mptcp_rmem(sk) > sk_rbuf) { - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); + if (__mptcp_rmem(sk) > sk_rbuf) return; - } /* Wake-up the reader only for in-sequence data */ mptcp_data_lock(sk); @@ -985,6 +989,7 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) put_page(dfrag->page); } +/* called under both the msk socket lock and the data lock */ static void __mptcp_clean_una(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -1033,13 +1038,15 @@ static void __mptcp_clean_una(struct sock *sk) msk->recovery = false; out: - if (snd_una == READ_ONCE(msk->snd_nxt) && - snd_una == READ_ONCE(msk->write_seq)) { + if (snd_una == msk->snd_nxt && snd_una == msk->write_seq) { if (mptcp_rtx_timer_pending(sk) && !mptcp_data_fin_enabled(msk)) mptcp_stop_rtx_timer(sk); } else { mptcp_reset_rtx_timer(sk); } + + if (mptcp_pending_data_fin_ack(sk)) + mptcp_schedule_work(sk); } static void __mptcp_clean_una_wakeup(struct sock *sk) @@ -1266,7 +1273,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, i = skb_shinfo(skb)->nr_frags; can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset); - if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) { + if (!can_coalesce && i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) { tcp_mark_push(tcp_sk(ssk), skb); goto alloc_skb; } @@ -1415,13 +1422,15 @@ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) } mptcp_for_each_subflow(msk, subflow) { + bool backup = subflow->backup || subflow->request_bkup; + trace_mptcp_subflow_get_send(subflow); ssk = mptcp_subflow_tcp_sock(subflow); if (!mptcp_subflow_active(subflow)) continue; tout = max(tout, mptcp_timeout_from_subflow(subflow)); - nr_active += !subflow->backup; + nr_active += !backup; pace = subflow->avg_pacing_rate; if (unlikely(!pace)) { /* init pacing rate from socket */ @@ -1432,9 +1441,9 @@ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) } linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace); - if (linger_time < send_info[subflow->backup].linger_time) { - send_info[subflow->backup].ssk = ssk; - send_info[subflow->backup].linger_time = linger_time; + if (linger_time < send_info[backup].linger_time) { + send_info[backup].ssk = ssk; + send_info[backup].linger_time = linger_time; } } __mptcp_set_timeout(sk, tout); @@ -1500,7 +1509,7 @@ static void mptcp_update_post_push(struct mptcp_sock *msk, */ if (likely(after64(snd_nxt_new, msk->snd_nxt))) { msk->bytes_sent += snd_nxt_new - msk->snd_nxt; - msk->snd_nxt = snd_nxt_new; + WRITE_ONCE(msk->snd_nxt, snd_nxt_new); } } @@ -1552,6 +1561,8 @@ static int __subflow_push_pending(struct sock *sk, struct sock *ssk, err = copied; out: + if (err > 0) + msk->last_data_sent = tcp_jiffies32; return err; } @@ -1687,15 +1698,6 @@ out: } } -static void mptcp_set_nospace(struct sock *sk) -{ - /* enable autotune */ - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - - /* will be cleared on avail space */ - set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags); -} - static int mptcp_disconnect(struct sock *sk, int flags); static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, @@ -1766,6 +1768,30 @@ static int do_copy_data_nocache(struct sock *sk, int copy, return 0; } +/* open-code sk_stream_memory_free() plus sent limit computation to + * avoid indirect calls in fast-path. + * Called under the msk socket lock, so we can avoid a bunch of ONCE + * annotations. + */ +static u32 mptcp_send_limit(const struct sock *sk) +{ + const struct mptcp_sock *msk = mptcp_sk(sk); + u32 limit, not_sent; + + if (sk->sk_wmem_queued >= READ_ONCE(sk->sk_sndbuf)) + return 0; + + limit = mptcp_notsent_lowat(sk); + if (limit == UINT_MAX) + return UINT_MAX; + + not_sent = msk->write_seq - msk->snd_nxt; + if (not_sent >= limit) + return 0; + + return limit - not_sent; +} + static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -1810,6 +1836,12 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct mptcp_data_frag *dfrag; bool dfrag_collapsed; size_t psize, offset; + u32 copy_limit; + + /* ensure fitting the notsent_lowat() constraint */ + copy_limit = mptcp_send_limit(sk); + if (!copy_limit) + goto wait_for_memory; /* reuse tail pfrag, if possible, or carve a new one from the * page allocator @@ -1817,9 +1849,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) dfrag = mptcp_pending_tail(sk); dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); if (!dfrag_collapsed) { - if (!sk_stream_memory_free(sk)) - goto wait_for_memory; - if (!mptcp_page_frag_refill(sk, pfrag)) goto wait_for_memory; @@ -1834,6 +1863,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) offset = dfrag->offset + dfrag->data_len; psize = pfrag->size - offset; psize = min_t(size_t, psize, msg_data_left(msg)); + psize = min_t(size_t, psize, copy_limit); total_ts = psize + frag_truesize; if (!sk_wmem_schedule(sk, total_ts)) @@ -1869,7 +1899,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) continue; wait_for_memory: - mptcp_set_nospace(sk); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); __mptcp_push_pending(sk, msg->msg_flags); ret = sk_stream_wait_memory(sk, &timeo); if (ret) @@ -2033,7 +2063,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) ssk = mptcp_subflow_tcp_sock(subflow); slow = lock_sock_fast(ssk); WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf); - tcp_sk(ssk)->window_clamp = window_clamp; + WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp); tcp_cleanup_rbuf(ssk, 1); unlock_sock_fast(ssk, slow); } @@ -2115,7 +2145,7 @@ static unsigned int mptcp_inq_hint(const struct sock *sk) skb = skb_peek(&msk->receive_queue); if (skb) { - u64 hint_val = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq; + u64 hint_val = READ_ONCE(msk->ack_seq) - MPTCP_SKB_CB(skb)->map_seq; if (hint_val >= INT_MAX) return INT_MAX; @@ -2542,7 +2572,7 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) slow = lock_sock_fast(tcp_sk); if (tcp_sk->sk_state != TCP_CLOSE) { - tcp_send_active_reset(tcp_sk, GFP_ATOMIC); + mptcp_send_active_reset_reason(tcp_sk); tcp_set_state(tcp_sk, TCP_CLOSE); } unlock_sock_fast(tcp_sk, slow); @@ -2759,7 +2789,7 @@ static void __mptcp_init_sock(struct sock *sk) __skb_queue_head_init(&msk->receive_queue); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; - msk->rmem_fwd_alloc = 0; + WRITE_ONCE(msk->rmem_fwd_alloc, 0); WRITE_ONCE(msk->rmem_released, 0); msk->timer_ival = TCP_RTO_MIN; msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO; @@ -2770,6 +2800,9 @@ static void __mptcp_init_sock(struct sock *sk) WRITE_ONCE(msk->allow_infinite_fallback, true); msk->recovery = false; msk->subflow_id = 1; + msk->last_data_sent = tcp_jiffies32; + msk->last_data_recv = tcp_jiffies32; + msk->last_ack_recv = tcp_jiffies32; mptcp_pm_data_init(msk); @@ -2783,7 +2816,8 @@ static void mptcp_ca_reset(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); tcp_assign_congestion_control(sk); - strcpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name); + strscpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name, + sizeof(mptcp_sk(sk)->ca_name)); /* no need to keep a reference to the ops, the name will suffice */ tcp_cleanup_congestion_control(sk); @@ -2884,9 +2918,14 @@ void mptcp_set_state(struct sock *sk, int state) if (oldstate != TCP_ESTABLISHED) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB); break; - + case TCP_CLOSE_WAIT: + /* Unlike TCP, MPTCP sk would not have the TCP_SYN_RECV state: + * MPTCP "accepted" sockets will be created later on. So no + * transition from TCP_SYN_RECV to TCP_CLOSE_WAIT. + */ + break; default: - if (oldstate == TCP_ESTABLISHED) + if (oldstate == TCP_ESTABLISHED || oldstate == TCP_CLOSE_WAIT) MPTCP_DEC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB); } @@ -2975,7 +3014,7 @@ static void __mptcp_destroy_sock(struct sock *sk) sk->sk_prot->destroy(sk); - WARN_ON_ONCE(msk->rmem_fwd_alloc); + WARN_ON_ONCE(READ_ONCE(msk->rmem_fwd_alloc)); WARN_ON_ONCE(msk->rmem_released); sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); @@ -3150,16 +3189,16 @@ static int mptcp_disconnect(struct sock *sk, int flags) WRITE_ONCE(msk->flags, 0); msk->cb_flags = 0; msk->recovery = false; - msk->can_ack = false; - msk->fully_established = false; - msk->rcv_data_fin = false; - msk->snd_data_fin_enable = false; - msk->rcv_fastclose = false; - msk->use_64bit_ack = false; - msk->bytes_consumed = 0; + WRITE_ONCE(msk->can_ack, false); + WRITE_ONCE(msk->fully_established, false); + WRITE_ONCE(msk->rcv_data_fin, false); + WRITE_ONCE(msk->snd_data_fin_enable, false); + WRITE_ONCE(msk->rcv_fastclose, false); + WRITE_ONCE(msk->use_64bit_ack, false); WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); mptcp_pm_data_reset(msk); mptcp_ca_reset(sk); + msk->bytes_consumed = 0; msk->bytes_acked = 0; msk->bytes_received = 0; msk->bytes_sent = 0; @@ -3250,17 +3289,17 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, mptcp_copy_ip_options(nsk, sk); msk = mptcp_sk(nsk); - msk->local_key = subflow_req->local_key; - msk->token = subflow_req->token; + WRITE_ONCE(msk->local_key, subflow_req->local_key); + WRITE_ONCE(msk->token, subflow_req->token); msk->in_accept_queue = 1; WRITE_ONCE(msk->fully_established, false); if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD) WRITE_ONCE(msk->csum_enabled, true); - msk->write_seq = subflow_req->idsn + 1; - msk->snd_nxt = msk->write_seq; - msk->snd_una = msk->write_seq; - msk->wnd_end = msk->snd_nxt + tcp_sk(ssk)->snd_wnd; + WRITE_ONCE(msk->write_seq, subflow_req->idsn + 1); + WRITE_ONCE(msk->snd_nxt, msk->write_seq); + WRITE_ONCE(msk->snd_una, msk->write_seq); + WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd); msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; mptcp_init_sched(msk, mptcp_sk(sk)->sched); @@ -3363,9 +3402,6 @@ void __mptcp_data_acked(struct sock *sk) __mptcp_clean_una(sk); else __set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->cb_flags); - - if (mptcp_pending_data_fin_ack(sk)) - mptcp_schedule_work(sk); } void __mptcp_check_push(struct sock *sk, struct sock *ssk) @@ -3703,6 +3739,10 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_TOKENFALLBACKINIT); mptcp_subflow_early_fallback(msk, subflow); } + + WRITE_ONCE(msk->write_seq, subflow->idsn); + WRITE_ONCE(msk->snd_nxt, subflow->idsn); + WRITE_ONCE(msk->snd_una, subflow->idsn); if (likely(!__mptcp_check_fallback(msk))) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVE); @@ -3767,6 +3807,7 @@ static struct proto mptcp_prot = { .unhash = mptcp_unhash, .get_port = mptcp_get_port, .forward_alloc_get = mptcp_forward_alloc_get, + .stream_memory_free = mptcp_stream_memory_free, .sockets_allocated = &mptcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, @@ -3849,11 +3890,10 @@ unlock: } static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, - int flags, bool kern) + struct proto_accept_arg *arg) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *ssk, *newsk; - int err; pr_debug("msk=%p", msk); @@ -3865,9 +3905,9 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, return -EINVAL; pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk)); - newsk = inet_csk_accept(ssk, flags, &err, kern); + newsk = inet_csk_accept(ssk, arg); if (!newsk) - return err; + return arg->err; pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk)); if (sk_is_mptcp(newsk)) { @@ -3888,7 +3928,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, newsk = new_mptcp_sock; MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK); - newsk->sk_kern_sock = kern; + newsk->sk_kern_sock = arg->kern; lock_sock(newsk); __inet_accept(sock, newsock, newsk); @@ -3917,7 +3957,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, } } else { tcpfallback: - newsk->sk_kern_sock = kern; + newsk->sk_kern_sock = arg->kern; lock_sock(newsk); __inet_accept(sock, newsock, newsk); /* we are being invoked after accepting a non-mp-capable @@ -3938,12 +3978,12 @@ static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; - if (sk_stream_is_writeable(sk)) + if (__mptcp_stream_is_writeable(sk, 1)) return EPOLLOUT | EPOLLWRNORM; - mptcp_set_nospace(sk); - smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ - if (sk_stream_is_writeable(sk)) + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + smp_mb__after_atomic(); /* NOSPACE is changed by mptcp_write_space() */ + if (__mptcp_stream_is_writeable(sk, 1)) return EPOLLOUT | EPOLLWRNORM; return 0; @@ -4137,7 +4177,7 @@ int __init mptcp_proto_v6_init(void) int err; mptcp_v6_prot = mptcp_prot; - strcpy(mptcp_v6_prot.name, "MPTCPv6"); + strscpy(mptcp_v6_prot.name, "MPTCPv6", sizeof(mptcp_v6_prot.name)); mptcp_v6_prot.slab = NULL; mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock); mptcp_v6_prot.ipv6_pinfo_offset = offsetof(struct mptcp6_sock, np); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 07f6242afc..8357046732 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -12,8 +12,7 @@ #include <net/inet_connection_sock.h> #include <uapi/linux/mptcp.h> #include <net/genetlink.h> - -#include "mptcp_pm_gen.h" +#include <net/rstreason.h> #define MPTCP_SUPPORTED_VERSION 1 @@ -113,10 +112,9 @@ #define MPTCP_RST_TRANSIENT BIT(0) /* MPTCP socket atomic flags */ -#define MPTCP_NOSPACE 1 -#define MPTCP_WORK_RTX 2 -#define MPTCP_FALLBACK_DONE 4 -#define MPTCP_WORK_CLOSE_SUBFLOW 5 +#define MPTCP_WORK_RTX 1 +#define MPTCP_FALLBACK_DONE 2 +#define MPTCP_WORK_CLOSE_SUBFLOW 3 /* MPTCP socket release cb flags */ #define MPTCP_PUSH_PENDING 1 @@ -260,8 +258,10 @@ struct mptcp_data_frag { struct mptcp_sock { /* inet_connection_sock must be the first member */ struct inet_connection_sock sk; - u64 local_key; - u64 remote_key; + u64 local_key; /* protected by the first subflow socket lock + * lockless access read + */ + u64 remote_key; /* same as above */ u64 write_seq; u64 bytes_sent; u64 snd_nxt; @@ -281,6 +281,9 @@ struct mptcp_sock { u64 bytes_acked; u64 snd_una; u64 wnd_end; + u32 last_data_sent; + u32 last_data_recv; + u32 last_ack_recv; unsigned long timer_ival; u32 token; int rmem_released; @@ -306,6 +309,10 @@ struct mptcp_sock { in_accept_queue:1, free_first:1, rcvspace_init:1; + u32 notsent_lowat; + int keepalive_cnt; + int keepalive_idle; + int keepalive_intvl; struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; @@ -341,12 +348,30 @@ struct mptcp_sock { #define mptcp_for_each_subflow_safe(__msk, __subflow, __tmp) \ list_for_each_entry_safe(__subflow, __tmp, &((__msk)->conn_list), node) +extern struct genl_family mptcp_genl_family; + static inline void msk_owned_by_me(const struct mptcp_sock *msk) { sock_owned_by_me((const struct sock *)msk); } +#ifdef CONFIG_DEBUG_NET +/* MPTCP-specific: we might (indirectly) call this helper with the wrong sk */ +#undef tcp_sk +#define tcp_sk(ptr) ({ \ + typeof(ptr) _ptr = (ptr); \ + WARN_ON(_ptr->sk_protocol != IPPROTO_TCP); \ + container_of_const(_ptr, struct tcp_sock, inet_conn.icsk_inet.sk); \ +}) +#define mptcp_sk(ptr) ({ \ + typeof(ptr) _ptr = (ptr); \ + WARN_ON(_ptr->sk_protocol != IPPROTO_MPTCP); \ + container_of_const(_ptr, struct mptcp_sock, sk.icsk_inet.sk); \ +}) + +#else /* !CONFIG_DEBUG_NET */ #define mptcp_sk(ptr) container_of_const(ptr, struct mptcp_sock, sk.icsk_inet.sk) +#endif /* the msk socket don't use the backlog, also account for the bulk * free memory @@ -400,7 +425,7 @@ static inline struct mptcp_data_frag *mptcp_rtx_head(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (msk->snd_una == READ_ONCE(msk->snd_nxt)) + if (msk->snd_una == msk->snd_nxt) return NULL; return list_first_entry_or_null(&msk->rtx_queue, struct mptcp_data_frag, list); @@ -418,6 +443,7 @@ struct mptcp_subflow_request_sock { u16 mp_capable : 1, mp_join : 1, backup : 1, + request_bkup : 1, csum_reqd : 1, allow_join_id0 : 1; u8 local_id; @@ -538,7 +564,7 @@ struct mptcp_subflow_context { static inline struct mptcp_subflow_context * mptcp_subflow_ctx(const struct sock *sk) { - struct inet_connection_sock *icsk = inet_csk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); /* Use RCU on icsk_ulp_data only for sock diag code */ return (__force struct mptcp_subflow_context *)icsk->icsk_ulp_data; @@ -558,6 +584,43 @@ mptcp_subflow_ctx_reset(struct mptcp_subflow_context *subflow) WRITE_ONCE(subflow->local_id, -1); } +/* Convert reset reasons in MPTCP to enum sk_rst_reason type */ +static inline enum sk_rst_reason +sk_rst_convert_mptcp_reason(u32 reason) +{ + switch (reason) { + case MPTCP_RST_EUNSPEC: + return SK_RST_REASON_MPTCP_RST_EUNSPEC; + case MPTCP_RST_EMPTCP: + return SK_RST_REASON_MPTCP_RST_EMPTCP; + case MPTCP_RST_ERESOURCE: + return SK_RST_REASON_MPTCP_RST_ERESOURCE; + case MPTCP_RST_EPROHIBIT: + return SK_RST_REASON_MPTCP_RST_EPROHIBIT; + case MPTCP_RST_EWQ2BIG: + return SK_RST_REASON_MPTCP_RST_EWQ2BIG; + case MPTCP_RST_EBADPERF: + return SK_RST_REASON_MPTCP_RST_EBADPERF; + case MPTCP_RST_EMIDDLEBOX: + return SK_RST_REASON_MPTCP_RST_EMIDDLEBOX; + default: + /* It should not happen, or else errors may occur + * in MPTCP layer + */ + return SK_RST_REASON_ERROR; + } +} + +static inline void +mptcp_send_active_reset_reason(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + enum sk_rst_reason reason; + + reason = sk_rst_convert_mptcp_reason(subflow->reset_reason); + tcp_send_active_reset(sk, GFP_ATOMIC, reason); +} + static inline u64 mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow) { @@ -622,6 +685,7 @@ unsigned int mptcp_stale_loss_cnt(const struct net *net); unsigned int mptcp_close_timeout(const struct sock *sk); int mptcp_get_pm_type(const struct net *net); const char *mptcp_get_scheduler(const struct net *net); +void mptcp_get_available_schedulers(char *buf, size_t maxlen); void __mptcp_subflow_fully_established(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt); @@ -790,14 +854,36 @@ static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk) READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt); } +static inline u32 mptcp_notsent_lowat(const struct sock *sk) +{ + struct net *net = sock_net(sk); + u32 val; + + val = READ_ONCE(mptcp_sk(sk)->notsent_lowat); + return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat); +} + +static inline bool mptcp_stream_memory_free(const struct sock *sk, int wake) +{ + const struct mptcp_sock *msk = mptcp_sk(sk); + u32 notsent_bytes; + + notsent_bytes = READ_ONCE(msk->write_seq) - READ_ONCE(msk->snd_nxt); + return (notsent_bytes << wake) < mptcp_notsent_lowat(sk); +} + +static inline bool __mptcp_stream_is_writeable(const struct sock *sk, int wake) +{ + return mptcp_stream_memory_free(sk, wake) && + __sk_stream_is_writeable(sk, wake); +} + static inline void mptcp_write_space(struct sock *sk) { - if (sk_stream_is_writeable(sk)) { - /* pairs with memory barrier in mptcp_poll */ - smp_mb(); - if (test_and_clear_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags)) - sk_stream_write_space(sk); - } + /* pairs with memory barrier in mptcp_poll */ + smp_mb(); + if (mptcp_stream_memory_free(sk, 1)) + sk_stream_write_space(sk); } static inline void __mptcp_sync_sndbuf(struct sock *sk) @@ -808,7 +894,7 @@ static inline void __mptcp_sync_sndbuf(struct sock *sk) if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) return; - new_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[0]; + new_sndbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[0]); mptcp_for_each_subflow(mptcp_sk(sk), subflow) { ssk_sndbuf = READ_ONCE(mptcp_subflow_tcp_sock(subflow)->sk_sndbuf); @@ -928,21 +1014,15 @@ int mptcp_pm_nl_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, u8 *flags, int *ifindex); -int mptcp_pm_set_flags(struct net *net, struct nlattr *token, - struct mptcp_pm_addr_entry *loc, - struct mptcp_pm_addr_entry *rem, u8 bkup); -int mptcp_pm_nl_set_flags(struct net *net, struct mptcp_pm_addr_entry *addr, u8 bkup); -int mptcp_userspace_pm_set_flags(struct net *net, struct nlattr *token, - struct mptcp_pm_addr_entry *loc, - struct mptcp_pm_addr_entry *rem, u8 bkup); +int mptcp_pm_set_flags(struct sk_buff *skb, struct genl_info *info); +int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info); +int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info); int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool echo); int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list); -void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, - struct list_head *rm_list); void mptcp_free_local_addr_list(struct mptcp_sock *msk); @@ -958,6 +1038,8 @@ void __mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflo const struct mptcp_options_received *mp_opt); void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow, struct request_sock *req); +int mptcp_nl_fill_addr(struct sk_buff *skb, + struct mptcp_pm_addr_entry *entry); static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk) { @@ -1022,6 +1104,18 @@ bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); +bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc); +bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc); +bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc); +int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb); +int mptcp_pm_nl_dump_addr(struct sk_buff *msg, + struct netlink_callback *cb); +int mptcp_userspace_pm_dump_addr(struct sk_buff *msg, + struct netlink_callback *cb); +int mptcp_pm_get_addr(struct sk_buff *skb, struct genl_info *info); +int mptcp_pm_nl_get_addr(struct sk_buff *skb, struct genl_info *info); +int mptcp_userspace_pm_get_addr(struct sk_buff *skb, + struct genl_info *info); static inline u8 subflow_get_local_id(const struct mptcp_subflow_context *subflow) { diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c index 4ab0693c06..4a7fd0508a 100644 --- a/net/mptcp/sched.c +++ b/net/mptcp/sched.c @@ -51,6 +51,28 @@ struct mptcp_sched_ops *mptcp_sched_find(const char *name) return ret; } +/* Build string with list of available scheduler values. + * Similar to tcp_get_available_congestion_control() + */ +void mptcp_get_available_schedulers(char *buf, size_t maxlen) +{ + struct mptcp_sched_ops *sched; + size_t offs = 0; + + rcu_read_lock(); + spin_lock(&mptcp_sched_list_lock); + list_for_each_entry_rcu(sched, &mptcp_sched_list, list) { + offs += snprintf(buf + offs, maxlen - offs, + "%s%s", + offs == 0 ? "" : " ", sched->name); + + if (WARN_ON_ONCE(offs >= maxlen)) + break; + } + spin_unlock(&mptcp_sched_list_lock); + rcu_read_unlock(); +} + int mptcp_register_scheduler(struct mptcp_sched_ops *sched) { if (!sched->get_subflow) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index ef3edba754..f9a4fb17b5 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -181,8 +181,6 @@ static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname, switch (optname) { case SO_KEEPALIVE: - mptcp_sol_socket_sync_intval(msk, optname, val); - return 0; case SO_DEBUG: case SO_MARK: case SO_PRIORITY: @@ -618,26 +616,42 @@ static int mptcp_setsockopt_sol_tcp_congestion(struct mptcp_sock *msk, sockptr_t } if (ret == 0) - strcpy(msk->ca_name, name); + strscpy(msk->ca_name, name, sizeof(msk->ca_name)); release_sock(sk); return ret; } -static int mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, sockptr_t optval, - unsigned int optlen) +static int __mptcp_setsockopt_set_val(struct mptcp_sock *msk, int max, + int (*set_val)(struct sock *, int), + int *msk_val, int val) { struct mptcp_subflow_context *subflow; - struct sock *sk = (struct sock *)msk; - int val; + int err = 0; - if (optlen < sizeof(int)) - return -EINVAL; + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + int ret; - if (copy_from_sockptr(&val, optval, sizeof(val))) - return -EFAULT; + lock_sock(ssk); + ret = set_val(ssk, val); + err = err ? : ret; + release_sock(ssk); + } + + if (!err) { + *msk_val = val; + sockopt_seq_inc(msk); + } + + return err; +} + +static int __mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, int val) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; - lock_sock(sk); sockopt_seq_inc(msk); msk->cork = !!val; mptcp_for_each_subflow(msk, subflow) { @@ -649,25 +663,15 @@ static int mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, sockptr_t optva } if (!val) mptcp_check_and_set_pending(sk); - release_sock(sk); return 0; } -static int mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, sockptr_t optval, - unsigned int optlen) +static int __mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, int val) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; - int val; - - if (optlen < sizeof(int)) - return -EINVAL; - - if (copy_from_sockptr(&val, optval, sizeof(val))) - return -EFAULT; - lock_sock(sk); sockopt_seq_inc(msk); msk->nodelay = !!val; mptcp_for_each_subflow(msk, subflow) { @@ -679,8 +683,6 @@ static int mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, sockptr_t op } if (val) mptcp_check_and_set_pending(sk); - release_sock(sk); - return 0; } @@ -803,25 +805,10 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, int ret, val; switch (optname) { - case TCP_INQ: - ret = mptcp_get_int_option(msk, optval, optlen, &val); - if (ret) - return ret; - if (val < 0 || val > 1) - return -EINVAL; - - lock_sock(sk); - msk->recvmsg_inq = !!val; - release_sock(sk); - return 0; case TCP_ULP: return -EOPNOTSUPP; case TCP_CONGESTION: return mptcp_setsockopt_sol_tcp_congestion(msk, optval, optlen); - case TCP_CORK: - return mptcp_setsockopt_sol_tcp_cork(msk, optval, optlen); - case TCP_NODELAY: - return mptcp_setsockopt_sol_tcp_nodelay(msk, optval, optlen); case TCP_DEFER_ACCEPT: /* See tcp.c: TCP_DEFER_ACCEPT does not fail */ mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); @@ -834,7 +821,50 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, optval, optlen); } - return -EOPNOTSUPP; + ret = mptcp_get_int_option(msk, optval, optlen, &val); + if (ret) + return ret; + + lock_sock(sk); + switch (optname) { + case TCP_INQ: + if (val < 0 || val > 1) + ret = -EINVAL; + else + msk->recvmsg_inq = !!val; + break; + case TCP_NOTSENT_LOWAT: + WRITE_ONCE(msk->notsent_lowat, val); + mptcp_write_space(sk); + break; + case TCP_CORK: + ret = __mptcp_setsockopt_sol_tcp_cork(msk, val); + break; + case TCP_NODELAY: + ret = __mptcp_setsockopt_sol_tcp_nodelay(msk, val); + break; + case TCP_KEEPIDLE: + ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPIDLE, + &tcp_sock_set_keepidle_locked, + &msk->keepalive_idle, val); + break; + case TCP_KEEPINTVL: + ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPINTVL, + &tcp_sock_set_keepintvl, + &msk->keepalive_intvl, val); + break; + case TCP_KEEPCNT: + ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPCNT, + &tcp_sock_set_keepcnt, + &msk->keepalive_cnt, + val); + break; + default: + ret = -ENOPROTOOPT; + } + + release_sock(sk); + return ret; } int mptcp_setsockopt(struct sock *sk, int level, int optname, @@ -907,6 +937,7 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) struct sock *sk = (struct sock *)msk; u32 flags = 0; bool slow; + u32 now; memset(info, 0, sizeof(*info)); @@ -935,14 +966,9 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) if (READ_ONCE(msk->can_ack)) flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED; info->mptcpi_flags = flags; - mptcp_data_lock(sk); - info->mptcpi_snd_una = msk->snd_una; - info->mptcpi_rcv_nxt = msk->ack_seq; - info->mptcpi_bytes_acked = msk->bytes_acked; - mptcp_data_unlock(sk); slow = lock_sock_fast(sk); - info->mptcpi_csum_enabled = msk->csum_enabled; + info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled); info->mptcpi_token = msk->token; info->mptcpi_write_seq = msk->write_seq; info->mptcpi_retransmits = inet_csk(sk)->icsk_retransmits; @@ -951,7 +977,17 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) info->mptcpi_bytes_retrans = msk->bytes_retrans; info->mptcpi_subflows_total = info->mptcpi_subflows + __mptcp_has_initial_subflow(msk); + now = tcp_jiffies32; + info->mptcpi_last_data_sent = jiffies_to_msecs(now - msk->last_data_sent); + info->mptcpi_last_data_recv = jiffies_to_msecs(now - msk->last_data_recv); unlock_sock_fast(sk, slow); + + mptcp_data_lock(sk); + info->mptcpi_last_ack_recv = jiffies_to_msecs(now - msk->last_ack_recv); + info->mptcpi_snd_una = msk->snd_una; + info->mptcpi_rcv_nxt = msk->ack_seq; + info->mptcpi_bytes_acked = msk->bytes_acked; + mptcp_data_unlock(sk); } EXPORT_SYMBOL_GPL(mptcp_diag_fill_info); @@ -963,6 +999,10 @@ static int mptcp_getsockopt_info(struct mptcp_sock *msk, char __user *optval, in if (get_user(len, optlen)) return -EFAULT; + /* When used only to check if a fallback to TCP happened. */ + if (len == 0) + return 0; + len = min_t(unsigned int, len, sizeof(struct mptcp_info)); mptcp_diag_fill_info(msk, &m_info); @@ -1331,6 +1371,8 @@ static int mptcp_put_int_option(struct mptcp_sock *msk, char __user *optval, static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, char __user *optval, int __user *optlen) { + struct sock *sk = (void *)msk; + switch (optname) { case TCP_ULP: case TCP_CONGESTION: @@ -1349,6 +1391,22 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, return mptcp_put_int_option(msk, optval, optlen, msk->cork); case TCP_NODELAY: return mptcp_put_int_option(msk, optval, optlen, msk->nodelay); + case TCP_KEEPIDLE: + return mptcp_put_int_option(msk, optval, optlen, + msk->keepalive_idle ? : + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_time) / HZ); + case TCP_KEEPINTVL: + return mptcp_put_int_option(msk, optval, optlen, + msk->keepalive_intvl ? : + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_intvl) / HZ); + case TCP_KEEPCNT: + return mptcp_put_int_option(msk, optval, optlen, + msk->keepalive_cnt ? : + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_probes)); + case TCP_NOTSENT_LOWAT: + return mptcp_put_int_option(msk, optval, optlen, msk->notsent_lowat); + case TCP_IS_MPTCP: + return mptcp_put_int_option(msk, optval, optlen, 1); } return -EOPNOTSUPP; } @@ -1464,6 +1522,9 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) tcp_set_congestion_control(ssk, msk->ca_name, false, true); __tcp_sock_set_cork(ssk, !!msk->cork); __tcp_sock_set_nodelay(ssk, !!msk->nodelay); + tcp_sock_set_keepidle_locked(ssk, msk->keepalive_idle); + tcp_sock_set_keepintvl(ssk, msk->keepalive_intvl); + tcp_sock_set_keepcnt(ssk, msk->keepalive_cnt); inet_assign_bit(TRANSPARENT, ssk, inet_test_bit(TRANSPARENT, sk)); inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk)); @@ -1530,7 +1591,7 @@ int mptcp_set_rcvlowat(struct sock *sk, int val) slow = lock_sock_fast(ssk); WRITE_ONCE(ssk->sk_rcvbuf, space); - tcp_sk(ssk)->window_clamp = val; + WRITE_ONCE(tcp_sk(ssk)->window_clamp, val); unlock_sock_fast(ssk, slow); } return 0; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 13f66d11b7..c330946384 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -15,13 +15,12 @@ #include <net/inet_common.h> #include <net/inet_hashtables.h> #include <net/protocol.h> -#include <net/tcp.h> #if IS_ENABLED(CONFIG_MPTCP_IPV6) #include <net/ip6_route.h> #include <net/transp_v6.h> #endif #include <net/mptcp.h> -#include <uapi/linux/mptcp.h> + #include "protocol.h" #include "mib.h" @@ -75,7 +74,8 @@ static void subflow_req_create_thmac(struct mptcp_subflow_request_sock *subflow_ get_random_bytes(&subflow_req->local_nonce, sizeof(u32)); - subflow_generate_hmac(msk->local_key, msk->remote_key, + subflow_generate_hmac(READ_ONCE(msk->local_key), + READ_ONCE(msk->remote_key), subflow_req->local_nonce, subflow_req->remote_nonce, hmac); @@ -100,6 +100,7 @@ static struct mptcp_sock *subflow_token_join_request(struct request_sock *req) return NULL; } subflow_req->local_id = local_id; + subflow_req->request_bkup = mptcp_pm_is_backup(msk, (struct sock_common *)req); return msk; } @@ -151,8 +152,10 @@ static int subflow_check_req(struct request_sock *req, /* no MPTCP if MD5SIG is enabled on this socket or we may run out of * TCP option space. */ - if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) + if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) { + subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); return -EINVAL; + } #endif mptcp_get_options(skb, &mp_opt); @@ -166,6 +169,9 @@ static int subflow_check_req(struct request_sock *req, return 0; } else if (opt_mp_join) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); + + if (mp_opt.backup) + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNBACKUPRX); } if (opt_mp_capable && listener->request_mptcp) { @@ -220,6 +226,7 @@ again: ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX); + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); return -EPERM; } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTSYNRX); @@ -228,10 +235,12 @@ again: subflow_req_create_thmac(subflow_req); if (unlikely(req->syncookie)) { - if (mptcp_can_accept_new_subflow(subflow_req->msk)) - subflow_init_req_cookie_join_save(subflow_req, skb); - else + if (!mptcp_can_accept_new_subflow(subflow_req->msk)) { + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); return -EPERM; + } + + subflow_init_req_cookie_join_save(subflow_req, skb); } pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token, @@ -282,10 +291,21 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req, } EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req); +static enum sk_rst_reason mptcp_get_rst_reason(const struct sk_buff *skb) +{ + const struct mptcp_ext *mpext = mptcp_get_ext(skb); + + if (!mpext) + return SK_RST_REASON_NOT_SPECIFIED; + + return sk_rst_convert_mptcp_reason(mpext->reset_reason); +} + static struct dst_entry *subflow_v4_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req) + struct request_sock *req, + u32 tw_isn) { struct dst_entry *dst; int err; @@ -293,7 +313,7 @@ static struct dst_entry *subflow_v4_route_req(const struct sock *sk, tcp_rsk(req)->is_mptcp = 1; subflow_init_req(req, sk); - dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req); + dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req, tw_isn); if (!dst) return NULL; @@ -303,7 +323,8 @@ static struct dst_entry *subflow_v4_route_req(const struct sock *sk, dst_release(dst); if (!req->syncookie) - tcp_request_sock_ops.send_reset(sk, skb); + tcp_request_sock_ops.send_reset(sk, skb, + mptcp_get_rst_reason(skb)); return NULL; } @@ -352,7 +373,8 @@ static int subflow_v6_send_synack(const struct sock *sk, struct dst_entry *dst, static struct dst_entry *subflow_v6_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req) + struct request_sock *req, + u32 tw_isn) { struct dst_entry *dst; int err; @@ -360,7 +382,7 @@ static struct dst_entry *subflow_v6_route_req(const struct sock *sk, tcp_rsk(req)->is_mptcp = 1; subflow_init_req(req, sk); - dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req); + dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req, tw_isn); if (!dst) return NULL; @@ -370,7 +392,8 @@ static struct dst_entry *subflow_v6_route_req(const struct sock *sk, dst_release(dst); if (!req->syncookie) - tcp6_request_sock_ops.send_reset(sk, skb); + tcp6_request_sock_ops.send_reset(sk, skb, + mptcp_get_rst_reason(skb)); return NULL; } #endif @@ -406,7 +429,7 @@ void mptcp_subflow_reset(struct sock *ssk) /* must hold: tcp_done() could drop last reference on parent */ sock_hold(sk); - tcp_send_active_reset(ssk, GFP_ATOMIC); + mptcp_send_active_reset_reason(ssk); tcp_done(ssk); if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags)) mptcp_schedule_work(sk); @@ -558,6 +581,9 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->mp_join = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); + if (subflow->backup) + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKBACKUPRX); + if (subflow_use_different_dport(msk, sk)) { pr_debug("synack inet_dport=%d %d", ntohs(inet_sk(sk)->inet_dport), @@ -595,6 +621,8 @@ static int subflow_chk_local_id(struct sock *sk) return err; subflow_set_local_id(subflow, err); + subflow->request_bkup = mptcp_pm_is_backup(msk, (struct sock_common *)sk); + return 0; } @@ -714,7 +742,8 @@ static bool subflow_hmac_valid(const struct request_sock *req, if (!msk) return false; - subflow_generate_hmac(msk->remote_key, msk->local_key, + subflow_generate_hmac(READ_ONCE(msk->remote_key), + READ_ONCE(msk->local_key), subflow_req->remote_nonce, subflow_req->local_nonce, hmac); @@ -774,6 +803,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct mptcp_subflow_request_sock *subflow_req; struct mptcp_options_received mp_opt; bool fallback, fallback_is_fatal; + enum sk_rst_reason reason; struct mptcp_sock *owner; struct sock *child; @@ -873,13 +903,18 @@ create_child: ntohs(inet_sk((struct sock *)owner)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(owner, sk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTACKRX); + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTACKRX); } - if (!mptcp_finish_join(child)) + if (!mptcp_finish_join(child)) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(child); + + subflow_add_reset_reason(skb, subflow->reset_reason); goto dispose_child; + } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); tcp_rsk(req)->drop_req = true; @@ -887,7 +922,7 @@ create_child: } /* check for expected invariant - should never trigger, just help - * catching eariler subtle bugs + * catching earlier subtle bugs */ WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp && (!mptcp_subflow_ctx(child) || @@ -899,7 +934,8 @@ dispose_child: tcp_rsk(req)->drop_req = true; inet_csk_prepare_for_destroy_sock(child); tcp_done(child); - req->rsk_ops->send_reset(sk, skb); + reason = mptcp_get_rst_reason(skb); + req->rsk_ops->send_reset(sk, skb, reason); /* The last child reference will be released by the caller */ return child; @@ -1092,6 +1128,8 @@ static enum mapping_status get_mapping_status(struct sock *ssk, } if (mpext->data_fin == 1) { + u64 data_fin_seq; + if (data_len == 1) { bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq, mpext->dsn64); @@ -1104,26 +1142,26 @@ static enum mapping_status get_mapping_status(struct sock *ssk, */ skb_ext_del(skb, SKB_EXT_MPTCP); return MAPPING_OK; - } else { - if (updated) - mptcp_schedule_work((struct sock *)msk); - - return MAPPING_DATA_FIN; } - } else { - u64 data_fin_seq = mpext->data_seq + data_len - 1; - /* If mpext->data_seq is a 32-bit value, data_fin_seq - * must also be limited to 32 bits. - */ - if (!mpext->dsn64) - data_fin_seq &= GENMASK_ULL(31, 0); + if (updated) + mptcp_schedule_work((struct sock *)msk); - mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64); - pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d", - data_fin_seq, mpext->dsn64); + return MAPPING_DATA_FIN; } + data_fin_seq = mpext->data_seq + data_len - 1; + + /* If mpext->data_seq is a 32-bit value, data_fin_seq must also + * be limited to 32 bits. + */ + if (!mpext->dsn64) + data_fin_seq &= GENMASK_ULL(31, 0); + + mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64); + pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d", + data_fin_seq, mpext->dsn64); + /* Adjust for DATA_FIN using 1 byte of sequence space */ data_len--; } @@ -1192,14 +1230,22 @@ static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; - u32 incr; + struct tcp_sock *tp = tcp_sk(ssk); + u32 offset, incr, avail_len; - incr = limit >= skb->len ? skb->len + fin : limit; + offset = tp->copied_seq - TCP_SKB_CB(skb)->seq; + if (WARN_ON_ONCE(offset > skb->len)) + goto out; - pr_debug("discarding=%d len=%d seq=%d", incr, skb->len, - subflow->map_subflow_seq); + avail_len = skb->len - offset; + incr = limit >= avail_len ? avail_len + fin : limit; + + pr_debug("discarding=%d len=%d offset=%d seq=%d", incr, skb->len, + offset, subflow->map_subflow_seq); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA); tcp_sk(ssk)->copied_seq += incr; + +out: if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq)) sk_eat_skb(ssk, skb); if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) @@ -1234,7 +1280,7 @@ static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); unsigned long fail_tout; - /* greceful failure can happen only on the MPC subflow */ + /* graceful failure can happen only on the MPC subflow */ if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first))) return; @@ -1336,7 +1382,7 @@ reset: tcp_set_state(ssk, TCP_CLOSE); while ((skb = skb_peek(&ssk->sk_receive_queue))) sk_eat_skb(ssk, skb); - tcp_send_active_reset(ssk, GFP_ATOMIC); + mptcp_send_active_reset_reason(ssk); WRITE_ONCE(subflow->data_avail, false); return false; } @@ -1550,8 +1596,8 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, mptcp_pm_get_flags_and_ifindex_by_id(msk, local_id, &flags, &ifindex); subflow->remote_key_valid = 1; - subflow->remote_key = msk->remote_key; - subflow->local_key = msk->local_key; + subflow->remote_key = READ_ONCE(msk->remote_key); + subflow->local_key = READ_ONCE(msk->local_key); subflow->token = msk->token; mptcp_info2sockaddr(loc, &addr, ssk->sk_family); @@ -1976,6 +2022,7 @@ static void subflow_ulp_clone(const struct request_sock *req, new_ctx->fully_established = 1; new_ctx->remote_key_valid = 1; new_ctx->backup = subflow_req->backup; + new_ctx->request_bkup = subflow_req->request_bkup; WRITE_ONCE(new_ctx->remote_id, subflow_req->remote_id); new_ctx->token = subflow_req->token; new_ctx->thmac = subflow_req->thmac; diff --git a/net/mptcp/token_test.c b/net/mptcp/token_test.c index bfff53e668..4fc39fa2e2 100644 --- a/net/mptcp/token_test.c +++ b/net/mptcp/token_test.c @@ -52,14 +52,19 @@ static struct mptcp_subflow_context *build_ctx(struct kunit *test) static struct mptcp_sock *build_msk(struct kunit *test) { struct mptcp_sock *msk; + struct sock *sk; msk = kunit_kzalloc(test, sizeof(struct mptcp_sock), GFP_USER); KUNIT_EXPECT_NOT_ERR_OR_NULL(test, msk); refcount_set(&((struct sock *)msk)->sk_refcnt, 1); sock_net_set((struct sock *)msk, &init_net); + sk = (struct sock *)msk; + /* be sure the token helpers can dereference sk->sk_prot */ - ((struct sock *)msk)->sk_prot = &tcp_prot; + sk->sk_prot = &tcp_prot; + sk->sk_protocol = IPPROTO_MPTCP; + return msk; } diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 374412ed78..ef0f8f7382 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -325,6 +325,7 @@ struct ncsi_dev_priv { spinlock_t lock; /* Protect the NCSI device */ unsigned int package_probe_id;/* Current ID during probe */ unsigned int package_num; /* Number of packages */ + unsigned int channel_probe_id;/* Current cahnnel ID during probe */ struct list_head packages; /* List of packages */ struct ncsi_channel *hot_channel; /* Channel was ever active */ struct ncsi_request requests[256]; /* Request table */ @@ -343,6 +344,7 @@ struct ncsi_dev_priv { bool multi_package; /* Enable multiple packages */ bool mlx_multi_host; /* Enable multi host Mellanox */ u32 package_whitelist; /* Packages to configure */ + unsigned char channel_count; /* Num of channels to probe */ }; struct ncsi_cmd_arg { diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 745c788f1d..5ecf611c88 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -510,17 +510,19 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp) break; case ncsi_dev_state_suspend_gls: - ndp->pending_req_num = np->channel_num; + ndp->pending_req_num = 1; nca.type = NCSI_PKT_CMD_GLS; nca.package = np->id; + nca.channel = ndp->channel_probe_id; + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + ndp->channel_probe_id++; - nd->state = ncsi_dev_state_suspend_dcnt; - NCSI_FOR_EACH_CHANNEL(np, nc) { - nca.channel = nc->id; - ret = ncsi_xmit_cmd(&nca); - if (ret) - goto error; + if (ndp->channel_probe_id == ndp->channel_count) { + ndp->channel_probe_id = 0; + nd->state = ncsi_dev_state_suspend_dcnt; } break; @@ -1345,7 +1347,6 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) { struct ncsi_dev *nd = &ndp->ndev; struct ncsi_package *np; - struct ncsi_channel *nc; struct ncsi_cmd_arg nca; unsigned char index; int ret; @@ -1423,23 +1424,6 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) nd->state = ncsi_dev_state_probe_cis; break; - case ncsi_dev_state_probe_cis: - ndp->pending_req_num = NCSI_RESERVED_CHANNEL; - - /* Clear initial state */ - nca.type = NCSI_PKT_CMD_CIS; - nca.package = ndp->active_package->id; - for (index = 0; index < NCSI_RESERVED_CHANNEL; index++) { - nca.channel = index; - ret = ncsi_xmit_cmd(&nca); - if (ret) - goto error; - } - - nd->state = ncsi_dev_state_probe_gvi; - if (IS_ENABLED(CONFIG_NCSI_OEM_CMD_KEEP_PHY)) - nd->state = ncsi_dev_state_probe_keep_phy; - break; case ncsi_dev_state_probe_keep_phy: ndp->pending_req_num = 1; @@ -1452,14 +1436,17 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) nd->state = ncsi_dev_state_probe_gvi; break; + case ncsi_dev_state_probe_cis: case ncsi_dev_state_probe_gvi: case ncsi_dev_state_probe_gc: case ncsi_dev_state_probe_gls: np = ndp->active_package; - ndp->pending_req_num = np->channel_num; + ndp->pending_req_num = 1; - /* Retrieve version, capability or link status */ - if (nd->state == ncsi_dev_state_probe_gvi) + /* Clear initial state Retrieve version, capability or link status */ + if (nd->state == ncsi_dev_state_probe_cis) + nca.type = NCSI_PKT_CMD_CIS; + else if (nd->state == ncsi_dev_state_probe_gvi) nca.type = NCSI_PKT_CMD_GVI; else if (nd->state == ncsi_dev_state_probe_gc) nca.type = NCSI_PKT_CMD_GC; @@ -1467,19 +1454,29 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) nca.type = NCSI_PKT_CMD_GLS; nca.package = np->id; - NCSI_FOR_EACH_CHANNEL(np, nc) { - nca.channel = nc->id; - ret = ncsi_xmit_cmd(&nca); - if (ret) - goto error; - } + nca.channel = ndp->channel_probe_id; - if (nd->state == ncsi_dev_state_probe_gvi) + ret = ncsi_xmit_cmd(&nca); + if (ret) + goto error; + + if (nd->state == ncsi_dev_state_probe_cis) { + nd->state = ncsi_dev_state_probe_gvi; + if (IS_ENABLED(CONFIG_NCSI_OEM_CMD_KEEP_PHY) && ndp->channel_probe_id == 0) + nd->state = ncsi_dev_state_probe_keep_phy; + } else if (nd->state == ncsi_dev_state_probe_gvi) { nd->state = ncsi_dev_state_probe_gc; - else if (nd->state == ncsi_dev_state_probe_gc) + } else if (nd->state == ncsi_dev_state_probe_gc) { nd->state = ncsi_dev_state_probe_gls; - else + } else { + nd->state = ncsi_dev_state_probe_cis; + ndp->channel_probe_id++; + } + + if (ndp->channel_probe_id == ndp->channel_count) { + ndp->channel_probe_id = 0; nd->state = ncsi_dev_state_probe_dp; + } break; case ncsi_dev_state_probe_dp: ndp->pending_req_num = 1; @@ -1780,6 +1777,7 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, ndp->requests[i].ndp = ndp; timer_setup(&ndp->requests[i].timer, ncsi_request_timeout, 0); } + ndp->channel_count = NCSI_RESERVED_CHANNEL; spin_lock_irqsave(&ncsi_dev_lock, flags); list_add_tail_rcu(&ndp->node, &ncsi_dev_list); @@ -1813,6 +1811,7 @@ int ncsi_start_dev(struct ncsi_dev *nd) if (!(ndp->flags & NCSI_DEV_PROBED)) { ndp->package_probe_id = 0; + ndp->channel_probe_id = 0; nd->state = ncsi_dev_state_probe; schedule_work(&ndp->work); return 0; diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index bee290d0f4..e28be33bdf 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -795,12 +795,13 @@ static int ncsi_rsp_handler_gc(struct ncsi_request *nr) struct ncsi_rsp_gc_pkt *rsp; struct ncsi_dev_priv *ndp = nr->ndp; struct ncsi_channel *nc; + struct ncsi_package *np; size_t size; /* Find the channel */ rsp = (struct ncsi_rsp_gc_pkt *)skb_network_header(nr->rsp); ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, - NULL, &nc); + &np, &nc); if (!nc) return -ENODEV; @@ -835,6 +836,7 @@ static int ncsi_rsp_handler_gc(struct ncsi_request *nr) */ nc->vlan_filter.bitmap = U64_MAX; nc->vlan_filter.n_vids = rsp->vlan_cnt; + np->ndp->channel_count = rsp->channel_cnt; return 0; } diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 441d1f1341..df2dc21304 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -818,7 +818,7 @@ config NETFILTER_XT_TARGET_AUDIT config NETFILTER_XT_TARGET_CHECKSUM tristate "CHECKSUM target support" - depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT depends on NETFILTER_ADVANCED help This option adds a `CHECKSUM' target, which can be used in the iptables mangle @@ -869,7 +869,7 @@ config NETFILTER_XT_TARGET_CONNSECMARK config NETFILTER_XT_TARGET_CT tristate '"CT" target support' depends on NF_CONNTRACK - depends on IP_NF_RAW || IP6_NF_RAW + depends on IP_NF_RAW || IP6_NF_RAW || NFT_COMPAT depends on NETFILTER_ADVANCED help This options adds a `CT' target, which allows to specify initial @@ -880,7 +880,7 @@ config NETFILTER_XT_TARGET_CT config NETFILTER_XT_TARGET_DSCP tristate '"DSCP" and "TOS" target support' - depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT depends on NETFILTER_ADVANCED help This option adds a `DSCP' target, which allows you to manipulate @@ -896,7 +896,7 @@ config NETFILTER_XT_TARGET_DSCP config NETFILTER_XT_TARGET_HL tristate '"HL" hoplimit target support' - depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT depends on NETFILTER_ADVANCED help This option adds the "HL" (for IPv6) and "TTL" (for IPv4) @@ -1080,7 +1080,7 @@ config NETFILTER_XT_TARGET_TPROXY depends on NETFILTER_ADVANCED depends on IPV6 || IPV6=n depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n - depends on IP_NF_MANGLE + depends on IP_NF_MANGLE || NFT_COMPAT select NF_DEFRAG_IPV4 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n select NF_TPROXY_IPV4 @@ -1147,7 +1147,7 @@ config NETFILTER_XT_TARGET_TCPMSS config NETFILTER_XT_TARGET_TCPOPTSTRIP tristate '"TCPOPTSTRIP" target support' - depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT depends on NETFILTER_ADVANCED help This option adds a "TCPOPTSTRIP" target, which allows you to strip diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index d4958e7e76..614815a3ed 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -101,7 +101,7 @@ endif endif ifdef CONFIG_NFT_CT -ifdef CONFIG_RETPOLINE +ifdef CONFIG_MITIGATION_RETPOLINE nf_tables-objs += nft_ct_fast.o endif endif diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 3126911f50..b00fc285b3 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -815,12 +815,21 @@ int __init netfilter_init(void) if (ret < 0) goto err; +#ifdef CONFIG_LWTUNNEL + ret = netfilter_lwtunnel_init(); + if (ret < 0) + goto err_lwtunnel_pernet; +#endif ret = netfilter_log_init(); if (ret < 0) - goto err_pernet; + goto err_log_pernet; return 0; -err_pernet: +err_log_pernet: +#ifdef CONFIG_LWTUNNEL + netfilter_lwtunnel_fini(); +err_lwtunnel_pernet: +#endif unregister_pernet_subsys(&netfilter_net_ops); err: return ret; diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 3184cc6be4..61431690cb 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -53,12 +53,13 @@ MODULE_DESCRIPTION("core IP set support"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); /* When the nfnl mutex or ip_set_ref_lock is held: */ -#define ip_set_dereference(p) \ - rcu_dereference_protected(p, \ +#define ip_set_dereference(inst) \ + rcu_dereference_protected((inst)->ip_set_list, \ lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \ - lockdep_is_held(&ip_set_ref_lock)) + lockdep_is_held(&ip_set_ref_lock) || \ + (inst)->is_deleted) #define ip_set(inst, id) \ - ip_set_dereference((inst)->ip_set_list)[id] + ip_set_dereference(inst)[id] #define ip_set_ref_netlink(inst,id) \ rcu_dereference_raw((inst)->ip_set_list)[id] #define ip_set_dereference_nfnl(p) \ @@ -1133,7 +1134,7 @@ static int ip_set_create(struct sk_buff *skb, const struct nfnl_info *info, if (!list) goto cleanup; /* nfnl mutex is held, both lists are valid */ - tmp = ip_set_dereference(inst->ip_set_list); + tmp = ip_set_dereference(inst); memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max); rcu_assign_pointer(inst->ip_set_list, list); /* Make sure all current packets have passed through */ @@ -1172,23 +1173,50 @@ ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = { .len = IPSET_MAXNAMELEN - 1 }, }; +/* In order to return quickly when destroying a single set, it is split + * into two stages: + * - Cancel garbage collector + * - Destroy the set itself via call_rcu() + */ + static void -ip_set_destroy_set(struct ip_set *set) +ip_set_destroy_set_rcu(struct rcu_head *head) { - pr_debug("set: %s\n", set->name); + struct ip_set *set = container_of(head, struct ip_set, rcu); - /* Must call it without holding any lock */ set->variant->destroy(set); module_put(set->type->me); kfree(set); } static void -ip_set_destroy_set_rcu(struct rcu_head *head) +_destroy_all_sets(struct ip_set_net *inst) { - struct ip_set *set = container_of(head, struct ip_set, rcu); + struct ip_set *set; + ip_set_id_t i; + bool need_wait = false; - ip_set_destroy_set(set); + /* First cancel gc's: set:list sets are flushed as well */ + for (i = 0; i < inst->ip_set_max; i++) { + set = ip_set(inst, i); + if (set) { + set->variant->cancel_gc(set); + if (set->type->features & IPSET_TYPE_NAME) + need_wait = true; + } + } + /* Must wait for flush to be really finished */ + if (need_wait) + rcu_barrier(); + for (i = 0; i < inst->ip_set_max; i++) { + set = ip_set(inst, i); + if (set) { + ip_set(inst, i) = NULL; + set->variant->destroy(set); + module_put(set->type->me); + kfree(set); + } + } } static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, @@ -1202,11 +1230,10 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, if (unlikely(protocol_min_failed(attr))) return -IPSET_ERR_PROTOCOL; - /* Commands are serialized and references are * protected by the ip_set_ref_lock. * External systems (i.e. xt_set) must call - * ip_set_put|get_nfnl_* functions, that way we + * ip_set_nfnl_get_* functions, that way we * can safely check references here. * * list:set timer can only decrement the reference @@ -1214,8 +1241,6 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, * without holding the lock. */ if (!attr[IPSET_ATTR_SETNAME]) { - /* Must wait for flush to be really finished in list:set */ - rcu_barrier(); read_lock_bh(&ip_set_ref_lock); for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); @@ -1226,15 +1251,7 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, } inst->is_destroyed = true; read_unlock_bh(&ip_set_ref_lock); - for (i = 0; i < inst->ip_set_max; i++) { - s = ip_set(inst, i); - if (s) { - ip_set(inst, i) = NULL; - /* Must cancel garbage collectors */ - s->variant->cancel_gc(s); - ip_set_destroy_set(s); - } - } + _destroy_all_sets(inst); /* Modified by ip_set_destroy() only, which is serialized */ inst->is_destroyed = false; } else { @@ -1255,12 +1272,12 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, features = s->type->features; ip_set(inst, i) = NULL; read_unlock_bh(&ip_set_ref_lock); + /* Must cancel garbage collectors */ + s->variant->cancel_gc(s); if (features & IPSET_TYPE_NAME) { /* Must wait for flush to be really finished */ rcu_barrier(); } - /* Must cancel garbage collectors */ - s->variant->cancel_gc(s); call_rcu(&s->rcu, ip_set_destroy_set_rcu); } return 0; @@ -2365,30 +2382,25 @@ ip_set_net_init(struct net *net) } static void __net_exit -ip_set_net_exit(struct net *net) +ip_set_net_pre_exit(struct net *net) { struct ip_set_net *inst = ip_set_pernet(net); - struct ip_set *set = NULL; - ip_set_id_t i; - inst->is_deleted = true; /* flag for ip_set_nfnl_put */ +} - nfnl_lock(NFNL_SUBSYS_IPSET); - for (i = 0; i < inst->ip_set_max; i++) { - set = ip_set(inst, i); - if (set) { - ip_set(inst, i) = NULL; - set->variant->cancel_gc(set); - ip_set_destroy_set(set); - } - } - nfnl_unlock(NFNL_SUBSYS_IPSET); +static void __net_exit +ip_set_net_exit(struct net *net) +{ + struct ip_set_net *inst = ip_set_pernet(net); + + _destroy_all_sets(inst); kvfree(rcu_dereference_protected(inst->ip_set_list, 1)); } static struct pernet_operations ip_set_net_ops = { .init = ip_set_net_init, + .pre_exit = ip_set_net_pre_exit, .exit = ip_set_net_exit, .id = &ip_set_net_id, .size = sizeof(struct ip_set_net), diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 6c3f28bc59..bfae706693 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -79,7 +79,7 @@ list_set_kadd(struct ip_set *set, const struct sk_buff *skb, struct set_elem *e; int ret; - list_for_each_entry(e, &map->members, list) { + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -99,7 +99,7 @@ list_set_kdel(struct ip_set *set, const struct sk_buff *skb, struct set_elem *e; int ret; - list_for_each_entry(e, &map->members, list) { + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -188,9 +188,10 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct list_set *map = set->data; struct set_adt_elem *d = value; struct set_elem *e, *next, *prev = NULL; - int ret; + int ret = 0; - list_for_each_entry(e, &map->members, list) { + rcu_read_lock(); + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -201,6 +202,7 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (d->before == 0) { ret = 1; + goto out; } else if (d->before > 0) { next = list_next_entry(e, list); ret = !list_is_last(&e->list, &map->members) && @@ -208,9 +210,11 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, } else { ret = prev && prev->id == d->refid; } - return ret; + goto out; } - return 0; +out: + rcu_read_unlock(); + return ret; } static void @@ -239,7 +243,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, /* Find where to add the new entry */ n = prev = next = NULL; - list_for_each_entry(e, &map->members, list) { + list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -316,9 +320,9 @@ list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext, { struct list_set *map = set->data; struct set_adt_elem *d = value; - struct set_elem *e, *next, *prev = NULL; + struct set_elem *e, *n, *next, *prev = NULL; - list_for_each_entry(e, &map->members, list) { + list_for_each_entry_safe(e, n, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; @@ -424,14 +428,8 @@ static void list_set_destroy(struct ip_set *set) { struct list_set *map = set->data; - struct set_elem *e, *n; - list_for_each_entry_safe(e, n, &map->members, list) { - list_del(&e->list); - ip_set_put_byindex(map->net, e->id); - ip_set_ext_destroy(set, e); - kfree(e); - } + WARN_ON_ONCE(!list_empty(&map->members)); kfree(map); set->data = NULL; @@ -549,6 +547,9 @@ list_set_cancel_gc(struct ip_set *set) if (SET_WITH_TIMEOUT(set)) timer_shutdown_sync(&map->gc); + + /* Flush list to drop references to other ipsets */ + list_set_flush(set); } static const struct ip_set_type_variant set_variant = { diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index a743db0738..98d7dbe3d7 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1511,9 +1511,7 @@ int __init ip_vs_conn_init(void) return -ENOMEM; /* Allocate ip_vs_conn slab cache */ - ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", - sizeof(struct ip_vs_conn), 0, - SLAB_HWCACHE_ALIGN, NULL); + ip_vs_conn_cachep = KMEM_CACHE(ip_vs_conn, SLAB_HWCACHE_ALIGN); if (!ip_vs_conn_cachep) { kvfree(ip_vs_conn_tab); return -ENOMEM; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index a2c16b5010..c7a8a08b73 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1550,6 +1550,7 @@ static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, if (!dest) goto unk; if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + IP_TUNNEL_DECLARE_FLAGS(flags); __be16 type; /* Only support version 0 and C (csum) */ @@ -1560,7 +1561,10 @@ static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, if (type != htons(ETH_P_IP)) goto unk; *proto = IPPROTO_IPIP; - return gre_calc_hlen(gre_flags_to_tnl_flags(greh->flags)); + + gre_flags_to_tnl_flags(flags, greh->flags); + + return gre_calc_hlen(flags); } unk: diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 143a341bbc..f4384e147e 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -94,6 +94,7 @@ static void update_defense_level(struct netns_ipvs *ipvs) { struct sysinfo i; int availmem; + int amemthresh; int nomem; int to_change = -1; @@ -105,7 +106,8 @@ static void update_defense_level(struct netns_ipvs *ipvs) /* si_swapinfo(&i); */ /* availmem = availmem - (i.totalswap - i.freeswap); */ - nomem = (availmem < ipvs->sysctl_amemthresh); + amemthresh = max(READ_ONCE(ipvs->sysctl_amemthresh), 0); + nomem = (availmem < amemthresh); local_bh_disable(); @@ -145,9 +147,8 @@ static void update_defense_level(struct netns_ipvs *ipvs) break; case 1: if (nomem) { - ipvs->drop_rate = ipvs->drop_counter - = ipvs->sysctl_amemthresh / - (ipvs->sysctl_amemthresh-availmem); + ipvs->drop_counter = amemthresh / (amemthresh - availmem); + ipvs->drop_rate = ipvs->drop_counter; ipvs->sysctl_drop_packet = 2; } else { ipvs->drop_rate = 0; @@ -155,9 +156,8 @@ static void update_defense_level(struct netns_ipvs *ipvs) break; case 2: if (nomem) { - ipvs->drop_rate = ipvs->drop_counter - = ipvs->sysctl_amemthresh / - (ipvs->sysctl_amemthresh-availmem); + ipvs->drop_counter = amemthresh / (amemthresh - availmem); + ipvs->drop_rate = ipvs->drop_counter; } else { ipvs->drop_rate = 0; ipvs->sysctl_drop_packet = 1; @@ -1459,18 +1459,18 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, if (ret < 0) goto out_err; - /* Bind the ct retriever */ - RCU_INIT_POINTER(svc->pe, pe); - pe = NULL; - /* Update the virtual service counters */ if (svc->port == FTPPORT) atomic_inc(&ipvs->ftpsvc_counter); else if (svc->port == 0) atomic_inc(&ipvs->nullsvc_counter); - if (svc->pe && svc->pe->conn_out) + if (pe && pe->conn_out) atomic_inc(&ipvs->conn_out_counter); + /* Bind the ct retriever */ + RCU_INIT_POINTER(svc->pe, pe); + pe = NULL; + /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) ipvs->num_services++; @@ -2263,7 +2263,6 @@ static struct ctl_table vs_vars[] = { .proc_handler = proc_dointvec, }, #endif - { } }; #endif @@ -4270,6 +4269,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) struct ctl_table *tbl; int idx, ret; size_t ctl_table_size = ARRAY_SIZE(vs_vars); + bool unpriv = net->user_ns != &init_user_ns; atomic_set(&ipvs->dropentry, 0); spin_lock_init(&ipvs->dropentry_lock); @@ -4284,12 +4284,6 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); if (tbl == NULL) return -ENOMEM; - - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - tbl[0].procname = NULL; - ctl_table_size = 0; - } } else tbl = vs_vars; /* Initialize sysctl defaults */ @@ -4315,10 +4309,17 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) ipvs->sysctl_sync_ports = 1; tbl[idx++].data = &ipvs->sysctl_sync_ports; tbl[idx++].data = &ipvs->sysctl_sync_persist_mode; + ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32; + if (unpriv) + tbl[idx].mode = 0444; tbl[idx++].data = &ipvs->sysctl_sync_qlen_max; + ipvs->sysctl_sync_sock_size = 0; + if (unpriv) + tbl[idx].mode = 0444; tbl[idx++].data = &ipvs->sysctl_sync_sock_size; + tbl[idx++].data = &ipvs->sysctl_cache_bypass; tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; tbl[idx++].data = &ipvs->sysctl_sloppy_tcp; @@ -4341,15 +4342,22 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; tbl[idx++].data = &ipvs->sysctl_schedule_icmp; tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; + ipvs->sysctl_run_estimation = 1; + if (unpriv) + tbl[idx].mode = 0444; tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_run_estimation; ipvs->est_cpulist_valid = 0; + if (unpriv) + tbl[idx].mode = 0444; tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_est_cpulist; ipvs->sysctl_est_nice = IPVS_EST_NICE; + if (unpriv) + tbl[idx].mode = 0444; tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_est_nice; diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 8ceec7a2fa..2423513d70 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -123,7 +123,6 @@ static struct ctl_table vs_vars_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; #endif @@ -563,10 +562,8 @@ static int __net_init __ip_vs_lblc_init(struct net *net) return -ENOMEM; /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - ipvs->lblc_ctl_table[0].procname = NULL; + if (net->user_ns != &init_user_ns) vars_table_size = 0; - } } else ipvs->lblc_ctl_table = vs_vars_table; diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 0fb6470721..cdb1d4bf67 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -294,7 +294,6 @@ static struct ctl_table vs_vars_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; #endif @@ -749,10 +748,8 @@ static int __net_init __ip_vs_lblcr_init(struct net *net) return -ENOMEM; /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - ipvs->lblcr_ctl_table[0].procname = NULL; + if (net->user_ns != &init_user_ns) vars_table_size = 0; - } } else ipvs->lblcr_ctl_table = vs_vars_table; ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION; diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 1e689c7141..83e4529164 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -126,7 +126,7 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, if (sctph->source != cp->vport || payload_csum || skb->ip_summed == CHECKSUM_PARTIAL) { sctph->source = cp->vport; - if (!skb_is_gso(skb) || !skb_is_gso_sctp(skb)) + if (!skb_is_gso(skb)) sctp_nat_csum(skb, sctph, sctphoff); } else { skb->ip_summed = CHECKSUM_UNNECESSARY; @@ -175,7 +175,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, (skb->ip_summed == CHECKSUM_PARTIAL && !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CRC))) { sctph->dest = cp->dport; - if (!skb_is_gso(skb) || !skb_is_gso_sctp(skb)) + if (!skb_is_gso(skb)) sctp_nat_csum(skb, sctph, sctphoff); } else if (skb->ip_summed != CHECKSUM_PARTIAL) { skb->ip_summed = CHECKSUM_UNNECESSARY; diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 65e0259178..3313bceb6c 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -180,7 +180,7 @@ static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb, (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && (addr_type & IPV6_ADDR_LOOPBACK); old_rt_is_local = __ip_vs_is_local_route6( - (struct rt6_info *)skb_dst(skb)); + dst_rt6_info(skb_dst(skb))); } else #endif { @@ -318,7 +318,7 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, if (dest) { dest_dst = __ip_vs_dst_check(dest); if (likely(dest_dst)) - rt = (struct rtable *) dest_dst->dst_cache; + rt = dst_rtable(dest_dst->dst_cache); else { dest_dst = ip_vs_dest_dst_alloc(); spin_lock_bh(&dest->dst_lock); @@ -390,10 +390,10 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, skb->ip_summed == CHECKSUM_PARTIAL) mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { - __be16 tflags = 0; + IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) - tflags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tflags); mtu -= gre_calc_hlen(tflags); } if (mtu < 68) { @@ -481,7 +481,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, if (dest) { dest_dst = __ip_vs_dst_check(dest); if (likely(dest_dst)) - rt = (struct rt6_info *) dest_dst->dst_cache; + rt = dst_rt6_info(dest_dst->dst_cache); else { u32 cookie; @@ -501,7 +501,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, ip_vs_dest_dst_free(dest_dst); goto err_unreach; } - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); cookie = rt6_get_cookie(rt); __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); spin_unlock_bh(&dest->dst_lock); @@ -517,7 +517,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, rt_mode); if (!dst) goto err_unreach; - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); } local = __ip_vs_is_local_route6(rt); @@ -553,10 +553,10 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, skb->ip_summed == CHECKSUM_PARTIAL) mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { - __be16 tflags = 0; + IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) - tflags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tflags); mtu -= gre_calc_hlen(tflags); } if (mtu < IPV6_MIN_MTU) { @@ -862,7 +862,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_RT_MODE_RDR); if (local < 0) goto tx_error; - rt = (struct rt6_info *) skb_dst(skb); + rt = dst_rt6_info(skb_dst(skb)); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed @@ -1082,11 +1082,11 @@ ipvs_gre_encap(struct net *net, struct sk_buff *skb, { __be16 proto = *next_protocol == IPPROTO_IPIP ? htons(ETH_P_IP) : htons(ETH_P_IPV6); - __be16 tflags = 0; + IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; size_t hdrlen; if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) - tflags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tflags); hdrlen = gre_calc_hlen(tflags); gre_build_header(skb, hdrlen, tflags, proto, 0, 0); @@ -1165,11 +1165,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, max_headroom += sizeof(struct udphdr) + gue_hdrlen; } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; size_t gre_hdrlen; - __be16 tflags = 0; if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) - tflags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tflags); gre_hdrlen = gre_calc_hlen(tflags); max_headroom += gre_hdrlen; @@ -1288,7 +1288,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (local) return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); - rt = (struct rt6_info *) skb_dst(skb); + rt = dst_rt6_info(skb_dst(skb)); tdev = rt->dst.dev; /* @@ -1310,11 +1310,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, max_headroom += sizeof(struct udphdr) + gue_hdrlen; } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; size_t gre_hdrlen; - __be16 tflags = 0; if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) - tflags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tflags); gre_hdrlen = gre_calc_hlen(tflags); max_headroom += gre_hdrlen; @@ -1590,7 +1590,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, &cp->daddr.in6, NULL, ipvsh, 0, rt_mode); if (local < 0) goto tx_error; - rt = (struct rt6_info *) skb_dst(skb); + rt = dst_rt6_info(skb_dst(skb)); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c index 0e4beae421..5257d5e7eb 100644 --- a/net/netfilter/nf_bpf_link.c +++ b/net/netfilter/nf_bpf_link.c @@ -314,7 +314,7 @@ static bool nf_is_valid_access(int off, int size, enum bpf_access_type type, static const struct bpf_func_proto * bpf_nf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } const struct bpf_verifier_ops netfilter_verifier_ops = { diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 5d8ed6c90b..8715617b02 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -605,15 +605,11 @@ static int __init nf_conncount_modinit(void) for (i = 0; i < CONNCOUNT_SLOTS; ++i) spin_lock_init(&nf_conncount_locks[i]); - conncount_conn_cachep = kmem_cache_create("nf_conncount_tuple", - sizeof(struct nf_conncount_tuple), - 0, 0, NULL); + conncount_conn_cachep = KMEM_CACHE(nf_conncount_tuple, 0); if (!conncount_conn_cachep) return -ENOMEM; - conncount_rb_cachep = kmem_cache_create("nf_conncount_rb", - sizeof(struct nf_conncount_rb), - 0, 0, NULL); + conncount_rb_cachep = KMEM_CACHE(nf_conncount_rb, 0); if (!conncount_rb_cachep) { kmem_cache_destroy(conncount_conn_cachep); return -ENOMEM; diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 475358ec82..d2492d050f 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -467,7 +467,7 @@ __bpf_kfunc int bpf_ct_change_status(struct nf_conn *nfct, u32 status) __bpf_kfunc_end_defs(); -BTF_SET8_START(nf_ct_kfunc_set) +BTF_KFUNCS_START(nf_ct_kfunc_set) BTF_ID_FLAGS(func, bpf_xdp_ct_alloc, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_xdp_ct_lookup, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL) @@ -478,7 +478,7 @@ BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS) -BTF_SET8_END(nf_ct_kfunc_set) +BTF_KFUNCS_END(nf_ct_kfunc_set) static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 5b876fa7f9..7ac20750c1 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1440,8 +1440,6 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct) const struct nf_conntrack_l4proto *l4proto; u8 protonum = nf_ct_protonum(ct); - if (test_bit(IPS_OFFLOAD_BIT, &ct->status) && protonum != IPPROTO_UDP) - return false; if (!test_bit(IPS_ASSURED_BIT, &ct->status)) return true; @@ -2024,7 +2022,7 @@ repeat: goto repeat; NF_CT_STAT_INC_ATOMIC(state->net, invalid); - if (ret == -NF_DROP) + if (ret == NF_DROP) NF_CT_STAT_INC_ATOMIC(state->net, drop); ret = -ret; @@ -2530,7 +2528,7 @@ void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) * netfilter framework. Roll on, two-stage module * delete... */ - synchronize_net(); + synchronize_rcu_expedited(); i_see_dead_people: busy = 0; list_for_each_entry(net, net_exit_list, exit_list) { diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 81ca348915..21fa550966 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -722,9 +722,7 @@ int nf_conntrack_expect_init(void) nf_ct_expect_hsize = 1; } nf_ct_expect_max = nf_ct_expect_hsize * 4; - nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", - sizeof(struct nf_conntrack_expect), - 0, 0, NULL); + nf_ct_expect_cachep = KMEM_CACHE(nf_conntrack_expect, 0); if (!nf_ct_expect_cachep) return -ENOMEM; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 3b846cbdc0..4cbf71d078 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -3420,7 +3420,8 @@ static int ctnetlink_del_expect(struct sk_buff *skb, if (cda[CTA_EXPECT_ID]) { __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); - if (ntohl(id) != (u32)(unsigned long)exp) { + + if (id != nf_expect_get_id(exp)) { nf_ct_expect_put(exp); return -ENOENT; } diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index e2db1f4ec2..ebc4f733bb 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -525,7 +525,7 @@ int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb, dh = skb_header_pointer(skb, dataoff, sizeof(*dh), &_dh.dh); if (!dh) - return NF_DROP; + return -NF_ACCEPT; if (dccp_error(dh, skb, dataoff, state)) return -NF_ACCEPT; @@ -533,7 +533,7 @@ int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb, /* pull again, including possible 48 bit sequences and subtype header */ dh = dccp_header_pointer(skb, dataoff, dh, &_dh); if (!dh) - return NF_DROP; + return -NF_ACCEPT; type = dh->dccph_type; if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh, state)) diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index 1020d67600..327b805902 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -62,7 +62,9 @@ static const u_int8_t noct_valid_new[] = { [NDISC_ROUTER_ADVERTISEMENT - 130] = 1, [NDISC_NEIGHBOUR_SOLICITATION - 130] = 1, [NDISC_NEIGHBOUR_ADVERTISEMENT - 130] = 1, - [ICMPV6_MLD2_REPORT - 130] = 1 + [ICMPV6_MLD2_REPORT - 130] = 1, + [ICMPV6_MRDISC_ADV - 130] = 1, + [ICMPV6_MRDISC_SOL - 130] = 1 }; bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple, diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 0ee98ce5b8..6c40bdf8b0 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -22,9 +22,6 @@ #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_timestamp.h> -#ifdef CONFIG_LWTUNNEL -#include <net/netfilter/nf_hooks_lwtunnel.h> -#endif #include <linux/rculist_nulls.h> static bool enable_hooks __read_mostly; @@ -612,15 +609,10 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_PROTO_TIMEOUT_GRE, NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM, #endif -#ifdef CONFIG_LWTUNNEL - NF_SYSCTL_CT_LWTUNNEL, -#endif - __NF_SYSCTL_CT_LAST_SYSCTL, + NF_SYSCTL_CT_LAST_SYSCTL, }; -#define NF_SYSCTL_CT_LAST_SYSCTL (__NF_SYSCTL_CT_LAST_SYSCTL + 1) - static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_MAX] = { .procname = "nf_conntrack_max", @@ -948,16 +940,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { .proc_handler = proc_dointvec_jiffies, }, #endif -#ifdef CONFIG_LWTUNNEL - [NF_SYSCTL_CT_LWTUNNEL] = { - .procname = "nf_hooks_lwtunnel", - .data = NULL, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = nf_hooks_lwtunnel_sysctl_handler, - }, -#endif - {} }; static struct ctl_table nf_ct_netfilter_table[] = { @@ -968,7 +950,6 @@ static struct ctl_table nf_ct_netfilter_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net, @@ -1122,7 +1103,7 @@ out_unregister_netfilter: static void nf_conntrack_standalone_fini_sysctl(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); - struct ctl_table *table; + const struct ctl_table *table; table = cnet->sysctl_header->ctl_table_arg; unregister_net_sysctl_table(cnet->sysctl_header); diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index a057133923..5c1ff07eae 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -77,12 +77,8 @@ EXPORT_SYMBOL_GPL(flow_offload_alloc); static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple) { - const struct rt6_info *rt; - - if (flow_tuple->l3proto == NFPROTO_IPV6) { - rt = (const struct rt6_info *)flow_tuple->dst_cache; - return rt6_get_cookie(rt); - } + if (flow_tuple->l3proto == NFPROTO_IPV6) + return rt6_get_cookie(dst_rt6_info(flow_tuple->dst_cache)); return 0; } diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 5383bed3d3..c2c005234d 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -434,7 +434,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, return NF_ACCEPT; if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { - rt = (struct rtable *)tuplehash->tuple.dst_cache; + rt = dst_rtable(tuplehash->tuple.dst_cache); memset(skb->cb, 0, sizeof(struct inet_skb_parm)); IPCB(skb)->iif = skb->dev->ifindex; IPCB(skb)->flags = IPSKB_FORWARDED; @@ -446,7 +446,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: - rt = (struct rtable *)tuplehash->tuple.dst_cache; + rt = dst_rtable(tuplehash->tuple.dst_cache); outdev = rt->dst.dev; skb->dev = outdev; nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); @@ -729,7 +729,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, return NF_ACCEPT; if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { - rt = (struct rt6_info *)tuplehash->tuple.dst_cache; + rt = dst_rt6_info(tuplehash->tuple.dst_cache); memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); IP6CB(skb)->iif = skb->dev->ifindex; IP6CB(skb)->flags = IP6SKB_FORWARDED; @@ -741,7 +741,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: - rt = (struct rt6_info *)tuplehash->tuple.dst_cache; + rt = dst_rt6_info(tuplehash->tuple.dst_cache); outdev = rt->dst.dev; skb->dev = outdev; nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); diff --git a/net/netfilter/nf_hooks_lwtunnel.c b/net/netfilter/nf_hooks_lwtunnel.c index 00e89ffd78..d8ebebc977 100644 --- a/net/netfilter/nf_hooks_lwtunnel.c +++ b/net/netfilter/nf_hooks_lwtunnel.c @@ -3,6 +3,9 @@ #include <linux/sysctl.h> #include <net/lwtunnel.h> #include <net/netfilter/nf_hooks_lwtunnel.h> +#include <linux/netfilter.h> + +#include "nf_internals.h" static inline int nf_hooks_lwtunnel_get(void) { @@ -50,4 +53,71 @@ int nf_hooks_lwtunnel_sysctl_handler(struct ctl_table *table, int write, return ret; } EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_sysctl_handler); + +static struct ctl_table nf_lwtunnel_sysctl_table[] = { + { + .procname = "nf_hooks_lwtunnel", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = nf_hooks_lwtunnel_sysctl_handler, + }, +}; + +static int __net_init nf_lwtunnel_net_init(struct net *net) +{ + struct ctl_table_header *hdr; + struct ctl_table *table; + + table = nf_lwtunnel_sysctl_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(nf_lwtunnel_sysctl_table, + sizeof(nf_lwtunnel_sysctl_table), + GFP_KERNEL); + if (!table) + goto err_alloc; + } + + hdr = register_net_sysctl_sz(net, "net/netfilter", table, + ARRAY_SIZE(nf_lwtunnel_sysctl_table)); + if (!hdr) + goto err_reg; + + net->nf.nf_lwtnl_dir_header = hdr; + + return 0; +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void __net_exit nf_lwtunnel_net_exit(struct net *net) +{ + const struct ctl_table *table; + + table = net->nf.nf_lwtnl_dir_header->ctl_table_arg; + unregister_net_sysctl_table(net->nf.nf_lwtnl_dir_header); + if (!net_eq(net, &init_net)) + kfree(table); +} + +static struct pernet_operations nf_lwtunnel_net_ops = { + .init = nf_lwtunnel_net_init, + .exit = nf_lwtunnel_net_exit, +}; + +int __init netfilter_lwtunnel_init(void) +{ + return register_pernet_subsys(&nf_lwtunnel_net_ops); +} + +void netfilter_lwtunnel_fini(void) +{ + unregister_pernet_subsys(&nf_lwtunnel_net_ops); +} +#else +int __init netfilter_lwtunnel_init(void) { return 0; } +void netfilter_lwtunnel_fini(void) {} #endif /* CONFIG_SYSCTL */ diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index 832ae64179..2540302306 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -29,6 +29,12 @@ void nf_queue_nf_hook_drop(struct net *net); /* nf_log.c */ int __init netfilter_log_init(void); +#ifdef CONFIG_LWTUNNEL +/* nf_hooks_lwtunnel.c */ +int __init netfilter_lwtunnel_init(void); +void netfilter_lwtunnel_fini(void); +#endif + /* core.c */ void nf_hook_entries_delete_raw(struct nf_hook_entries __rcu **pp, const struct nf_hook_ops *reg); diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index e16f158388..769fd7680f 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -31,10 +31,10 @@ static struct nf_logger *__find_logger(int pf, const char *str_logger) int i; for (i = 0; i < NF_LOG_TYPE_MAX; i++) { - if (loggers[pf][i] == NULL) + log = nft_log_dereference(loggers[pf][i]); + if (!log) continue; - log = nft_log_dereference(loggers[pf][i]); if (!strncasecmp(str_logger, log->name, strlen(log->name))) return log; } @@ -156,6 +156,11 @@ int nf_logger_find_get(int pf, enum nf_log_type type) struct nf_logger *logger; int ret = -ENOENT; + if (pf >= ARRAY_SIZE(loggers)) + return -EINVAL; + if (type >= NF_LOG_TYPE_MAX) + return -EINVAL; + if (pf == NFPROTO_INET) { ret = nf_logger_find_get(NFPROTO_IPV4, type); if (ret < 0) @@ -390,7 +395,7 @@ static const struct seq_operations nflog_seq_ops = { #ifdef CONFIG_SYSCTL static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3]; -static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1]; +static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO]; static struct ctl_table_header *nf_log_sysctl_fhdr; static struct ctl_table nf_log_sysctl_ftable[] = { @@ -401,7 +406,6 @@ static struct ctl_table nf_log_sysctl_ftable[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static int nf_log_proc_dostring(struct ctl_table *table, int write, @@ -509,7 +513,7 @@ err_alloc: static void netfilter_log_sysctl_exit(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->nf.nf_log_dir_header->ctl_table_arg; unregister_net_sysctl_table(net->nf.nf_log_dir_header); diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c index 6e3b2f5885..481be15609 100644 --- a/net/netfilter/nf_nat_bpf.c +++ b/net/netfilter/nf_nat_bpf.c @@ -54,9 +54,9 @@ __bpf_kfunc int bpf_ct_set_nat_info(struct nf_conn___init *nfct, __bpf_kfunc_end_defs(); -BTF_SET8_START(nf_nat_kfunc_set) +BTF_KFUNCS_START(nf_nat_kfunc_set) BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS) -BTF_SET8_END(nf_nat_kfunc_set) +BTF_KFUNCS_END(nf_nat_kfunc_set) static const struct btf_kfunc_id_set nf_bpf_nat_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index e2f334f702..7f12e56e6e 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -248,109 +248,3 @@ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, return 0; } EXPORT_SYMBOL_GPL(nf_queue); - -static unsigned int nf_iterate(struct sk_buff *skb, - struct nf_hook_state *state, - const struct nf_hook_entries *hooks, - unsigned int *index) -{ - const struct nf_hook_entry *hook; - unsigned int verdict, i = *index; - - while (i < hooks->num_hook_entries) { - hook = &hooks->hooks[i]; -repeat: - verdict = nf_hook_entry_hookfn(hook, skb, state); - if (verdict != NF_ACCEPT) { - *index = i; - if (verdict != NF_REPEAT) - return verdict; - goto repeat; - } - i++; - } - - *index = i; - return NF_ACCEPT; -} - -static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf, u8 hooknum) -{ - switch (pf) { -#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE - case NFPROTO_BRIDGE: - return rcu_dereference(net->nf.hooks_bridge[hooknum]); -#endif - case NFPROTO_IPV4: - return rcu_dereference(net->nf.hooks_ipv4[hooknum]); - case NFPROTO_IPV6: - return rcu_dereference(net->nf.hooks_ipv6[hooknum]); - default: - WARN_ON_ONCE(1); - return NULL; - } - - return NULL; -} - -/* Caller must hold rcu read-side lock */ -void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) -{ - const struct nf_hook_entry *hook_entry; - const struct nf_hook_entries *hooks; - struct sk_buff *skb = entry->skb; - const struct net *net; - unsigned int i; - int err; - u8 pf; - - net = entry->state.net; - pf = entry->state.pf; - - hooks = nf_hook_entries_head(net, pf, entry->state.hook); - - i = entry->hook_index; - if (WARN_ON_ONCE(!hooks || i >= hooks->num_hook_entries)) { - kfree_skb(skb); - nf_queue_entry_free(entry); - return; - } - - hook_entry = &hooks->hooks[i]; - - /* Continue traversal iff userspace said ok... */ - if (verdict == NF_REPEAT) - verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state); - - if (verdict == NF_ACCEPT) { - if (nf_reroute(skb, entry) < 0) - verdict = NF_DROP; - } - - if (verdict == NF_ACCEPT) { -next_hook: - ++i; - verdict = nf_iterate(skb, &entry->state, hooks, &i); - } - - switch (verdict & NF_VERDICT_MASK) { - case NF_ACCEPT: - case NF_STOP: - local_bh_disable(); - entry->state.okfn(entry->state.net, entry->state.sk, skb); - local_bh_enable(); - break; - case NF_QUEUE: - err = nf_queue(skb, &entry->state, i, verdict); - if (err == 1) - goto next_hook; - break; - case NF_STOLEN: - break; - default: - kfree_skb(skb); - } - - nf_queue_entry_free(entry); -} -EXPORT_SYMBOL(nf_reinject); diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index fbbc4fd373..5b140c12b7 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -800,7 +800,7 @@ synproxy_build_ip_ipv6(struct net *net, struct sk_buff *skb, skb_reset_network_header(skb); iph = skb_put(skb, sizeof(*iph)); ip6_flow_hdr(iph, 0, 0); - iph->hop_limit = net->ipv6.devconf_all->hop_limit; + iph->hop_limit = READ_ONCE(net->ipv6.devconf_all->hop_limit); iph->nexthdr = IPPROTO_TCP; iph->saddr = *saddr; iph->daddr = *daddr; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 0e697e53a7..91cc3a81ba 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1203,8 +1203,10 @@ static void nf_tables_table_disable(struct net *net, struct nft_table *table) #define __NFT_TABLE_F_INTERNAL (NFT_TABLE_F_MASK + 1) #define __NFT_TABLE_F_WAS_DORMANT (__NFT_TABLE_F_INTERNAL << 0) #define __NFT_TABLE_F_WAS_AWAKEN (__NFT_TABLE_F_INTERNAL << 1) +#define __NFT_TABLE_F_WAS_ORPHAN (__NFT_TABLE_F_INTERNAL << 2) #define __NFT_TABLE_F_UPDATE (__NFT_TABLE_F_WAS_DORMANT | \ - __NFT_TABLE_F_WAS_AWAKEN) + __NFT_TABLE_F_WAS_AWAKEN | \ + __NFT_TABLE_F_WAS_ORPHAN) static bool nft_table_pending_update(const struct nft_ctx *ctx) { @@ -1244,8 +1246,11 @@ static int nf_tables_updtable(struct nft_ctx *ctx) if ((nft_table_has_owner(ctx->table) && !(flags & NFT_TABLE_F_OWNER)) || - (!nft_table_has_owner(ctx->table) && - flags & NFT_TABLE_F_OWNER)) + (flags & NFT_TABLE_F_OWNER && + !nft_table_is_orphan(ctx->table))) + return -EOPNOTSUPP; + + if ((flags ^ ctx->table->flags) & NFT_TABLE_F_PERSIST) return -EOPNOTSUPP; /* No dormant off/on/off/on games in single transaction */ @@ -1274,6 +1279,13 @@ static int nf_tables_updtable(struct nft_ctx *ctx) } } + if ((flags & NFT_TABLE_F_OWNER) && + !nft_table_has_owner(ctx->table)) { + ctx->table->nlpid = ctx->portid; + ctx->table->flags |= NFT_TABLE_F_OWNER | + __NFT_TABLE_F_WAS_ORPHAN; + } + nft_trans_table_update(trans) = true; nft_trans_commit_list_add_tail(ctx->net, trans); @@ -3321,7 +3333,7 @@ err_expr_parse: return ERR_PTR(err); } -int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src) +int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp) { int err; @@ -3329,7 +3341,7 @@ int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src) return -EINVAL; dst->ops = src->ops; - err = src->ops->clone(dst, src); + err = src->ops->clone(dst, src, gfp); if (err < 0) return err; @@ -3811,6 +3823,15 @@ static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *r nf_tables_rule_destroy(ctx, rule); } +/** nft_chain_validate - loop detection and hook validation + * + * @ctx: context containing call depth and base chain + * @chain: chain to validate + * + * Walk through the rules of the given chain and chase all jumps/gotos + * and set lookups until either the jump limit is hit or all reachable + * chains have been validated. + */ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) { struct nft_expr *expr, *last; @@ -3832,6 +3853,9 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) if (!expr->ops->validate) continue; + /* This may call nft_chain_validate() recursively, + * callers that do so must increment ctx->level. + */ err = expr->ops->validate(ctx, expr, &data); if (err < 0) return err; @@ -4291,23 +4315,18 @@ static bool nft_set_ops_candidate(const struct nft_set_type *type, u32 flags) * given, in that case the amount of memory per element is used. */ static const struct nft_set_ops * -nft_select_set_ops(const struct nft_ctx *ctx, - const struct nlattr * const nla[], +nft_select_set_ops(const struct nft_ctx *ctx, u32 flags, const struct nft_set_desc *desc) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); const struct nft_set_ops *ops, *bops; struct nft_set_estimate est, best; const struct nft_set_type *type; - u32 flags = 0; int i; lockdep_assert_held(&nft_net->commit_mutex); lockdep_nfnl_nft_mutex_not_held(); - if (nla[NFTA_SET_FLAGS] != NULL) - flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); - bops = NULL; best.size = ~0; best.lookup = ~0; @@ -5060,9 +5079,6 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if ((flags & (NFT_SET_CONSTANT | NFT_SET_TIMEOUT)) == (NFT_SET_CONSTANT | NFT_SET_TIMEOUT)) return -EOPNOTSUPP; - if ((flags & (NFT_SET_CONSTANT | NFT_SET_TIMEOUT)) == - (NFT_SET_CONSTANT | NFT_SET_TIMEOUT)) - return -EOPNOTSUPP; } desc.dtype = 0; @@ -5202,7 +5218,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) return -ENOENT; - ops = nft_select_set_ops(&ctx, nla, &desc); + ops = nft_select_set_ops(&ctx, flags, &desc); if (IS_ERR(ops)) return PTR_ERR(ops); @@ -5736,8 +5752,7 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_data_dump(skb, NFTA_SET_ELEM_DATA, nft_set_ext_data(ext), - set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE, - set->dlen) < 0) + nft_set_datatype(set), set->dlen) < 0) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS) && @@ -6521,7 +6536,7 @@ int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, if (!expr) goto err_expr; - err = nft_expr_clone(expr, set->exprs[i]); + err = nft_expr_clone(expr, set->exprs[i], GFP_KERNEL_ACCOUNT); if (err < 0) { kfree(expr); goto err_expr; @@ -6560,7 +6575,7 @@ static int nft_set_elem_expr_setup(struct nft_ctx *ctx, for (i = 0; i < num_exprs; i++) { expr = nft_setelem_expr_at(elem_expr, elem_expr->size); - err = nft_expr_clone(expr, expr_array[i]); + err = nft_expr_clone(expr, expr_array[i], GFP_KERNEL_ACCOUNT); if (err < 0) goto err_elem_expr_setup; @@ -7772,6 +7787,9 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, if (WARN_ON_ONCE(!type)) return -ENOENT; + if (!obj->ops->update) + return 0; + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); return nf_tables_updobj(&ctx, type, nla[NFTA_OBJ_DATA], obj); @@ -9463,9 +9481,10 @@ static void nft_obj_commit_update(struct nft_trans *trans) obj = nft_trans_obj(trans); newobj = nft_trans_obj_newobj(trans); - if (obj->ops->update) - obj->ops->update(obj, newobj); + if (WARN_ON_ONCE(!obj->ops->update)) + return; + obj->ops->update(obj, newobj); nft_obj_destroy(&trans->ctx, newobj); } @@ -10537,6 +10556,10 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) } else if (trans->ctx.table->flags & __NFT_TABLE_F_WAS_AWAKEN) { trans->ctx.table->flags &= ~NFT_TABLE_F_DORMANT; } + if (trans->ctx.table->flags & __NFT_TABLE_F_WAS_ORPHAN) { + trans->ctx.table->flags &= ~NFT_TABLE_F_OWNER; + trans->ctx.table->nlpid = 0; + } trans->ctx.table->flags &= ~__NFT_TABLE_F_UPDATE; nft_trans_destroy(trans); } else { @@ -10798,150 +10821,6 @@ int nft_chain_validate_hooks(const struct nft_chain *chain, } EXPORT_SYMBOL_GPL(nft_chain_validate_hooks); -/* - * Loop detection - walk through the ruleset beginning at the destination chain - * of a new jump until either the source chain is reached (loop) or all - * reachable chains have been traversed. - * - * The loop check is performed whenever a new jump verdict is added to an - * expression or verdict map or a verdict map is bound to a new chain. - */ - -static int nf_tables_check_loops(const struct nft_ctx *ctx, - const struct nft_chain *chain); - -static int nft_check_loops(const struct nft_ctx *ctx, - const struct nft_set_ext *ext) -{ - const struct nft_data *data; - int ret; - - data = nft_set_ext_data(ext); - switch (data->verdict.code) { - case NFT_JUMP: - case NFT_GOTO: - ret = nf_tables_check_loops(ctx, data->verdict.chain); - break; - default: - ret = 0; - break; - } - - return ret; -} - -static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, - struct nft_set *set, - const struct nft_set_iter *iter, - struct nft_elem_priv *elem_priv) -{ - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); - - if (!nft_set_elem_active(ext, iter->genmask)) - return 0; - - if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && - *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) - return 0; - - return nft_check_loops(ctx, ext); -} - -static int nft_set_catchall_loops(const struct nft_ctx *ctx, - struct nft_set *set) -{ - u8 genmask = nft_genmask_next(ctx->net); - struct nft_set_elem_catchall *catchall; - struct nft_set_ext *ext; - int ret = 0; - - list_for_each_entry_rcu(catchall, &set->catchall_list, list) { - ext = nft_set_elem_ext(set, catchall->elem); - if (!nft_set_elem_active(ext, genmask)) - continue; - - ret = nft_check_loops(ctx, ext); - if (ret < 0) - return ret; - } - - return ret; -} - -static int nf_tables_check_loops(const struct nft_ctx *ctx, - const struct nft_chain *chain) -{ - const struct nft_rule *rule; - const struct nft_expr *expr, *last; - struct nft_set *set; - struct nft_set_binding *binding; - struct nft_set_iter iter; - - if (ctx->chain == chain) - return -ELOOP; - - if (fatal_signal_pending(current)) - return -EINTR; - - list_for_each_entry(rule, &chain->rules, list) { - nft_rule_for_each_expr(expr, last, rule) { - struct nft_immediate_expr *priv; - const struct nft_data *data; - int err; - - if (strcmp(expr->ops->type->name, "immediate")) - continue; - - priv = nft_expr_priv(expr); - if (priv->dreg != NFT_REG_VERDICT) - continue; - - data = &priv->data; - switch (data->verdict.code) { - case NFT_JUMP: - case NFT_GOTO: - err = nf_tables_check_loops(ctx, - data->verdict.chain); - if (err < 0) - return err; - break; - default: - break; - } - } - } - - list_for_each_entry(set, &ctx->table->sets, list) { - if (!nft_is_active_next(ctx->net, set)) - continue; - if (!(set->flags & NFT_SET_MAP) || - set->dtype != NFT_DATA_VERDICT) - continue; - - list_for_each_entry(binding, &set->bindings, list) { - if (!(binding->flags & NFT_SET_MAP) || - binding->chain != chain) - continue; - - iter.genmask = nft_genmask_next(ctx->net); - iter.type = NFT_ITER_UPDATE; - iter.skip = 0; - iter.count = 0; - iter.err = 0; - iter.fn = nf_tables_loop_check_setelem; - - set->ops->walk(ctx, set, &iter); - if (!iter.err) - iter.err = nft_set_catchall_loops(ctx, set); - - if (iter.err < 0) - return iter.err; - } - } - - return 0; -} - /** * nft_parse_u32_check - fetch u32 attribute and check for maximum value * @@ -11054,13 +10933,16 @@ static int nft_validate_register_store(const struct nft_ctx *ctx, if (data != NULL && (data->verdict.code == NFT_GOTO || data->verdict.code == NFT_JUMP)) { - err = nf_tables_check_loops(ctx, data->verdict.chain); + err = nft_chain_validate(ctx, data->verdict.chain); if (err < 0) return err; } return 0; default: + if (type != NFT_DATA_VALUE) + return -EINVAL; + if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE) return -EINVAL; if (len == 0) @@ -11069,8 +10951,6 @@ static int nft_validate_register_store(const struct nft_ctx *ctx, sizeof_field(struct nft_regs, data)) return -ERANGE; - if (data != NULL && type != NFT_DATA_VALUE) - return -EINVAL; return 0; } } @@ -11471,12 +11351,15 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, gc_seq = nft_gc_seq_begin(nft_net); - if (!list_empty(&nf_tables_destroy_list)) - nf_tables_trans_destroy_flush_work(); + nf_tables_trans_destroy_flush_work(); again: list_for_each_entry(table, &nft_net->tables, list) { if (nft_table_has_owner(table) && n->portid == table->nlpid) { + if (table->flags & NFT_TABLE_F_PERSIST) { + table->flags &= ~NFT_TABLE_F_OWNER; + continue; + } __nft_release_hook(net, table); list_del_rcu(&table->list); to_delete[deleted++] = table; diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index c3e6353647..a48d5f0e2f 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -21,7 +21,7 @@ #include <net/netfilter/nf_log.h> #include <net/netfilter/nft_meta.h> -#if defined(CONFIG_RETPOLINE) && defined(CONFIG_X86) +#if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_X86) static struct static_key_false nf_tables_skip_direct_calls; @@ -207,7 +207,7 @@ static void expr_call_ops_eval(const struct nft_expr *expr, struct nft_regs *regs, struct nft_pktinfo *pkt) { -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE unsigned long e; if (nf_skip_indirect_calls()) @@ -236,7 +236,7 @@ static void expr_call_ops_eval(const struct nft_expr *expr, X(e, nft_objref_map_eval); #undef X indirect_call: -#endif /* CONFIG_RETPOLINE */ +#endif /* CONFIG_MITIGATION_RETPOLINE */ expr->ops->eval(expr, regs, pkt); } diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index c9fbe0f707..4abf660c7b 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -427,6 +427,9 @@ replay_abort: nfnl_unlock(subsys_id); + if (nlh->nlmsg_flags & NLM_F_ACK) + nfnl_err_add(&err_list, nlh, 0, &extack); + while (skb->len >= nlmsg_total_size(0)) { int msglen, type; @@ -573,6 +576,8 @@ done: } else if (err) { ss->abort(net, oskb, NFNL_ABORT_NONE); netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL); + } else if (nlh->nlmsg_flags & NLM_F_ACK) { + nfnl_err_add(&err_list, nlh, 0, &extack); } } else { enum nfnl_abort_action abort_action; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 5cf38fc0a3..55e28e1da6 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -169,7 +169,9 @@ instance_destroy_rcu(struct rcu_head *head) struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance, rcu); + rcu_read_lock(); nfqnl_flush(inst, NULL, 0); + rcu_read_unlock(); kfree(inst); module_put(THIS_MODULE); } @@ -225,6 +227,148 @@ find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) return entry; } +static unsigned int nf_iterate(struct sk_buff *skb, + struct nf_hook_state *state, + const struct nf_hook_entries *hooks, + unsigned int *index) +{ + const struct nf_hook_entry *hook; + unsigned int verdict, i = *index; + + while (i < hooks->num_hook_entries) { + hook = &hooks->hooks[i]; +repeat: + verdict = nf_hook_entry_hookfn(hook, skb, state); + if (verdict != NF_ACCEPT) { + *index = i; + if (verdict != NF_REPEAT) + return verdict; + goto repeat; + } + i++; + } + + *index = i; + return NF_ACCEPT; +} + +static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf, u8 hooknum) +{ + switch (pf) { +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE + case NFPROTO_BRIDGE: + return rcu_dereference(net->nf.hooks_bridge[hooknum]); +#endif + case NFPROTO_IPV4: + return rcu_dereference(net->nf.hooks_ipv4[hooknum]); + case NFPROTO_IPV6: + return rcu_dereference(net->nf.hooks_ipv6[hooknum]); + default: + WARN_ON_ONCE(1); + return NULL; + } + + return NULL; +} + +static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) +{ +#ifdef CONFIG_INET + const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); + + if (entry->state.hook == NF_INET_LOCAL_OUT) { + const struct iphdr *iph = ip_hdr(skb); + + if (!(iph->tos == rt_info->tos && + skb->mark == rt_info->mark && + iph->daddr == rt_info->daddr && + iph->saddr == rt_info->saddr)) + return ip_route_me_harder(entry->state.net, entry->state.sk, + skb, RTN_UNSPEC); + } +#endif + return 0; +} + +static int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry) +{ + const struct nf_ipv6_ops *v6ops; + int ret = 0; + + switch (entry->state.pf) { + case AF_INET: + ret = nf_ip_reroute(skb, entry); + break; + case AF_INET6: + v6ops = rcu_dereference(nf_ipv6_ops); + if (v6ops) + ret = v6ops->reroute(skb, entry); + break; + } + return ret; +} + +/* caller must hold rcu read-side lock */ +static void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) +{ + const struct nf_hook_entry *hook_entry; + const struct nf_hook_entries *hooks; + struct sk_buff *skb = entry->skb; + const struct net *net; + unsigned int i; + int err; + u8 pf; + + net = entry->state.net; + pf = entry->state.pf; + + hooks = nf_hook_entries_head(net, pf, entry->state.hook); + + i = entry->hook_index; + if (!hooks || i >= hooks->num_hook_entries) { + kfree_skb_reason(skb, SKB_DROP_REASON_NETFILTER_DROP); + nf_queue_entry_free(entry); + return; + } + + hook_entry = &hooks->hooks[i]; + + /* Continue traversal iff userspace said ok... */ + if (verdict == NF_REPEAT) + verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state); + + if (verdict == NF_ACCEPT) { + if (nf_reroute(skb, entry) < 0) + verdict = NF_DROP; + } + + if (verdict == NF_ACCEPT) { +next_hook: + ++i; + verdict = nf_iterate(skb, &entry->state, hooks, &i); + } + + switch (verdict & NF_VERDICT_MASK) { + case NF_ACCEPT: + case NF_STOP: + local_bh_disable(); + entry->state.okfn(entry->state.net, entry->state.sk, skb); + local_bh_enable(); + break; + case NF_QUEUE: + err = nf_queue(skb, &entry->state, i, verdict); + if (err == 1) + goto next_hook; + break; + case NF_STOLEN: + break; + default: + kfree_skb(skb); + } + + nf_queue_entry_free(entry); +} + static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict) { const struct nf_ct_hook *ct_hook; diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index d170758a1e..7010541fcc 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -325,9 +325,6 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev, struct nft_hook *hook, *found = NULL; int n = 0; - if (event != NETDEV_UNREGISTER) - return; - list_for_each_entry(hook, &basechain->hook_list, list) { if (hook->ops.dev == dev) found = hook; @@ -367,8 +364,7 @@ static int nf_tables_netdev_event(struct notifier_block *this, .net = dev_net(dev), }; - if (event != NETDEV_UNREGISTER && - event != NETDEV_CHANGENAME) + if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; nft_net = nft_pernet(ctx.net); diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index de9d1980df..92b984fa81 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -210,12 +210,12 @@ static void nft_connlimit_destroy(const struct nft_ctx *ctx, nft_connlimit_do_destroy(ctx, priv); } -static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) { struct nft_connlimit *priv_dst = nft_expr_priv(dst); struct nft_connlimit *priv_src = nft_expr_priv(src); - priv_dst->list = kmalloc(sizeof(*priv_dst->list), GFP_ATOMIC); + priv_dst->list = kmalloc(sizeof(*priv_dst->list), gfp); if (!priv_dst->list) return -ENOMEM; diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index dccc68a513..291ed20263 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -226,7 +226,7 @@ static void nft_counter_destroy(const struct nft_ctx *ctx, nft_counter_do_destroy(priv); } -static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) { struct nft_counter_percpu_priv *priv = nft_expr_priv(src); struct nft_counter_percpu_priv *priv_clone = nft_expr_priv(dst); @@ -236,7 +236,7 @@ static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src) nft_counter_fetch(priv, &total); - cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_ATOMIC); + cpu_stats = alloc_percpu_gfp(struct nft_counter, gfp); if (cpu_stats == NULL) return -ENOMEM; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 255640013a..452ed94c3a 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -754,7 +754,7 @@ static bool nft_ct_set_reduce(struct nft_regs_track *track, return false; } -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE static const struct nft_expr_ops nft_ct_get_fast_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), @@ -799,7 +799,7 @@ nft_ct_select_ops(const struct nft_ctx *ctx, return ERR_PTR(-EINVAL); if (tb[NFTA_CT_DREG]) { -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE u32 k = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); switch (k) { diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index c09dba5735..b4ada3ab21 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -35,7 +35,7 @@ static int nft_dynset_expr_setup(const struct nft_dynset *priv, for (i = 0; i < priv->num_exprs; i++) { expr = nft_setelem_expr_at(elem_expr, elem_expr->size); - if (nft_expr_clone(expr, priv->expr_array[i]) < 0) + if (nft_expr_clone(expr, priv->expr_array[i], GFP_ATOMIC) < 0) return -1; elem_expr->size += priv->expr_array[i]->ops->size; diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index 37cfe6dd71..b58f62195f 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -35,11 +35,9 @@ int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, switch (priv->result) { case NFT_FIB_RESULT_OIF: case NFT_FIB_RESULT_OIFNAME: - hooks = (1 << NF_INET_PRE_ROUTING); - if (priv->flags & NFTA_FIB_F_IIF) { - hooks |= (1 << NF_INET_LOCAL_IN) | - (1 << NF_INET_FORWARD); - } + hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD); break; case NFT_FIB_RESULT_ADDRTYPE: if (priv->flags & NFTA_FIB_F_IIF) diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c index 8e6d7eaf9d..de1b6066bf 100644 --- a/net/netfilter/nft_last.c +++ b/net/netfilter/nft_last.c @@ -102,12 +102,12 @@ static void nft_last_destroy(const struct nft_ctx *ctx, kfree(priv->last); } -static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) { struct nft_last_priv *priv_dst = nft_expr_priv(dst); struct nft_last_priv *priv_src = nft_expr_priv(src); - priv_dst->last = kzalloc(sizeof(*priv_dst->last), GFP_ATOMIC); + priv_dst->last = kzalloc(sizeof(*priv_dst->last), gfp); if (!priv_dst->last) return -ENOMEM; diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index cefa25e0db..21d26b79b4 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -150,7 +150,7 @@ static void nft_limit_destroy(const struct nft_ctx *ctx, } static int nft_limit_clone(struct nft_limit_priv *priv_dst, - const struct nft_limit_priv *priv_src) + const struct nft_limit_priv *priv_src, gfp_t gfp) { priv_dst->tokens_max = priv_src->tokens_max; priv_dst->rate = priv_src->rate; @@ -158,7 +158,7 @@ static int nft_limit_clone(struct nft_limit_priv *priv_dst, priv_dst->burst = priv_src->burst; priv_dst->invert = priv_src->invert; - priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), GFP_ATOMIC); + priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), gfp); if (!priv_dst->limit) return -ENOMEM; @@ -223,14 +223,15 @@ static void nft_limit_pkts_destroy(const struct nft_ctx *ctx, nft_limit_destroy(ctx, &priv->limit); } -static int nft_limit_pkts_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_limit_pkts_clone(struct nft_expr *dst, const struct nft_expr *src, + gfp_t gfp) { struct nft_limit_priv_pkts *priv_dst = nft_expr_priv(dst); struct nft_limit_priv_pkts *priv_src = nft_expr_priv(src); priv_dst->cost = priv_src->cost; - return nft_limit_clone(&priv_dst->limit, &priv_src->limit); + return nft_limit_clone(&priv_dst->limit, &priv_src->limit, gfp); } static struct nft_expr_type nft_limit_type; @@ -281,12 +282,13 @@ static void nft_limit_bytes_destroy(const struct nft_ctx *ctx, nft_limit_destroy(ctx, priv); } -static int nft_limit_bytes_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_limit_bytes_clone(struct nft_expr *dst, const struct nft_expr *src, + gfp_t gfp) { struct nft_limit_priv *priv_dst = nft_expr_priv(dst); struct nft_limit_priv *priv_src = nft_expr_priv(src); - return nft_limit_clone(priv_dst, priv_src); + return nft_limit_clone(priv_dst, priv_src, gfp); } static const struct nft_expr_ops nft_limit_bytes_ops = { diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 87c18eddb0..f3080fa1b2 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -24,7 +24,7 @@ struct nft_lookup { struct nft_set_binding binding; }; -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE bool nft_set_do_lookup(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext) { @@ -132,7 +132,8 @@ static int nft_lookup_init(const struct nft_ctx *ctx, return -EINVAL; err = nft_parse_register_store(ctx, tb[NFTA_LOOKUP_DREG], - &priv->dreg, NULL, set->dtype, + &priv->dreg, NULL, + nft_set_datatype(set), set->dlen); if (err < 0) return err; diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index ba0d3683a4..9139ce38ea 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -839,6 +839,9 @@ static int nft_meta_inner_init(const struct nft_ctx *ctx, struct nft_meta *priv = nft_expr_priv(expr); unsigned int len; + if (!tb[NFTA_META_KEY] || !tb[NFTA_META_DREG]) + return -EINVAL; + priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); switch (priv->key) { case NFT_META_PROTOCOL: diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index 7f61506e5b..7fec57ff73 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -63,7 +63,6 @@ static int nft_osf_init(const struct nft_ctx *ctx, { struct nft_osf *priv = nft_expr_priv(expr); u32 flags; - int err; u8 ttl; if (!tb[NFTA_OSF_DREG]) @@ -83,13 +82,9 @@ static int nft_osf_init(const struct nft_ctx *ctx, priv->flags = flags; } - err = nft_parse_register_store(ctx, tb[NFTA_OSF_DREG], &priv->dreg, - NULL, NFT_DATA_VALUE, - NFT_OSF_MAXGENRELEN); - if (err < 0) - return err; - - return 0; + return nft_parse_register_store(ctx, tb[NFTA_OSF_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, + NFT_OSF_MAXGENRELEN); } static int nft_osf_dump(struct sk_buff *skb, diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 0a689c8e02..50429cbd42 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -45,36 +45,27 @@ nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len) int mac_off = skb_mac_header(skb) - skb->data; u8 *vlanh, *dst_u8 = (u8 *) d; struct vlan_ethhdr veth; - u8 vlan_hlen = 0; - - if ((skb->protocol == htons(ETH_P_8021AD) || - skb->protocol == htons(ETH_P_8021Q)) && - offset >= VLAN_ETH_HLEN && offset < VLAN_ETH_HLEN + VLAN_HLEN) - vlan_hlen += VLAN_HLEN; vlanh = (u8 *) &veth; - if (offset < VLAN_ETH_HLEN + vlan_hlen) { + if (offset < VLAN_ETH_HLEN) { u8 ethlen = len; - if (vlan_hlen && - skb_copy_bits(skb, mac_off, &veth, VLAN_ETH_HLEN) < 0) - return false; - else if (!nft_payload_rebuild_vlan_hdr(skb, mac_off, &veth)) + if (!nft_payload_rebuild_vlan_hdr(skb, mac_off, &veth)) return false; - if (offset + len > VLAN_ETH_HLEN + vlan_hlen) - ethlen -= offset + len - VLAN_ETH_HLEN - vlan_hlen; + if (offset + len > VLAN_ETH_HLEN) + ethlen -= offset + len - VLAN_ETH_HLEN; - memcpy(dst_u8, vlanh + offset - vlan_hlen, ethlen); + memcpy(dst_u8, vlanh + offset, ethlen); len -= ethlen; if (len == 0) return true; dst_u8 += ethlen; - offset = ETH_HLEN + vlan_hlen; + offset = ETH_HLEN; } else { - offset -= VLAN_HLEN + vlan_hlen; + offset -= VLAN_HLEN; } return skb_copy_bits(skb, offset + mac_off, dst_u8, len) == 0; @@ -154,12 +145,12 @@ int nft_payload_inner_offset(const struct nft_pktinfo *pkt) return pkt->inneroff; } -static bool nft_payload_need_vlan_copy(const struct nft_payload *priv) +static bool nft_payload_need_vlan_adjust(u32 offset, u32 len) { - unsigned int len = priv->offset + priv->len; + unsigned int boundary = offset + len; /* data past ether src/dst requested, copy needed */ - if (len > offsetof(struct ethhdr, h_proto)) + if (boundary > offsetof(struct ethhdr, h_proto)) return true; return false; @@ -183,7 +174,7 @@ void nft_payload_eval(const struct nft_expr *expr, goto err; if (skb_vlan_tag_present(skb) && - nft_payload_need_vlan_copy(priv)) { + nft_payload_need_vlan_adjust(priv->offset, priv->len)) { if (!nft_payload_copy_vlan(dest, skb, priv->offset, priv->len)) goto err; @@ -659,6 +650,10 @@ static int nft_payload_inner_init(const struct nft_ctx *ctx, struct nft_payload *priv = nft_expr_priv(expr); u32 base; + if (!tb[NFTA_PAYLOAD_BASE] || !tb[NFTA_PAYLOAD_OFFSET] || + !tb[NFTA_PAYLOAD_LEN] || !tb[NFTA_PAYLOAD_DREG]) + return -EINVAL; + base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); switch (base) { case NFT_PAYLOAD_TUN_HEADER: @@ -810,21 +805,79 @@ struct nft_payload_set { u8 csum_flags; }; +/* This is not struct vlan_hdr. */ +struct nft_payload_vlan_hdr { + __be16 h_vlan_proto; + __be16 h_vlan_TCI; +}; + +static bool +nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u8 offset, u8 len, + int *vlan_hlen) +{ + struct nft_payload_vlan_hdr *vlanh; + __be16 vlan_proto; + u16 vlan_tci; + + if (offset >= offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto)) { + *vlan_hlen = VLAN_HLEN; + return true; + } + + switch (offset) { + case offsetof(struct vlan_ethhdr, h_vlan_proto): + if (len == 2) { + vlan_proto = nft_reg_load_be16(src); + skb->vlan_proto = vlan_proto; + } else if (len == 4) { + vlanh = (struct nft_payload_vlan_hdr *)src; + __vlan_hwaccel_put_tag(skb, vlanh->h_vlan_proto, + ntohs(vlanh->h_vlan_TCI)); + } else { + return false; + } + break; + case offsetof(struct vlan_ethhdr, h_vlan_TCI): + if (len != 2) + return false; + + vlan_tci = ntohs(nft_reg_load_be16(src)); + skb->vlan_tci = vlan_tci; + break; + default: + return false; + } + + return true; +} + static void nft_payload_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_payload_set *priv = nft_expr_priv(expr); - struct sk_buff *skb = pkt->skb; const u32 *src = ®s->data[priv->sreg]; - int offset, csum_offset; + int offset, csum_offset, vlan_hlen = 0; + struct sk_buff *skb = pkt->skb; __wsum fsum, tsum; switch (priv->base) { case NFT_PAYLOAD_LL_HEADER: if (!skb_mac_header_was_set(skb)) goto err; - offset = skb_mac_header(skb) - skb->data; + + if (skb_vlan_tag_present(skb) && + nft_payload_need_vlan_adjust(priv->offset, priv->len)) { + if (!nft_payload_set_vlan(src, skb, + priv->offset, priv->len, + &vlan_hlen)) + goto err; + + if (!vlan_hlen) + return; + } + + offset = skb_mac_header(skb) - skb->data - vlan_hlen; break; case NFT_PAYLOAD_NETWORK_HEADER: offset = skb_network_offset(skb); diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index 3ba12a7471..9b2d7463d3 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -233,7 +233,7 @@ static void nft_quota_destroy(const struct nft_ctx *ctx, return nft_quota_do_destroy(ctx, priv); } -static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) { struct nft_quota *priv_dst = nft_expr_priv(dst); struct nft_quota *priv_src = nft_expr_priv(src); @@ -241,7 +241,7 @@ static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src) priv_dst->quota = priv_src->quota; priv_dst->flags = priv_src->flags; - priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), GFP_ATOMIC); + priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), gfp); if (!priv_dst->consumed) return -ENOMEM; diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index 24d9771385..14d88394bc 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -73,14 +73,14 @@ void nft_rt_get_eval(const struct nft_expr *expr, if (nft_pf(pkt) != NFPROTO_IPV4) goto err; - *dest = (__force u32)rt_nexthop((const struct rtable *)dst, + *dest = (__force u32)rt_nexthop(dst_rtable(dst), ip_hdr(skb)->daddr); break; case NFT_RT_NEXTHOP6: if (nft_pf(pkt) != NFPROTO_IPV6) goto err; - memcpy(dest, rt6_nexthop((struct rt6_info *)dst, + memcpy(dest, rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr), sizeof(struct in6_addr)); break; diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index b42a34087e..eb4c4a4ac7 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -359,11 +359,13 @@ * * Return: -1 on no match, bit position on 'match_only', 0 otherwise. */ -int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst, +int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules, + unsigned long *dst, const union nft_pipapo_map_bucket *mt, bool match_only) { unsigned long bitset; - int k, ret = -1; + unsigned int k; + int ret = -1; for (k = 0; k < len; k++) { bitset = map[k]; @@ -432,7 +434,7 @@ bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, res_map = scratch->map + (map_index ? m->bsize_max : 0); fill_map = scratch->map + (map_index ? 0 : m->bsize_max); - memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); + pipapo_resmap_init(m, res_map); nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1; @@ -502,9 +504,11 @@ out: * pipapo_get() - Get matching element reference given key data * @net: Network namespace * @set: nftables API set representation + * @m: storage containing active/existing elements * @data: Key data to be matched against existing elements * @genmask: If set, check that element is active in given genmask * @tstamp: timestamp to check for expired elements + * @gfp: the type of memory to allocate (see kmalloc). * * This is essentially the same as the lookup function, except that it matches * key data against the uncommitted copy and doesn't use preallocated maps for @@ -514,31 +518,31 @@ out: */ static struct nft_pipapo_elem *pipapo_get(const struct net *net, const struct nft_set *set, + const struct nft_pipapo_match *m, const u8 *data, u8 genmask, - u64 tstamp) + u64 tstamp, gfp_t gfp) { struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT); - struct nft_pipapo *priv = nft_set_priv(set); unsigned long *res_map, *fill_map = NULL; - const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; int i; - m = priv->clone; + if (m->bsize_max == 0) + return ret; - res_map = kmalloc_array(m->bsize_max, sizeof(*res_map), GFP_ATOMIC); + res_map = kmalloc_array(m->bsize_max, sizeof(*res_map), gfp); if (!res_map) { ret = ERR_PTR(-ENOMEM); goto out; } - fill_map = kcalloc(m->bsize_max, sizeof(*res_map), GFP_ATOMIC); + fill_map = kcalloc(m->bsize_max, sizeof(*res_map), gfp); if (!fill_map) { ret = ERR_PTR(-ENOMEM); goto out; } - memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); + pipapo_resmap_init(m, res_map); nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1; @@ -607,10 +611,13 @@ static struct nft_elem_priv * nft_pipapo_get(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags) { + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m = rcu_dereference(priv->match); struct nft_pipapo_elem *e; - e = pipapo_get(net, set, (const u8 *)elem->key.val.data, - nft_genmask_cur(net), get_jiffies_64()); + e = pipapo_get(net, set, m, (const u8 *)elem->key.val.data, + nft_genmask_cur(net), get_jiffies_64(), + GFP_ATOMIC); if (IS_ERR(e)) return ERR_CAST(e); @@ -618,6 +625,65 @@ nft_pipapo_get(const struct net *net, const struct nft_set *set, } /** + * pipapo_realloc_mt() - Reallocate mapping table if needed upon resize + * @f: Field containing mapping table + * @old_rules: Amount of existing mapped rules + * @rules: Amount of new rules to map + * + * Return: 0 on success, negative error code on failure. + */ +static int pipapo_realloc_mt(struct nft_pipapo_field *f, + unsigned int old_rules, unsigned int rules) +{ + union nft_pipapo_map_bucket *new_mt = NULL, *old_mt = f->mt; + const unsigned int extra = PAGE_SIZE / sizeof(*new_mt); + unsigned int rules_alloc = rules; + + might_sleep(); + + if (unlikely(rules == 0)) + goto out_free; + + /* growing and enough space left, no action needed */ + if (rules > old_rules && f->rules_alloc > rules) + return 0; + + /* downsize and extra slack has not grown too large */ + if (rules < old_rules) { + unsigned int remove = f->rules_alloc - rules; + + if (remove < (2u * extra)) + return 0; + } + + /* If set needs more than one page of memory for rules then + * allocate another extra page to avoid frequent reallocation. + */ + if (rules > extra && + check_add_overflow(rules, extra, &rules_alloc)) + return -EOVERFLOW; + + new_mt = kvmalloc_array(rules_alloc, sizeof(*new_mt), GFP_KERNEL); + if (!new_mt) + return -ENOMEM; + + if (old_mt) + memcpy(new_mt, old_mt, min(old_rules, rules) * sizeof(*new_mt)); + + if (rules > old_rules) { + memset(new_mt + old_rules, 0, + (rules - old_rules) * sizeof(*new_mt)); + } +out_free: + f->rules_alloc = rules_alloc; + f->mt = new_mt; + + kvfree(old_mt); + + return 0; +} + +/** * pipapo_resize() - Resize lookup or mapping table, or both * @f: Field containing lookup and mapping tables * @old_rules: Previous amount of rules in field @@ -629,12 +695,15 @@ nft_pipapo_get(const struct net *net, const struct nft_set *set, * * Return: 0 on success, -ENOMEM on allocation failure. */ -static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules) +static int pipapo_resize(struct nft_pipapo_field *f, + unsigned int old_rules, unsigned int rules) { long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p; - union nft_pipapo_map_bucket *new_mt, *old_mt = f->mt; - size_t new_bucket_size, copy; - int group, bucket; + unsigned int new_bucket_size, copy; + int group, bucket, err; + + if (rules >= NFT_PIPAPO_RULE0_MAX) + return -ENOSPC; new_bucket_size = DIV_ROUND_UP(rules, BITS_PER_LONG); #ifdef NFT_PIPAPO_ALIGN @@ -674,27 +743,18 @@ static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules) } mt: - new_mt = kvmalloc(rules * sizeof(*new_mt), GFP_KERNEL); - if (!new_mt) { + err = pipapo_realloc_mt(f, old_rules, rules); + if (err) { kvfree(new_lt); - return -ENOMEM; - } - - memcpy(new_mt, f->mt, min(old_rules, rules) * sizeof(*new_mt)); - if (rules > old_rules) { - memset(new_mt + old_rules, 0, - (rules - old_rules) * sizeof(*new_mt)); + return err; } if (new_lt) { f->bsize = new_bucket_size; - NFT_PIPAPO_LT_ASSIGN(f, new_lt); + f->lt = new_lt; kvfree(old_lt); } - f->mt = new_mt; - kvfree(old_mt); - return 0; } @@ -845,8 +905,8 @@ static void pipapo_lt_8b_to_4b(int old_groups, int bsize, */ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) { + unsigned int groups, bb; unsigned long *new_lt; - int groups, bb; size_t lt_size; lt_size = f->groups * NFT_PIPAPO_BUCKETS(f->bb) * f->bsize * @@ -896,7 +956,7 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) f->groups = groups; f->bb = bb; kvfree(f->lt); - NFT_PIPAPO_LT_ASSIGN(f, new_lt); + f->lt = new_lt; } /** @@ -913,7 +973,7 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k, int mask_bits) { - int rule = f->rules, group, ret, bit_offset = 0; + unsigned int rule = f->rules, group, ret, bit_offset = 0; ret = pipapo_resize(f, f->rules, f->rules + 1); if (ret) @@ -1188,6 +1248,40 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, return 0; } +static bool nft_pipapo_transaction_mutex_held(const struct nft_set *set) +{ +#ifdef CONFIG_PROVE_LOCKING + const struct net *net = read_pnet(&set->net); + + return lockdep_is_held(&nft_pernet(net)->commit_mutex); +#else + return true; +#endif +} + +static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old); + +/** + * pipapo_maybe_clone() - Build clone for pending data changes, if not existing + * @set: nftables API set representation + * + * Return: newly created or existing clone, if any. NULL on allocation failure + */ +static struct nft_pipapo_match *pipapo_maybe_clone(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m; + + if (priv->clone) + return priv->clone; + + m = rcu_dereference_protected(priv->match, + nft_pipapo_transaction_mutex_held(set)); + priv->clone = pipapo_clone(m); + + return priv->clone; +} + /** * nft_pipapo_insert() - Validate and insert ranged elements * @net: Network namespace @@ -1204,8 +1298,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; const u8 *start = (const u8 *)elem->key.val.data, *end; - struct nft_pipapo *priv = nft_set_priv(set); - struct nft_pipapo_match *m = priv->clone; + struct nft_pipapo_match *m = pipapo_maybe_clone(set); u8 genmask = nft_genmask_next(net); struct nft_pipapo_elem *e, *dup; u64 tstamp = nft_net_tstamp(net); @@ -1213,12 +1306,15 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, const u8 *start_p, *end_p; int i, bsize_max, err = 0; + if (!m) + return -ENOMEM; + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) end = (const u8 *)nft_set_ext_key_end(ext)->data; else end = start; - dup = pipapo_get(net, set, start, genmask, tstamp); + dup = pipapo_get(net, set, m, start, genmask, tstamp, GFP_KERNEL); if (!IS_ERR(dup)) { /* Check if we already have the same exact entry */ const struct nft_data *dup_key, *dup_end; @@ -1240,7 +1336,8 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, if (PTR_ERR(dup) == -ENOENT) { /* Look for partially overlapping entries */ - dup = pipapo_get(net, set, end, nft_genmask_next(net), tstamp); + dup = pipapo_get(net, set, m, end, nft_genmask_next(net), tstamp, + GFP_KERNEL); } if (PTR_ERR(dup) != -ENOENT) { @@ -1253,8 +1350,14 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, /* Validate */ start_p = start; end_p = end; + + /* some helpers return -1, or 0 >= for valid rule pos, + * so we cannot support more than INT_MAX rules at this time. + */ + BUILD_BUG_ON(NFT_PIPAPO_RULE0_MAX > INT_MAX); + nft_pipapo_for_each_field(f, i, m) { - if (f->rules >= (unsigned long)NFT_PIPAPO_RULE0_MAX) + if (f->rules >= NFT_PIPAPO_RULE0_MAX) return -ENOSPC; if (memcmp(start_p, end_p, @@ -1266,8 +1369,6 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, } /* Insert */ - priv->dirty = true; - bsize_max = m->bsize_max; nft_pipapo_for_each_field(f, i, m) { @@ -1318,7 +1419,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, * pipapo_clone() - Clone matching data to create new working copy * @old: Existing matching data * - * Return: copy of matching data passed as 'old', error pointer on failure + * Return: copy of matching data passed as 'old' or NULL. */ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) { @@ -1328,7 +1429,7 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) new = kmalloc(struct_size(new, f, old->field_count), GFP_KERNEL); if (!new) - return ERR_PTR(-ENOMEM); + return NULL; new->field_count = old->field_count; new->bsize_max = old->bsize_max; @@ -1360,18 +1461,25 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) if (!new_lt) goto out_lt; - NFT_PIPAPO_LT_ASSIGN(dst, new_lt); + dst->lt = new_lt; memcpy(NFT_PIPAPO_LT_ALIGN(new_lt), NFT_PIPAPO_LT_ALIGN(src->lt), src->bsize * sizeof(*dst->lt) * src->groups * NFT_PIPAPO_BUCKETS(src->bb)); - dst->mt = kvmalloc(src->rules * sizeof(*src->mt), GFP_KERNEL); - if (!dst->mt) - goto out_mt; + if (src->rules > 0) { + dst->mt = kvmalloc_array(src->rules_alloc, + sizeof(*src->mt), GFP_KERNEL); + if (!dst->mt) + goto out_mt; + + memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt)); + } else { + dst->mt = NULL; + dst->rules_alloc = 0; + } - memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt)); src++; dst++; } @@ -1393,7 +1501,7 @@ out_scratch: free_percpu(new->scratch); kfree(new); - return ERR_PTR(-ENOMEM); + return NULL; } /** @@ -1425,10 +1533,10 @@ out_scratch: * * Return: Number of rules that originated from the same entry as @first. */ -static int pipapo_rules_same_key(struct nft_pipapo_field *f, int first) +static unsigned int pipapo_rules_same_key(struct nft_pipapo_field *f, unsigned int first) { struct nft_pipapo_elem *e = NULL; /* Keep gcc happy */ - int r; + unsigned int r; for (r = first; r < f->rules; r++) { if (r != first && e != f->mt[r].e) @@ -1481,8 +1589,9 @@ static int pipapo_rules_same_key(struct nft_pipapo_field *f, int first) * 0 1 2 * element pointers: 0x42 0x42 0x44 */ -static void pipapo_unmap(union nft_pipapo_map_bucket *mt, int rules, - int start, int n, int to_offset, bool is_last) +static void pipapo_unmap(union nft_pipapo_map_bucket *mt, unsigned int rules, + unsigned int start, unsigned int n, + unsigned int to_offset, bool is_last) { int i; @@ -1588,8 +1697,8 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) { struct nft_pipapo *priv = nft_set_priv(set); struct net *net = read_pnet(&set->net); + unsigned int rules_f0, first_rule = 0; u64 tstamp = nft_net_tstamp(net); - int rules_f0, first_rule = 0; struct nft_pipapo_elem *e; struct nft_trans_gc *gc; @@ -1600,7 +1709,7 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; const struct nft_pipapo_field *f; - int i, start, rules_fx; + unsigned int i, start, rules_fx; start = first_rule; rules_fx = rules_f0; @@ -1624,8 +1733,6 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) * NFT_SET_ELEM_DEAD_BIT. */ if (__nft_set_elem_expired(&e->ext, tstamp)) { - priv->dirty = true; - gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); if (!gc) return; @@ -1703,57 +1810,30 @@ static void pipapo_reclaim_match(struct rcu_head *rcu) static void nft_pipapo_commit(struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); - struct nft_pipapo_match *new_clone, *old; - - if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) - pipapo_gc(set, priv->clone); + struct nft_pipapo_match *old; - if (!priv->dirty) + if (!priv->clone) return; - new_clone = pipapo_clone(priv->clone); - if (IS_ERR(new_clone)) - return; + if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) + pipapo_gc(set, priv->clone); - priv->dirty = false; + old = rcu_replace_pointer(priv->match, priv->clone, + nft_pipapo_transaction_mutex_held(set)); + priv->clone = NULL; - old = rcu_access_pointer(priv->match); - rcu_assign_pointer(priv->match, priv->clone); if (old) call_rcu(&old->rcu, pipapo_reclaim_match); - - priv->clone = new_clone; -} - -static bool nft_pipapo_transaction_mutex_held(const struct nft_set *set) -{ -#ifdef CONFIG_PROVE_LOCKING - const struct net *net = read_pnet(&set->net); - - return lockdep_is_held(&nft_pernet(net)->commit_mutex); -#else - return true; -#endif } static void nft_pipapo_abort(const struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); - struct nft_pipapo_match *new_clone, *m; - if (!priv->dirty) + if (!priv->clone) return; - - m = rcu_dereference_protected(priv->match, nft_pipapo_transaction_mutex_held(set)); - - new_clone = pipapo_clone(m); - if (IS_ERR(new_clone)) - return; - - priv->dirty = false; - pipapo_free_match(priv->clone); - priv->clone = new_clone; + priv->clone = NULL; } /** @@ -1777,51 +1857,38 @@ static void nft_pipapo_activate(const struct net *net, } /** - * pipapo_deactivate() - Check that element is in set, mark as inactive + * nft_pipapo_deactivate() - Search for element and make it inactive * @net: Network namespace * @set: nftables API set representation - * @data: Input key data - * @ext: nftables API extension pointer, used to check for end element - * - * This is a convenience function that can be called from both - * nft_pipapo_deactivate() and nft_pipapo_flush(), as they are in fact the same - * operation. + * @elem: nftables API element representation containing key data * * Return: deactivated element if found, NULL otherwise. */ -static void *pipapo_deactivate(const struct net *net, const struct nft_set *set, - const u8 *data, const struct nft_set_ext *ext) +static struct nft_elem_priv * +nft_pipapo_deactivate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) { + struct nft_pipapo_match *m = pipapo_maybe_clone(set); struct nft_pipapo_elem *e; - e = pipapo_get(net, set, data, nft_genmask_next(net), nft_net_tstamp(net)); + /* removal must occur on priv->clone, if we are low on memory + * we have no choice and must fail the removal request. + */ + if (!m) + return NULL; + + e = pipapo_get(net, set, m, (const u8 *)elem->key.val.data, + nft_genmask_next(net), nft_net_tstamp(net), GFP_KERNEL); if (IS_ERR(e)) return NULL; nft_set_elem_change_active(net, set, &e->ext); - return e; -} - -/** - * nft_pipapo_deactivate() - Call pipapo_deactivate() to make element inactive - * @net: Network namespace - * @set: nftables API set representation - * @elem: nftables API element representation containing key data - * - * Return: deactivated element if found, NULL otherwise. - */ -static struct nft_elem_priv * -nft_pipapo_deactivate(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) -{ - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); - - return pipapo_deactivate(net, set, (const u8 *)elem->key.val.data, ext); + return &e->priv; } /** - * nft_pipapo_flush() - Call pipapo_deactivate() to make element inactive + * nft_pipapo_flush() - make element inactive * @net: Network namespace * @set: nftables API set representation * @elem_priv: nftables API element representation containing key data @@ -1978,7 +2045,7 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m = priv->clone; - int rules_f0, first_rule = 0; + unsigned int rules_f0, first_rule = 0; struct nft_pipapo_elem *e; const u8 *data; @@ -2018,7 +2085,6 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); if (last && f->mt[rulemap[i].to].e == e) { - priv->dirty = true; pipapo_drop(m, rulemap); return; } @@ -2031,34 +2097,22 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, } /** - * nft_pipapo_walk() - Walk over elements + * nft_pipapo_do_walk() - Walk over elements in m * @ctx: nftables API context * @set: nftables API set representation + * @m: matching data pointing to key mapping array * @iter: Iterator * * As elements are referenced in the mapping array for the last field, directly * scan that array: there's no need to follow rule mappings from the first - * field. + * field. @m is protected either by RCU read lock or by transaction mutex. */ -static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_iter *iter) +static void nft_pipapo_do_walk(const struct nft_ctx *ctx, struct nft_set *set, + const struct nft_pipapo_match *m, + struct nft_set_iter *iter) { - struct nft_pipapo *priv = nft_set_priv(set); - const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; - int i, r; - - WARN_ON_ONCE(iter->type != NFT_ITER_READ && - iter->type != NFT_ITER_UPDATE); - - rcu_read_lock(); - if (iter->type == NFT_ITER_READ) - m = rcu_dereference(priv->match); - else - m = priv->clone; - - if (unlikely(!m)) - goto out; + unsigned int i, r; for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) ; @@ -2076,14 +2130,49 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, iter->err = iter->fn(ctx, set, iter, &e->priv); if (iter->err < 0) - goto out; + return; cont: iter->count++; } +} -out: - rcu_read_unlock(); +/** + * nft_pipapo_walk() - Walk over elements + * @ctx: nftables API context + * @set: nftables API set representation + * @iter: Iterator + * + * Test if destructive action is needed or not, clone active backend if needed + * and call the real function to work on the data. + */ +static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_pipapo *priv = nft_set_priv(set); + const struct nft_pipapo_match *m; + + switch (iter->type) { + case NFT_ITER_UPDATE: + m = pipapo_maybe_clone(set); + if (!m) { + iter->err = -ENOMEM; + return; + } + + nft_pipapo_do_walk(ctx, set, m, iter); + break; + case NFT_ITER_READ: + rcu_read_lock(); + m = rcu_dereference(priv->match); + nft_pipapo_do_walk(ctx, set, m, iter); + rcu_read_unlock(); + break; + default: + iter->err = -EINVAL; + WARN_ON_ONCE(1); + break; + } } /** @@ -2150,6 +2239,9 @@ static int nft_pipapo_init(const struct nft_set *set, field_count = desc->field_count ? : 1; + BUILD_BUG_ON(NFT_PIPAPO_MAX_FIELDS > 255); + BUILD_BUG_ON(NFT_PIPAPO_MAX_FIELDS != NFT_REG32_COUNT); + if (field_count > NFT_PIPAPO_MAX_FIELDS) return -EINVAL; @@ -2171,7 +2263,11 @@ static int nft_pipapo_init(const struct nft_set *set, rcu_head_init(&m->rcu); nft_pipapo_for_each_field(f, i, m) { - int len = desc->field_len[i] ? : set->klen; + unsigned int len = desc->field_len[i] ? : set->klen; + + /* f->groups is u8 */ + BUILD_BUG_ON((NFT_PIPAPO_MAX_BYTES * + BITS_PER_BYTE / NFT_PIPAPO_GROUP_BITS_LARGE_SET) >= 256); f->bb = NFT_PIPAPO_GROUP_BITS_INIT; f->groups = len * NFT_PIPAPO_GROUPS_PER_BYTE(f); @@ -2180,25 +2276,15 @@ static int nft_pipapo_init(const struct nft_set *set, f->bsize = 0; f->rules = 0; - NFT_PIPAPO_LT_ASSIGN(f, NULL); + f->rules_alloc = 0; + f->lt = NULL; f->mt = NULL; } - /* Create an initial clone of matching data for next insertion */ - priv->clone = pipapo_clone(m); - if (IS_ERR(priv->clone)) { - err = PTR_ERR(priv->clone); - goto out_free; - } - - priv->dirty = false; - rcu_assign_pointer(priv->match, m); return 0; -out_free: - free_percpu(m->scratch); out_scratch: kfree(m); @@ -2216,7 +2302,7 @@ static void nft_set_pipapo_match_destroy(const struct nft_ctx *ctx, struct nft_pipapo_match *m) { struct nft_pipapo_field *f; - int i, r; + unsigned int i, r; for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) ; @@ -2243,33 +2329,18 @@ static void nft_pipapo_destroy(const struct nft_ctx *ctx, { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m; - int cpu; m = rcu_dereference_protected(priv->match, true); - if (m) { - rcu_barrier(); - - for_each_possible_cpu(cpu) - pipapo_free_scratch(m, cpu); - free_percpu(m->scratch); - pipapo_free_fields(m); - kfree(m); - priv->match = NULL; - } if (priv->clone) { - m = priv->clone; - - nft_set_pipapo_match_destroy(ctx, set, m); - - for_each_possible_cpu(cpu) - pipapo_free_scratch(priv->clone, cpu); - free_percpu(priv->clone->scratch); - - pipapo_free_fields(priv->clone); - kfree(priv->clone); + nft_set_pipapo_match_destroy(ctx, set, priv->clone); + pipapo_free_match(priv->clone); priv->clone = NULL; + } else { + nft_set_pipapo_match_destroy(ctx, set, m); } + + pipapo_free_match(m); } /** diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h index 42464e7c24..4a2ff85ce1 100644 --- a/net/netfilter/nft_set_pipapo.h +++ b/net/netfilter/nft_set_pipapo.h @@ -70,15 +70,9 @@ #define NFT_PIPAPO_ALIGN_HEADROOM \ (NFT_PIPAPO_ALIGN - ARCH_KMALLOC_MINALIGN) #define NFT_PIPAPO_LT_ALIGN(lt) (PTR_ALIGN((lt), NFT_PIPAPO_ALIGN)) -#define NFT_PIPAPO_LT_ASSIGN(field, x) \ - do { \ - (field)->lt_aligned = NFT_PIPAPO_LT_ALIGN(x); \ - (field)->lt = (x); \ - } while (0) #else #define NFT_PIPAPO_ALIGN_HEADROOM 0 #define NFT_PIPAPO_LT_ALIGN(lt) (lt) -#define NFT_PIPAPO_LT_ASSIGN(field, x) ((field)->lt = (x)) #endif /* NFT_PIPAPO_ALIGN */ #define nft_pipapo_for_each_field(field, index, match) \ @@ -110,22 +104,20 @@ union nft_pipapo_map_bucket { /** * struct nft_pipapo_field - Lookup, mapping tables and related data for a field - * @groups: Amount of bit groups * @rules: Number of inserted rules * @bsize: Size of each bucket in lookup table, in longs + * @rules_alloc: Number of allocated rules, always >= rules + * @groups: Amount of bit groups * @bb: Number of bits grouped together in lookup table buckets * @lt: Lookup table: 'groups' rows of buckets - * @lt_aligned: Version of @lt aligned to NFT_PIPAPO_ALIGN bytes * @mt: Mapping table: one bucket per rule */ struct nft_pipapo_field { - int groups; - unsigned long rules; - size_t bsize; - int bb; -#ifdef NFT_PIPAPO_ALIGN - unsigned long *lt_aligned; -#endif + unsigned int rules; + unsigned int bsize; + unsigned int rules_alloc; + u8 groups; + u8 bb; unsigned long *lt; union nft_pipapo_map_bucket *mt; }; @@ -145,15 +137,15 @@ struct nft_pipapo_scratch { /** * struct nft_pipapo_match - Data used for lookup and matching * @field_count: Amount of fields in set - * @scratch: Preallocated per-CPU maps for partial matching results * @bsize_max: Maximum lookup table bucket size of all fields, in longs + * @scratch: Preallocated per-CPU maps for partial matching results * @rcu: Matching data is swapped on commits * @f: Fields, with lookup and mapping tables */ struct nft_pipapo_match { - int field_count; + u8 field_count; + unsigned int bsize_max; struct nft_pipapo_scratch * __percpu *scratch; - size_t bsize_max; struct rcu_head rcu; struct nft_pipapo_field f[] __counted_by(field_count); }; @@ -163,14 +155,12 @@ struct nft_pipapo_match { * @match: Currently in-use matching data * @clone: Copy where pending insertions and deletions are kept * @width: Total bytes to be matched for one packet, including padding - * @dirty: Working copy has pending insertions or deletions * @last_gc: Timestamp of last garbage collection run, jiffies */ struct nft_pipapo { struct nft_pipapo_match __rcu *match; struct nft_pipapo_match *clone; int width; - bool dirty; unsigned long last_gc; }; @@ -186,7 +176,8 @@ struct nft_pipapo_elem { struct nft_set_ext ext; }; -int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst, +int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules, + unsigned long *dst, const union nft_pipapo_map_bucket *mt, bool match_only); /** @@ -287,4 +278,25 @@ static u64 pipapo_estimate_size(const struct nft_set_desc *desc) return size; } +/** + * pipapo_resmap_init() - Initialise result map before first use + * @m: Matching data, including mapping table + * @res_map: Result map + * + * Initialize all bits covered by the first field to one, so that after + * the first step, only the matching bits of the first bit group remain. + * + * If other fields have a large bitmap, set remainder of res_map to 0. + */ +static inline void pipapo_resmap_init(const struct nft_pipapo_match *m, unsigned long *res_map) +{ + const struct nft_pipapo_field *f = m->f; + int i; + + for (i = 0; i < f->bsize; i++) + res_map[i] = ULONG_MAX; + + for (i = f->bsize; i < m->bsize_max; i++) + res_map[i] = 0ul; +} #endif /* _NFT_SET_PIPAPO_H */ diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index d08407d589..b8d3c3213e 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1036,6 +1036,7 @@ nothing: /** * nft_pipapo_avx2_lookup_slow() - Fallback function for uncommon field sizes + * @mdata: Matching data, including mapping table * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables @@ -1051,7 +1052,8 @@ nothing: * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ -static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill, +static int nft_pipapo_avx2_lookup_slow(const struct nft_pipapo_match *mdata, + unsigned long *map, unsigned long *fill, const struct nft_pipapo_field *f, int offset, const u8 *pkt, bool first, bool last) @@ -1060,7 +1062,7 @@ static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill, int i, ret = -1, b; if (first) - memset(map, 0xff, bsize * sizeof(*map)); + pipapo_resmap_init(mdata, map); for (i = offset; i < bsize; i++) { if (f->bb == 8) @@ -1137,8 +1139,14 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, bool map_index; int i, ret = 0; - if (unlikely(!irq_fpu_usable())) - return nft_pipapo_lookup(net, set, key, ext); + local_bh_disable(); + + if (unlikely(!irq_fpu_usable())) { + bool fallback_res = nft_pipapo_lookup(net, set, key, ext); + + local_bh_enable(); + return fallback_res; + } m = rcu_dereference(priv->match); @@ -1153,6 +1161,7 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, scratch = *raw_cpu_ptr(m->scratch); if (unlikely(!scratch)) { kernel_fpu_end(); + local_bh_enable(); return false; } @@ -1186,7 +1195,7 @@ next_match: } else if (f->groups == 16) { NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16); } else { - ret = nft_pipapo_avx2_lookup_slow(res, fill, f, + ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f, ret, rp, first, last); } @@ -1202,7 +1211,7 @@ next_match: } else if (f->groups == 32) { NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32); } else { - ret = nft_pipapo_avx2_lookup_slow(res, fill, f, + ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f, ret, rp, first, last); } @@ -1233,6 +1242,7 @@ out: if (i % 2) scratch->map_index = !map_index; kernel_fpu_end(); + local_bh_enable(); return ret >= 0; } diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index f735d79d8b..60a76e6e34 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -174,8 +174,8 @@ struct nft_tunnel_opts { struct erspan_metadata erspan; u8 data[IP_TUNNEL_OPTS_MAX]; } u; + IP_TUNNEL_DECLARE_FLAGS(flags); u32 len; - __be16 flags; }; struct nft_tunnel_obj { @@ -271,7 +271,8 @@ static int nft_tunnel_obj_vxlan_init(const struct nlattr *attr, opts->u.vxlan.gbp = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_VXLAN_GBP])); opts->len = sizeof(struct vxlan_metadata); - opts->flags = TUNNEL_VXLAN_OPT; + ip_tunnel_flags_zero(opts->flags); + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, opts->flags); return 0; } @@ -325,7 +326,8 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr, opts->u.erspan.version = version; opts->len = sizeof(struct erspan_metadata); - opts->flags = TUNNEL_ERSPAN_OPT; + ip_tunnel_flags_zero(opts->flags); + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, opts->flags); return 0; } @@ -366,7 +368,8 @@ static int nft_tunnel_obj_geneve_init(const struct nlattr *attr, opt->length = data_len / 4; opt->opt_class = nla_get_be16(tb[NFTA_TUNNEL_KEY_GENEVE_CLASS]); opt->type = nla_get_u8(tb[NFTA_TUNNEL_KEY_GENEVE_TYPE]); - opts->flags = TUNNEL_GENEVE_OPT; + ip_tunnel_flags_zero(opts->flags); + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, opts->flags); return 0; } @@ -385,8 +388,8 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx, struct nft_tunnel_opts *opts) { struct nlattr *nla; - __be16 type = 0; int err, rem; + u32 type = 0; err = nla_validate_nested_deprecated(attr, NFTA_TUNNEL_KEY_OPTS_MAX, nft_tunnel_opts_policy, NULL); @@ -401,7 +404,7 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx, err = nft_tunnel_obj_vxlan_init(nla, opts); if (err) return err; - type = TUNNEL_VXLAN_OPT; + type = IP_TUNNEL_VXLAN_OPT_BIT; break; case NFTA_TUNNEL_KEY_OPTS_ERSPAN: if (type) @@ -409,15 +412,15 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx, err = nft_tunnel_obj_erspan_init(nla, opts); if (err) return err; - type = TUNNEL_ERSPAN_OPT; + type = IP_TUNNEL_ERSPAN_OPT_BIT; break; case NFTA_TUNNEL_KEY_OPTS_GENEVE: - if (type && type != TUNNEL_GENEVE_OPT) + if (type && type != IP_TUNNEL_GENEVE_OPT_BIT) return -EINVAL; err = nft_tunnel_obj_geneve_init(nla, opts); if (err) return err; - type = TUNNEL_GENEVE_OPT; + type = IP_TUNNEL_GENEVE_OPT_BIT; break; default: return -EOPNOTSUPP; @@ -454,7 +457,9 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx, memset(&info, 0, sizeof(info)); info.mode = IP_TUNNEL_INFO_TX; info.key.tun_id = key32_to_tunnel_id(nla_get_be32(tb[NFTA_TUNNEL_KEY_ID])); - info.key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; + __set_bit(IP_TUNNEL_KEY_BIT, info.key.tun_flags); + __set_bit(IP_TUNNEL_CSUM_BIT, info.key.tun_flags); + __set_bit(IP_TUNNEL_NOCACHE_BIT, info.key.tun_flags); if (tb[NFTA_TUNNEL_KEY_IP]) { err = nft_tunnel_obj_ip_init(ctx, tb[NFTA_TUNNEL_KEY_IP], &info); @@ -483,11 +488,12 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; if (tun_flags & NFT_TUNNEL_F_ZERO_CSUM_TX) - info.key.tun_flags &= ~TUNNEL_CSUM; + __clear_bit(IP_TUNNEL_CSUM_BIT, info.key.tun_flags); if (tun_flags & NFT_TUNNEL_F_DONT_FRAGMENT) - info.key.tun_flags |= TUNNEL_DONT_FRAGMENT; + __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, + info.key.tun_flags); if (tun_flags & NFT_TUNNEL_F_SEQ_NUMBER) - info.key.tun_flags |= TUNNEL_SEQ; + __set_bit(IP_TUNNEL_SEQ_BIT, info.key.tun_flags); } if (tb[NFTA_TUNNEL_KEY_TOS]) info.key.tos = nla_get_u8(tb[NFTA_TUNNEL_KEY_TOS]); @@ -583,7 +589,7 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, if (!nest) return -1; - if (opts->flags & TUNNEL_VXLAN_OPT) { + if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, opts->flags)) { inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_VXLAN); if (!inner) goto failure; @@ -591,7 +597,7 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, htonl(opts->u.vxlan.gbp))) goto inner_failure; nla_nest_end(skb, inner); - } else if (opts->flags & TUNNEL_ERSPAN_OPT) { + } else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, opts->flags)) { inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_ERSPAN); if (!inner) goto failure; @@ -613,7 +619,7 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, break; } nla_nest_end(skb, inner); - } else if (opts->flags & TUNNEL_GENEVE_OPT) { + } else if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, opts->flags)) { struct geneve_opt *opt; int offset = 0; @@ -658,11 +664,11 @@ static int nft_tunnel_flags_dump(struct sk_buff *skb, { u32 flags = 0; - if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) + if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags)) flags |= NFT_TUNNEL_F_DONT_FRAGMENT; - if (!(info->key.tun_flags & TUNNEL_CSUM)) + if (!test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags)) flags |= NFT_TUNNEL_F_ZERO_CSUM_TX; - if (info->key.tun_flags & TUNNEL_SEQ) + if (test_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags)) flags |= NFT_TUNNEL_F_SEQ_NUMBER; if (nla_put_be32(skb, NFTA_TUNNEL_KEY_FLAGS, htonl(flags)) < 0) diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c index acef4155f0..008419db81 100644 --- a/net/netfilter/utils.c +++ b/net/netfilter/utils.c @@ -179,43 +179,6 @@ int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl, } EXPORT_SYMBOL_GPL(nf_route); -static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) -{ -#ifdef CONFIG_INET - const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); - - if (entry->state.hook == NF_INET_LOCAL_OUT) { - const struct iphdr *iph = ip_hdr(skb); - - if (!(iph->tos == rt_info->tos && - skb->mark == rt_info->mark && - iph->daddr == rt_info->daddr && - iph->saddr == rt_info->saddr)) - return ip_route_me_harder(entry->state.net, entry->state.sk, - skb, RTN_UNSPEC); - } -#endif - return 0; -} - -int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry) -{ - const struct nf_ipv6_ops *v6ops; - int ret = 0; - - switch (entry->state.pf) { - case AF_INET: - ret = nf_ip_reroute(skb, entry); - break; - case AF_INET6: - v6ops = rcu_dereference(nf_ipv6_ops); - if (v6ops) - ret = v6ops->reroute(skb, entry); - break; - } - return ret; -} - /* Only get and check the lengths, not do any hop-by-hop stuff. */ int nf_ip6_check_hbh_len(struct sk_buff *skb, u32 *plen) { diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 21624d6831..da5d929c7c 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1142,7 +1142,8 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr, if (target->compat_from_user) target->compat_from_user(t->data, ct->data); else - memcpy(t->data, ct->data, tsize - sizeof(*ct)); + unsafe_memcpy(t->data, ct->data, tsize - sizeof(*ct), + /* UAPI 0-sized destination */); tsize += off; t->u.user.target_size = tsize; diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 27511c90a2..cd9160bbc9 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -610,7 +610,7 @@ int netlbl_catmap_walk(struct netlbl_lsm_catmap *catmap, u32 offset) struct netlbl_lsm_catmap *iter; u32 idx; u32 bit; - NETLBL_CATMAP_MAPTYPE bitmap; + u64 bitmap; iter = _netlbl_catmap_getnode(&catmap, offset, _CM_F_WALK, 0); if (iter == NULL) @@ -666,8 +666,8 @@ int netlbl_catmap_walkrng(struct netlbl_lsm_catmap *catmap, u32 offset) struct netlbl_lsm_catmap *prev = NULL; u32 idx; u32 bit; - NETLBL_CATMAP_MAPTYPE bitmask; - NETLBL_CATMAP_MAPTYPE bitmap; + u64 bitmask; + u64 bitmap; iter = _netlbl_catmap_getnode(&catmap, offset, _CM_F_WALK, 0); if (iter == NULL) @@ -857,7 +857,7 @@ int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap, offset -= iter->startbit; idx = offset / NETLBL_CATMAP_MAPSIZE; - iter->bitmap[idx] |= (NETLBL_CATMAP_MAPTYPE)bitmap + iter->bitmap[idx] |= (u64)bitmap << (offset % NETLBL_CATMAP_MAPSIZE); return 0; @@ -876,7 +876,7 @@ int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap, * Description: * Starting at @offset, walk the bitmap from left to right until either the * desired bit is found or we reach the end. Return the bit offset, -1 if - * not found, or -2 if error. + * not found. */ int netlbl_bitmap_walk(const unsigned char *bitmap, u32 bitmap_len, u32 offset, u8 state) @@ -965,6 +965,7 @@ int netlbl_enabled(void) * @sk: the socket to label * @family: protocol family * @secattr: the security attributes + * @sk_locked: true if caller holds the socket lock * * Description: * Attach the correct label to the given socket using the security attributes @@ -977,7 +978,8 @@ int netlbl_enabled(void) */ int netlbl_sock_setattr(struct sock *sk, u16 family, - const struct netlbl_lsm_secattr *secattr) + const struct netlbl_lsm_secattr *secattr, + bool sk_locked) { int ret_val; struct netlbl_dom_map *dom_entry; @@ -997,7 +999,7 @@ int netlbl_sock_setattr(struct sock *sk, case NETLBL_NLTYPE_CIPSOV4: ret_val = cipso_v4_sock_setattr(sk, dom_entry->def.cipso, - secattr); + secattr, sk_locked); break; case NETLBL_NLTYPE_UNLABELED: ret_val = 0; @@ -1091,6 +1093,28 @@ int netlbl_sock_getattr(struct sock *sk, } /** + * netlbl_sk_lock_check - Check if the socket lock has been acquired. + * @sk: the socket to be checked + * + * Return: true if socket @sk is locked or if lock debugging is disabled at + * runtime or compile-time; false otherwise + * + */ +#ifdef CONFIG_LOCKDEP +bool netlbl_sk_lock_check(struct sock *sk) +{ + if (debug_locks) + return lockdep_sock_is_held(sk); + return true; +} +#else +bool netlbl_sk_lock_check(struct sock *sk) +{ + return true; +} +#endif + +/** * netlbl_conn_setattr - Label a connected socket using the correct protocol * @sk: the socket to label * @addr: the destination address @@ -1126,7 +1150,8 @@ int netlbl_conn_setattr(struct sock *sk, switch (entry->type) { case NETLBL_NLTYPE_CIPSOV4: ret_val = cipso_v4_sock_setattr(sk, - entry->cipso, secattr); + entry->cipso, secattr, + netlbl_sk_lock_check(sk)); break; case NETLBL_NLTYPE_UNLABELED: /* just delete the protocols we support for right now diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index ff31535126..fa9c090cf6 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -59,7 +59,6 @@ #include <linux/rhashtable.h> #include <asm/cacheflush.h> #include <linux/hash.h> -#include <linux/genetlink.h> #include <linux/net_namespace.h> #include <linux/nospec.h> #include <linux/btf_ids.h> @@ -73,6 +72,7 @@ #include <trace/events/netlink.h> #include "af_netlink.h" +#include "genetlink.h" struct listeners { struct rcu_head rcu; @@ -130,7 +130,7 @@ static const char *const nlk_cb_mutex_key_strings[MAX_LINKS + 1] = { "nlk_cb_mutex-MAX_LINKS" }; -static int netlink_dump(struct sock *sk); +static int netlink_dump(struct sock *sk, bool lock_taken); /* nl_table locking explained: * Lookup and traversal are protected with an RCU read-side lock. Insertion @@ -636,7 +636,7 @@ static struct proto netlink_proto = { }; static int __netlink_create(struct net *net, struct socket *sock, - struct mutex *cb_mutex, int protocol, + struct mutex *dump_cb_mutex, int protocol, int kern) { struct sock *sk; @@ -651,15 +651,11 @@ static int __netlink_create(struct net *net, struct socket *sock, sock_init_data(sock, sk); nlk = nlk_sk(sk); - if (cb_mutex) { - nlk->cb_mutex = cb_mutex; - } else { - nlk->cb_mutex = &nlk->cb_def_mutex; - mutex_init(nlk->cb_mutex); - lockdep_set_class_and_name(nlk->cb_mutex, + mutex_init(&nlk->nl_cb_mutex); + lockdep_set_class_and_name(&nlk->nl_cb_mutex, nlk_cb_mutex_keys + protocol, nlk_cb_mutex_key_strings[protocol]); - } + nlk->dump_cb_mutex = dump_cb_mutex; init_waitqueue_head(&nlk->wait); sk->sk_destruct = netlink_sock_destruct; @@ -1206,23 +1202,21 @@ struct sock *netlink_getsockbyfilp(struct file *filp) struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast) { + size_t head_size = SKB_HEAD_ALIGN(size); struct sk_buff *skb; void *data; - if (size <= NLMSG_GOODSIZE || broadcast) + if (head_size <= PAGE_SIZE || broadcast) return alloc_skb(size, GFP_KERNEL); - size = SKB_DATA_ALIGN(size) + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - - data = vmalloc(size); - if (data == NULL) + data = kvmalloc(head_size, GFP_KERNEL); + if (!data) return NULL; - skb = __build_skb(data, size); - if (skb == NULL) - vfree(data); - else + skb = __build_skb(data, head_size); + if (!skb) + kvfree(data); + else if (is_vmalloc_addr(data)) skb->destructor = netlink_skb_destructor; return skb; @@ -1779,6 +1773,9 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, netlink_unlock_table(); return err; } + case NETLINK_LISTEN_ALL_NSID: + flag = NETLINK_F_LISTEN_ALL_NSID; + break; case NETLINK_CAP_ACK: flag = NETLINK_F_CAP_ACK; break; @@ -1987,7 +1984,7 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (READ_ONCE(nlk->cb_running) && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) { - ret = netlink_dump(sk); + ret = netlink_dump(sk, false); if (ret) { WRITE_ONCE(sk->sk_err, -ret); sk_error_report(sk); @@ -2168,6 +2165,69 @@ __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int fla } EXPORT_SYMBOL(__nlmsg_put); +static size_t +netlink_ack_tlv_len(struct netlink_sock *nlk, int err, + const struct netlink_ext_ack *extack) +{ + size_t tlvlen; + + if (!extack || !test_bit(NETLINK_F_EXT_ACK, &nlk->flags)) + return 0; + + tlvlen = 0; + if (extack->_msg) + tlvlen += nla_total_size(strlen(extack->_msg) + 1); + if (extack->cookie_len) + tlvlen += nla_total_size(extack->cookie_len); + + /* Following attributes are only reported as error (not warning) */ + if (!err) + return tlvlen; + + if (extack->bad_attr) + tlvlen += nla_total_size(sizeof(u32)); + if (extack->policy) + tlvlen += netlink_policy_dump_attr_size_estimate(extack->policy); + if (extack->miss_type) + tlvlen += nla_total_size(sizeof(u32)); + if (extack->miss_nest) + tlvlen += nla_total_size(sizeof(u32)); + + return tlvlen; +} + +static void +netlink_ack_tlv_fill(struct sk_buff *in_skb, struct sk_buff *skb, + const struct nlmsghdr *nlh, int err, + const struct netlink_ext_ack *extack) +{ + if (extack->_msg) + WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg)); + if (extack->cookie_len) + WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE, + extack->cookie_len, extack->cookie)); + + if (!err) + return; + + if (extack->bad_attr && + !WARN_ON((u8 *)extack->bad_attr < in_skb->data || + (u8 *)extack->bad_attr >= in_skb->data + in_skb->len)) + WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS, + (u8 *)extack->bad_attr - (const u8 *)nlh)); + if (extack->policy) + netlink_policy_dump_write_attr(skb, extack->policy, + NLMSGERR_ATTR_POLICY); + if (extack->miss_type) + WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_TYPE, + extack->miss_type)); + if (extack->miss_nest && + !WARN_ON((u8 *)extack->miss_nest < in_skb->data || + (u8 *)extack->miss_nest > in_skb->data + in_skb->len)) + WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_NEST, + (u8 *)extack->miss_nest - (const u8 *)nlh)); +} + /* * It looks a bit ugly. * It would be better to create kernel thread. @@ -2178,6 +2238,7 @@ static int netlink_dump_done(struct netlink_sock *nlk, struct sk_buff *skb, struct netlink_ext_ack *extack) { struct nlmsghdr *nlh; + size_t extack_len; nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(nlk->dump_done_errno), NLM_F_MULTI | cb->answer_flags); @@ -2187,16 +2248,20 @@ static int netlink_dump_done(struct netlink_sock *nlk, struct sk_buff *skb, nl_dump_check_consistent(cb, nlh); memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, sizeof(nlk->dump_done_errno)); - if (extack->_msg && test_bit(NETLINK_F_EXT_ACK, &nlk->flags)) { + extack_len = netlink_ack_tlv_len(nlk, nlk->dump_done_errno, extack); + if (extack_len) { nlh->nlmsg_flags |= NLM_F_ACK_TLVS; - if (!nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg)) + if (skb_tailroom(skb) >= extack_len) { + netlink_ack_tlv_fill(cb->skb, skb, cb->nlh, + nlk->dump_done_errno, extack); nlmsg_end(skb, nlh); + } } return 0; } -static int netlink_dump(struct sock *sk) +static int netlink_dump(struct sock *sk, bool lock_taken) { struct netlink_sock *nlk = nlk_sk(sk); struct netlink_ext_ack extack = {}; @@ -2208,7 +2273,8 @@ static int netlink_dump(struct sock *sk) int alloc_min_size; int alloc_size; - mutex_lock(nlk->cb_mutex); + if (!lock_taken) + mutex_lock(&nlk->nl_cb_mutex); if (!nlk->cb_running) { err = -EINVAL; goto errout_skb; @@ -2260,14 +2326,33 @@ static int netlink_dump(struct sock *sk) netlink_skb_set_owner_r(skb, sk); if (nlk->dump_done_errno > 0) { + struct mutex *extra_mutex = nlk->dump_cb_mutex; + cb->extack = &extack; + + if (cb->flags & RTNL_FLAG_DUMP_UNLOCKED) + extra_mutex = NULL; + if (extra_mutex) + mutex_lock(extra_mutex); nlk->dump_done_errno = cb->dump(skb, cb); + if (extra_mutex) + mutex_unlock(extra_mutex); + + /* EMSGSIZE plus something already in the skb means + * that there's more to dump but current skb has filled up. + * If the callback really wants to return EMSGSIZE to user space + * it needs to do so again, on the next cb->dump() call, + * without putting data in the skb. + */ + if (nlk->dump_done_errno == -EMSGSIZE && skb->len) + nlk->dump_done_errno = skb->len; + cb->extack = NULL; } if (nlk->dump_done_errno > 0 || skb_tailroom(skb) < nlmsg_total_size(sizeof(nlk->dump_done_errno))) { - mutex_unlock(nlk->cb_mutex); + mutex_unlock(&nlk->nl_cb_mutex); if (sk_filter(sk, skb)) kfree_skb(skb); @@ -2301,13 +2386,13 @@ static int netlink_dump(struct sock *sk) WRITE_ONCE(nlk->cb_running, false); module = cb->module; skb = cb->skb; - mutex_unlock(nlk->cb_mutex); + mutex_unlock(&nlk->nl_cb_mutex); module_put(module); consume_skb(skb); return 0; errout_skb: - mutex_unlock(nlk->cb_mutex); + mutex_unlock(&nlk->nl_cb_mutex); kfree_skb(skb); return err; } @@ -2330,7 +2415,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, } nlk = nlk_sk(sk); - mutex_lock(nlk->cb_mutex); + mutex_lock(&nlk->nl_cb_mutex); /* A dump is in progress... */ if (nlk->cb_running) { ret = -EBUSY; @@ -2350,6 +2435,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, cb->data = control->data; cb->module = control->module; cb->min_dump_alloc = control->min_dump_alloc; + cb->flags = control->flags; cb->skb = skb; cb->strict_check = nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk); @@ -2365,9 +2451,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, WRITE_ONCE(nlk->cb_running, true); nlk->dump_done_errno = INT_MAX; - mutex_unlock(nlk->cb_mutex); - - ret = netlink_dump(sk); + ret = netlink_dump(sk, true); sock_put(sk); @@ -2383,76 +2467,13 @@ error_put: module_put(control->module); error_unlock: sock_put(sk); - mutex_unlock(nlk->cb_mutex); + mutex_unlock(&nlk->nl_cb_mutex); error_free: kfree_skb(skb); return ret; } EXPORT_SYMBOL(__netlink_dump_start); -static size_t -netlink_ack_tlv_len(struct netlink_sock *nlk, int err, - const struct netlink_ext_ack *extack) -{ - size_t tlvlen; - - if (!extack || !test_bit(NETLINK_F_EXT_ACK, &nlk->flags)) - return 0; - - tlvlen = 0; - if (extack->_msg) - tlvlen += nla_total_size(strlen(extack->_msg) + 1); - if (extack->cookie_len) - tlvlen += nla_total_size(extack->cookie_len); - - /* Following attributes are only reported as error (not warning) */ - if (!err) - return tlvlen; - - if (extack->bad_attr) - tlvlen += nla_total_size(sizeof(u32)); - if (extack->policy) - tlvlen += netlink_policy_dump_attr_size_estimate(extack->policy); - if (extack->miss_type) - tlvlen += nla_total_size(sizeof(u32)); - if (extack->miss_nest) - tlvlen += nla_total_size(sizeof(u32)); - - return tlvlen; -} - -static void -netlink_ack_tlv_fill(struct sk_buff *in_skb, struct sk_buff *skb, - struct nlmsghdr *nlh, int err, - const struct netlink_ext_ack *extack) -{ - if (extack->_msg) - WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg)); - if (extack->cookie_len) - WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE, - extack->cookie_len, extack->cookie)); - - if (!err) - return; - - if (extack->bad_attr && - !WARN_ON((u8 *)extack->bad_attr < in_skb->data || - (u8 *)extack->bad_attr >= in_skb->data + in_skb->len)) - WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS, - (u8 *)extack->bad_attr - (u8 *)nlh)); - if (extack->policy) - netlink_policy_dump_write_attr(skb, extack->policy, - NLMSGERR_ATTR_POLICY); - if (extack->miss_type) - WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_TYPE, - extack->miss_type)); - if (extack->miss_nest && - !WARN_ON((u8 *)extack->miss_nest < in_skb->data || - (u8 *)extack->miss_nest > in_skb->data + in_skb->len)) - WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_NEST, - (u8 *)extack->miss_nest - (u8 *)nlh)); -} - void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, const struct netlink_ext_ack *extack) { diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 2145979b99..9751e29d4b 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -39,8 +39,9 @@ struct netlink_sock { bool cb_running; int dump_done_errno; struct netlink_callback cb; - struct mutex *cb_mutex; - struct mutex cb_def_mutex; + struct mutex nl_cb_mutex; + + struct mutex *dump_cb_mutex; void (*netlink_rcv)(struct sk_buff *skb); int (*netlink_bind)(struct net *net, int group); void (*netlink_unbind)(struct net *net, int group); diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 1eeff94228..61981e01fd 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -207,7 +207,7 @@ static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) err = __netlink_diag_dump(skb, cb, req->sdiag_protocol, s_num); } - return err < 0 ? err : skb->len; + return err <= 0 ? err : skb->len; } static int netlink_diag_dump_done(struct netlink_callback *cb) @@ -241,6 +241,7 @@ static int netlink_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) } static const struct sock_diag_handler netlink_diag_handler = { + .owner = THIS_MODULE, .family = AF_NETLINK, .dump = netlink_diag_handler_dump, }; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 8c7af02f84..feb54c63a1 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -22,6 +22,8 @@ #include <net/sock.h> #include <net/genetlink.h> +#include "genetlink.h" + static DEFINE_MUTEX(genl_mutex); /* serialization of message processing */ static DECLARE_RWSEM(cb_lock); @@ -1232,7 +1234,7 @@ static int ctrl_fill_info(const struct genl_family *family, u32 portid, u32 seq, hdr = genlmsg_put(skb, portid, seq, &genl_ctrl, flags, cmd); if (hdr == NULL) - return -1; + return -EMSGSIZE; if (nla_put_string(skb, CTRL_ATTR_FAMILY_NAME, family->name) || nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, family->id) || @@ -1355,6 +1357,7 @@ static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb) struct net *net = sock_net(skb->sk); int fams_to_skip = cb->args[0]; unsigned int id; + int err = 0; idr_for_each_entry(&genl_fam_idr, rt, id) { if (!rt->netnsok && !net_eq(net, &init_net)) @@ -1363,16 +1366,17 @@ static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb) if (n++ < fams_to_skip) continue; - if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, - skb, CTRL_CMD_NEWFAMILY) < 0) { + err = ctrl_fill_info(rt, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + skb, CTRL_CMD_NEWFAMILY); + if (err) { n--; break; } } cb->args[0] = n; - return skb->len; + return err; } static struct sk_buff *ctrl_build_family_msg(const struct genl_family *family, @@ -1836,6 +1840,9 @@ static int genl_bind(struct net *net, int group) !ns_capable(net->user_ns, CAP_SYS_ADMIN)) ret = -EPERM; + if (family->bind) + family->bind(i); + break; } @@ -1843,12 +1850,39 @@ static int genl_bind(struct net *net, int group) return ret; } +static void genl_unbind(struct net *net, int group) +{ + const struct genl_family *family; + unsigned int id; + + down_read(&cb_lock); + + idr_for_each_entry(&genl_fam_idr, family, id) { + int i; + + if (family->n_mcgrps == 0) + continue; + + i = group - family->mcgrp_offset; + if (i < 0 || i >= family->n_mcgrps) + continue; + + if (family->unbind) + family->unbind(i); + + break; + } + + up_read(&cb_lock); +} + static int __net_init genl_pernet_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = genl_rcv, .flags = NL_CFG_F_NONROOT_RECV, .bind = genl_bind, + .unbind = genl_unbind, .release = genl_release, }; diff --git a/net/netlink/genetlink.h b/net/netlink/genetlink.h new file mode 100644 index 0000000000..89bd9d2631 --- /dev/null +++ b/net/netlink/genetlink.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NET_GENETLINK_H +#define __NET_GENETLINK_H + +#include <linux/wait.h> + +/* for synchronisation between af_netlink and genetlink */ +extern atomic_t genl_sk_destructing_cnt; +extern wait_queue_head_t genl_sk_destructing_waitq; + +#endif /* __LINUX_GENERIC_NETLINK_H */ diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 104a80b754..6ee148f0e6 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -772,8 +772,8 @@ out_release: return err; } -static int nr_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +static int nr_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { struct sk_buff *skb; struct sock *newsk; @@ -805,7 +805,7 @@ static int nr_accept(struct socket *sock, struct socket *newsock, int flags, if (skb) break; - if (flags & O_NONBLOCK) { + if (arg->flags & O_NONBLOCK) { err = -EWOULDBLOCK; break; } diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index 70480869ad..bd2b17b219 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -285,22 +285,14 @@ static int __must_check nr_add_node(ax25_address *nr, const char *mnemonic, return 0; } -static inline void __nr_remove_node(struct nr_node *nr_node) +static void nr_remove_node_locked(struct nr_node *nr_node) { + lockdep_assert_held(&nr_node_list_lock); + hlist_del_init(&nr_node->node_node); nr_node_put(nr_node); } -#define nr_remove_node_locked(__node) \ - __nr_remove_node(__node) - -static void nr_remove_node(struct nr_node *nr_node) -{ - spin_lock_bh(&nr_node_list_lock); - __nr_remove_node(nr_node); - spin_unlock_bh(&nr_node_list_lock); -} - static inline void __nr_remove_neigh(struct nr_neigh *nr_neigh) { hlist_del_init(&nr_neigh->neigh_node); @@ -339,6 +331,7 @@ static int nr_del_node(ax25_address *callsign, ax25_address *neighbour, struct n return -EINVAL; } + spin_lock_bh(&nr_node_list_lock); nr_node_lock(nr_node); for (i = 0; i < nr_node->count; i++) { if (nr_node->routes[i].neighbour == nr_neigh) { @@ -352,7 +345,7 @@ static int nr_del_node(ax25_address *callsign, ax25_address *neighbour, struct n nr_node->count--; if (nr_node->count == 0) { - nr_remove_node(nr_node); + nr_remove_node_locked(nr_node); } else { switch (i) { case 0: @@ -367,12 +360,14 @@ static int nr_del_node(ax25_address *callsign, ax25_address *neighbour, struct n nr_node_put(nr_node); } nr_node_unlock(nr_node); + spin_unlock_bh(&nr_node_list_lock); return 0; } } nr_neigh_put(nr_neigh); nr_node_unlock(nr_node); + spin_unlock_bh(&nr_node_list_lock); nr_node_put(nr_node); return -EINVAL; diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c index 4e7c968cde..5e3ca068f0 100644 --- a/net/netrom/nr_timer.c +++ b/net/netrom/nr_timer.c @@ -121,7 +121,8 @@ static void nr_heartbeat_expiry(struct timer_list *t) is accepted() it isn't 'dead' so doesn't get removed. */ if (sock_flag(sk, SOCK_DESTROY) || (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) { - sock_hold(sk); + if (sk->sk_state == TCP_LISTEN) + sock_hold(sk); bh_unlock_sock(sk); nr_destroy_socket(sk); goto out; diff --git a/net/netrom/sysctl_net_netrom.c b/net/netrom/sysctl_net_netrom.c index 79fb2d3f47..7dc0fa628f 100644 --- a/net/netrom/sysctl_net_netrom.c +++ b/net/netrom/sysctl_net_netrom.c @@ -140,7 +140,6 @@ static struct ctl_table nr_table[] = { .extra1 = &min_reset, .extra2 = &max_reset }, - { } }; int __init nr_register_sysctl(void) diff --git a/net/nfc/core.c b/net/nfc/core.c index eb2c0959e5..e58dc64050 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -1015,7 +1015,7 @@ static void nfc_check_pres_timeout(struct timer_list *t) schedule_work(&dev->check_pres_work); } -struct class nfc_class = { +const struct class nfc_class = { .name = "nfc", .dev_release = nfc_release, }; diff --git a/net/nfc/hci/llc.c b/net/nfc/hci/llc.c index 2140f67246..ba91284f40 100644 --- a/net/nfc/hci/llc.c +++ b/net/nfc/hci/llc.c @@ -30,15 +30,19 @@ exit: return r; } +static void nfc_llc_del_engine(struct nfc_llc_engine *llc_engine) +{ + list_del(&llc_engine->entry); + kfree_const(llc_engine->name); + kfree(llc_engine); +} + void nfc_llc_exit(void) { struct nfc_llc_engine *llc_engine, *n; - list_for_each_entry_safe(llc_engine, n, &llc_engines, entry) { - list_del(&llc_engine->entry); - kfree(llc_engine->name); - kfree(llc_engine); - } + list_for_each_entry_safe(llc_engine, n, &llc_engines, entry) + nfc_llc_del_engine(llc_engine); } int nfc_llc_register(const char *name, const struct nfc_llc_ops *ops) @@ -49,7 +53,7 @@ int nfc_llc_register(const char *name, const struct nfc_llc_ops *ops) if (llc_engine == NULL) return -ENOMEM; - llc_engine->name = kstrdup(name, GFP_KERNEL); + llc_engine->name = kstrdup_const(name, GFP_KERNEL); if (llc_engine->name == NULL) { kfree(llc_engine); return -ENOMEM; @@ -82,9 +86,7 @@ void nfc_llc_unregister(const char *name) if (llc_engine == NULL) return; - list_del(&llc_engine->entry); - kfree(llc_engine->name); - kfree(llc_engine); + nfc_llc_del_engine(llc_engine); } struct nfc_llc *nfc_llc_allocate(const char *name, struct nfc_hci_dev *hdev, diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 819157bbb5..57a2f97004 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -252,10 +252,10 @@ static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = copy_safe_from_sockptr(&opt, sizeof(opt), + optval, optlen); + if (err) break; - } if (opt > LLCP_MAX_RW) { err = -EINVAL; @@ -274,10 +274,10 @@ static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = copy_safe_from_sockptr(&opt, sizeof(opt), + optval, optlen); + if (err) break; - } if (opt > LLCP_MAX_MIUX) { err = -EINVAL; @@ -447,7 +447,7 @@ struct sock *nfc_llcp_accept_dequeue(struct sock *parent, } static int llcp_sock_accept(struct socket *sock, struct socket *newsock, - int flags, bool kern) + struct proto_accept_arg *arg) { DECLARE_WAITQUEUE(wait, current); struct sock *sk = sock->sk, *new_sk; @@ -463,7 +463,7 @@ static int llcp_sock_accept(struct socket *sock, struct socket *newsock, goto error; } - timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); /* Wait for an incoming connection. */ add_wait_queue_exclusive(sk_sleep(sk), &wait); diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 0d26c8ec99..f456a5911e 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -1463,6 +1463,19 @@ int nci_core_ntf_packet(struct nci_dev *ndev, __u16 opcode, ndev->ops->n_core_ops); } +static bool nci_valid_size(struct sk_buff *skb) +{ + BUILD_BUG_ON(NCI_CTRL_HDR_SIZE != NCI_DATA_HDR_SIZE); + unsigned int hdr_size = NCI_CTRL_HDR_SIZE; + + if (skb->len < hdr_size || + !nci_plen(skb->data) || + skb->len < hdr_size + nci_plen(skb->data)) { + return false; + } + return true; +} + /* ---- NCI TX Data worker thread ---- */ static void nci_tx_work(struct work_struct *work) @@ -1516,9 +1529,9 @@ static void nci_rx_work(struct work_struct *work) nfc_send_to_raw_sock(ndev->nfc_dev, skb, RAW_PAYLOAD_NCI, NFC_DIRECTION_RX); - if (!nci_plen(skb->data)) { + if (!nci_valid_size(skb)) { kfree_skb(skb); - break; + continue; } /* Process frame */ diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index aa1dbf654c..dd2ce73a24 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -969,8 +969,7 @@ static int nfc_genl_dep_link_down(struct sk_buff *skb, struct genl_info *info) int rc; u32 idx; - if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || - !info->attrs[NFC_ATTR_TARGET_INDEX]) + if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); @@ -1018,8 +1017,7 @@ static int nfc_genl_llc_get_params(struct sk_buff *skb, struct genl_info *info) struct sk_buff *msg = NULL; u32 idx; - if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || - !info->attrs[NFC_ATTR_FIRMWARE_NAME]) + if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); diff --git a/net/nsh/nsh.c b/net/nsh/nsh.c index f4a38bd6a7..bfb7758063 100644 --- a/net/nsh/nsh.c +++ b/net/nsh/nsh.c @@ -77,13 +77,15 @@ EXPORT_SYMBOL_GPL(nsh_pop); static struct sk_buff *nsh_gso_segment(struct sk_buff *skb, netdev_features_t features) { + unsigned int outer_hlen, mac_len, nsh_len; struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; - unsigned int nsh_len, mac_len; - __be16 proto; + __be16 outer_proto, proto; skb_reset_network_header(skb); + outer_proto = skb->protocol; + outer_hlen = skb_mac_header_len(skb); mac_len = skb->mac_len; if (unlikely(!pskb_may_pull(skb, NSH_BASE_HDR_LEN))) @@ -113,10 +115,10 @@ static struct sk_buff *nsh_gso_segment(struct sk_buff *skb, } for (skb = segs; skb; skb = skb->next) { - skb->protocol = htons(ETH_P_NSH); - __skb_push(skb, nsh_len); - skb->mac_header = mac_offset; - skb->network_header = skb->mac_header + mac_len; + skb->protocol = outer_proto; + __skb_push(skb, nsh_len + outer_hlen); + skb_reset_mac_header(skb); + skb_set_network_header(skb, outer_hlen); skb->mac_len = mac_len; } diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 6fcd7e2ca8..9642255808 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -936,6 +936,12 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, pskb_trim(skb, ovs_mac_header_len(key)); } + /* Need to set the pkt_type to involve the routing layer. The + * packet movement through the OVS datapath doesn't generally + * use routing, but this is needed for tunnel cases. + */ + skb->pkt_type = PACKET_OUTGOING; + if (likely(!mru || (skb->len <= mru + vport->dev->hard_header_len))) { ovs_vport_send(vport, skb, ovs_key_mac_proto(key)); diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 2928c142a2..3b980bf277 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -168,8 +168,13 @@ static u32 ovs_ct_get_mark(const struct nf_conn *ct) static void ovs_ct_get_labels(const struct nf_conn *ct, struct ovs_key_ct_labels *labels) { - struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL; + struct nf_conn_labels *cl = NULL; + if (ct) { + if (ct->master && !nf_ct_is_confirmed(ct)) + ct = ct->master; + cl = nf_ct_labels_find(ct); + } if (cl) memcpy(labels, cl->bits, OVS_CT_LABELS_LEN); else diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 11c69415c6..99d72543ab 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -15,7 +15,6 @@ #include <linux/delay.h> #include <linux/time.h> #include <linux/etherdevice.h> -#include <linux/genetlink.h> #include <linux/kernel.h> #include <linux/kthread.h> #include <linux/mutex.h> diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 33b21a0c05..8a848ce72e 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -561,7 +561,6 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, */ key->tp.src = htons(icmp->icmp6_type); key->tp.dst = htons(icmp->icmp6_code); - memset(&key->ipv6.nd, 0, sizeof(key->ipv6.nd)); if (icmp->icmp6_code == 0 && (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION || @@ -570,6 +569,8 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, struct nd_msg *nd; int offset; + memset(&key->ipv6.nd, 0, sizeof(key->ipv6.nd)); + /* In order to process neighbor discovery options, we need the * entire packet. */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index ebc5728aab..f224d9bcea 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -152,6 +152,13 @@ static void update_range(struct sw_flow_match *match, sizeof((match)->key->field)); \ } while (0) +#define SW_FLOW_KEY_BITMAP_COPY(match, field, value_p, nbits, is_mask) ({ \ + update_range(match, offsetof(struct sw_flow_key, field), \ + bitmap_size(nbits), is_mask); \ + bitmap_copy(is_mask ? (match)->mask->key.field : (match)->key->field, \ + value_p, nbits); \ +}) + static bool match_validate(const struct sw_flow_match *match, u64 key_attrs, u64 mask_attrs, bool log) { @@ -670,8 +677,8 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, bool log) { bool ttl = false, ipv4 = false, ipv6 = false; + IP_TUNNEL_DECLARE_FLAGS(tun_flags) = { }; bool info_bridge_mode = false; - __be16 tun_flags = 0; int opts_type = 0; struct nlattr *a; int rem; @@ -697,7 +704,7 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, case OVS_TUNNEL_KEY_ATTR_ID: SW_FLOW_KEY_PUT(match, tun_key.tun_id, nla_get_be64(a), is_mask); - tun_flags |= TUNNEL_KEY; + __set_bit(IP_TUNNEL_KEY_BIT, tun_flags); break; case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.src, @@ -729,10 +736,10 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, ttl = true; break; case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT: - tun_flags |= TUNNEL_DONT_FRAGMENT; + __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, tun_flags); break; case OVS_TUNNEL_KEY_ATTR_CSUM: - tun_flags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tun_flags); break; case OVS_TUNNEL_KEY_ATTR_TP_SRC: SW_FLOW_KEY_PUT(match, tun_key.tp_src, @@ -743,7 +750,7 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, nla_get_be16(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_OAM: - tun_flags |= TUNNEL_OAM; + __set_bit(IP_TUNNEL_OAM_BIT, tun_flags); break; case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS: if (opts_type) { @@ -755,7 +762,7 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, if (err) return err; - tun_flags |= TUNNEL_GENEVE_OPT; + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_flags); opts_type = type; break; case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: @@ -768,7 +775,7 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, if (err) return err; - tun_flags |= TUNNEL_VXLAN_OPT; + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, tun_flags); opts_type = type; break; case OVS_TUNNEL_KEY_ATTR_PAD: @@ -784,7 +791,7 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, if (err) return err; - tun_flags |= TUNNEL_ERSPAN_OPT; + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_flags); opts_type = type; break; case OVS_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE: @@ -798,7 +805,8 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, } } - SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask); + SW_FLOW_KEY_BITMAP_COPY(match, tun_key.tun_flags, tun_flags, + __IP_TUNNEL_FLAG_NUM, is_mask); if (is_mask) SW_FLOW_KEY_MEMSET_FIELD(match, tun_proto, 0xff, true); else @@ -823,13 +831,15 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, } if (ipv4) { if (info_bridge_mode) { + __clear_bit(IP_TUNNEL_KEY_BIT, tun_flags); + if (match->key->tun_key.u.ipv4.src || match->key->tun_key.u.ipv4.dst || match->key->tun_key.tp_src || match->key->tun_key.tp_dst || match->key->tun_key.ttl || match->key->tun_key.tos || - tun_flags & ~TUNNEL_KEY) { + !ip_tunnel_flags_empty(tun_flags)) { OVS_NLERR(log, "IPv4 tun info is not correct"); return -EINVAL; } @@ -874,7 +884,7 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb, const void *tun_opts, int swkey_tun_opts_len, unsigned short tun_proto, u8 mode) { - if (output->tun_flags & TUNNEL_KEY && + if (test_bit(IP_TUNNEL_KEY_BIT, output->tun_flags) && nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id, OVS_TUNNEL_KEY_ATTR_PAD)) return -EMSGSIZE; @@ -910,10 +920,10 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb, return -EMSGSIZE; if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ttl)) return -EMSGSIZE; - if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) && + if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, output->tun_flags) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) return -EMSGSIZE; - if ((output->tun_flags & TUNNEL_CSUM) && + if (test_bit(IP_TUNNEL_CSUM_BIT, output->tun_flags) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) return -EMSGSIZE; if (output->tp_src && @@ -922,18 +932,20 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb, if (output->tp_dst && nla_put_be16(skb, OVS_TUNNEL_KEY_ATTR_TP_DST, output->tp_dst)) return -EMSGSIZE; - if ((output->tun_flags & TUNNEL_OAM) && + if (test_bit(IP_TUNNEL_OAM_BIT, output->tun_flags) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM)) return -EMSGSIZE; if (swkey_tun_opts_len) { - if (output->tun_flags & TUNNEL_GENEVE_OPT && + if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, output->tun_flags) && nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS, swkey_tun_opts_len, tun_opts)) return -EMSGSIZE; - else if (output->tun_flags & TUNNEL_VXLAN_OPT && + else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, + output->tun_flags) && vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len)) return -EMSGSIZE; - else if (output->tun_flags & TUNNEL_ERSPAN_OPT && + else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, + output->tun_flags) && nla_put(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, swkey_tun_opts_len, tun_opts)) return -EMSGSIZE; @@ -2029,7 +2041,7 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, if ((swkey->tun_proto || is_mask)) { const void *opts = NULL; - if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT) + if (ip_tunnel_is_options_present(output->tun_key.tun_flags)) opts = TUN_METADATA_OPTS(output, swkey->tun_opts_len); if (ip_tun_to_nlattr(skb, &output->tun_key, opts, @@ -2752,7 +2764,8 @@ static int validate_geneve_opts(struct sw_flow_key *key) opts_len -= len; } - key->tun_key.tun_flags |= crit_opt ? TUNNEL_CRIT_OPT : 0; + if (crit_opt) + __set_bit(IP_TUNNEL_CRIT_OPT_BIT, key->tun_key.tun_flags); return 0; } @@ -2760,6 +2773,7 @@ static int validate_geneve_opts(struct sw_flow_key *key) static int validate_and_copy_set_tun(const struct nlattr *attr, struct sw_flow_actions **sfa, bool log) { + IP_TUNNEL_DECLARE_FLAGS(dst_opt_type) = { }; struct sw_flow_match match; struct sw_flow_key key; struct metadata_dst *tun_dst; @@ -2767,9 +2781,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, struct ovs_tunnel_info *ovs_tun; struct nlattr *a; int err = 0, start, opts_type; - __be16 dst_opt_type; - dst_opt_type = 0; ovs_match_init(&match, &key, true, NULL); opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log); if (opts_type < 0) @@ -2781,13 +2793,14 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, err = validate_geneve_opts(&key); if (err < 0) return err; - dst_opt_type = TUNNEL_GENEVE_OPT; + + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, dst_opt_type); break; case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: - dst_opt_type = TUNNEL_VXLAN_OPT; + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, dst_opt_type); break; case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: - dst_opt_type = TUNNEL_ERSPAN_OPT; + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, dst_opt_type); break; } } diff --git a/net/openvswitch/meter.h b/net/openvswitch/meter.h index ed11cd12b5..8bbf983cd2 100644 --- a/net/openvswitch/meter.h +++ b/net/openvswitch/meter.h @@ -11,7 +11,6 @@ #include <linux/kernel.h> #include <linux/netlink.h> #include <linux/openvswitch.h> -#include <linux/genetlink.h> #include <linux/skbuff.h> #include <linux/bits.h> diff --git a/net/openvswitch/openvswitch_trace.h b/net/openvswitch/openvswitch_trace.h index 3eb35d9eb7..74d75aaebe 100644 --- a/net/openvswitch/openvswitch_trace.h +++ b/net/openvswitch/openvswitch_trace.h @@ -43,8 +43,8 @@ TRACE_EVENT(ovs_do_execute_action, TP_fast_assign( __entry->dpaddr = dp; - __assign_str(dp_name, ovs_dp_name(dp)); - __assign_str(dev_name, skb->dev->name); + __assign_str(dp_name); + __assign_str(dev_name); __entry->skbaddr = skb; __entry->len = skb->len; __entry->data_len = skb->data_len; @@ -113,8 +113,8 @@ TRACE_EVENT(ovs_dp_upcall, TP_fast_assign( __entry->dpaddr = dp; - __assign_str(dp_name, ovs_dp_name(dp)); - __assign_str(dev_name, skb->dev->name); + __assign_str(dp_name); + __assign_str(dev_name); __entry->skbaddr = skb; __entry->len = skb->len; __entry->data_len = skb->data_len; diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 903537a5da..91a11067e4 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -82,6 +82,13 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name) err = -ENODEV; goto error_free_vport; } + /* Ensure that the device exists and that the provided + * name is not one of its aliases. + */ + if (strcmp(name, ovs_vport_name(vport))) { + err = -ENODEV; + goto error_put; + } netdev_tracker_alloc(vport->dev, &vport->dev_tracker, GFP_KERNEL); if (vport->dev->flags & IFF_LOOPBACK || (vport->dev->type != ARPHRD_ETHER && diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e6a8701a38..4692a9ef11 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -538,6 +538,61 @@ static void *packet_current_frame(struct packet_sock *po, return packet_lookup_frame(po, rb, rb->head, status); } +static u16 vlan_get_tci(struct sk_buff *skb, struct net_device *dev) +{ + u8 *skb_orig_data = skb->data; + int skb_orig_len = skb->len; + struct vlan_hdr vhdr, *vh; + unsigned int header_len; + + if (!dev) + return 0; + + /* In the SOCK_DGRAM scenario, skb data starts at the network + * protocol, which is after the VLAN headers. The outer VLAN + * header is at the hard_header_len offset in non-variable + * length link layer headers. If it's a VLAN device, the + * min_header_len should be used to exclude the VLAN header + * size. + */ + if (dev->min_header_len == dev->hard_header_len) + header_len = dev->hard_header_len; + else if (is_vlan_dev(dev)) + header_len = dev->min_header_len; + else + return 0; + + skb_push(skb, skb->data - skb_mac_header(skb)); + vh = skb_header_pointer(skb, header_len, sizeof(vhdr), &vhdr); + if (skb_orig_data != skb->data) { + skb->data = skb_orig_data; + skb->len = skb_orig_len; + } + if (unlikely(!vh)) + return 0; + + return ntohs(vh->h_vlan_TCI); +} + +static __be16 vlan_get_protocol_dgram(struct sk_buff *skb) +{ + __be16 proto = skb->protocol; + + if (unlikely(eth_type_vlan(proto))) { + u8 *skb_orig_data = skb->data; + int skb_orig_len = skb->len; + + skb_push(skb, skb->data - skb_mac_header(skb)); + proto = __vlan_get_protocol(skb, proto, NULL); + if (skb_orig_data != skb->data) { + skb->data = skb_orig_data; + skb->len = skb_orig_len; + } + } + + return proto; +} + static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc) { del_timer_sync(&pkc->retire_blk_timer); @@ -1007,10 +1062,16 @@ static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc, static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { + struct packet_sock *po = container_of(pkc, struct packet_sock, rx_ring.prb_bdqc); + if (skb_vlan_tag_present(pkc->skb)) { ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb); ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto); ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; + } else if (unlikely(po->sk.sk_type == SOCK_DGRAM && eth_type_vlan(pkc->skb->protocol))) { + ppd->hv1.tp_vlan_tci = vlan_get_tci(pkc->skb, pkc->skb->dev); + ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->protocol); + ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { ppd->hv1.tp_vlan_tci = 0; ppd->hv1.tp_vlan_tpid = 0; @@ -2318,7 +2379,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, } if (po->tp_version <= TPACKET_V2) { if (macoff + snaplen > po->rx_ring.frame_size) { - if (po->copy_thresh && + if (READ_ONCE(po->copy_thresh) && atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { if (skb_shared(skb)) { copy_skb = skb_clone(skb, GFP_ATOMIC); @@ -2428,6 +2489,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, h.h2->tp_vlan_tci = skb_vlan_tag_get(skb); h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto); status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; + } else if (unlikely(sk->sk_type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) { + h.h2->tp_vlan_tci = vlan_get_tci(skb, skb->dev); + h.h2->tp_vlan_tpid = ntohs(skb->protocol); + status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { h.h2->tp_vlan_tci = 0; h.h2->tp_vlan_tpid = 0; @@ -2457,7 +2522,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, sll->sll_halen = dev_parse_header(skb, sll->sll_addr); sll->sll_family = AF_PACKET; sll->sll_hatype = dev->type; - sll->sll_protocol = skb->protocol; + sll->sll_protocol = (sk->sk_type == SOCK_DGRAM) ? + vlan_get_protocol_dgram(skb) : skb->protocol; sll->sll_pkttype = skb->pkt_type; if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV))) sll->sll_ifindex = orig_dev->ifindex; @@ -2522,8 +2588,7 @@ static void tpacket_destruct_skb(struct sk_buff *skb) ts = __packet_set_timestamp(po, ph, skb); __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts); - if (!packet_read_pending(&po->tx_ring)) - complete(&po->skb_completion); + complete(&po->skb_completion); } sock_wfree(skb); @@ -3483,7 +3548,8 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, /* Original length was stored in sockaddr_ll fields */ origlen = PACKET_SKB_CB(skb)->sa.origlen; sll->sll_family = AF_PACKET; - sll->sll_protocol = skb->protocol; + sll->sll_protocol = (sock->type == SOCK_DGRAM) ? + vlan_get_protocol_dgram(skb) : skb->protocol; } sock_recv_cmsgs(msg, sk, skb); @@ -3540,6 +3606,21 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, aux.tp_vlan_tci = skb_vlan_tag_get(skb); aux.tp_vlan_tpid = ntohs(skb->vlan_proto); aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; + } else if (unlikely(sock->type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) { + struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(sock_net(sk), sll->sll_ifindex); + if (dev) { + aux.tp_vlan_tci = vlan_get_tci(skb, dev); + aux.tp_vlan_tpid = ntohs(skb->protocol); + aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; + } else { + aux.tp_vlan_tci = 0; + aux.tp_vlan_tpid = 0; + } + rcu_read_unlock(); } else { aux.tp_vlan_tci = 0; aux.tp_vlan_tpid = 0; @@ -3800,28 +3881,30 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, case PACKET_TX_RING: { union tpacket_req_u req_u; - int len; + ret = -EINVAL; lock_sock(sk); switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: - len = sizeof(req_u.req); + if (optlen < sizeof(req_u.req)) + break; + ret = copy_from_sockptr(&req_u.req, optval, + sizeof(req_u.req)) ? + -EINVAL : 0; break; case TPACKET_V3: default: - len = sizeof(req_u.req3); + if (optlen < sizeof(req_u.req3)) + break; + ret = copy_from_sockptr(&req_u.req3, optval, + sizeof(req_u.req3)) ? + -EINVAL : 0; break; } - if (optlen < len) { - ret = -EINVAL; - } else { - if (copy_from_sockptr(&req_u.req, optval, len)) - ret = -EFAULT; - else - ret = packet_set_ring(sk, &req_u, 0, - optname == PACKET_TX_RING); - } + if (!ret) + ret = packet_set_ring(sk, &req_u, 0, + optname == PACKET_TX_RING); release_sock(sk); return ret; } @@ -3834,7 +3917,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; - pkt_sk(sk)->copy_thresh = val; + WRITE_ONCE(pkt_sk(sk)->copy_thresh, val); return 0; } case PACKET_VERSION: @@ -4088,6 +4171,9 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, case PACKET_VNET_HDR_SZ: val = READ_ONCE(po->vnet_hdr_sz); break; + case PACKET_COPY_THRESH: + val = READ_ONCE(pkt_sk(sk)->copy_thresh); + break; case PACKET_VERSION: val = po->tp_version; break; diff --git a/net/packet/diag.c b/net/packet/diag.c index 9a7980e330..47f69f3dbf 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -17,7 +17,7 @@ static int pdiag_put_info(const struct packet_sock *po, struct sk_buff *nlskb) pinfo.pdi_index = po->ifindex; pinfo.pdi_version = po->tp_version; pinfo.pdi_reserve = po->tp_reserve; - pinfo.pdi_copy_thresh = po->copy_thresh; + pinfo.pdi_copy_thresh = READ_ONCE(po->copy_thresh); pinfo.pdi_tstamp = READ_ONCE(po->tp_tstamp); pinfo.pdi_flags = 0; @@ -245,6 +245,7 @@ static int packet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) } static const struct sock_diag_handler packet_diag_handler = { + .owner = THIS_MODULE, .family = AF_PACKET, .dump = packet_diag_handler_dump, }; diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 3dd5f52bc1..53a858478e 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -759,8 +759,8 @@ static void pep_sock_close(struct sock *sk, long timeout) sock_put(sk); } -static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp, - bool kern) +static struct sock *pep_sock_accept(struct sock *sk, + struct proto_accept_arg *arg) { struct pep_sock *pn = pep_sk(sk), *newpn; struct sock *newsk = NULL; @@ -772,8 +772,8 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp, u8 pipe_handle, enabled, n_sb; u8 aligned = 0; - skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, - errp); + skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, + &arg->err); if (!skb) return NULL; @@ -836,7 +836,7 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp, /* Create a new to-be-accepted sock */ newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot, - kern); + arg->kern); if (!newsk) { pep_reject_conn(sk, skb, PN_PIPE_ERR_OVERLOAD, GFP_KERNEL); err = -ENOBUFS; @@ -878,7 +878,7 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp, drop: release_sock(sk); kfree_skb(skb); - *errp = err; + arg->err = err; return newsk; } diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c index 59aebe2968..7008d40249 100644 --- a/net/phonet/pn_netlink.c +++ b/net/phonet/pn_netlink.c @@ -178,7 +178,7 @@ static int fill_route(struct sk_buff *skb, struct net_device *dev, u8 dst, rtm->rtm_type = RTN_UNICAST; rtm->rtm_flags = 0; if (nla_put_u8(skb, RTA_DST, dst) || - nla_put_u32(skb, RTA_OIF, dev->ifindex)) + nla_put_u32(skb, RTA_OIF, READ_ONCE(dev->ifindex))) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; @@ -193,7 +193,7 @@ void rtm_phonet_notify(int event, struct net_device *dev, u8 dst) struct sk_buff *skb; int err = -ENOBUFS; - skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + + skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(1) + nla_total_size(4), GFP_KERNEL); if (skb == NULL) goto errout; @@ -263,6 +263,7 @@ static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh, static int route_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); + int err = 0; u8 addr; rcu_read_lock(); @@ -272,16 +273,16 @@ static int route_dumpit(struct sk_buff *skb, struct netlink_callback *cb) if (!dev) continue; - if (fill_route(skb, dev, addr << 2, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, RTM_NEWROUTE) < 0) - goto out; + err = fill_route(skb, dev, addr << 2, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, RTM_NEWROUTE); + if (err < 0) + break; } - -out: rcu_read_unlock(); cb->args[0] = addr; - return skb->len; + return err; } int __init phonet_netlink_register(void) @@ -301,6 +302,6 @@ int __init phonet_netlink_register(void) rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_DELROUTE, route_doit, NULL, 0); rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_GETROUTE, - NULL, route_dumpit, 0); + NULL, route_dumpit, RTNL_FLAG_DUMP_UNLOCKED); return 0; } diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 1018340d89..5ce0b3ee5d 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -292,18 +292,17 @@ out: } static int pn_socket_accept(struct socket *sock, struct socket *newsock, - int flags, bool kern) + struct proto_accept_arg *arg) { struct sock *sk = sock->sk; struct sock *newsk; - int err; if (unlikely(sk->sk_state != TCP_LISTEN)) return -EINVAL; - newsk = sk->sk_prot->accept(sk, flags, &err, kern); + newsk = sk->sk_prot->accept(sk, arg); if (!newsk) - return err; + return arg->err; lock_sock(newsk); sock_graft(newsk, newsock); diff --git a/net/phonet/sysctl.c b/net/phonet/sysctl.c index 0d0bf41381..82fc22467a 100644 --- a/net/phonet/sysctl.c +++ b/net/phonet/sysctl.c @@ -81,7 +81,6 @@ static struct ctl_table phonet_table[] = { .mode = 0644, .proc_handler = proc_local_port_range, }, - { } }; int __init phonet_sysctl_init(void) diff --git a/net/psample/psample.c b/net/psample/psample.c index ddd211a151..a5d9b8446f 100644 --- a/net/psample/psample.c +++ b/net/psample/psample.c @@ -221,7 +221,7 @@ static int __psample_ip_tun_to_nlattr(struct sk_buff *skb, const struct ip_tunnel_key *tun_key = &tun_info->key; int tun_opts_len = tun_info->options_len; - if (tun_key->tun_flags & TUNNEL_KEY && + if (test_bit(IP_TUNNEL_KEY_BIT, tun_key->tun_flags) && nla_put_be64(skb, PSAMPLE_TUNNEL_KEY_ATTR_ID, tun_key->tun_id, PSAMPLE_TUNNEL_KEY_ATTR_PAD)) return -EMSGSIZE; @@ -257,10 +257,10 @@ static int __psample_ip_tun_to_nlattr(struct sk_buff *skb, return -EMSGSIZE; if (nla_put_u8(skb, PSAMPLE_TUNNEL_KEY_ATTR_TTL, tun_key->ttl)) return -EMSGSIZE; - if ((tun_key->tun_flags & TUNNEL_DONT_FRAGMENT) && + if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, tun_key->tun_flags) && nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) return -EMSGSIZE; - if ((tun_key->tun_flags & TUNNEL_CSUM) && + if (test_bit(IP_TUNNEL_CSUM_BIT, tun_key->tun_flags) && nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_CSUM)) return -EMSGSIZE; if (tun_key->tp_src && @@ -269,15 +269,16 @@ static int __psample_ip_tun_to_nlattr(struct sk_buff *skb, if (tun_key->tp_dst && nla_put_be16(skb, PSAMPLE_TUNNEL_KEY_ATTR_TP_DST, tun_key->tp_dst)) return -EMSGSIZE; - if ((tun_key->tun_flags & TUNNEL_OAM) && + if (test_bit(IP_TUNNEL_OAM_BIT, tun_key->tun_flags) && nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_OAM)) return -EMSGSIZE; if (tun_opts_len) { - if (tun_key->tun_flags & TUNNEL_GENEVE_OPT && + if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_key->tun_flags) && nla_put(skb, PSAMPLE_TUNNEL_KEY_ATTR_GENEVE_OPTS, tun_opts_len, tun_opts)) return -EMSGSIZE; - else if (tun_key->tun_flags & TUNNEL_ERSPAN_OPT && + else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, + tun_key->tun_flags) && nla_put(skb, PSAMPLE_TUNNEL_KEY_ATTR_ERSPAN_OPTS, tun_opts_len, tun_opts)) return -EMSGSIZE; @@ -314,7 +315,7 @@ static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info) int tun_opts_len = tun_info->options_len; int sum = nla_total_size(0); /* PSAMPLE_ATTR_TUNNEL */ - if (tun_key->tun_flags & TUNNEL_KEY) + if (test_bit(IP_TUNNEL_KEY_BIT, tun_key->tun_flags)) sum += nla_total_size_64bit(sizeof(u64)); if (tun_info->mode & IP_TUNNEL_INFO_BRIDGE) @@ -337,20 +338,21 @@ static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info) if (tun_key->tos) sum += nla_total_size(sizeof(u8)); sum += nla_total_size(sizeof(u8)); /* TTL */ - if (tun_key->tun_flags & TUNNEL_DONT_FRAGMENT) + if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, tun_key->tun_flags)) sum += nla_total_size(0); - if (tun_key->tun_flags & TUNNEL_CSUM) + if (test_bit(IP_TUNNEL_CSUM_BIT, tun_key->tun_flags)) sum += nla_total_size(0); if (tun_key->tp_src) sum += nla_total_size(sizeof(u16)); if (tun_key->tp_dst) sum += nla_total_size(sizeof(u16)); - if (tun_key->tun_flags & TUNNEL_OAM) + if (test_bit(IP_TUNNEL_OAM_BIT, tun_key->tun_flags)) sum += nla_total_size(0); if (tun_opts_len) { - if (tun_key->tun_flags & TUNNEL_GENEVE_OPT) + if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_key->tun_flags)) sum += nla_total_size(tun_opts_len); - else if (tun_key->tun_flags & TUNNEL_ERSPAN_OPT) + else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, + tun_key->tun_flags)) sum += nla_total_size(tun_opts_len); } diff --git a/net/qrtr/mhi.c b/net/qrtr/mhi.c index 9ced13c062..69f53625a0 100644 --- a/net/qrtr/mhi.c +++ b/net/qrtr/mhi.c @@ -118,6 +118,51 @@ static const struct mhi_device_id qcom_mhi_qrtr_id_table[] = { }; MODULE_DEVICE_TABLE(mhi, qcom_mhi_qrtr_id_table); +static int __maybe_unused qcom_mhi_qrtr_pm_suspend_late(struct device *dev) +{ + struct mhi_device *mhi_dev = container_of(dev, struct mhi_device, dev); + enum mhi_state state; + + state = mhi_get_mhi_state(mhi_dev->mhi_cntrl); + /* + * If the device is in suspend state, then no need for the + * client driver to unprepare the channels. + */ + if (state == MHI_STATE_M3) + return 0; + + mhi_unprepare_from_transfer(mhi_dev); + + return 0; +} + +static int __maybe_unused qcom_mhi_qrtr_pm_resume_early(struct device *dev) +{ + struct mhi_device *mhi_dev = container_of(dev, struct mhi_device, dev); + enum mhi_state state; + int rc; + + state = mhi_get_mhi_state(mhi_dev->mhi_cntrl); + /* + * If the device is in suspend state, we won't unprepare channels + * in suspend callback, therefore no need to prepare channels when + * resume. + */ + if (state == MHI_STATE_M3) + return 0; + + rc = mhi_prepare_for_transfer_autoqueue(mhi_dev); + if (rc) + dev_err(dev, "failed to prepare for autoqueue transfer %d\n", rc); + + return rc; +} + +static const struct dev_pm_ops qcom_mhi_qrtr_pm_ops = { + SET_LATE_SYSTEM_SLEEP_PM_OPS(qcom_mhi_qrtr_pm_suspend_late, + qcom_mhi_qrtr_pm_resume_early) +}; + static struct mhi_driver qcom_mhi_qrtr_driver = { .probe = qcom_mhi_qrtr_probe, .remove = qcom_mhi_qrtr_remove, @@ -126,6 +171,7 @@ static struct mhi_driver qcom_mhi_qrtr_driver = { .id_table = qcom_mhi_qrtr_id_table, .driver = { .name = "qcom_mhi_qrtr", + .pm = &qcom_mhi_qrtr_pm_ops, }, }; diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index abb0c70ffc..654a3cc0d3 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -725,6 +725,24 @@ int qrtr_ns_init(void) if (ret < 0) goto err_wq; + /* As the qrtr ns socket owner and creator is the same module, we have + * to decrease the qrtr module reference count to guarantee that it + * remains zero after the ns socket is created, otherwise, executing + * "rmmod" command is unable to make the qrtr module deleted after the + * qrtr module is inserted successfully. + * + * However, the reference count is increased twice in + * sock_create_kern(): one is to increase the reference count of owner + * of qrtr socket's proto_ops struct; another is to increment the + * reference count of owner of qrtr proto struct. Therefore, we must + * decrement the module reference count twice to ensure that it keeps + * zero after server's listening socket is created. Of course, we + * must bump the module reference count twice as well before the socket + * is closed. + */ + module_put(qrtr_ns.sock->ops->owner); + module_put(qrtr_ns.sock->sk->sk_prot_creator->owner); + return 0; err_wq: @@ -739,6 +757,15 @@ void qrtr_ns_remove(void) { cancel_work_sync(&qrtr_ns.work); destroy_workqueue(qrtr_ns.workqueue); + + /* sock_release() expects the two references that were put during + * qrtr_ns_init(). This function is only called during module remove, + * so try_stop_module() has already set the refcnt to 0. Use + * __module_get() instead of try_module_get() to successfully take two + * references. + */ + __module_get(qrtr_ns.sock->ops->owner); + __module_get(qrtr_ns.sock->sk->sk_prot_creator->owner); sock_release(qrtr_ns.sock); } EXPORT_SYMBOL_GPL(qrtr_ns_remove); diff --git a/net/rds/connection.c b/net/rds/connection.c index b4cc699c5f..c749c5525b 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -829,9 +829,7 @@ int rds_conn_init(void) if (ret) return ret; - rds_conn_slab = kmem_cache_create("rds_connection", - sizeof(struct rds_connection), - 0, 0, NULL); + rds_conn_slab = KMEM_CACHE(rds_connection, 0); if (!rds_conn_slab) { rds_loop_net_exit(); return -ENOMEM; diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c index e4e41b3afc..2af678e71e 100644 --- a/net/rds/ib_sysctl.c +++ b/net/rds/ib_sysctl.c @@ -103,7 +103,6 @@ static struct ctl_table rds_ib_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; void rds_ib_sysctl_exit(void) diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c index e381bbcd9c..025f518a43 100644 --- a/net/rds/sysctl.c +++ b/net/rds/sysctl.c @@ -89,7 +89,6 @@ static struct ctl_table rds_sysctl_rds_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; void rds_sysctl_exit(void) diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 2dba7505b4..d8111ac83b 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -86,7 +86,6 @@ static struct ctl_table rds_tcp_sysctl_table[] = { .proc_handler = rds_tcp_skbuf_handler, .extra1 = &rds_tcp_min_rcvbuf, }, - { } }; u32 rds_tcp_write_seq(struct rds_tcp_connection *tc) diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 05008ce5c4..d89bd8d0c3 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -105,6 +105,10 @@ int rds_tcp_accept_one(struct socket *sock) int conn_state; struct rds_conn_path *cp; struct in6_addr *my_addr, *peer_addr; + struct proto_accept_arg arg = { + .flags = O_NONBLOCK, + .kern = true, + }; #if !IS_ENABLED(CONFIG_IPV6) struct in6_addr saddr, daddr; #endif @@ -119,7 +123,7 @@ int rds_tcp_accept_one(struct socket *sock) if (ret) goto out; - ret = sock->ops->accept(sock, new_sock, O_NONBLOCK, true); + ret = sock->ops->accept(sock, new_sock, &arg); if (ret < 0) goto out; diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index 4e32d65952..84529886c2 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -156,14 +156,12 @@ err_destroy: return ret; } -static int rfkill_gpio_remove(struct platform_device *pdev) +static void rfkill_gpio_remove(struct platform_device *pdev) { struct rfkill_gpio_data *rfkill = platform_get_drvdata(pdev); rfkill_unregister(rfkill->rfkill_dev); rfkill_destroy(rfkill->rfkill_dev); - - return 0; } #ifdef CONFIG_ACPI @@ -183,7 +181,7 @@ MODULE_DEVICE_TABLE(of, rfkill_of_match); static struct platform_driver rfkill_gpio_driver = { .probe = rfkill_gpio_probe, - .remove = rfkill_gpio_remove, + .remove_new = rfkill_gpio_remove, .driver = { .name = "rfkill_gpio", .acpi_match_table = ACPI_PTR(rfkill_acpi_match), diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index ef81d019b2..59050caab6 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -919,8 +919,8 @@ out_release: return err; } -static int rose_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +static int rose_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { struct sk_buff *skb; struct sock *newsk; @@ -953,7 +953,7 @@ static int rose_accept(struct socket *sock, struct socket *newsock, int flags, if (skb) break; - if (flags & O_NONBLOCK) { + if (arg->flags & O_NONBLOCK) { err = -EWOULDBLOCK; break; } diff --git a/net/rose/sysctl_net_rose.c b/net/rose/sysctl_net_rose.c index d391d7758f..d801315b70 100644 --- a/net/rose/sysctl_net_rose.c +++ b/net/rose/sysctl_net_rose.c @@ -112,7 +112,6 @@ static struct ctl_table rose_table[] = { .extra1 = &min_window, .extra2 = &max_window }, - { } }; void __init rose_register_sysctl(void) diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 465bfe5eb0..f4844683e1 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -65,7 +65,7 @@ static void rxrpc_write_space(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible(&wq->wait); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } @@ -487,7 +487,7 @@ EXPORT_SYMBOL(rxrpc_kernel_new_call_notification); * rxrpc_kernel_set_max_life - Set maximum lifespan on a call * @sock: The socket the call is on * @call: The call to configure - * @hard_timeout: The maximum lifespan of the call in jiffies + * @hard_timeout: The maximum lifespan of the call in ms * * Set the maximum lifespan of a call. The call will end with ETIME or * ETIMEDOUT if it takes longer than this. @@ -495,14 +495,14 @@ EXPORT_SYMBOL(rxrpc_kernel_new_call_notification); void rxrpc_kernel_set_max_life(struct socket *sock, struct rxrpc_call *call, unsigned long hard_timeout) { - unsigned long now; + ktime_t delay = ms_to_ktime(hard_timeout), expect_term_by; mutex_lock(&call->user_mutex); - now = jiffies; - hard_timeout += now; - WRITE_ONCE(call->expect_term_by, hard_timeout); - rxrpc_reduce_call_timer(call, hard_timeout, now, rxrpc_timer_set_for_hard); + expect_term_by = ktime_add(ktime_get_real(), delay); + WRITE_ONCE(call->expect_term_by, expect_term_by); + trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_hard); + rxrpc_poke_call(call, rxrpc_call_poke_set_timeout); mutex_unlock(&call->user_mutex); } diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 7818aae1be..08de24658f 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -198,8 +198,8 @@ struct rxrpc_host_header { * - max 48 bytes (struct sk_buff::cb) */ struct rxrpc_skb_priv { - struct rxrpc_connection *conn; /* Connection referred to (poke packet) */ union { + struct rxrpc_connection *conn; /* Connection referred to (poke packet) */ struct { u16 offset; /* Offset of data */ u16 len; /* Length of data */ @@ -208,9 +208,12 @@ struct rxrpc_skb_priv { }; struct { rxrpc_seq_t first_ack; /* First packet in acks table */ + rxrpc_seq_t prev_ack; /* Highest seq seen */ + rxrpc_serial_t acked_serial; /* Packet in response to (or 0) */ + u8 reason; /* Reason for ack */ u8 nr_acks; /* Number of acks+nacks */ u8 nr_nacks; /* Number of nacks */ - }; + } ack; }; struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */ }; @@ -248,10 +251,9 @@ struct rxrpc_security { struct rxrpc_key_token *); /* Work out how much data we can store in a packet, given an estimate - * of the amount of data remaining. + * of the amount of data remaining and allocate a data buffer. */ - int (*how_much_data)(struct rxrpc_call *, size_t, - size_t *, size_t *, size_t *); + struct rxrpc_txbuf *(*alloc_txbuf)(struct rxrpc_call *call, size_t remaining, gfp_t gfp); /* impose security on a packet */ int (*secure_packet)(struct rxrpc_call *, struct rxrpc_txbuf *); @@ -292,6 +294,7 @@ struct rxrpc_local { struct socket *socket; /* my UDP socket */ struct task_struct *io_thread; struct completion io_thread_ready; /* Indication that the I/O thread started */ + struct page_frag_cache tx_alloc; /* Tx control packet allocation (I/O thread only) */ struct rxrpc_sock *service; /* Service(s) listening on this endpoint */ #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY struct sk_buff_head rx_delay_queue; /* Delay injection queue */ @@ -352,8 +355,8 @@ struct rxrpc_peer { u32 mdev_us; /* medium deviation */ u32 mdev_max_us; /* maximal mdev for the last rtt period */ u32 rttvar_us; /* smoothed mdev_max */ - u32 rto_j; /* Retransmission timeout in jiffies */ - u8 backoff; /* Backoff timeout */ + u32 rto_us; /* Retransmission timeout in usec */ + u8 backoff; /* Backoff timeout (as shift) */ u8 cong_ssthresh; /* Congestion slow-start threshold */ }; @@ -500,6 +503,8 @@ struct rxrpc_connection { struct list_head proc_link; /* link in procfs list */ struct list_head link; /* link in master connection list */ struct sk_buff_head rx_queue; /* received conn-level packets */ + struct page_frag_cache tx_data_alloc; /* Tx DATA packet allocation */ + struct mutex tx_data_alloc_lock; struct mutex security_lock; /* Lock for security management */ const struct rxrpc_security *security; /* applied security module */ @@ -618,17 +623,17 @@ struct rxrpc_call { const struct rxrpc_security *security; /* applied security module */ struct mutex user_mutex; /* User access mutex */ struct sockaddr_rxrpc dest_srx; /* Destination address */ - unsigned long delay_ack_at; /* When DELAY ACK needs to happen */ - unsigned long ack_lost_at; /* When ACK is figured as lost */ - unsigned long resend_at; /* When next resend needs to happen */ - unsigned long ping_at; /* When next to send a ping */ - unsigned long keepalive_at; /* When next to send a keepalive ping */ - unsigned long expect_rx_by; /* When we expect to get a packet by */ - unsigned long expect_req_by; /* When we expect to get a request DATA packet by */ - unsigned long expect_term_by; /* When we expect call termination by */ - u32 next_rx_timo; /* Timeout for next Rx packet (jif) */ - u32 next_req_timo; /* Timeout for next Rx request packet (jif) */ - u32 hard_timo; /* Maximum lifetime or 0 (jif) */ + ktime_t delay_ack_at; /* When DELAY ACK needs to happen */ + ktime_t ack_lost_at; /* When ACK is figured as lost */ + ktime_t resend_at; /* When next resend needs to happen */ + ktime_t ping_at; /* When next to send a ping */ + ktime_t keepalive_at; /* When next to send a keepalive ping */ + ktime_t expect_rx_by; /* When we expect to get a packet by */ + ktime_t expect_req_by; /* When we expect to get a request DATA packet by */ + ktime_t expect_term_by; /* When we expect call termination by */ + u32 next_rx_timo; /* Timeout for next Rx packet (ms) */ + u32 next_req_timo; /* Timeout for next Rx request packet (ms) */ + u32 hard_timo; /* Maximum lifetime or 0 (s) */ struct timer_list timer; /* Combined event timer */ struct work_struct destroyer; /* In-process-context destroyer */ rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */ @@ -673,7 +678,7 @@ struct rxrpc_call { rxrpc_seq_t tx_transmitted; /* Highest packet transmitted */ rxrpc_seq_t tx_prepared; /* Highest Tx slot prepared. */ rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */ - u16 tx_backoff; /* Delay to insert due to Tx failure */ + u16 tx_backoff; /* Delay to insert due to Tx failure (ms) */ u8 tx_winsize; /* Maximum size of Tx window */ #define RXRPC_TX_MAX_WINDOW 128 ktime_t tx_last_sent; /* Last time a transmission occurred */ @@ -692,7 +697,7 @@ struct rxrpc_call { * packets) rather than bytes. */ #define RXRPC_TX_SMSS RXRPC_JUMBO_DATALEN -#define RXRPC_MIN_CWND (RXRPC_TX_SMSS > 2190 ? 2 : RXRPC_TX_SMSS > 1095 ? 3 : 4) +#define RXRPC_MIN_CWND 4 u8 cong_cwnd; /* Congestion window size */ u8 cong_extra; /* Extra to send for congestion management */ u8 cong_ssthresh; /* Slow-start threshold */ @@ -788,40 +793,30 @@ struct rxrpc_send_params { * Buffer of data to be output as a packet. */ struct rxrpc_txbuf { - struct rcu_head rcu; struct list_head call_link; /* Link in call->tx_sendmsg/tx_buffer */ struct list_head tx_link; /* Link in live Enc queue or Tx queue */ ktime_t last_sent; /* Time at which last transmitted */ refcount_t ref; rxrpc_seq_t seq; /* Sequence number of this packet */ + rxrpc_serial_t serial; /* Last serial number transmitted with */ unsigned int call_debug_id; unsigned int debug_id; unsigned int len; /* Amount of data in buffer */ unsigned int space; /* Remaining data space */ unsigned int offset; /* Offset of fill point */ - unsigned long flags; -#define RXRPC_TXBUF_LAST 0 /* Set if last packet in Tx phase */ -#define RXRPC_TXBUF_RESENT 1 /* Set if has been resent */ + unsigned int flags; +#define RXRPC_TXBUF_WIRE_FLAGS 0xff /* The wire protocol flags */ +#define RXRPC_TXBUF_RESENT 0x100 /* Set if has been resent */ + __be16 cksum; /* Checksum to go in header */ + unsigned short ack_rwind; /* ACK receive window */ u8 /*enum rxrpc_propose_ack_trace*/ ack_why; /* If ack, why */ - struct { - /* The packet for encrypting and DMA'ing. We align it such - * that data[] aligns correctly for any crypto blocksize. - */ - u8 pad[64 - sizeof(struct rxrpc_wire_header)]; - struct rxrpc_wire_header wire; /* Network-ready header */ - union { - u8 data[RXRPC_JUMBO_DATALEN]; /* Data packet */ - struct { - struct rxrpc_ackpacket ack; - DECLARE_FLEX_ARRAY(u8, acks); - }; - }; - } __aligned(64); + u8 nr_kvec; /* Amount of kvec[] used */ + struct kvec kvec[3]; }; static inline bool rxrpc_sending_to_server(const struct rxrpc_txbuf *txb) { - return txb->wire.flags & RXRPC_CLIENT_INITIATED; + return txb->flags & RXRPC_CLIENT_INITIATED; } static inline bool rxrpc_sending_to_client(const struct rxrpc_txbuf *txb) @@ -869,17 +864,11 @@ int rxrpc_user_charge_accept(struct rxrpc_sock *, unsigned long); */ void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial, enum rxrpc_propose_ack_trace why); -void rxrpc_send_ACK(struct rxrpc_call *, u8, rxrpc_serial_t, enum rxrpc_propose_ack_trace); void rxrpc_propose_delay_ACK(struct rxrpc_call *, rxrpc_serial_t, enum rxrpc_propose_ack_trace); void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *); void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb); -void rxrpc_reduce_call_timer(struct rxrpc_call *call, - unsigned long expire_at, - unsigned long now, - enum rxrpc_timer_trace why); - bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb); /* @@ -1160,9 +1149,9 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net) /* * output.c */ -int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb); +void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, + rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why); int rxrpc_send_abort_packet(struct rxrpc_call *); -int rxrpc_send_data_packet(struct rxrpc_call *, struct rxrpc_txbuf *); void rxrpc_send_conn_abort(struct rxrpc_connection *conn); void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb); void rxrpc_send_keepalive(struct rxrpc_peer *); @@ -1223,7 +1212,7 @@ static inline int rxrpc_abort_eproto(struct rxrpc_call *call, */ void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, int, rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t); -unsigned long rxrpc_get_rto_backoff(struct rxrpc_peer *, bool); +ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans); void rxrpc_peer_init_rtt(struct rxrpc_peer *); /* @@ -1295,8 +1284,9 @@ static inline void rxrpc_sysctl_exit(void) {} * txbuf.c */ extern atomic_t rxrpc_nr_txbuf; -struct rxrpc_txbuf *rxrpc_alloc_txbuf(struct rxrpc_call *call, u8 packet_type, - gfp_t gfp); +struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_size, + size_t data_align, gfp_t gfp); +struct rxrpc_txbuf *rxrpc_alloc_ack_txbuf(struct rxrpc_call *call, size_t sack_size); void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 0f78544d04..7bbb685047 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -23,14 +23,14 @@ void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial, enum rxrpc_propose_ack_trace why) { - unsigned long now = jiffies; - unsigned long ping_at = now + rxrpc_idle_ack_delay; - - if (time_before(ping_at, call->ping_at)) { - WRITE_ONCE(call->ping_at, ping_at); - rxrpc_reduce_call_timer(call, ping_at, now, - rxrpc_timer_set_for_ping); - trace_rxrpc_propose_ack(call, why, RXRPC_ACK_PING, serial); + ktime_t delay = ms_to_ktime(READ_ONCE(rxrpc_idle_ack_delay)); + ktime_t now = ktime_get_real(); + ktime_t ping_at = ktime_add(now, delay); + + trace_rxrpc_propose_ack(call, why, RXRPC_ACK_PING, serial); + if (ktime_before(ping_at, call->ping_at)) { + call->ping_at = ping_at; + trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_ping); } } @@ -40,62 +40,18 @@ void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial, void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why) { - unsigned long expiry = rxrpc_soft_ack_delay; - unsigned long now = jiffies, ack_at; - - if (rxrpc_soft_ack_delay < expiry) - expiry = rxrpc_soft_ack_delay; - if (call->peer->srtt_us != 0) - ack_at = usecs_to_jiffies(call->peer->srtt_us >> 3); - else - ack_at = expiry; - - ack_at += READ_ONCE(call->tx_backoff); - ack_at += now; - if (time_before(ack_at, call->delay_ack_at)) { - WRITE_ONCE(call->delay_ack_at, ack_at); - rxrpc_reduce_call_timer(call, ack_at, now, - rxrpc_timer_set_for_ack); - } + ktime_t now = ktime_get_real(), delay; trace_rxrpc_propose_ack(call, why, RXRPC_ACK_DELAY, serial); -} - -/* - * Queue an ACK for immediate transmission. - */ -void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, - rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why) -{ - struct rxrpc_txbuf *txb; - - if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) - return; - - rxrpc_inc_stat(call->rxnet, stat_tx_acks[ack_reason]); - txb = rxrpc_alloc_txbuf(call, RXRPC_PACKET_TYPE_ACK, - rcu_read_lock_held() ? GFP_ATOMIC | __GFP_NOWARN : GFP_NOFS); - if (!txb) { - kleave(" = -ENOMEM"); - return; - } + if (call->peer->srtt_us) + delay = (call->peer->srtt_us >> 3) * NSEC_PER_USEC; + else + delay = ms_to_ktime(READ_ONCE(rxrpc_soft_ack_delay)); + ktime_add_ms(delay, call->tx_backoff); - txb->ack_why = why; - txb->wire.seq = 0; - txb->wire.type = RXRPC_PACKET_TYPE_ACK; - txb->wire.flags |= RXRPC_SLOW_START_OK; - txb->ack.bufferSpace = 0; - txb->ack.maxSkew = 0; - txb->ack.firstPacket = 0; - txb->ack.previousPacket = 0; - txb->ack.serial = htonl(serial); - txb->ack.reason = ack_reason; - txb->ack.nAcks = 0; - - trace_rxrpc_send_ack(call, why, ack_reason, serial); - rxrpc_send_ack_packet(call, txb); - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_ack_tx); + call->delay_ack_at = ktime_add(now, delay); + trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_delayed_ack); } /* @@ -114,25 +70,19 @@ void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb) struct rxrpc_ackpacket *ack = NULL; struct rxrpc_skb_priv *sp; struct rxrpc_txbuf *txb; - unsigned long resend_at; - rxrpc_seq_t transmitted = READ_ONCE(call->tx_transmitted); - ktime_t now, max_age, oldest, ack_ts; - bool unacked = false; + rxrpc_seq_t transmitted = call->tx_transmitted; + ktime_t next_resend = KTIME_MAX, rto = ns_to_ktime(call->peer->rto_us * NSEC_PER_USEC); + ktime_t resend_at = KTIME_MAX, now, delay; + bool unacked = false, did_send = false; unsigned int i; - LIST_HEAD(retrans_queue); _enter("{%d,%d}", call->acks_hard_ack, call->tx_top); now = ktime_get_real(); - max_age = ktime_sub_us(now, jiffies_to_usecs(call->peer->rto_j)); - oldest = now; if (list_empty(&call->tx_buffer)) goto no_resend; - if (list_empty(&call->tx_buffer)) - goto no_further_resend; - trace_rxrpc_resend(call, ack_skb); txb = list_first_entry(&call->tx_buffer, struct rxrpc_txbuf, call_link); @@ -143,12 +93,12 @@ void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb) sp = rxrpc_skb(ack_skb); ack = (void *)ack_skb->data + sizeof(struct rxrpc_wire_header); - for (i = 0; i < sp->nr_acks; i++) { + for (i = 0; i < sp->ack.nr_acks; i++) { rxrpc_seq_t seq; if (ack->acks[i] & 1) continue; - seq = sp->first_ack + i; + seq = sp->ack.first_ack + i; if (after(txb->seq, transmitted)) break; if (after(txb->seq, seq)) @@ -160,19 +110,23 @@ void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb) goto no_further_resend; found_txb: - if (after(ntohl(txb->wire.serial), call->acks_highest_serial)) + resend_at = ktime_add(txb->last_sent, rto); + if (after(txb->serial, call->acks_highest_serial)) { + if (ktime_after(resend_at, now) && + ktime_before(resend_at, next_resend)) + next_resend = resend_at; continue; /* Ack point not yet reached */ + } rxrpc_see_txbuf(txb, rxrpc_txbuf_see_unacked); - if (list_empty(&txb->tx_link)) { - list_add_tail(&txb->tx_link, &retrans_queue); - set_bit(RXRPC_TXBUF_RESENT, &txb->flags); - } + trace_rxrpc_retransmit(call, txb->seq, txb->serial, + ktime_sub(resend_at, now)); - trace_rxrpc_retransmit(call, txb->seq, - ktime_to_ns(ktime_sub(txb->last_sent, - max_age))); + txb->flags |= RXRPC_TXBUF_RESENT; + rxrpc_transmit_one(call, txb); + did_send = true; + now = ktime_get_real(); if (list_is_last(&txb->call_link, &call->tx_buffer)) goto no_further_resend; @@ -184,43 +138,46 @@ void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb) * seen. Anything between the soft-ACK table and that point will get * ACK'd or NACK'd in due course, so don't worry about it here; here we * need to consider retransmitting anything beyond that point. - * - * Note that ACK for a packet can beat the update of tx_transmitted. */ - if (after_eq(READ_ONCE(call->acks_prev_seq), READ_ONCE(call->tx_transmitted))) + if (after_eq(call->acks_prev_seq, call->tx_transmitted)) goto no_further_resend; list_for_each_entry_from(txb, &call->tx_buffer, call_link) { - if (before_eq(txb->seq, READ_ONCE(call->acks_prev_seq))) + resend_at = ktime_add(txb->last_sent, rto); + + if (before_eq(txb->seq, call->acks_prev_seq)) continue; - if (after(txb->seq, READ_ONCE(call->tx_transmitted))) + if (after(txb->seq, call->tx_transmitted)) break; /* Not transmitted yet */ if (ack && ack->reason == RXRPC_ACK_PING_RESPONSE && - before(ntohl(txb->wire.serial), ntohl(ack->serial))) + before(txb->serial, ntohl(ack->serial))) goto do_resend; /* Wasn't accounted for by a more recent ping. */ - if (ktime_after(txb->last_sent, max_age)) { - if (ktime_before(txb->last_sent, oldest)) - oldest = txb->last_sent; + if (ktime_after(resend_at, now)) { + if (ktime_before(resend_at, next_resend)) + next_resend = resend_at; continue; } do_resend: unacked = true; - if (list_empty(&txb->tx_link)) { - list_add_tail(&txb->tx_link, &retrans_queue); - set_bit(RXRPC_TXBUF_RESENT, &txb->flags); - rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans); - } + + txb->flags |= RXRPC_TXBUF_RESENT; + rxrpc_transmit_one(call, txb); + did_send = true; + rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans); + now = ktime_get_real(); } no_further_resend: no_resend: - resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(now, oldest))); - resend_at += jiffies + rxrpc_get_rto_backoff(call->peer, - !list_empty(&retrans_queue)); - WRITE_ONCE(call->resend_at, resend_at); + if (resend_at < KTIME_MAX) { + delay = rxrpc_get_rto_backoff(call->peer, did_send); + resend_at = ktime_add(resend_at, delay); + trace_rxrpc_timer_set(call, resend_at - now, rxrpc_timer_trace_resend_reset); + } + call->resend_at = resend_at; if (unacked) rxrpc_congestion_timeout(call); @@ -229,25 +186,15 @@ no_resend: * that an ACK got lost somewhere. Send a ping to find out instead of * retransmitting data. */ - if (list_empty(&retrans_queue)) { - rxrpc_reduce_call_timer(call, resend_at, jiffies, - rxrpc_timer_set_for_resend); - ack_ts = ktime_sub(now, call->acks_latest_ts); - if (ktime_to_us(ack_ts) < (call->peer->srtt_us >> 3)) - goto out; - rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, - rxrpc_propose_ack_ping_for_lost_ack); - goto out; - } + if (!did_send) { + ktime_t next_ping = ktime_add_us(call->acks_latest_ts, + call->peer->srtt_us >> 3); - /* Retransmit the queue */ - while ((txb = list_first_entry_or_null(&retrans_queue, - struct rxrpc_txbuf, tx_link))) { - list_del_init(&txb->tx_link); - rxrpc_transmit_one(call, txb); + if (ktime_sub(next_ping, now) <= 0) + rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, + rxrpc_propose_ack_ping_for_0_retrans); } -out: _leave(""); } @@ -257,13 +204,11 @@ out: */ static void rxrpc_begin_service_reply(struct rxrpc_call *call) { - unsigned long now = jiffies; - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SEND_REPLY); - WRITE_ONCE(call->delay_ack_at, now + MAX_JIFFY_OFFSET); if (call->ackr_reason == RXRPC_ACK_DELAY) call->ackr_reason = 0; - trace_rxrpc_timer(call, rxrpc_timer_init_for_send_reply, now); + call->delay_ack_at = KTIME_MAX; + trace_rxrpc_timer_can(call, rxrpc_timer_trace_delayed_ack); } /* @@ -320,7 +265,7 @@ static void rxrpc_decant_prepared_tx(struct rxrpc_call *call) call->tx_top = txb->seq; list_add_tail(&txb->call_link, &call->tx_buffer); - if (txb->wire.flags & RXRPC_LAST_PACKET) + if (txb->flags & RXRPC_LAST_PACKET) rxrpc_close_tx_phase(call); rxrpc_transmit_one(call, txb); @@ -372,8 +317,8 @@ static void rxrpc_send_initial_ping(struct rxrpc_call *call) */ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) { - unsigned long now, next, t; - bool resend = false, expired = false; + ktime_t now, t; + bool resend = false; s32 abort_code; rxrpc_see_call(call, rxrpc_call_see_input); @@ -397,70 +342,73 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) if (skb && skb->mark == RXRPC_SKB_MARK_ERROR) goto out; + if (skb) + rxrpc_input_call_packet(call, skb); + /* If we see our async-event poke, check for timeout trippage. */ - now = jiffies; - t = READ_ONCE(call->expect_rx_by); - if (time_after_eq(now, t)) { - trace_rxrpc_timer(call, rxrpc_timer_exp_normal, now); - expired = true; + now = ktime_get_real(); + t = ktime_sub(call->expect_rx_by, now); + if (t <= 0) { + trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_expect_rx); + goto expired; } - t = READ_ONCE(call->expect_req_by); - if (__rxrpc_call_state(call) == RXRPC_CALL_SERVER_RECV_REQUEST && - time_after_eq(now, t)) { - trace_rxrpc_timer(call, rxrpc_timer_exp_idle, now); - expired = true; + t = ktime_sub(call->expect_req_by, now); + if (t <= 0) { + call->expect_req_by = KTIME_MAX; + if (__rxrpc_call_state(call) == RXRPC_CALL_SERVER_RECV_REQUEST) { + trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_idle); + goto expired; + } } - t = READ_ONCE(call->expect_term_by); - if (time_after_eq(now, t)) { - trace_rxrpc_timer(call, rxrpc_timer_exp_hard, now); - expired = true; + t = ktime_sub(READ_ONCE(call->expect_term_by), now); + if (t <= 0) { + trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_hard); + goto expired; } - t = READ_ONCE(call->delay_ack_at); - if (time_after_eq(now, t)) { - trace_rxrpc_timer(call, rxrpc_timer_exp_ack, now); - cmpxchg(&call->delay_ack_at, t, now + MAX_JIFFY_OFFSET); + t = ktime_sub(call->delay_ack_at, now); + if (t <= 0) { + trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_delayed_ack); + call->delay_ack_at = KTIME_MAX; rxrpc_send_ACK(call, RXRPC_ACK_DELAY, 0, - rxrpc_propose_ack_ping_for_lost_ack); + rxrpc_propose_ack_delayed_ack); } - t = READ_ONCE(call->ack_lost_at); - if (time_after_eq(now, t)) { - trace_rxrpc_timer(call, rxrpc_timer_exp_lost_ack, now); - cmpxchg(&call->ack_lost_at, t, now + MAX_JIFFY_OFFSET); + t = ktime_sub(call->ack_lost_at, now); + if (t <= 0) { + trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_lost_ack); + call->ack_lost_at = KTIME_MAX; set_bit(RXRPC_CALL_EV_ACK_LOST, &call->events); } - t = READ_ONCE(call->keepalive_at); - if (time_after_eq(now, t)) { - trace_rxrpc_timer(call, rxrpc_timer_exp_keepalive, now); - cmpxchg(&call->keepalive_at, t, now + MAX_JIFFY_OFFSET); + t = ktime_sub(call->ping_at, now); + if (t <= 0) { + trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_ping); + call->ping_at = KTIME_MAX; rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_keepalive); } - t = READ_ONCE(call->ping_at); - if (time_after_eq(now, t)) { - trace_rxrpc_timer(call, rxrpc_timer_exp_ping, now); - cmpxchg(&call->ping_at, t, now + MAX_JIFFY_OFFSET); - rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, - rxrpc_propose_ack_ping_for_keepalive); - } - - t = READ_ONCE(call->resend_at); - if (time_after_eq(now, t)) { - trace_rxrpc_timer(call, rxrpc_timer_exp_resend, now); - cmpxchg(&call->resend_at, t, now + MAX_JIFFY_OFFSET); + t = ktime_sub(call->resend_at, now); + if (t <= 0) { + trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_resend); + call->resend_at = KTIME_MAX; resend = true; } - if (skb) - rxrpc_input_call_packet(call, skb); - rxrpc_transmit_some_data(call); + now = ktime_get_real(); + t = ktime_sub(call->keepalive_at, now); + if (t <= 0) { + trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_keepalive); + call->keepalive_at = KTIME_MAX; + rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, + rxrpc_propose_ack_ping_for_keepalive); + } + if (skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); @@ -472,24 +420,13 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_send_initial_ping(call); /* Process events */ - if (expired) { - if (test_bit(RXRPC_CALL_RX_HEARD, &call->flags) && - (int)call->conn->hi_serial - (int)call->rx_serial > 0) { - trace_rxrpc_call_reset(call); - rxrpc_abort_call(call, 0, RX_CALL_DEAD, -ECONNRESET, - rxrpc_abort_call_reset); - } else { - rxrpc_abort_call(call, 0, RX_CALL_TIMEOUT, -ETIME, - rxrpc_abort_call_timeout); - } - goto out; - } - if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events)) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_lost_ack); - if (resend && __rxrpc_call_state(call) != RXRPC_CALL_CLIENT_RECV_REPLY) + if (resend && + __rxrpc_call_state(call) != RXRPC_CALL_CLIENT_RECV_REPLY && + !test_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags)) rxrpc_resend(call, NULL); if (test_and_clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags)) @@ -511,23 +448,33 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) /* Make sure the timer is restarted */ if (!__rxrpc_call_is_complete(call)) { - next = call->expect_rx_by; + ktime_t next = READ_ONCE(call->expect_term_by), delay; -#define set(T) { t = READ_ONCE(T); if (time_before(t, next)) next = t; } +#define set(T) { ktime_t _t = (T); if (ktime_before(_t, next)) next = _t; } set(call->expect_req_by); - set(call->expect_term_by); + set(call->expect_rx_by); set(call->delay_ack_at); set(call->ack_lost_at); set(call->resend_at); set(call->keepalive_at); set(call->ping_at); - now = jiffies; - if (time_after_eq(now, next)) + now = ktime_get_real(); + delay = ktime_sub(next, now); + if (delay <= 0) { rxrpc_poke_call(call, rxrpc_call_poke_timer_now); - - rxrpc_reduce_call_timer(call, next, now, rxrpc_timer_restart); + } else { + unsigned long nowj = jiffies, delayj, nextj; + + delayj = max(nsecs_to_jiffies(delay), 1); + nextj = nowj + delayj; + if (time_before(nextj, call->timer.expires) || + !timer_pending(&call->timer)) { + trace_rxrpc_timer_restart(call, delay, delayj); + timer_reduce(&call->timer, nextj); + } + } } out: @@ -542,4 +489,16 @@ out: rxrpc_shrink_call_tx_buffer(call); _leave(""); return true; + +expired: + if (test_bit(RXRPC_CALL_RX_HEARD, &call->flags) && + (int)call->conn->hi_serial - (int)call->rx_serial > 0) { + trace_rxrpc_call_reset(call); + rxrpc_abort_call(call, 0, RX_CALL_DEAD, -ECONNRESET, + rxrpc_abort_call_reset); + } else { + rxrpc_abort_call(call, 0, RX_CALL_TIMEOUT, -ETIME, + rxrpc_abort_call_timeout); + } + goto out; } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 9fc9a6c3f6..f9e983a12c 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -70,20 +70,11 @@ static void rxrpc_call_timer_expired(struct timer_list *t) _enter("%d", call->debug_id); if (!__rxrpc_call_is_complete(call)) { - trace_rxrpc_timer_expired(call, jiffies); + trace_rxrpc_timer_expired(call); rxrpc_poke_call(call, rxrpc_call_poke_timer); } } -void rxrpc_reduce_call_timer(struct rxrpc_call *call, - unsigned long expire_at, - unsigned long now, - enum rxrpc_timer_trace why) -{ - trace_rxrpc_timer(call, why, now); - timer_reduce(&call->timer, expire_at); -} - static struct lock_class_key rxrpc_call_user_mutex_lock_class_key; static void rxrpc_destroy_call(struct work_struct *); @@ -163,24 +154,27 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, spin_lock_init(&call->notify_lock); spin_lock_init(&call->tx_lock); refcount_set(&call->ref, 1); - call->debug_id = debug_id; - call->tx_total_len = -1; - call->next_rx_timo = 20 * HZ; - call->next_req_timo = 1 * HZ; - call->ackr_window = 1; - call->ackr_wtop = 1; + call->debug_id = debug_id; + call->tx_total_len = -1; + call->next_rx_timo = 20 * HZ; + call->next_req_timo = 1 * HZ; + call->ackr_window = 1; + call->ackr_wtop = 1; + call->delay_ack_at = KTIME_MAX; + call->ack_lost_at = KTIME_MAX; + call->resend_at = KTIME_MAX; + call->ping_at = KTIME_MAX; + call->keepalive_at = KTIME_MAX; + call->expect_rx_by = KTIME_MAX; + call->expect_req_by = KTIME_MAX; + call->expect_term_by = KTIME_MAX; memset(&call->sock_node, 0xed, sizeof(call->sock_node)); call->rx_winsize = rxrpc_rx_window_size; call->tx_winsize = 16; - if (RXRPC_TX_SMSS > 2190) - call->cong_cwnd = 2; - else if (RXRPC_TX_SMSS > 1095) - call->cong_cwnd = 3; - else - call->cong_cwnd = 4; + call->cong_cwnd = RXRPC_MIN_CWND; call->cong_ssthresh = RXRPC_TX_MAX_WINDOW; call->rxnet = rxnet; @@ -226,11 +220,11 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, __set_bit(RXRPC_CALL_EXCLUSIVE, &call->flags); if (p->timeouts.normal) - call->next_rx_timo = min(msecs_to_jiffies(p->timeouts.normal), 1UL); + call->next_rx_timo = min(p->timeouts.normal, 1); if (p->timeouts.idle) - call->next_req_timo = min(msecs_to_jiffies(p->timeouts.idle), 1UL); + call->next_req_timo = min(p->timeouts.idle, 1); if (p->timeouts.hard) - call->hard_timo = p->timeouts.hard * HZ; + call->hard_timo = p->timeouts.hard; ret = rxrpc_init_client_call_security(call); if (ret < 0) { @@ -253,18 +247,13 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, */ void rxrpc_start_call_timer(struct rxrpc_call *call) { - unsigned long now = jiffies; - unsigned long j = now + MAX_JIFFY_OFFSET; - - call->delay_ack_at = j; - call->ack_lost_at = j; - call->resend_at = j; - call->ping_at = j; - call->keepalive_at = j; - call->expect_rx_by = j; - call->expect_req_by = j; - call->expect_term_by = j + call->hard_timo; - call->timer.expires = now; + if (call->hard_timo) { + ktime_t delay = ms_to_ktime(call->hard_timo * 1000); + + call->expect_term_by = ktime_add(ktime_get_real(), delay); + trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_hard); + } + call->timer.expires = jiffies; } /* diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 3b9b267a44..d25bf1cf36 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -636,7 +636,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call test_bit(RXRPC_CALL_EXPOSED, &call->flags)) { unsigned long final_ack_at = jiffies + 2; - WRITE_ONCE(chan->final_ack_at, final_ack_at); + chan->final_ack_at = final_ack_at; smp_wmb(); /* vs rxrpc_process_delayed_final_acks() */ set_bit(RXRPC_CONN_FINAL_ACK_0 + channel, &conn->flags); rxrpc_reduce_conn_timer(conn, final_ack_at); @@ -770,7 +770,7 @@ next: conn_expires_at = conn->idle_timestamp + expiry; - now = READ_ONCE(jiffies); + now = jiffies; if (time_after(conn_expires_at, now)) goto not_yet_expired; } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 1f251d758c..598b4ee389 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -88,7 +88,7 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, struct rxrpc_ackpacket ack; }; } __attribute__((packed)) pkt; - struct rxrpc_ackinfo ack_info; + struct rxrpc_acktrailer trailer; size_t len; int ret, ioc; u32 serial, mtu, call_id, padding; @@ -122,8 +122,8 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, iov[0].iov_len = sizeof(pkt.whdr); iov[1].iov_base = &padding; iov[1].iov_len = 3; - iov[2].iov_base = &ack_info; - iov[2].iov_len = sizeof(ack_info); + iov[2].iov_base = &trailer; + iov[2].iov_len = sizeof(trailer); serial = rxrpc_get_next_serial(conn); @@ -158,14 +158,14 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, pkt.ack.serial = htonl(skb ? sp->hdr.serial : 0); pkt.ack.reason = skb ? RXRPC_ACK_DUPLICATE : RXRPC_ACK_IDLE; pkt.ack.nAcks = 0; - ack_info.rxMTU = htonl(rxrpc_rx_mtu); - ack_info.maxMTU = htonl(mtu); - ack_info.rwind = htonl(rxrpc_rx_window_size); - ack_info.jumbo_max = htonl(rxrpc_rx_jumbo_max); + trailer.maxMTU = htonl(rxrpc_rx_mtu); + trailer.ifMTU = htonl(mtu); + trailer.rwind = htonl(rxrpc_rx_window_size); + trailer.jumbo_max = htonl(rxrpc_rx_jumbo_max); pkt.whdr.flags |= RXRPC_SLOW_START_OK; padding = 0; iov[0].iov_len += sizeof(pkt.ack); - len += sizeof(pkt.ack) + 3 + sizeof(ack_info); + len += sizeof(pkt.ack) + 3 + sizeof(trailer); ioc = 3; trace_rxrpc_tx_ack(chan->call_debug_id, serial, diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index df8a271948..1539d315af 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -68,6 +68,7 @@ struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *rxnet, INIT_LIST_HEAD(&conn->proc_link); INIT_LIST_HEAD(&conn->link); mutex_init(&conn->security_lock); + mutex_init(&conn->tx_data_alloc_lock); skb_queue_head_init(&conn->rx_queue); conn->rxnet = rxnet; conn->security = &rxrpc_no_security; @@ -118,18 +119,13 @@ struct rxrpc_connection *rxrpc_find_client_connection_rcu(struct rxrpc_local *lo switch (srx->transport.family) { case AF_INET: if (peer->srx.transport.sin.sin_port != - srx->transport.sin.sin_port || - peer->srx.transport.sin.sin_addr.s_addr != - srx->transport.sin.sin_addr.s_addr) + srx->transport.sin.sin_port) goto not_found; break; #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: if (peer->srx.transport.sin6.sin6_port != - srx->transport.sin6.sin6_port || - memcmp(&peer->srx.transport.sin6.sin6_addr, - &srx->transport.sin6.sin6_addr, - sizeof(struct in6_addr)) != 0) + srx->transport.sin6.sin6_port) goto not_found; break; #endif @@ -341,6 +337,9 @@ static void rxrpc_clean_up_connection(struct work_struct *work) */ rxrpc_purge_queue(&conn->rx_queue); + if (conn->tx_data_alloc.va) + __page_frag_cache_drain(virt_to_page(conn->tx_data_alloc.va), + conn->tx_data_alloc.pagecnt_bias); call_rcu(&conn->rcu, rxrpc_rcu_free_connection); } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 9691de00ad..16d49a861d 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -9,6 +9,17 @@ #include "ar-internal.h" +/* Override priority when generating ACKs for received DATA */ +static const u8 rxrpc_ack_priority[RXRPC_ACK__INVALID] = { + [RXRPC_ACK_IDLE] = 1, + [RXRPC_ACK_DELAY] = 2, + [RXRPC_ACK_REQUESTED] = 3, + [RXRPC_ACK_DUPLICATE] = 4, + [RXRPC_ACK_EXCEEDS_WINDOW] = 5, + [RXRPC_ACK_NOSPACE] = 6, + [RXRPC_ACK_OUT_OF_SEQUENCE] = 7, +}; + static void rxrpc_proto_abort(struct rxrpc_call *call, rxrpc_seq_t seq, enum rxrpc_abort_reason why) { @@ -212,7 +223,7 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, list_for_each_entry_rcu(txb, &call->tx_buffer, call_link, false) { if (before_eq(txb->seq, call->acks_hard_ack)) continue; - if (test_bit(RXRPC_TXBUF_LAST, &txb->flags)) { + if (txb->flags & RXRPC_LAST_PACKET) { set_bit(RXRPC_CALL_TX_LAST, &call->flags); rot_last = true; } @@ -252,6 +263,9 @@ static void rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, { ASSERT(test_bit(RXRPC_CALL_TX_LAST, &call->flags)); + call->resend_at = KTIME_MAX; + trace_rxrpc_timer_can(call, rxrpc_timer_trace_resend); + if (unlikely(call->cong_last_nack)) { rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack); call->cong_last_nack = NULL; @@ -288,15 +302,11 @@ static void rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, static bool rxrpc_receiving_reply(struct rxrpc_call *call) { struct rxrpc_ack_summary summary = { 0 }; - unsigned long now, timo; rxrpc_seq_t top = READ_ONCE(call->tx_top); if (call->ackr_reason) { - now = jiffies; - timo = now + MAX_JIFFY_OFFSET; - - WRITE_ONCE(call->delay_ack_at, timo); - trace_rxrpc_timer(call, rxrpc_timer_init_for_reply, now); + call->delay_ack_at = KTIME_MAX; + trace_rxrpc_timer_can(call, rxrpc_timer_trace_delayed_ack); } if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) { @@ -329,7 +339,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) case RXRPC_CALL_SERVER_RECV_REQUEST: rxrpc_set_call_state(call, RXRPC_CALL_SERVER_ACK_REQUEST); - call->expect_req_by = jiffies + MAX_JIFFY_OFFSET; + call->expect_req_by = KTIME_MAX; rxrpc_propose_delay_ACK(call, serial, rxrpc_propose_ack_processing_op); break; @@ -366,7 +376,7 @@ static void rxrpc_input_queue_data(struct rxrpc_call *call, struct sk_buff *skb, * Process a DATA packet. */ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, - bool *_notify) + bool *_notify, rxrpc_serial_t *_ack_serial, int *_ack_reason) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct sk_buff *oos; @@ -419,8 +429,6 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, /* Send an immediate ACK if we fill in a hole */ else if (!skb_queue_empty(&call->rx_oos_queue)) ack_reason = RXRPC_ACK_DELAY; - else - call->ackr_nr_unacked++; window++; if (after(window, wtop)) { @@ -498,12 +506,16 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, } send_ack: - if (ack_reason >= 0) - rxrpc_send_ACK(call, ack_reason, serial, - rxrpc_propose_ack_input_data); - else - rxrpc_propose_delay_ACK(call, serial, - rxrpc_propose_ack_input_data); + if (ack_reason >= 0) { + if (rxrpc_ack_priority[ack_reason] > rxrpc_ack_priority[*_ack_reason]) { + *_ack_serial = serial; + *_ack_reason = ack_reason; + } else if (rxrpc_ack_priority[ack_reason] == rxrpc_ack_priority[*_ack_reason] && + ack_reason == RXRPC_ACK_REQUESTED) { + *_ack_serial = serial; + *_ack_reason = ack_reason; + } + } } /* @@ -514,9 +526,11 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb struct rxrpc_jumbo_header jhdr; struct rxrpc_skb_priv *sp = rxrpc_skb(skb), *jsp; struct sk_buff *jskb; + rxrpc_serial_t ack_serial = 0; unsigned int offset = sizeof(struct rxrpc_wire_header); unsigned int len = skb->len - offset; bool notify = false; + int ack_reason = 0; while (sp->hdr.flags & RXRPC_JUMBO_PACKET) { if (len < RXRPC_JUMBO_SUBPKTLEN) @@ -536,7 +550,7 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb jsp = rxrpc_skb(jskb); jsp->offset = offset; jsp->len = RXRPC_JUMBO_DATALEN; - rxrpc_input_data_one(call, jskb, ¬ify); + rxrpc_input_data_one(call, jskb, ¬ify, &ack_serial, &ack_reason); rxrpc_free_skb(jskb, rxrpc_skb_put_jumbo_subpacket); sp->hdr.flags = jhdr.flags; @@ -549,7 +563,16 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb sp->offset = offset; sp->len = len; - rxrpc_input_data_one(call, skb, ¬ify); + rxrpc_input_data_one(call, skb, ¬ify, &ack_serial, &ack_reason); + + if (ack_reason > 0) { + rxrpc_send_ACK(call, ack_reason, ack_serial, + rxrpc_propose_ack_input_data); + } else { + call->ackr_nr_unacked++; + rxrpc_propose_delay_ACK(call, sp->hdr.serial, + rxrpc_propose_ack_input_data); + } if (notify) { trace_rxrpc_notify_socket(call->debug_id, sp->hdr.serial); rxrpc_notify_socket(call); @@ -589,14 +612,12 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) case RXRPC_CALL_SERVER_RECV_REQUEST: { unsigned long timo = READ_ONCE(call->next_req_timo); - unsigned long now, expect_req_by; if (timo) { - now = jiffies; - expect_req_by = now + timo; - WRITE_ONCE(call->expect_req_by, expect_req_by); - rxrpc_reduce_call_timer(call, expect_req_by, now, - rxrpc_timer_set_for_idle); + ktime_t delay = ms_to_ktime(timo); + + call->expect_req_by = ktime_add(ktime_get_real(), delay); + trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_idle); } break; } @@ -670,14 +691,14 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call, /* * Process the extra information that may be appended to an ACK packet */ -static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, - struct rxrpc_ackinfo *ackinfo) +static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb, + struct rxrpc_acktrailer *trailer) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_peer *peer; unsigned int mtu; bool wake = false; - u32 rwind = ntohl(ackinfo->rwind); + u32 rwind = ntohl(trailer->rwind); if (rwind > RXRPC_TX_MAX_WINDOW) rwind = RXRPC_TX_MAX_WINDOW; @@ -688,10 +709,7 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, call->tx_winsize = rwind; } - if (call->cong_ssthresh > rwind) - call->cong_ssthresh = rwind; - - mtu = min(ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU)); + mtu = min(ntohl(trailer->maxMTU), ntohl(trailer->ifMTU)); peer = call->peer; if (mtu < peer->maxdata) { @@ -713,20 +731,19 @@ static rxrpc_seq_t rxrpc_input_check_prev_ack(struct rxrpc_call *call, rxrpc_seq_t seq) { struct sk_buff *skb = call->cong_last_nack; - struct rxrpc_ackpacket ack; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); unsigned int i, new_acks = 0, retained_nacks = 0; - rxrpc_seq_t old_seq = sp->first_ack; - u8 *acks = skb->data + sizeof(struct rxrpc_wire_header) + sizeof(ack); + rxrpc_seq_t old_seq = sp->ack.first_ack; + u8 *acks = skb->data + sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket); - if (after_eq(seq, old_seq + sp->nr_acks)) { - summary->nr_new_acks += sp->nr_nacks; - summary->nr_new_acks += seq - (old_seq + sp->nr_acks); + if (after_eq(seq, old_seq + sp->ack.nr_acks)) { + summary->nr_new_acks += sp->ack.nr_nacks; + summary->nr_new_acks += seq - (old_seq + sp->ack.nr_acks); summary->nr_retained_nacks = 0; } else if (seq == old_seq) { - summary->nr_retained_nacks = sp->nr_nacks; + summary->nr_retained_nacks = sp->ack.nr_nacks; } else { - for (i = 0; i < sp->nr_acks; i++) { + for (i = 0; i < sp->ack.nr_acks; i++) { if (acks[i] == RXRPC_ACK_TYPE_NACK) { if (before(old_seq + i, seq)) new_acks++; @@ -739,7 +756,7 @@ static rxrpc_seq_t rxrpc_input_check_prev_ack(struct rxrpc_call *call, summary->nr_retained_nacks = retained_nacks; } - return old_seq + sp->nr_acks; + return old_seq + sp->ack.nr_acks; } /* @@ -759,10 +776,10 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); unsigned int i, old_nacks = 0; - rxrpc_seq_t lowest_nak = seq + sp->nr_acks; + rxrpc_seq_t lowest_nak = seq + sp->ack.nr_acks; u8 *acks = skb->data + sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket); - for (i = 0; i < sp->nr_acks; i++) { + for (i = 0; i < sp->ack.nr_acks; i++) { if (acks[i] == RXRPC_ACK_TYPE_ACK) { summary->nr_acks++; if (after_eq(seq, since)) @@ -774,7 +791,7 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, old_nacks++; } else { summary->nr_new_nacks++; - sp->nr_nacks++; + sp->ack.nr_nacks++; } if (before(seq, lowest_nak)) @@ -835,38 +852,32 @@ static bool rxrpc_is_ack_valid(struct rxrpc_call *call, static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_ack_summary summary = { 0 }; - struct rxrpc_ackpacket ack; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxrpc_ackinfo info; + struct rxrpc_acktrailer trailer; rxrpc_serial_t ack_serial, acked_serial; rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt, since; int nr_acks, offset, ioffset; _enter(""); - offset = sizeof(struct rxrpc_wire_header); - if (skb_copy_bits(skb, offset, &ack, sizeof(ack)) < 0) - return rxrpc_proto_abort(call, 0, rxrpc_badmsg_short_ack); - offset += sizeof(ack); - - ack_serial = sp->hdr.serial; - acked_serial = ntohl(ack.serial); - first_soft_ack = ntohl(ack.firstPacket); - prev_pkt = ntohl(ack.previousPacket); - hard_ack = first_soft_ack - 1; - nr_acks = ack.nAcks; - sp->first_ack = first_soft_ack; - sp->nr_acks = nr_acks; - summary.ack_reason = (ack.reason < RXRPC_ACK__INVALID ? - ack.reason : RXRPC_ACK__INVALID); + offset = sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket); + + ack_serial = sp->hdr.serial; + acked_serial = sp->ack.acked_serial; + first_soft_ack = sp->ack.first_ack; + prev_pkt = sp->ack.prev_ack; + nr_acks = sp->ack.nr_acks; + hard_ack = first_soft_ack - 1; + summary.ack_reason = (sp->ack.reason < RXRPC_ACK__INVALID ? + sp->ack.reason : RXRPC_ACK__INVALID); trace_rxrpc_rx_ack(call, ack_serial, acked_serial, first_soft_ack, prev_pkt, summary.ack_reason, nr_acks); - rxrpc_inc_stat(call->rxnet, stat_rx_acks[ack.reason]); + rxrpc_inc_stat(call->rxnet, stat_rx_acks[summary.ack_reason]); if (acked_serial != 0) { - switch (ack.reason) { + switch (summary.ack_reason) { case RXRPC_ACK_PING_RESPONSE: rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, rxrpc_rtt_rx_ping_response); @@ -886,7 +897,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) * indicates that the client address changed due to NAT. The server * lost the call because it switched to a different peer. */ - if (unlikely(ack.reason == RXRPC_ACK_EXCEEDS_WINDOW) && + if (unlikely(summary.ack_reason == RXRPC_ACK_EXCEEDS_WINDOW) && first_soft_ack == 1 && prev_pkt == 0 && rxrpc_is_client_call(call)) { @@ -899,7 +910,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) * indicate a change of address. However, we can retransmit the call * if we still have it buffered to the beginning. */ - if (unlikely(ack.reason == RXRPC_ACK_OUT_OF_SEQUENCE) && + if (unlikely(summary.ack_reason == RXRPC_ACK_OUT_OF_SEQUENCE) && first_soft_ack == 1 && prev_pkt == 0 && call->acks_hard_ack == 0 && @@ -917,11 +928,11 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) goto send_response; } - info.rxMTU = 0; + trailer.maxMTU = 0; ioffset = offset + nr_acks + 3; - if (skb->len >= ioffset + sizeof(info) && - skb_copy_bits(skb, ioffset, &info, sizeof(info)) < 0) - return rxrpc_proto_abort(call, 0, rxrpc_badmsg_short_ack_info); + if (skb->len >= ioffset + sizeof(trailer) && + skb_copy_bits(skb, ioffset, &trailer, sizeof(trailer)) < 0) + return rxrpc_proto_abort(call, 0, rxrpc_badmsg_short_ack_trailer); if (nr_acks > 0) skb_condense(skb); @@ -940,7 +951,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) call->acks_first_seq = first_soft_ack; call->acks_prev_seq = prev_pkt; - switch (ack.reason) { + switch (summary.ack_reason) { case RXRPC_ACK_PING: break; default: @@ -950,8 +961,8 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) } /* Parse rwind and mtu sizes if provided. */ - if (info.rxMTU) - rxrpc_input_ackinfo(call, skb, &info); + if (trailer.maxMTU) + rxrpc_input_ack_trailer(call, skb, &trailer); if (first_soft_ack == 0) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_zero); @@ -997,7 +1008,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_congestion_management(call, skb, &summary, acked_serial); send_response: - if (ack.reason == RXRPC_ACK_PING) + if (summary.ack_reason == RXRPC_ACK_PING) rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, ack_serial, rxrpc_propose_ack_respond_to_ping); else if (sp->hdr.flags & RXRPC_REQUEST_ACK) @@ -1048,12 +1059,10 @@ void rxrpc_input_call_packet(struct rxrpc_call *call, struct sk_buff *skb) timo = READ_ONCE(call->next_rx_timo); if (timo) { - unsigned long now = jiffies, expect_rx_by; + ktime_t delay = ms_to_ktime(timo); - expect_rx_by = now + timo; - WRITE_ONCE(call->expect_rx_by, expect_rx_by); - rxrpc_reduce_call_timer(call, expect_rx_by, now, - rxrpc_timer_set_for_normal); + call->expect_rx_by = ktime_add(ktime_get_real(), delay); + trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_expect_rx); } switch (sp->hdr.type) { diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index 34353b6e58..6716c021a5 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -15,14 +15,11 @@ static int none_init_connection_security(struct rxrpc_connection *conn, } /* - * Work out how much data we can put in an unsecured packet. + * Allocate an appropriately sized buffer for the amount of data remaining. */ -static int none_how_much_data(struct rxrpc_call *call, size_t remain, - size_t *_buf_size, size_t *_data_size, size_t *_offset) +static struct rxrpc_txbuf *none_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp) { - *_buf_size = *_data_size = min_t(size_t, remain, RXRPC_JUMBO_DATALEN); - *_offset = 0; - return 0; + return rxrpc_alloc_data_txbuf(call, min_t(size_t, remain, RXRPC_JUMBO_DATALEN), 1, gfp); } static int none_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) @@ -79,7 +76,7 @@ const struct rxrpc_security rxrpc_no_security = { .exit = none_exit, .init_connection_security = none_init_connection_security, .free_call_crypto = none_free_call_crypto, - .how_much_data = none_how_much_data, + .alloc_txbuf = none_alloc_txbuf, .secure_packet = none_secure_packet, .verify_packet = none_verify_packet, .respond_to_challenge = none_respond_to_challenge, diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 4a3a08a0e2..0300baa9af 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -124,6 +124,7 @@ static bool rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) { struct rxrpc_wire_header whdr; + struct rxrpc_ackpacket ack; /* dig out the RxRPC connection details */ if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0) @@ -141,6 +142,16 @@ static bool rxrpc_extract_header(struct rxrpc_skb_priv *sp, sp->hdr.securityIndex = whdr.securityIndex; sp->hdr._rsvd = ntohs(whdr._rsvd); sp->hdr.serviceId = ntohs(whdr.serviceId); + + if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK) { + if (skb_copy_bits(skb, sizeof(whdr), &ack, sizeof(ack)) < 0) + return rxrpc_bad_message(skb, rxrpc_badmsg_short_ack); + sp->ack.first_ack = ntohl(ack.firstPacket); + sp->ack.prev_ack = ntohl(ack.previousPacket); + sp->ack.acked_serial = ntohl(ack.serial); + sp->ack.reason = ack.reason; + sp->ack.nr_acks = ack.nAcks; + } return true; } diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 34d3073681..504453c688 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -452,6 +452,9 @@ void rxrpc_destroy_local(struct rxrpc_local *local) #endif rxrpc_purge_queue(&local->rx_queue); rxrpc_purge_client_connections(local); + if (local->tx_alloc.va) + __page_frag_cache_drain(virt_to_page(local->tx_alloc.va), + local->tx_alloc.pagecnt_bias); } /* diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 825b811830..657cf35089 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -17,22 +17,22 @@ unsigned int rxrpc_max_backlog __read_mostly = 10; /* - * How long to wait before scheduling an ACK with subtype DELAY (in jiffies). + * How long to wait before scheduling an ACK with subtype DELAY (in ms). * * We use this when we've received new data packets. If those packets aren't * all consumed within this time we will send a DELAY ACK if an ACK was not * requested to let the sender know it doesn't need to resend. */ -unsigned long rxrpc_soft_ack_delay = HZ; +unsigned long rxrpc_soft_ack_delay = 1000; /* - * How long to wait before scheduling an ACK with subtype IDLE (in jiffies). + * How long to wait before scheduling an ACK with subtype IDLE (in ms). * * We use this when we've consumed some previously soft-ACK'd packets when * further packets aren't immediately received to decide when to send an IDLE * ACK let the other end know that it can free up its Tx buffer space. */ -unsigned long rxrpc_idle_ack_delay = HZ / 2; +unsigned long rxrpc_idle_ack_delay = 500; /* * Receive window size in packets. This indicates the maximum number of diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 4a292f860a..5ea9601efd 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -48,12 +48,10 @@ static const char rxrpc_keepalive_string[] = ""; static void rxrpc_tx_backoff(struct rxrpc_call *call, int ret) { if (ret < 0) { - u16 tx_backoff = READ_ONCE(call->tx_backoff); - - if (tx_backoff < HZ) - WRITE_ONCE(call->tx_backoff, tx_backoff + 1); + if (call->tx_backoff < 1000) + call->tx_backoff += 100; } else { - WRITE_ONCE(call->tx_backoff, 0); + call->tx_backoff = 0; } } @@ -65,84 +63,92 @@ static void rxrpc_tx_backoff(struct rxrpc_call *call, int ret) * Receiving a response to the ping will prevent the ->expect_rx_by timer from * expiring. */ -static void rxrpc_set_keepalive(struct rxrpc_call *call) +static void rxrpc_set_keepalive(struct rxrpc_call *call, ktime_t now) { - unsigned long now = jiffies, keepalive_at = call->next_rx_timo / 6; + ktime_t delay = ms_to_ktime(READ_ONCE(call->next_rx_timo) / 6); - keepalive_at += now; - WRITE_ONCE(call->keepalive_at, keepalive_at); - rxrpc_reduce_call_timer(call, keepalive_at, now, - rxrpc_timer_set_for_keepalive); + call->keepalive_at = ktime_add(ktime_get_real(), delay); + trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_keepalive); } /* * Fill out an ACK packet. */ -static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn, - struct rxrpc_call *call, - struct rxrpc_txbuf *txb, - u16 *_rwind) +static void rxrpc_fill_out_ack(struct rxrpc_call *call, + struct rxrpc_txbuf *txb, + u8 ack_reason, + rxrpc_serial_t serial) { - struct rxrpc_ackinfo ackinfo; + struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; + struct rxrpc_acktrailer *trailer = txb->kvec[2].iov_base + 3; + struct rxrpc_ackpacket *ack = (struct rxrpc_ackpacket *)(whdr + 1); unsigned int qsize, sack, wrap, to; rxrpc_seq_t window, wtop; int rsize; u32 mtu, jmax; - u8 *ackp = txb->acks; + u8 *filler = txb->kvec[2].iov_base; + u8 *sackp = txb->kvec[1].iov_base; - call->ackr_nr_unacked = 0; - atomic_set(&call->ackr_nr_consumed, 0); rxrpc_inc_stat(call->rxnet, stat_tx_ack_fill); - clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags); window = call->ackr_window; wtop = call->ackr_wtop; sack = call->ackr_sack_base % RXRPC_SACK_SIZE; - txb->ack.firstPacket = htonl(window); - txb->ack.nAcks = wtop - window; + + whdr->seq = 0; + whdr->type = RXRPC_PACKET_TYPE_ACK; + txb->flags |= RXRPC_SLOW_START_OK; + ack->bufferSpace = 0; + ack->maxSkew = 0; + ack->firstPacket = htonl(window); + ack->previousPacket = htonl(call->rx_highest_seq); + ack->serial = htonl(serial); + ack->reason = ack_reason; + ack->nAcks = wtop - window; + filler[0] = 0; + filler[1] = 0; + filler[2] = 0; + + if (ack_reason == RXRPC_ACK_PING) + txb->flags |= RXRPC_REQUEST_ACK; if (after(wtop, window)) { + txb->len += ack->nAcks; + txb->kvec[1].iov_base = sackp; + txb->kvec[1].iov_len = ack->nAcks; + wrap = RXRPC_SACK_SIZE - sack; - to = min_t(unsigned int, txb->ack.nAcks, RXRPC_SACK_SIZE); + to = min_t(unsigned int, ack->nAcks, RXRPC_SACK_SIZE); - if (sack + txb->ack.nAcks <= RXRPC_SACK_SIZE) { - memcpy(txb->acks, call->ackr_sack_table + sack, txb->ack.nAcks); + if (sack + ack->nAcks <= RXRPC_SACK_SIZE) { + memcpy(sackp, call->ackr_sack_table + sack, ack->nAcks); } else { - memcpy(txb->acks, call->ackr_sack_table + sack, wrap); - memcpy(txb->acks + wrap, call->ackr_sack_table, - to - wrap); + memcpy(sackp, call->ackr_sack_table + sack, wrap); + memcpy(sackp + wrap, call->ackr_sack_table, to - wrap); } - - ackp += to; } else if (before(wtop, window)) { pr_warn("ack window backward %x %x", window, wtop); - } else if (txb->ack.reason == RXRPC_ACK_DELAY) { - txb->ack.reason = RXRPC_ACK_IDLE; + } else if (ack->reason == RXRPC_ACK_DELAY) { + ack->reason = RXRPC_ACK_IDLE; } - mtu = conn->peer->if_mtu; - mtu -= conn->peer->hdrsize; + mtu = call->peer->if_mtu; + mtu -= call->peer->hdrsize; jmax = rxrpc_rx_jumbo_max; qsize = (window - 1) - call->rx_consumed; rsize = max_t(int, call->rx_winsize - qsize, 0); - *_rwind = rsize; - ackinfo.rxMTU = htonl(rxrpc_rx_mtu); - ackinfo.maxMTU = htonl(mtu); - ackinfo.rwind = htonl(rsize); - ackinfo.jumbo_max = htonl(jmax); - - *ackp++ = 0; - *ackp++ = 0; - *ackp++ = 0; - memcpy(ackp, &ackinfo, sizeof(ackinfo)); - return txb->ack.nAcks + 3 + sizeof(ackinfo); + txb->ack_rwind = rsize; + trailer->maxMTU = htonl(rxrpc_rx_mtu); + trailer->ifMTU = htonl(mtu); + trailer->rwind = htonl(rsize); + trailer->jumbo_max = htonl(jmax); } /* * Record the beginning of an RTT probe. */ -static int rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial, - enum rxrpc_rtt_tx_trace why) +static void rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial, + ktime_t now, enum rxrpc_rtt_tx_trace why) { unsigned long avail = call->rtt_avail; int rtt_slot = 9; @@ -155,47 +161,31 @@ static int rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial, goto no_slot; call->rtt_serial[rtt_slot] = serial; - call->rtt_sent_at[rtt_slot] = ktime_get_real(); + call->rtt_sent_at[rtt_slot] = now; smp_wmb(); /* Write data before avail bit */ set_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); trace_rxrpc_rtt_tx(call, why, rtt_slot, serial); - return rtt_slot; + return; no_slot: trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_no_slot, rtt_slot, serial); - return -1; -} - -/* - * Cancel an RTT probe. - */ -static void rxrpc_cancel_rtt_probe(struct rxrpc_call *call, - rxrpc_serial_t serial, int rtt_slot) -{ - if (rtt_slot != -1) { - clear_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); - smp_wmb(); /* Clear pending bit before setting slot */ - set_bit(rtt_slot, &call->rtt_avail); - trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_cancel, rtt_slot, serial); - } } /* * Transmit an ACK packet. */ -int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) { + struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; struct rxrpc_connection *conn; + struct rxrpc_ackpacket *ack = (struct rxrpc_ackpacket *)(whdr + 1); struct msghdr msg; - struct kvec iov[1]; - rxrpc_serial_t serial; - size_t len, n; - int ret, rtt_slot = -1; - u16 rwind; + ktime_t now; + int ret; if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) - return -ECONNRESET; + return; conn = call->conn; @@ -203,55 +193,68 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) msg.msg_namelen = call->peer->srx.transport_len; msg.msg_control = NULL; msg.msg_controllen = 0; - msg.msg_flags = 0; - - if (txb->ack.reason == RXRPC_ACK_PING) - txb->wire.flags |= RXRPC_REQUEST_ACK; - - n = rxrpc_fill_out_ack(conn, call, txb, &rwind); - if (n == 0) - return 0; - - iov[0].iov_base = &txb->wire; - iov[0].iov_len = sizeof(txb->wire) + sizeof(txb->ack) + n; - len = iov[0].iov_len; + msg.msg_flags = MSG_SPLICE_PAGES; - serial = rxrpc_get_next_serial(conn); - txb->wire.serial = htonl(serial); - trace_rxrpc_tx_ack(call->debug_id, serial, - ntohl(txb->ack.firstPacket), - ntohl(txb->ack.serial), txb->ack.reason, txb->ack.nAcks, - rwind); + whdr->flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS; - if (txb->ack.reason == RXRPC_ACK_PING) - rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_ping); + txb->serial = rxrpc_get_next_serial(conn); + whdr->serial = htonl(txb->serial); + trace_rxrpc_tx_ack(call->debug_id, txb->serial, + ntohl(ack->firstPacket), + ntohl(ack->serial), ack->reason, ack->nAcks, + txb->ack_rwind); rxrpc_inc_stat(call->rxnet, stat_tx_ack_send); - /* Grab the highest received seq as late as possible */ - txb->ack.previousPacket = htonl(call->rx_highest_seq); - - iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len); - ret = do_udp_sendmsg(conn->local->socket, &msg, len); + iov_iter_kvec(&msg.msg_iter, WRITE, txb->kvec, txb->nr_kvec, txb->len); + rxrpc_local_dont_fragment(conn->local, false); + ret = do_udp_sendmsg(conn->local->socket, &msg, txb->len); call->peer->last_tx_at = ktime_get_seconds(); if (ret < 0) { - trace_rxrpc_tx_fail(call->debug_id, serial, ret, + trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret, rxrpc_tx_point_call_ack); } else { - trace_rxrpc_tx_packet(call->debug_id, &txb->wire, + trace_rxrpc_tx_packet(call->debug_id, whdr, rxrpc_tx_point_call_ack); - if (txb->wire.flags & RXRPC_REQUEST_ACK) - call->peer->rtt_last_req = ktime_get_real(); + now = ktime_get_real(); + if (ack->reason == RXRPC_ACK_PING) + rxrpc_begin_rtt_probe(call, txb->serial, now, rxrpc_rtt_tx_ping); + if (txb->flags & RXRPC_REQUEST_ACK) + call->peer->rtt_last_req = now; + rxrpc_set_keepalive(call, now); } rxrpc_tx_backoff(call, ret); +} - if (!__rxrpc_call_is_complete(call)) { - if (ret < 0) - rxrpc_cancel_rtt_probe(call, serial, rtt_slot); - rxrpc_set_keepalive(call); +/* + * Queue an ACK for immediate transmission. + */ +void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, + rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why) +{ + struct rxrpc_txbuf *txb; + + if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) + return; + + rxrpc_inc_stat(call->rxnet, stat_tx_acks[ack_reason]); + + txb = rxrpc_alloc_ack_txbuf(call, call->ackr_wtop - call->ackr_window); + if (!txb) { + kleave(" = -ENOMEM"); + return; } - return ret; + txb->ack_why = why; + + rxrpc_fill_out_ack(call, txb, ack_reason, serial); + call->ackr_nr_unacked = 0; + atomic_set(&call->ackr_nr_consumed, 0); + clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags); + + trace_rxrpc_send_ack(call, why, ack_reason, serial); + rxrpc_send_ack_packet(call, txb); + rxrpc_put_txbuf(txb, rxrpc_txbuf_put_ack_tx); } /* @@ -319,38 +322,22 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) } /* - * send a packet through the transport endpoint + * Prepare a (sub)packet for transmission. */ -int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_txbuf *txb, + rxrpc_serial_t serial) { + struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; enum rxrpc_req_ack_trace why; struct rxrpc_connection *conn = call->conn; - struct msghdr msg; - struct kvec iov[1]; - rxrpc_serial_t serial; - size_t len; - int ret, rtt_slot = -1; _enter("%x,{%d}", txb->seq, txb->len); - /* Each transmission of a Tx packet needs a new serial number */ - serial = rxrpc_get_next_serial(conn); - txb->wire.serial = htonl(serial); + txb->serial = serial; if (test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags) && txb->seq == 1) - txb->wire.userStatus = RXRPC_USERSTATUS_SERVICE_UPGRADE; - - iov[0].iov_base = &txb->wire; - iov[0].iov_len = sizeof(txb->wire) + txb->len; - len = iov[0].iov_len; - iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len); - - msg.msg_name = &call->peer->srx.transport; - msg.msg_namelen = call->peer->srx.transport_len; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = 0; + whdr->userStatus = RXRPC_USERSTATUS_SERVICE_UPGRADE; /* If our RTT cache needs working on, request an ACK. Also request * ACKs if a DATA packet appears to have been lost. @@ -359,13 +346,13 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) * service call, lest OpenAFS incorrectly send us an ACK with some * soft-ACKs in it and then never follow up with a proper hard ACK. */ - if (txb->wire.flags & RXRPC_REQUEST_ACK) + if (txb->flags & RXRPC_REQUEST_ACK) why = rxrpc_reqack_already_on; - else if (test_bit(RXRPC_TXBUF_LAST, &txb->flags) && rxrpc_sending_to_client(txb)) + else if ((txb->flags & RXRPC_LAST_PACKET) && rxrpc_sending_to_client(txb)) why = rxrpc_reqack_no_srv_last; else if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events)) why = rxrpc_reqack_ack_lost; - else if (test_bit(RXRPC_TXBUF_RESENT, &txb->flags)) + else if (txb->flags & RXRPC_TXBUF_RESENT) why = rxrpc_reqack_retrans; else if (call->cong_mode == RXRPC_CALL_SLOW_START && call->cong_cwnd <= 2) why = rxrpc_reqack_slow_start; @@ -381,42 +368,116 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) rxrpc_inc_stat(call->rxnet, stat_why_req_ack[why]); trace_rxrpc_req_ack(call->debug_id, txb->seq, why); if (why != rxrpc_reqack_no_srv_last) - txb->wire.flags |= RXRPC_REQUEST_ACK; + txb->flags |= RXRPC_REQUEST_ACK; dont_set_request_ack: + whdr->flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS; + whdr->serial = htonl(txb->serial); + whdr->cksum = txb->cksum; + + trace_rxrpc_tx_data(call, txb->seq, txb->serial, txb->flags, false); +} + +/* + * Prepare a packet for transmission. + */ +static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +{ + rxrpc_serial_t serial; + + /* Each transmission of a Tx packet needs a new serial number */ + serial = rxrpc_get_next_serial(call->conn); + + rxrpc_prepare_data_subpacket(call, txb, serial); + + return txb->len; +} + +/* + * Set timeouts after transmitting a packet. + */ +static void rxrpc_tstamp_data_packets(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +{ + ktime_t now = ktime_get_real(); + bool ack_requested = txb->flags & RXRPC_REQUEST_ACK; + + call->tx_last_sent = now; + txb->last_sent = now; + + if (ack_requested) { + rxrpc_begin_rtt_probe(call, txb->serial, now, rxrpc_rtt_tx_data); + + call->peer->rtt_last_req = now; + if (call->peer->rtt_count > 1) { + ktime_t delay = rxrpc_get_rto_backoff(call->peer, false); + + call->ack_lost_at = ktime_add(now, delay); + trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_lost_ack); + } + } + + if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) { + ktime_t delay = ms_to_ktime(READ_ONCE(call->next_rx_timo)); + + call->expect_rx_by = ktime_add(now, delay); + trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_expect_rx); + } + + rxrpc_set_keepalive(call, now); +} + +/* + * send a packet through the transport endpoint + */ +static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +{ + struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; + struct rxrpc_connection *conn = call->conn; + enum rxrpc_tx_point frag; + struct msghdr msg; + size_t len; + int ret; + + _enter("%x,{%d}", txb->seq, txb->len); + + len = rxrpc_prepare_data_packet(call, txb); + if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { static int lose; if ((lose++ & 7) == 7) { ret = 0; - trace_rxrpc_tx_data(call, txb->seq, serial, - txb->wire.flags, - test_bit(RXRPC_TXBUF_RESENT, &txb->flags), - true); + trace_rxrpc_tx_data(call, txb->seq, txb->serial, + txb->flags, true); goto done; } } - trace_rxrpc_tx_data(call, txb->seq, serial, txb->wire.flags, - test_bit(RXRPC_TXBUF_RESENT, &txb->flags), false); + iov_iter_kvec(&msg.msg_iter, WRITE, txb->kvec, txb->nr_kvec, len); + + msg.msg_name = &call->peer->srx.transport; + msg.msg_namelen = call->peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = MSG_SPLICE_PAGES; /* Track what we've attempted to transmit at least once so that the * retransmission algorithm doesn't try to resend what we haven't sent - * yet. However, this can race as we can receive an ACK before we get - * to this point. But, OTOH, if we won't get an ACK mentioning this - * packet unless the far side received it (though it could have - * discarded it anyway and NAK'd it). + * yet. */ - cmpxchg(&call->tx_transmitted, txb->seq - 1, txb->seq); + if (txb->seq == call->tx_transmitted + 1) + call->tx_transmitted = txb->seq; /* send the packet with the don't fragment bit set if we currently * think it's small enough */ - if (txb->len >= call->peer->maxdata) - goto send_fragmentable; - - txb->last_sent = ktime_get_real(); - if (txb->wire.flags & RXRPC_REQUEST_ACK) - rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_data); + if (txb->len >= call->peer->maxdata) { + rxrpc_local_dont_fragment(conn->local, false); + frag = rxrpc_tx_point_call_data_frag; + } else { + rxrpc_local_dont_fragment(conn->local, true); + frag = rxrpc_tx_point_call_data_nofrag; + } +retry: /* send the packet by UDP * - returns -EMSGSIZE if UDP would have to fragment the packet * to go out of the interface @@ -429,46 +490,21 @@ dont_set_request_ack: if (ret < 0) { rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail); - rxrpc_cancel_rtt_probe(call, serial, rtt_slot); - trace_rxrpc_tx_fail(call->debug_id, serial, ret, - rxrpc_tx_point_call_data_nofrag); + trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret, frag); } else { - trace_rxrpc_tx_packet(call->debug_id, &txb->wire, - rxrpc_tx_point_call_data_nofrag); + trace_rxrpc_tx_packet(call->debug_id, whdr, frag); } rxrpc_tx_backoff(call, ret); - if (ret == -EMSGSIZE) - goto send_fragmentable; + if (ret == -EMSGSIZE && frag == rxrpc_tx_point_call_data_frag) { + rxrpc_local_dont_fragment(conn->local, false); + frag = rxrpc_tx_point_call_data_frag; + goto retry; + } done: if (ret >= 0) { - call->tx_last_sent = txb->last_sent; - if (txb->wire.flags & RXRPC_REQUEST_ACK) { - call->peer->rtt_last_req = txb->last_sent; - if (call->peer->rtt_count > 1) { - unsigned long nowj = jiffies, ack_lost_at; - - ack_lost_at = rxrpc_get_rto_backoff(call->peer, false); - ack_lost_at += nowj; - WRITE_ONCE(call->ack_lost_at, ack_lost_at); - rxrpc_reduce_call_timer(call, ack_lost_at, nowj, - rxrpc_timer_set_for_lost_ack); - } - } - - if (txb->seq == 1 && - !test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, - &call->flags)) { - unsigned long nowj = jiffies, expect_rx_by; - - expect_rx_by = nowj + call->next_rx_timo; - WRITE_ONCE(call->expect_rx_by, expect_rx_by); - rxrpc_reduce_call_timer(call, expect_rx_by, nowj, - rxrpc_timer_set_for_normal); - } - - rxrpc_set_keepalive(call); + rxrpc_tstamp_data_packets(call, txb); } else { /* Cancel the call if the initial transmission fails, * particularly if that's due to network routing issues that @@ -482,41 +518,6 @@ done: _leave(" = %d [%u]", ret, call->peer->maxdata); return ret; - -send_fragmentable: - /* attempt to send this message with fragmentation enabled */ - _debug("send fragment"); - - txb->last_sent = ktime_get_real(); - if (txb->wire.flags & RXRPC_REQUEST_ACK) - rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_data); - - switch (conn->local->srx.transport.family) { - case AF_INET6: - case AF_INET: - rxrpc_local_dont_fragment(conn->local, false); - rxrpc_inc_stat(call->rxnet, stat_tx_data_send_frag); - ret = do_udp_sendmsg(conn->local->socket, &msg, len); - conn->peer->last_tx_at = ktime_get_seconds(); - - rxrpc_local_dont_fragment(conn->local, true); - break; - - default: - BUG(); - } - - if (ret < 0) { - rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail); - rxrpc_cancel_rtt_probe(call, serial, rtt_slot); - trace_rxrpc_tx_fail(call->debug_id, serial, ret, - rxrpc_tx_point_call_data_frag); - } else { - trace_rxrpc_tx_packet(call->debug_id, &txb->wire, - rxrpc_tx_point_call_data_frag); - } - rxrpc_tx_backoff(call, ret); - goto done; } /* @@ -723,11 +724,9 @@ void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb) rxrpc_instant_resend(call, txb); } } else { - unsigned long now = jiffies; - unsigned long resend_at = now + call->peer->rto_j; + ktime_t delay = ns_to_ktime(call->peer->rto_us * NSEC_PER_USEC); - WRITE_ONCE(call->resend_at, resend_at); - rxrpc_reduce_call_timer(call, resend_at, now, - rxrpc_timer_set_for_send); + call->resend_at = ktime_add(ktime_get_real(), delay); + trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_resend_tx); } } diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 26dc2f26d9..263a2251e3 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -52,9 +52,9 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) struct rxrpc_call *call; struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); enum rxrpc_call_state state; - unsigned long timeout = 0; rxrpc_seq_t acks_hard_ack; char lbuff[50], rbuff[50]; + long timeout = 0; if (v == &rxnet->calls) { seq_puts(seq, @@ -76,10 +76,8 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) sprintf(rbuff, "%pISpc", &call->dest_srx.transport); state = rxrpc_call_state(call); - if (state != RXRPC_CALL_SERVER_PREALLOC) { - timeout = READ_ONCE(call->expect_rx_by); - timeout -= jiffies; - } + if (state != RXRPC_CALL_SERVER_PREALLOC) + timeout = ktime_ms_delta(READ_ONCE(call->expect_rx_by), ktime_get_real()); acks_hard_ack = READ_ONCE(call->acks_hard_ack); seq_printf(seq, @@ -309,7 +307,7 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v) peer->mtu, now - peer->last_tx_at, peer->srtt_us >> 3, - jiffies_to_usecs(peer->rto_j)); + peer->rto_us); return 0; } diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h index e8ee4af43c..4fe6b4d20a 100644 --- a/net/rxrpc/protocol.h +++ b/net/rxrpc/protocol.h @@ -135,9 +135,9 @@ struct rxrpc_ackpacket { /* * ACK packets can have a further piece of information tagged on the end */ -struct rxrpc_ackinfo { - __be32 rxMTU; /* maximum Rx MTU size (bytes) [AFS 3.3] */ - __be32 maxMTU; /* maximum interface MTU size (bytes) [AFS 3.3] */ +struct rxrpc_acktrailer { + __be32 maxMTU; /* maximum Rx MTU size (bytes) [AFS 3.3] */ + __be32 ifMTU; /* maximum interface MTU size (bytes) [AFS 3.3] */ __be32 rwind; /* Rx window size (packets) [AFS 3.4] */ __be32 jumbo_max; /* max packets to stick into a jumbo packet [AFS 3.5] */ }; diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c index be61d6f5be..cdab7b7d08 100644 --- a/net/rxrpc/rtt.c +++ b/net/rxrpc/rtt.c @@ -11,8 +11,8 @@ #include <linux/net.h> #include "ar-internal.h" -#define RXRPC_RTO_MAX ((unsigned)(120 * HZ)) -#define RXRPC_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */ +#define RXRPC_RTO_MAX (120 * USEC_PER_SEC) +#define RXRPC_TIMEOUT_INIT ((unsigned int)(1 * MSEC_PER_SEC)) /* RFC6298 2.1 initial RTO value */ #define rxrpc_jiffies32 ((u32)jiffies) /* As rxrpc_jiffies32 */ static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer) @@ -22,7 +22,7 @@ static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer) static u32 __rxrpc_set_rto(const struct rxrpc_peer *peer) { - return usecs_to_jiffies((peer->srtt_us >> 3) + peer->rttvar_us); + return (peer->srtt_us >> 3) + peer->rttvar_us; } static u32 rxrpc_bound_rto(u32 rto) @@ -124,7 +124,7 @@ static void rxrpc_set_rto(struct rxrpc_peer *peer) /* NOTE: clamping at RXRPC_RTO_MIN is not required, current algo * guarantees that rto is higher. */ - peer->rto_j = rxrpc_bound_rto(rto); + peer->rto_us = rxrpc_bound_rto(rto); } static void rxrpc_ack_update_rtt(struct rxrpc_peer *peer, long rtt_us) @@ -163,33 +163,33 @@ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, spin_unlock(&peer->rtt_input_lock); trace_rxrpc_rtt_rx(call, why, rtt_slot, send_serial, resp_serial, - peer->srtt_us >> 3, peer->rto_j); + peer->srtt_us >> 3, peer->rto_us); } /* - * Get the retransmission timeout to set in jiffies, backing it off each time - * we retransmit. + * Get the retransmission timeout to set in nanoseconds, backing it off each + * time we retransmit. */ -unsigned long rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans) +ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans) { - u64 timo_j; - u8 backoff = READ_ONCE(peer->backoff); + u64 timo_us; + u32 backoff = READ_ONCE(peer->backoff); - timo_j = peer->rto_j; - timo_j <<= backoff; - if (retrans && timo_j * 2 <= RXRPC_RTO_MAX) + timo_us = peer->rto_us; + timo_us <<= backoff; + if (retrans && timo_us * 2 <= RXRPC_RTO_MAX) WRITE_ONCE(peer->backoff, backoff + 1); - if (timo_j < 1) - timo_j = 1; + if (timo_us < 1) + timo_us = 1; - return timo_j; + return ns_to_ktime(timo_us * NSEC_PER_USEC); } void rxrpc_peer_init_rtt(struct rxrpc_peer *peer) { - peer->rto_j = RXRPC_TIMEOUT_INIT; - peer->mdev_us = jiffies_to_usecs(RXRPC_TIMEOUT_INIT); + peer->rto_us = RXRPC_TIMEOUT_INIT; + peer->mdev_us = RXRPC_TIMEOUT_INIT; peer->backoff = 0; //minmax_reset(&peer->rtt_min, rxrpc_jiffies32, ~0U); } diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 6b32d61d4c..48a1475e6b 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -145,16 +145,17 @@ error: /* * Work out how much data we can put in a packet. */ -static int rxkad_how_much_data(struct rxrpc_call *call, size_t remain, - size_t *_buf_size, size_t *_data_size, size_t *_offset) +static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp) { - size_t shdr, buf_size, chunk; + struct rxrpc_txbuf *txb; + size_t shdr, space; + + remain = min(remain, 65535 - sizeof(struct rxrpc_wire_header)); switch (call->conn->security_level) { default: - buf_size = chunk = min_t(size_t, remain, RXRPC_JUMBO_DATALEN); - shdr = 0; - goto out; + space = min_t(size_t, remain, RXRPC_JUMBO_DATALEN); + return rxrpc_alloc_data_txbuf(call, space, 1, gfp); case RXRPC_SECURITY_AUTH: shdr = sizeof(struct rxkad_level1_hdr); break; @@ -163,17 +164,16 @@ static int rxkad_how_much_data(struct rxrpc_call *call, size_t remain, break; } - buf_size = round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN); + space = min_t(size_t, round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN), remain + shdr); + space = round_up(space, RXKAD_ALIGN); - chunk = buf_size - shdr; - if (remain < chunk) - buf_size = round_up(shdr + remain, RXKAD_ALIGN); + txb = rxrpc_alloc_data_txbuf(call, space, RXKAD_ALIGN, gfp); + if (!txb) + return NULL; -out: - *_buf_size = buf_size; - *_data_size = chunk; - *_offset = shdr; - return 0; + txb->offset += shdr; + txb->space -= shdr; + return txb; } /* @@ -251,7 +251,8 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, struct rxrpc_txbuf *txb, struct skcipher_request *req) { - struct rxkad_level1_hdr *hdr = (void *)txb->data; + struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; + struct rxkad_level1_hdr *hdr = (void *)(whdr + 1); struct rxrpc_crypt iv; struct scatterlist sg; size_t pad; @@ -259,7 +260,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, _enter(""); - check = txb->seq ^ ntohl(txb->wire.callNumber); + check = txb->seq ^ call->call_id; hdr->data_size = htonl((u32)check << 16 | txb->len); txb->len += sizeof(struct rxkad_level1_hdr); @@ -267,14 +268,14 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, pad = RXKAD_ALIGN - pad; pad &= RXKAD_ALIGN - 1; if (pad) { - memset(txb->data + txb->offset, 0, pad); + memset(txb->kvec[0].iov_base + txb->offset, 0, pad); txb->len += pad; } /* start the encryption afresh */ memset(&iv, 0, sizeof(iv)); - sg_init_one(&sg, txb->data, 8); + sg_init_one(&sg, hdr, 8); skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); @@ -293,7 +294,8 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, struct skcipher_request *req) { const struct rxrpc_key_token *token; - struct rxkad_level2_hdr *rxkhdr = (void *)txb->data; + struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; + struct rxkad_level2_hdr *rxkhdr = (void *)(whdr + 1); struct rxrpc_crypt iv; struct scatterlist sg; size_t pad; @@ -302,7 +304,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, _enter(""); - check = txb->seq ^ ntohl(txb->wire.callNumber); + check = txb->seq ^ call->call_id; rxkhdr->data_size = htonl(txb->len | (u32)check << 16); rxkhdr->checksum = 0; @@ -312,7 +314,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, pad = RXKAD_ALIGN - pad; pad &= RXKAD_ALIGN - 1; if (pad) { - memset(txb->data + txb->offset, 0, pad); + memset(txb->kvec[0].iov_base + txb->offset, 0, pad); txb->len += pad; } @@ -320,7 +322,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, token = call->conn->key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); - sg_init_one(&sg, txb->data, txb->len); + sg_init_one(&sg, rxkhdr, txb->len); skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, txb->len, iv.x); @@ -362,9 +364,9 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) memcpy(&iv, call->conn->rxkad.csum_iv.x, sizeof(iv)); /* calculate the security checksum */ - x = (ntohl(txb->wire.cid) & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT); + x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT); x |= txb->seq & 0x3fffffff; - crypto.buf[0] = txb->wire.callNumber; + crypto.buf[0] = htonl(call->call_id); crypto.buf[1] = htonl(x); sg_init_one(&sg, crypto.buf, 8); @@ -378,7 +380,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) y = (y >> 16) & 0xffff; if (y == 0) y = 1; /* zero checksums are not permitted */ - txb->wire.cksum = htons(y); + txb->cksum = htons(y); switch (call->conn->security_level) { case RXRPC_SECURITY_PLAIN: @@ -726,7 +728,6 @@ static int rxkad_send_response(struct rxrpc_connection *conn, rxrpc_local_dont_fragment(conn->local, false); ret = kernel_sendmsg(conn->local->socket, &msg, iov, 3, len); - rxrpc_local_dont_fragment(conn->local, true); if (ret < 0) { trace_rxrpc_tx_fail(conn->debug_id, serial, ret, rxrpc_tx_point_rxkad_response); @@ -1256,7 +1257,7 @@ const struct rxrpc_security rxkad = { .free_preparse_server_key = rxkad_free_preparse_server_key, .destroy_server_key = rxkad_destroy_server_key, .init_connection_security = rxkad_init_connection_security, - .how_much_data = rxkad_how_much_data, + .alloc_txbuf = rxkad_alloc_txbuf, .secure_packet = rxkad_secure_packet, .verify_packet = rxkad_verify_packet, .free_call_crypto = rxkad_free_call_crypto, diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 5677d5690a..894b8fa68e 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -240,7 +240,7 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, rxrpc_notify_end_tx_t notify_end_tx) { rxrpc_seq_t seq = txb->seq; - bool last = test_bit(RXRPC_TXBUF_LAST, &txb->flags), poke; + bool poke, last = txb->flags & RXRPC_LAST_PACKET; rxrpc_inc_stat(call->rxnet, stat_tx_data); @@ -336,7 +336,7 @@ reload: do { if (!txb) { - size_t remain, bufsize, chunk, offset; + size_t remain; _debug("alloc"); @@ -348,23 +348,11 @@ reload: * region (enc blocksize), but the trailer is not. */ remain = more ? INT_MAX : msg_data_left(msg); - ret = call->conn->security->how_much_data(call, remain, - &bufsize, &chunk, &offset); - if (ret < 0) + txb = call->conn->security->alloc_txbuf(call, remain, sk->sk_allocation); + if (!txb) { + ret = -ENOMEM; goto maybe_error; - - _debug("SIZE: %zu/%zu @%zu", chunk, bufsize, offset); - - /* create a buffer that we can retain until it's ACK'd */ - ret = -ENOMEM; - txb = rxrpc_alloc_txbuf(call, RXRPC_PACKET_TYPE_DATA, - GFP_KERNEL); - if (!txb) - goto maybe_error; - - txb->offset = offset; - txb->space -= offset; - txb->space = min_t(size_t, chunk, txb->space); + } } _debug("append"); @@ -374,8 +362,8 @@ reload: size_t copy = min_t(size_t, txb->space, msg_data_left(msg)); _debug("add %zu", copy); - if (!copy_from_iter_full(txb->data + txb->offset, copy, - &msg->msg_iter)) + if (!copy_from_iter_full(txb->kvec[0].iov_base + txb->offset, + copy, &msg->msg_iter)) goto efault; _debug("added"); txb->space -= copy; @@ -394,18 +382,18 @@ reload: /* add the packet to the send queue if it's now full */ if (!txb->space || (msg_data_left(msg) == 0 && !more)) { - if (msg_data_left(msg) == 0 && !more) { - txb->wire.flags |= RXRPC_LAST_PACKET; - __set_bit(RXRPC_TXBUF_LAST, &txb->flags); - } + if (msg_data_left(msg) == 0 && !more) + txb->flags |= RXRPC_LAST_PACKET; else if (call->tx_top - call->acks_hard_ack < call->tx_winsize) - txb->wire.flags |= RXRPC_MORE_PACKETS; + txb->flags |= RXRPC_MORE_PACKETS; ret = call->security->secure_packet(call, txb); if (ret < 0) goto out; + txb->kvec[0].iov_len += txb->len; + txb->len = txb->kvec[0].iov_len; rxrpc_queue_packet(rx, call, txb, notify_end_tx); txb = NULL; } @@ -621,7 +609,6 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) __releases(&rx->sk.sk_lock.slock) { struct rxrpc_call *call; - unsigned long now, j; bool dropped_lock = false; int ret; @@ -699,25 +686,21 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) switch (p.call.nr_timeouts) { case 3: - j = msecs_to_jiffies(p.call.timeouts.normal); - if (p.call.timeouts.normal > 0 && j == 0) - j = 1; - WRITE_ONCE(call->next_rx_timo, j); + WRITE_ONCE(call->next_rx_timo, p.call.timeouts.normal); fallthrough; case 2: - j = msecs_to_jiffies(p.call.timeouts.idle); - if (p.call.timeouts.idle > 0 && j == 0) - j = 1; - WRITE_ONCE(call->next_req_timo, j); + WRITE_ONCE(call->next_req_timo, p.call.timeouts.idle); fallthrough; case 1: if (p.call.timeouts.hard > 0) { - j = p.call.timeouts.hard * HZ; - now = jiffies; - j += now; - WRITE_ONCE(call->expect_term_by, j); - rxrpc_reduce_call_timer(call, j, now, - rxrpc_timer_set_for_hard); + ktime_t delay = ms_to_ktime(p.call.timeouts.hard * MSEC_PER_SEC); + + WRITE_ONCE(call->expect_term_by, + ktime_add(p.call.timeouts.hard, + ktime_get_real())); + trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_hard); + rxrpc_poke_call(call, rxrpc_call_poke_set_timeout); + } break; } diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index ecaeb4ecfb..9bf9a1f6e4 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -15,6 +15,8 @@ static const unsigned int four = 4; static const unsigned int max_backlog = RXRPC_BACKLOG_MAX - 1; static const unsigned int n_65535 = 65535; static const unsigned int n_max_acks = 255; +static const unsigned long one_ms = 1; +static const unsigned long max_ms = 1000; static const unsigned long one_jiffy = 1; static const unsigned long max_jiffies = MAX_JIFFY_OFFSET; #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY @@ -28,24 +30,24 @@ static const unsigned long max_500 = 500; * information on the individual parameters. */ static struct ctl_table rxrpc_sysctl_table[] = { - /* Values measured in milliseconds but used in jiffies */ + /* Values measured in milliseconds */ { .procname = "soft_ack_delay", .data = &rxrpc_soft_ack_delay, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_doulongvec_ms_jiffies_minmax, - .extra1 = (void *)&one_jiffy, - .extra2 = (void *)&max_jiffies, + .proc_handler = proc_doulongvec_minmax, + .extra1 = (void *)&one_ms, + .extra2 = (void *)&max_ms, }, { .procname = "idle_ack_delay", .data = &rxrpc_idle_ack_delay, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_doulongvec_ms_jiffies_minmax, - .extra1 = (void *)&one_jiffy, - .extra2 = (void *)&max_jiffies, + .proc_handler = proc_doulongvec_minmax, + .extra1 = (void *)&one_ms, + .extra2 = (void *)&max_ms, }, { .procname = "idle_conn_expiry", @@ -125,7 +127,6 @@ static struct ctl_table rxrpc_sysctl_table[] = { .extra1 = (void *)SYSCTL_ONE, .extra2 = (void *)&four, }, - { } }; int __init rxrpc_sysctl_init(void) diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index d43be85123..c3913d8a50 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -14,45 +14,146 @@ static atomic_t rxrpc_txbuf_debug_ids; atomic_t rxrpc_nr_txbuf; /* - * Allocate and partially initialise an I/O request structure. + * Allocate and partially initialise a data transmission buffer. */ -struct rxrpc_txbuf *rxrpc_alloc_txbuf(struct rxrpc_call *call, u8 packet_type, - gfp_t gfp) +struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_size, + size_t data_align, gfp_t gfp) { + struct rxrpc_wire_header *whdr; struct rxrpc_txbuf *txb; + size_t total, hoff; + void *buf; txb = kmalloc(sizeof(*txb), gfp); - if (txb) { - INIT_LIST_HEAD(&txb->call_link); - INIT_LIST_HEAD(&txb->tx_link); - refcount_set(&txb->ref, 1); - txb->call_debug_id = call->debug_id; - txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids); - txb->space = sizeof(txb->data); - txb->len = 0; - txb->offset = 0; - txb->flags = 0; - txb->ack_why = 0; - txb->seq = call->tx_prepared + 1; - txb->wire.epoch = htonl(call->conn->proto.epoch); - txb->wire.cid = htonl(call->cid); - txb->wire.callNumber = htonl(call->call_id); - txb->wire.seq = htonl(txb->seq); - txb->wire.type = packet_type; - txb->wire.flags = call->conn->out_clientflag; - txb->wire.userStatus = 0; - txb->wire.securityIndex = call->security_ix; - txb->wire._rsvd = 0; - txb->wire.serviceId = htons(call->dest_srx.srx_service); - - trace_rxrpc_txbuf(txb->debug_id, - txb->call_debug_id, txb->seq, 1, - packet_type == RXRPC_PACKET_TYPE_DATA ? - rxrpc_txbuf_alloc_data : - rxrpc_txbuf_alloc_ack); - atomic_inc(&rxrpc_nr_txbuf); + if (!txb) + return NULL; + + hoff = round_up(sizeof(*whdr), data_align) - sizeof(*whdr); + total = hoff + sizeof(*whdr) + data_size; + + data_align = umax(data_align, L1_CACHE_BYTES); + mutex_lock(&call->conn->tx_data_alloc_lock); + buf = page_frag_alloc_align(&call->conn->tx_data_alloc, total, gfp, + data_align); + mutex_unlock(&call->conn->tx_data_alloc_lock); + if (!buf) { + kfree(txb); + return NULL; + } + + whdr = buf + hoff; + + INIT_LIST_HEAD(&txb->call_link); + INIT_LIST_HEAD(&txb->tx_link); + refcount_set(&txb->ref, 1); + txb->last_sent = KTIME_MIN; + txb->call_debug_id = call->debug_id; + txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids); + txb->space = data_size; + txb->len = 0; + txb->offset = sizeof(*whdr); + txb->flags = call->conn->out_clientflag; + txb->ack_why = 0; + txb->seq = call->tx_prepared + 1; + txb->serial = 0; + txb->cksum = 0; + txb->nr_kvec = 1; + txb->kvec[0].iov_base = whdr; + txb->kvec[0].iov_len = sizeof(*whdr); + + whdr->epoch = htonl(call->conn->proto.epoch); + whdr->cid = htonl(call->cid); + whdr->callNumber = htonl(call->call_id); + whdr->seq = htonl(txb->seq); + whdr->type = RXRPC_PACKET_TYPE_DATA; + whdr->flags = 0; + whdr->userStatus = 0; + whdr->securityIndex = call->security_ix; + whdr->_rsvd = 0; + whdr->serviceId = htons(call->dest_srx.srx_service); + + trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 1, + rxrpc_txbuf_alloc_data); + + atomic_inc(&rxrpc_nr_txbuf); + return txb; +} + +/* + * Allocate and partially initialise an ACK packet. + */ +struct rxrpc_txbuf *rxrpc_alloc_ack_txbuf(struct rxrpc_call *call, size_t sack_size) +{ + struct rxrpc_wire_header *whdr; + struct rxrpc_acktrailer *trailer; + struct rxrpc_ackpacket *ack; + struct rxrpc_txbuf *txb; + gfp_t gfp = rcu_read_lock_held() ? GFP_ATOMIC | __GFP_NOWARN : GFP_NOFS; + void *buf, *buf2 = NULL; + u8 *filler; + + txb = kmalloc(sizeof(*txb), gfp); + if (!txb) + return NULL; + + buf = page_frag_alloc(&call->local->tx_alloc, + sizeof(*whdr) + sizeof(*ack) + 1 + 3 + sizeof(*trailer), gfp); + if (!buf) { + kfree(txb); + return NULL; + } + + if (sack_size) { + buf2 = page_frag_alloc(&call->local->tx_alloc, sack_size, gfp); + if (!buf2) { + page_frag_free(buf); + kfree(txb); + return NULL; + } } + whdr = buf; + ack = buf + sizeof(*whdr); + filler = buf + sizeof(*whdr) + sizeof(*ack) + 1; + trailer = buf + sizeof(*whdr) + sizeof(*ack) + 1 + 3; + + INIT_LIST_HEAD(&txb->call_link); + INIT_LIST_HEAD(&txb->tx_link); + refcount_set(&txb->ref, 1); + txb->call_debug_id = call->debug_id; + txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids); + txb->space = 0; + txb->len = sizeof(*whdr) + sizeof(*ack) + 3 + sizeof(*trailer); + txb->offset = 0; + txb->flags = call->conn->out_clientflag; + txb->ack_rwind = 0; + txb->seq = 0; + txb->serial = 0; + txb->cksum = 0; + txb->nr_kvec = 3; + txb->kvec[0].iov_base = whdr; + txb->kvec[0].iov_len = sizeof(*whdr) + sizeof(*ack); + txb->kvec[1].iov_base = buf2; + txb->kvec[1].iov_len = sack_size; + txb->kvec[2].iov_base = filler; + txb->kvec[2].iov_len = 3 + sizeof(*trailer); + + whdr->epoch = htonl(call->conn->proto.epoch); + whdr->cid = htonl(call->cid); + whdr->callNumber = htonl(call->call_id); + whdr->seq = 0; + whdr->type = RXRPC_PACKET_TYPE_ACK; + whdr->flags = 0; + whdr->userStatus = 0; + whdr->securityIndex = call->security_ix; + whdr->_rsvd = 0; + whdr->serviceId = htons(call->dest_srx.srx_service); + + get_page(virt_to_head_page(trailer)); + + trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 1, + rxrpc_txbuf_alloc_ack); + atomic_inc(&rxrpc_nr_txbuf); return txb; } @@ -71,12 +172,15 @@ void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, r, what); } -static void rxrpc_free_txbuf(struct rcu_head *rcu) +static void rxrpc_free_txbuf(struct rxrpc_txbuf *txb) { - struct rxrpc_txbuf *txb = container_of(rcu, struct rxrpc_txbuf, rcu); + int i; trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 0, rxrpc_txbuf_free); + for (i = 0; i < txb->nr_kvec; i++) + if (txb->kvec[i].iov_base) + page_frag_free(txb->kvec[i].iov_base); kfree(txb); atomic_dec(&rxrpc_nr_txbuf); } @@ -95,7 +199,7 @@ void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) dead = __refcount_dec_and_test(&txb->ref, &r); trace_rxrpc_txbuf(debug_id, call_debug_id, seq, r - 1, what); if (dead) - call_rcu(&txb->rcu, rxrpc_free_txbuf); + rxrpc_free_txbuf(txb); } } diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 470c70deff..8180d0c12f 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -737,16 +737,6 @@ config NET_ACT_SAMPLE To compile this code as a module, choose M here: the module will be called act_sample. -config NET_ACT_IPT - tristate "IPtables targets" - depends on NET_CLS_ACT && NETFILTER && NETFILTER_XTABLES - help - Say Y here to be able to invoke iptables targets after successful - classification. - - To compile this code as a module, choose M here: the - module will be called act_ipt. - config NET_ACT_NAT tristate "Stateless NAT" depends on NET_CLS_ACT diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 3e30d72604..2520708b06 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -830,7 +830,6 @@ int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, u32 max; if (*index) { -again: rcu_read_lock(); p = idr_find(&idrinfo->action_idr, *index); @@ -839,7 +838,7 @@ again: * index but did not assign the pointer yet. */ rcu_read_unlock(); - goto again; + return -EAGAIN; } if (!p) { @@ -1363,7 +1362,7 @@ struct tc_action_ops *tc_action_load_ops(struct nlattr *nla, u32 flags, if (rtnl_held) rtnl_unlock(); - request_module("act_%s", act_name); + request_module(NET_ACT_ALIAS_PREFIX "%s", act_name); if (rtnl_held) rtnl_lock(); diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 6cfee66581..0e3cf11ae5 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -401,6 +401,7 @@ static struct tc_action_ops act_bpf_ops __read_mostly = { .init = tcf_bpf_init, .size = sizeof(struct tcf_bpf), }; +MODULE_ALIAS_NET_ACT("bpf"); static __net_init int bpf_init_net(struct net *net) { diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index f876275665..0fce631e7c 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -242,6 +242,7 @@ static struct tc_action_ops act_connmark_ops = { .cleanup = tcf_connmark_cleanup, .size = sizeof(struct tcf_connmark_info), }; +MODULE_ALIAS_NET_ACT("connmark"); static __net_init int connmark_init_net(struct net *net) { diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 7f8b1f2f2e..5cc8e407e7 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -709,6 +709,7 @@ static struct tc_action_ops act_csum_ops = { .offload_act_setup = tcf_csum_offload_act_setup, .size = sizeof(struct tcf_csum), }; +MODULE_ALIAS_NET_ACT("csum"); static __net_init int csum_init_net(struct net *net) { diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 6124d8b128..9d451d77d5 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -41,21 +41,28 @@ static struct workqueue_struct *act_ct_wq; static struct rhashtable zones_ht; static DEFINE_MUTEX(zones_mutex); +struct zones_ht_key { + struct net *net; + u16 zone; + /* Note : pad[] must be the last field. */ + u8 pad[]; +}; + struct tcf_ct_flow_table { struct rhash_head node; /* In zones tables */ struct rcu_work rwork; struct nf_flowtable nf_ft; refcount_t ref; - u16 zone; + struct zones_ht_key key; bool dying; }; static const struct rhashtable_params zones_params = { .head_offset = offsetof(struct tcf_ct_flow_table, node), - .key_offset = offsetof(struct tcf_ct_flow_table, zone), - .key_len = sizeof_field(struct tcf_ct_flow_table, zone), + .key_offset = offsetof(struct tcf_ct_flow_table, key), + .key_len = offsetof(struct zones_ht_key, pad), .automatic_shrinking = true, }; @@ -316,11 +323,12 @@ static struct nf_flowtable_type flowtable_ct = { static int tcf_ct_flow_table_get(struct net *net, struct tcf_ct_params *params) { + struct zones_ht_key key = { .net = net, .zone = params->zone }; struct tcf_ct_flow_table *ct_ft; int err = -ENOMEM; mutex_lock(&zones_mutex); - ct_ft = rhashtable_lookup_fast(&zones_ht, ¶ms->zone, zones_params); + ct_ft = rhashtable_lookup_fast(&zones_ht, &key, zones_params); if (ct_ft && refcount_inc_not_zero(&ct_ft->ref)) goto out_unlock; @@ -329,7 +337,7 @@ static int tcf_ct_flow_table_get(struct net *net, struct tcf_ct_params *params) goto err_alloc; refcount_set(&ct_ft->ref, 1); - ct_ft->zone = params->zone; + ct_ft->key = key; err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params); if (err) goto err_insert; @@ -1071,6 +1079,14 @@ do_nat: */ if (nf_conntrack_confirm(skb) != NF_ACCEPT) goto drop; + + /* The ct may be dropped if a clash has been resolved, + * so it's necessary to retrieve it from skb again to + * prevent UAF. + */ + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + skip_add = true; } if (!skip_add) @@ -1600,6 +1616,7 @@ static struct tc_action_ops act_ct_ops = { .offload_act_setup = tcf_ct_offload_act_setup, .size = sizeof(struct tcf_ct), }; +MODULE_ALIAS_NET_ACT("ct"); static __net_init int ct_init_net(struct net *net) { diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index e620f9a84a..5dd41a0121 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -363,6 +363,7 @@ static struct tc_action_ops act_ctinfo_ops = { .cleanup= tcf_ctinfo_cleanup, .size = sizeof(struct tcf_ctinfo), }; +MODULE_ALIAS_NET_ACT("ctinfo"); static __net_init int ctinfo_init_net(struct net *net) { diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 4af3b7ec24..e949280eb8 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -296,6 +296,7 @@ static struct tc_action_ops act_gact_ops = { .offload_act_setup = tcf_gact_offload_act_setup, .size = sizeof(struct tcf_gact), }; +MODULE_ALIAS_NET_ACT("gact"); static __net_init int gact_init_net(struct net *net) { diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index c681cd011a..1dd7412539 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -645,6 +645,7 @@ static struct tc_action_ops act_gate_ops = { .offload_act_setup = tcf_gate_offload_act_setup, .size = sizeof(struct tcf_gate), }; +MODULE_ALIAS_NET_ACT("gate"); static __net_init int gate_init_net(struct net *net) { diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 0e867d13be..107c6d83dc 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -889,6 +889,7 @@ static struct tc_action_ops act_ife_ops = { .init = tcf_ife_init, .size = sizeof(struct tcf_ife_info), }; +MODULE_ALIAS_NET_ACT("ife"); static __net_init int ife_init_net(struct net *net) { diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 6faa7d00da..5b38143659 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -635,6 +635,7 @@ static struct tc_action_ops act_mirred_ops = { .size = sizeof(struct tcf_mirred), .get_dev = tcf_mirred_get_dev, }; +MODULE_ALIAS_NET_ACT("mirred"); static __net_init int mirred_init_net(struct net *net) { diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index 34b8edb6cc..44a37a71ae 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -452,6 +452,7 @@ static struct tc_action_ops act_mpls_ops = { .offload_act_setup = tcf_mpls_offload_act_setup, .size = sizeof(struct tcf_mpls), }; +MODULE_ALIAS_NET_ACT("mpls"); static __net_init int mpls_init_net(struct net *net) { diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index a180e72463..d541f55380 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -324,6 +324,7 @@ static struct tc_action_ops act_nat_ops = { .cleanup = tcf_nat_cleanup, .size = sizeof(struct tcf_nat), }; +MODULE_ALIAS_NET_ACT("nat"); static __net_init int nat_init_net(struct net *net) { diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 2ef22969f2..fc0a35a7b6 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -515,11 +515,11 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, spin_unlock_bh(&p->tcf_lock); return -ENOBUFS; } + opt->nkeys = parms->tcfp_nkeys; memcpy(opt->keys, parms->tcfp_keys, flex_array_size(opt, keys, parms->tcfp_nkeys)); opt->index = p->tcf_index; - opt->nkeys = parms->tcfp_nkeys; opt->flags = parms->tcfp_flags; opt->action = p->tcf_action; opt->refcnt = refcount_read(&p->tcf_refcnt) - ref; @@ -620,6 +620,7 @@ static struct tc_action_ops act_pedit_ops = { .offload_act_setup = tcf_pedit_offload_act_setup, .size = sizeof(struct tcf_pedit), }; +MODULE_ALIAS_NET_ACT("pedit"); static __net_init int pedit_init_net(struct net *net) { diff --git a/net/sched/act_police.c b/net/sched/act_police.c index e119b4a3db..8555125ed3 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -502,6 +502,7 @@ static struct tc_action_ops act_police_ops = { .offload_act_setup = tcf_police_offload_act_setup, .size = sizeof(struct tcf_police), }; +MODULE_ALIAS_NET_ACT("police"); static __net_init int police_init_net(struct net *net) { diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index c5c61efe6d..a69b53d540 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -316,6 +316,7 @@ static struct tc_action_ops act_sample_ops = { .offload_act_setup = tcf_sample_offload_act_setup, .size = sizeof(struct tcf_sample), }; +MODULE_ALIAS_NET_ACT("sample"); static __net_init int sample_init_net(struct net *net) { diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 0a3e928882..f3abe05459 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -209,6 +209,7 @@ static struct tc_action_ops act_simp_ops = { .init = tcf_simp_init, .size = sizeof(struct tcf_defact), }; +MODULE_ALIAS_NET_ACT("simple"); static __net_init int simp_init_net(struct net *net) { diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 754f78b35b..1f1d9ce3e9 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -426,6 +426,7 @@ static struct tc_action_ops act_skbedit_ops = { .offload_act_setup = tcf_skbedit_offload_act_setup, .size = sizeof(struct tcf_skbedit), }; +MODULE_ALIAS_NET_ACT("skbedit"); static __net_init int skbedit_init_net(struct net *net) { diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 0015393910..cd0accaf84 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -287,6 +287,7 @@ static struct tc_action_ops act_skbmod_ops = { .cleanup = tcf_skbmod_cleanup, .size = sizeof(struct tcf_skbmod), }; +MODULE_ALIAS_NET_ACT("skbmod"); static __net_init int skbmod_init_net(struct net *net) { diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 300b08aa82..af7c998459 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -230,7 +230,7 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, nla_for_each_attr(attr, head, len, rem) { switch (nla_type(attr)) { case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE: - if (type && type != TUNNEL_GENEVE_OPT) { + if (type && type != IP_TUNNEL_GENEVE_OPT_BIT) { NL_SET_ERR_MSG(extack, "Duplicate type for geneve options"); return -EINVAL; } @@ -247,7 +247,7 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, dst_len -= opt_len; dst += opt_len; } - type = TUNNEL_GENEVE_OPT; + type = IP_TUNNEL_GENEVE_OPT_BIT; break; case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN: if (type) { @@ -259,7 +259,7 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, if (opt_len < 0) return opt_len; opts_len += opt_len; - type = TUNNEL_VXLAN_OPT; + type = IP_TUNNEL_VXLAN_OPT_BIT; break; case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN: if (type) { @@ -271,7 +271,7 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, if (opt_len < 0) return opt_len; opts_len += opt_len; - type = TUNNEL_ERSPAN_OPT; + type = IP_TUNNEL_ERSPAN_OPT_BIT; break; } } @@ -302,7 +302,7 @@ static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info, switch (nla_type(nla_data(nla))) { case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE: #if IS_ENABLED(CONFIG_INET) - info->key.tun_flags |= TUNNEL_GENEVE_OPT; + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags); return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info), opts_len, extack); #else @@ -310,7 +310,7 @@ static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info, #endif case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN: #if IS_ENABLED(CONFIG_INET) - info->key.tun_flags |= TUNNEL_VXLAN_OPT; + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags); return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info), opts_len, extack); #else @@ -318,7 +318,7 @@ static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info, #endif case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN: #if IS_ENABLED(CONFIG_INET) - info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags); return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info), opts_len, extack); #else @@ -363,6 +363,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, bool bind = act_flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1]; struct tcf_tunnel_key_params *params_new; + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct metadata_dst *metadata = NULL; struct tcf_chain *goto_ch = NULL; struct tc_tunnel_key *parm; @@ -371,7 +372,6 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, __be16 dst_port = 0; __be64 key_id = 0; int opts_len = 0; - __be16 flags = 0; u8 tos, ttl; int ret = 0; u32 index; @@ -412,16 +412,16 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, key32 = nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]); key_id = key32_to_tunnel_id(key32); - flags = TUNNEL_KEY; + __set_bit(IP_TUNNEL_KEY_BIT, flags); } - flags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, flags); if (tb[TCA_TUNNEL_KEY_NO_CSUM] && nla_get_u8(tb[TCA_TUNNEL_KEY_NO_CSUM])) - flags &= ~TUNNEL_CSUM; + __clear_bit(IP_TUNNEL_CSUM_BIT, flags); if (nla_get_flag(tb[TCA_TUNNEL_KEY_NO_FRAG])) - flags |= TUNNEL_DONT_FRAGMENT; + __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, flags); if (tb[TCA_TUNNEL_KEY_ENC_DST_PORT]) dst_port = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_DST_PORT]); @@ -663,15 +663,15 @@ static int tunnel_key_opts_dump(struct sk_buff *skb, if (!start) return -EMSGSIZE; - if (info->key.tun_flags & TUNNEL_GENEVE_OPT) { + if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags)) { err = tunnel_key_geneve_opts_dump(skb, info); if (err) goto err_out; - } else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) { + } else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) { err = tunnel_key_vxlan_opts_dump(skb, info); if (err) goto err_out; - } else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) { + } else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags)) { err = tunnel_key_erspan_opts_dump(skb, info); if (err) goto err_out; @@ -741,7 +741,7 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, struct ip_tunnel_key *key = &info->key; __be32 key_id = tunnel_id_to_key32(key->tun_id); - if (((key->tun_flags & TUNNEL_KEY) && + if ((test_bit(IP_TUNNEL_KEY_BIT, key->tun_flags) && nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id)) || tunnel_key_dump_addresses(skb, ¶ms->tcft_enc_metadata->u.tun_info) || @@ -749,8 +749,8 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst)) || nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM, - !(key->tun_flags & TUNNEL_CSUM)) || - ((key->tun_flags & TUNNEL_DONT_FRAGMENT) && + !test_bit(IP_TUNNEL_CSUM_BIT, key->tun_flags)) || + (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags) && nla_put_flag(skb, TCA_TUNNEL_KEY_NO_FRAG)) || tunnel_key_opts_dump(skb, info)) goto nla_put_failure; @@ -842,6 +842,7 @@ static struct tc_action_ops act_tunnel_key_ops = { .offload_act_setup = tcf_tunnel_key_offload_act_setup, .size = sizeof(struct tcf_tunnel_key), }; +MODULE_ALIAS_NET_ACT("tunnel_key"); static __net_init int tunnel_key_init_net(struct net *net) { diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 836183011a..22f4b1e8ad 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -427,6 +427,7 @@ static struct tc_action_ops act_vlan_ops = { .offload_act_setup = tcf_vlan_offload_act_setup, .size = sizeof(struct tcf_vlan), }; +MODULE_ALIAS_NET_ACT("vlan"); static __net_init int vlan_init_net(struct net *net) { diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ff3d396a65..17d97bbe89 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -257,7 +257,7 @@ tcf_proto_lookup_ops(const char *kind, bool rtnl_held, #ifdef CONFIG_MODULES if (rtnl_held) rtnl_unlock(); - request_module("cls_%s", kind); + request_module(NET_CLS_ALIAS_PREFIX "%s", kind); if (rtnl_held) rtnl_lock(); ops = __tcf_proto_lookup_ops(kind); @@ -410,12 +410,48 @@ static void tcf_proto_get(struct tcf_proto *tp) refcount_inc(&tp->refcnt); } +static void tcf_maintain_bypass(struct tcf_block *block) +{ + int filtercnt = atomic_read(&block->filtercnt); + int skipswcnt = atomic_read(&block->skipswcnt); + bool bypass_wanted = filtercnt > 0 && filtercnt == skipswcnt; + + if (bypass_wanted != block->bypass_wanted) { +#ifdef CONFIG_NET_CLS_ACT + if (bypass_wanted) + static_branch_inc(&tcf_bypass_check_needed_key); + else + static_branch_dec(&tcf_bypass_check_needed_key); +#endif + block->bypass_wanted = bypass_wanted; + } +} + +static void tcf_block_filter_cnt_update(struct tcf_block *block, bool *counted, bool add) +{ + lockdep_assert_not_held(&block->cb_lock); + + down_write(&block->cb_lock); + if (*counted != add) { + if (add) { + atomic_inc(&block->filtercnt); + *counted = true; + } else { + atomic_dec(&block->filtercnt); + *counted = false; + } + } + tcf_maintain_bypass(block); + up_write(&block->cb_lock); +} + static void tcf_chain_put(struct tcf_chain *chain); static void tcf_proto_destroy(struct tcf_proto *tp, bool rtnl_held, bool sig_destroy, struct netlink_ext_ack *extack) { tp->ops->destroy(tp, rtnl_held, extack); + tcf_block_filter_cnt_update(tp->chain->block, &tp->counted, false); if (sig_destroy) tcf_proto_signal_destroyed(tp->chain, tp); tcf_chain_put(tp->chain); @@ -2367,6 +2403,7 @@ replay: tfilter_notify(net, skb, n, tp, block, q, parent, fh, RTM_NEWTFILTER, false, rtnl_held, extack); tfilter_put(tp, fh); + tcf_block_filter_cnt_update(block, &tp->counted, true); /* q pointer is NULL for shared blocks */ if (q) q->flags &= ~TCQ_F_CAN_BYPASS; @@ -3483,6 +3520,8 @@ static void tcf_block_offload_inc(struct tcf_block *block, u32 *flags) if (*flags & TCA_CLS_FLAGS_IN_HW) return; *flags |= TCA_CLS_FLAGS_IN_HW; + if (tc_skip_sw(*flags)) + atomic_inc(&block->skipswcnt); atomic_inc(&block->offloadcnt); } @@ -3491,6 +3530,8 @@ static void tcf_block_offload_dec(struct tcf_block *block, u32 *flags) if (!(*flags & TCA_CLS_FLAGS_IN_HW)) return; *flags &= ~TCA_CLS_FLAGS_IN_HW; + if (tc_skip_sw(*flags)) + atomic_dec(&block->skipswcnt); atomic_dec(&block->offloadcnt); } diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index a1f5693133..ecfaa4f9a0 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -328,6 +328,7 @@ static struct tcf_proto_ops cls_basic_ops __read_mostly = { .bind_class = basic_bind_class, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_CLS("basic"); static int __init init_basic(void) { diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 382c7a71f8..5e83e890f6 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -693,6 +693,7 @@ static struct tcf_proto_ops cls_bpf_ops __read_mostly = { .dump = cls_bpf_dump, .bind_class = cls_bpf_bind_class, }; +MODULE_ALIAS_NET_CLS("bpf"); static int __init cls_bpf_init_mod(void) { diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 7ee8dbf49e..424252982d 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -209,6 +209,7 @@ static struct tcf_proto_ops cls_cgroup_ops __read_mostly = { .dump = cls_cgroup_dump, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_CLS("cgroup"); static int __init init_cgroup_cls(void) { diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 6ab317b48d..5502998aac 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -702,6 +702,7 @@ static struct tcf_proto_ops cls_flow_ops __read_mostly = { .walk = flow_walk, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_CLS("flow"); static int __init cls_flow_init(void) { diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 6ee7064c82..fd9a6f20b6 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -28,6 +28,7 @@ #include <net/vxlan.h> #include <net/erspan.h> #include <net/gtp.h> +#include <net/pfcp.h> #include <net/tc_wrapper.h> #include <net/dst.h> @@ -741,6 +742,7 @@ enc_opts_policy[TCA_FLOWER_KEY_ENC_OPTS_MAX + 1] = { [TCA_FLOWER_KEY_ENC_OPTS_VXLAN] = { .type = NLA_NESTED }, [TCA_FLOWER_KEY_ENC_OPTS_ERSPAN] = { .type = NLA_NESTED }, [TCA_FLOWER_KEY_ENC_OPTS_GTP] = { .type = NLA_NESTED }, + [TCA_FLOWER_KEY_ENC_OPTS_PFCP] = { .type = NLA_NESTED }, }; static const struct nla_policy @@ -771,6 +773,12 @@ gtp_opt_policy[TCA_FLOWER_KEY_ENC_OPT_GTP_MAX + 1] = { }; static const struct nla_policy +pfcp_opt_policy[TCA_FLOWER_KEY_ENC_OPT_PFCP_MAX + 1] = { + [TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ENC_OPT_PFCP_SEID] = { .type = NLA_U64 }, +}; + +static const struct nla_policy mpls_stack_entry_policy[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1] = { [TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL] = { .type = NLA_U8 }, @@ -1419,6 +1427,44 @@ static int fl_set_gtp_opt(const struct nlattr *nla, struct fl_flow_key *key, return sizeof(*sinfo); } +static int fl_set_pfcp_opt(const struct nlattr *nla, struct fl_flow_key *key, + int depth, int option_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_MAX + 1]; + struct pfcp_metadata *md; + int err; + + md = (struct pfcp_metadata *)&key->enc_opts.data[key->enc_opts.len]; + memset(md, 0xff, sizeof(*md)); + + if (!depth) + return sizeof(*md); + + if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_PFCP) { + NL_SET_ERR_MSG_MOD(extack, "Non-pfcp option type for mask"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_PFCP_MAX, nla, + pfcp_opt_policy, extack); + if (err < 0) + return err; + + if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing tunnel key pfcp option type"); + return -EINVAL; + } + + if (tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE]) + md->type = nla_get_u8(tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE]); + + if (tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_SEID]) + md->seid = nla_get_be64(tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_SEID]); + + return sizeof(*md); +} + static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask, struct netlink_ext_ack *extack) @@ -1454,12 +1500,13 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, switch (nla_type(nla_opt_key)) { case TCA_FLOWER_KEY_ENC_OPTS_GENEVE: if (key->enc_opts.dst_opt_type && - key->enc_opts.dst_opt_type != TUNNEL_GENEVE_OPT) { + key->enc_opts.dst_opt_type != + IP_TUNNEL_GENEVE_OPT_BIT) { NL_SET_ERR_MSG(extack, "Duplicate type for geneve options"); return -EINVAL; } option_len = 0; - key->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT; + key->enc_opts.dst_opt_type = IP_TUNNEL_GENEVE_OPT_BIT; option_len = fl_set_geneve_opt(nla_opt_key, key, key_depth, option_len, extack); @@ -1470,7 +1517,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, /* At the same time we need to parse through the mask * in order to verify exact and mask attribute lengths. */ - mask->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT; + mask->enc_opts.dst_opt_type = IP_TUNNEL_GENEVE_OPT_BIT; option_len = fl_set_geneve_opt(nla_opt_msk, mask, msk_depth, option_len, extack); @@ -1489,7 +1536,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, return -EINVAL; } option_len = 0; - key->enc_opts.dst_opt_type = TUNNEL_VXLAN_OPT; + key->enc_opts.dst_opt_type = IP_TUNNEL_VXLAN_OPT_BIT; option_len = fl_set_vxlan_opt(nla_opt_key, key, key_depth, option_len, extack); @@ -1500,7 +1547,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, /* At the same time we need to parse through the mask * in order to verify exact and mask attribute lengths. */ - mask->enc_opts.dst_opt_type = TUNNEL_VXLAN_OPT; + mask->enc_opts.dst_opt_type = IP_TUNNEL_VXLAN_OPT_BIT; option_len = fl_set_vxlan_opt(nla_opt_msk, mask, msk_depth, option_len, extack); @@ -1519,7 +1566,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, return -EINVAL; } option_len = 0; - key->enc_opts.dst_opt_type = TUNNEL_ERSPAN_OPT; + key->enc_opts.dst_opt_type = IP_TUNNEL_ERSPAN_OPT_BIT; option_len = fl_set_erspan_opt(nla_opt_key, key, key_depth, option_len, extack); @@ -1530,7 +1577,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, /* At the same time we need to parse through the mask * in order to verify exact and mask attribute lengths. */ - mask->enc_opts.dst_opt_type = TUNNEL_ERSPAN_OPT; + mask->enc_opts.dst_opt_type = IP_TUNNEL_ERSPAN_OPT_BIT; option_len = fl_set_erspan_opt(nla_opt_msk, mask, msk_depth, option_len, extack); @@ -1550,7 +1597,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, return -EINVAL; } option_len = 0; - key->enc_opts.dst_opt_type = TUNNEL_GTP_OPT; + key->enc_opts.dst_opt_type = IP_TUNNEL_GTP_OPT_BIT; option_len = fl_set_gtp_opt(nla_opt_key, key, key_depth, option_len, extack); @@ -1561,7 +1608,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, /* At the same time we need to parse through the mask * in order to verify exact and mask attribute lengths. */ - mask->enc_opts.dst_opt_type = TUNNEL_GTP_OPT; + mask->enc_opts.dst_opt_type = IP_TUNNEL_GTP_OPT_BIT; option_len = fl_set_gtp_opt(nla_opt_msk, mask, msk_depth, option_len, extack); @@ -1575,6 +1622,36 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, return -EINVAL; } break; + case TCA_FLOWER_KEY_ENC_OPTS_PFCP: + if (key->enc_opts.dst_opt_type) { + NL_SET_ERR_MSG_MOD(extack, "Duplicate type for pfcp options"); + return -EINVAL; + } + option_len = 0; + key->enc_opts.dst_opt_type = IP_TUNNEL_PFCP_OPT_BIT; + option_len = fl_set_pfcp_opt(nla_opt_key, key, + key_depth, option_len, + extack); + if (option_len < 0) + return option_len; + + key->enc_opts.len += option_len; + /* At the same time we need to parse through the mask + * in order to verify exact and mask attribute lengths. + */ + mask->enc_opts.dst_opt_type = IP_TUNNEL_PFCP_OPT_BIT; + option_len = fl_set_pfcp_opt(nla_opt_msk, mask, + msk_depth, option_len, + extack); + if (option_len < 0) + return option_len; + + mask->enc_opts.len += option_len; + if (key->enc_opts.len != mask->enc_opts.len) { + NL_SET_ERR_MSG_MOD(extack, "Key and mask miss aligned"); + return -EINVAL; + } + break; default: NL_SET_ERR_MSG(extack, "Unknown tunnel option type"); return -EINVAL; @@ -3117,6 +3194,32 @@ nla_put_failure: return -EMSGSIZE; } +static int fl_dump_key_pfcp_opt(struct sk_buff *skb, + struct flow_dissector_key_enc_opts *enc_opts) +{ + struct pfcp_metadata *md; + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_PFCP); + if (!nest) + goto nla_put_failure; + + md = (struct pfcp_metadata *)&enc_opts->data[0]; + if (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE, md->type)) + goto nla_put_failure; + + if (nla_put_be64(skb, TCA_FLOWER_KEY_ENC_OPT_PFCP_SEID, + md->seid, 0)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static int fl_dump_key_ct(struct sk_buff *skb, struct flow_dissector_key_ct *key, struct flow_dissector_key_ct *mask) @@ -3202,26 +3305,31 @@ static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type, goto nla_put_failure; switch (enc_opts->dst_opt_type) { - case TUNNEL_GENEVE_OPT: + case IP_TUNNEL_GENEVE_OPT_BIT: err = fl_dump_key_geneve_opt(skb, enc_opts); if (err) goto nla_put_failure; break; - case TUNNEL_VXLAN_OPT: + case IP_TUNNEL_VXLAN_OPT_BIT: err = fl_dump_key_vxlan_opt(skb, enc_opts); if (err) goto nla_put_failure; break; - case TUNNEL_ERSPAN_OPT: + case IP_TUNNEL_ERSPAN_OPT_BIT: err = fl_dump_key_erspan_opt(skb, enc_opts); if (err) goto nla_put_failure; break; - case TUNNEL_GTP_OPT: + case IP_TUNNEL_GTP_OPT_BIT: err = fl_dump_key_gtp_opt(skb, enc_opts); if (err) goto nla_put_failure; break; + case IP_TUNNEL_PFCP_OPT_BIT: + err = fl_dump_key_pfcp_opt(skb, enc_opts); + if (err) + goto nla_put_failure; + break; default: goto nla_put_failure; } @@ -3659,6 +3767,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = { .owner = THIS_MODULE, .flags = TCF_PROTO_OPS_DOIT_UNLOCKED, }; +MODULE_ALIAS_NET_CLS("flower"); static int __init cls_fl_init(void) { diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index afc534ee0a..cdddc86952 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -433,6 +433,7 @@ static struct tcf_proto_ops cls_fw_ops __read_mostly = { .bind_class = fw_bind_class, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_CLS("fw"); static int __init init_fw(void) { diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index c4ed11df62..9f1e62ca50 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -398,6 +398,7 @@ static struct tcf_proto_ops cls_mall_ops __read_mostly = { .bind_class = mall_bind_class, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_CLS("matchall"); static int __init cls_mall_init(void) { diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 12a505db41..b9c58c040c 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -671,6 +671,7 @@ static struct tcf_proto_ops cls_route4_ops __read_mostly = { .bind_class = route4_bind_class, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_CLS("route"); static int __init init_route4(void) { diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 289e1755c2..9412d88a99 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -1453,6 +1453,7 @@ static struct tcf_proto_ops cls_u32_ops __read_mostly = { .bind_class = u32_bind_class, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_CLS("u32"); static int __init init_u32(void) { diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 31e38a614f..74afc21052 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -228,7 +228,7 @@ int qdisc_set_default(const char *name) if (!ops) { /* Not found, drop lock and try to load module */ write_unlock(&qdisc_mod_lock); - request_module("sch_%s", name); + request_module(NET_SCH_ALIAS_PREFIX "%s", name); write_lock(&qdisc_mod_lock); ops = qdisc_lookup_default(name); @@ -1275,7 +1275,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, * go away in the mean time. */ rtnl_unlock(); - request_module("sch_%s", name); + request_module(NET_SCH_ALIAS_PREFIX "%s", name); rtnl_lock(); ops = qdisc_lookup_ops(kind); if (ops != NULL) { @@ -1334,7 +1334,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, * before again attaching a qdisc. */ if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) { - dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; + WRITE_ONCE(dev->tx_queue_len, DEFAULT_TX_QUEUE_LEN); netdev_info(dev, "Caught tx_queue_len zero misconfig\n"); } @@ -1389,6 +1389,7 @@ err_out4: ops->destroy(sch); qdisc_put_stab(rtnl_dereference(sch->stab)); err_out3: + lockdep_unregister_key(&sch->root_lock_key); netdev_put(dev, &sch->dev_tracker); qdisc_free(sch); err_out2: @@ -2410,7 +2411,7 @@ static struct pernet_operations psched_net_ops = { .exit = psched_net_exit, }; -#if IS_ENABLED(CONFIG_RETPOLINE) +#if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper); #endif diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 9cff995586..9602dafe32 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1512,7 +1512,7 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) if (!q->overflow_timeout) { int i; /* Build fresh max-heap */ - for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--) + for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2 - 1; i >= 0; i--) cake_heapify(q, i); } q->overflow_timeout = 65535; @@ -2572,6 +2572,8 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, { struct cake_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_CAKE_MAX + 1]; + u16 rate_flags; + u8 flow_mode; int err; err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy, @@ -2579,10 +2581,11 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, if (err < 0) return err; + flow_mode = q->flow_mode; if (tb[TCA_CAKE_NAT]) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) - q->flow_mode &= ~CAKE_FLOW_NAT_FLAG; - q->flow_mode |= CAKE_FLOW_NAT_FLAG * + flow_mode &= ~CAKE_FLOW_NAT_FLAG; + flow_mode |= CAKE_FLOW_NAT_FLAG * !!nla_get_u32(tb[TCA_CAKE_NAT]); #else NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT], @@ -2592,29 +2595,34 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, } if (tb[TCA_CAKE_BASE_RATE64]) - q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]); + WRITE_ONCE(q->rate_bps, + nla_get_u64(tb[TCA_CAKE_BASE_RATE64])); if (tb[TCA_CAKE_DIFFSERV_MODE]) - q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]); + WRITE_ONCE(q->tin_mode, + nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE])); + rate_flags = q->rate_flags; if (tb[TCA_CAKE_WASH]) { if (!!nla_get_u32(tb[TCA_CAKE_WASH])) - q->rate_flags |= CAKE_FLAG_WASH; + rate_flags |= CAKE_FLAG_WASH; else - q->rate_flags &= ~CAKE_FLAG_WASH; + rate_flags &= ~CAKE_FLAG_WASH; } if (tb[TCA_CAKE_FLOW_MODE]) - q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) | + flow_mode = ((flow_mode & CAKE_FLOW_NAT_FLAG) | (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & CAKE_FLOW_MASK)); if (tb[TCA_CAKE_ATM]) - q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]); + WRITE_ONCE(q->atm_mode, + nla_get_u32(tb[TCA_CAKE_ATM])); if (tb[TCA_CAKE_OVERHEAD]) { - q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]); - q->rate_flags |= CAKE_FLAG_OVERHEAD; + WRITE_ONCE(q->rate_overhead, + nla_get_s32(tb[TCA_CAKE_OVERHEAD])); + rate_flags |= CAKE_FLAG_OVERHEAD; q->max_netlen = 0; q->max_adjlen = 0; @@ -2623,7 +2631,7 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, } if (tb[TCA_CAKE_RAW]) { - q->rate_flags &= ~CAKE_FLAG_OVERHEAD; + rate_flags &= ~CAKE_FLAG_OVERHEAD; q->max_netlen = 0; q->max_adjlen = 0; @@ -2632,54 +2640,58 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, } if (tb[TCA_CAKE_MPU]) - q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]); + WRITE_ONCE(q->rate_mpu, + nla_get_u32(tb[TCA_CAKE_MPU])); if (tb[TCA_CAKE_RTT]) { - q->interval = nla_get_u32(tb[TCA_CAKE_RTT]); + u32 interval = nla_get_u32(tb[TCA_CAKE_RTT]); - if (!q->interval) - q->interval = 1; + WRITE_ONCE(q->interval, max(interval, 1U)); } if (tb[TCA_CAKE_TARGET]) { - q->target = nla_get_u32(tb[TCA_CAKE_TARGET]); + u32 target = nla_get_u32(tb[TCA_CAKE_TARGET]); - if (!q->target) - q->target = 1; + WRITE_ONCE(q->target, max(target, 1U)); } if (tb[TCA_CAKE_AUTORATE]) { if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) - q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; + rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; else - q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; + rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; } if (tb[TCA_CAKE_INGRESS]) { if (!!nla_get_u32(tb[TCA_CAKE_INGRESS])) - q->rate_flags |= CAKE_FLAG_INGRESS; + rate_flags |= CAKE_FLAG_INGRESS; else - q->rate_flags &= ~CAKE_FLAG_INGRESS; + rate_flags &= ~CAKE_FLAG_INGRESS; } if (tb[TCA_CAKE_ACK_FILTER]) - q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]); + WRITE_ONCE(q->ack_filter, + nla_get_u32(tb[TCA_CAKE_ACK_FILTER])); if (tb[TCA_CAKE_MEMORY]) - q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]); + WRITE_ONCE(q->buffer_config_limit, + nla_get_u32(tb[TCA_CAKE_MEMORY])); if (tb[TCA_CAKE_SPLIT_GSO]) { if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO])) - q->rate_flags |= CAKE_FLAG_SPLIT_GSO; + rate_flags |= CAKE_FLAG_SPLIT_GSO; else - q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO; + rate_flags &= ~CAKE_FLAG_SPLIT_GSO; } if (tb[TCA_CAKE_FWMARK]) { - q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]); - q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0; + WRITE_ONCE(q->fwmark_mask, nla_get_u32(tb[TCA_CAKE_FWMARK])); + WRITE_ONCE(q->fwmark_shft, + q->fwmark_mask ? __ffs(q->fwmark_mask) : 0); } + WRITE_ONCE(q->rate_flags, rate_flags); + WRITE_ONCE(q->flow_mode, flow_mode); if (q->tins) { sch_tree_lock(sch); cake_reconfigure(sch); @@ -2774,68 +2786,72 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) { struct cake_sched_data *q = qdisc_priv(sch); struct nlattr *opts; + u16 rate_flags; + u8 flow_mode; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!opts) goto nla_put_failure; - if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps, - TCA_CAKE_PAD)) + if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, + READ_ONCE(q->rate_bps), TCA_CAKE_PAD)) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, - q->flow_mode & CAKE_FLOW_MASK)) + flow_mode = READ_ONCE(q->flow_mode); + if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, flow_mode & CAKE_FLOW_MASK)) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval)) + if (nla_put_u32(skb, TCA_CAKE_RTT, READ_ONCE(q->interval))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target)) + if (nla_put_u32(skb, TCA_CAKE_TARGET, READ_ONCE(q->target))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit)) + if (nla_put_u32(skb, TCA_CAKE_MEMORY, + READ_ONCE(q->buffer_config_limit))) goto nla_put_failure; + rate_flags = READ_ONCE(q->rate_flags); if (nla_put_u32(skb, TCA_CAKE_AUTORATE, - !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS))) + !!(rate_flags & CAKE_FLAG_AUTORATE_INGRESS))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_INGRESS, - !!(q->rate_flags & CAKE_FLAG_INGRESS))) + !!(rate_flags & CAKE_FLAG_INGRESS))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter)) + if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, READ_ONCE(q->ack_filter))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_NAT, - !!(q->flow_mode & CAKE_FLOW_NAT_FLAG))) + !!(flow_mode & CAKE_FLOW_NAT_FLAG))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode)) + if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, READ_ONCE(q->tin_mode))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_WASH, - !!(q->rate_flags & CAKE_FLAG_WASH))) + !!(rate_flags & CAKE_FLAG_WASH))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead)) + if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, READ_ONCE(q->rate_overhead))) goto nla_put_failure; - if (!(q->rate_flags & CAKE_FLAG_OVERHEAD)) + if (!(rate_flags & CAKE_FLAG_OVERHEAD)) if (nla_put_u32(skb, TCA_CAKE_RAW, 0)) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode)) + if (nla_put_u32(skb, TCA_CAKE_ATM, READ_ONCE(q->atm_mode))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu)) + if (nla_put_u32(skb, TCA_CAKE_MPU, READ_ONCE(q->rate_mpu))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO, - !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO))) + !!(rate_flags & CAKE_FLAG_SPLIT_GSO))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask)) + if (nla_put_u32(skb, TCA_CAKE_FWMARK, READ_ONCE(q->fwmark_mask))) goto nla_put_failure; return nla_nest_end(skb, opts); @@ -3103,6 +3119,7 @@ static struct Qdisc_ops cake_qdisc_ops __read_mostly = { .dump_stats = cake_dump_stats, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("cake"); static int __init cake_module_init(void) { diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index beece8e82c..939425da18 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -389,11 +389,11 @@ static int cbs_change(struct Qdisc *sch, struct nlattr *opt, } /* Everything went OK, save the parameters used. */ - q->hicredit = qopt->hicredit; - q->locredit = qopt->locredit; - q->idleslope = qopt->idleslope * BYTES_PER_KBIT; - q->sendslope = qopt->sendslope * BYTES_PER_KBIT; - q->offload = qopt->offload; + WRITE_ONCE(q->hicredit, qopt->hicredit); + WRITE_ONCE(q->locredit, qopt->locredit); + WRITE_ONCE(q->idleslope, qopt->idleslope * BYTES_PER_KBIT); + WRITE_ONCE(q->sendslope, qopt->sendslope * BYTES_PER_KBIT); + WRITE_ONCE(q->offload, qopt->offload); return 0; } @@ -459,11 +459,11 @@ static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb) if (!nest) goto nla_put_failure; - opt.hicredit = q->hicredit; - opt.locredit = q->locredit; - opt.sendslope = div64_s64(q->sendslope, BYTES_PER_KBIT); - opt.idleslope = div64_s64(q->idleslope, BYTES_PER_KBIT); - opt.offload = q->offload; + opt.hicredit = READ_ONCE(q->hicredit); + opt.locredit = READ_ONCE(q->locredit); + opt.sendslope = div64_s64(READ_ONCE(q->sendslope), BYTES_PER_KBIT); + opt.idleslope = div64_s64(READ_ONCE(q->idleslope), BYTES_PER_KBIT); + opt.offload = READ_ONCE(q->offload); if (nla_put(skb, TCA_CBS_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -546,6 +546,7 @@ static struct Qdisc_ops cbs_qdisc_ops __read_mostly = { .dump = cbs_dump, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("cbs"); static struct notifier_block cbs_device_notifier = { .notifier_call = cbs_dev_notifier, diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index ae1da08e26..9107201092 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -405,8 +405,8 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt, } else sch_tree_lock(sch); - q->flags = ctl->flags; - q->limit = ctl->limit; + WRITE_ONCE(q->flags, ctl->flags); + WRITE_ONCE(q->limit, ctl->limit); red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog, ctl->Scell_log, @@ -431,15 +431,16 @@ static int choke_init(struct Qdisc *sch, struct nlattr *opt, static int choke_dump(struct Qdisc *sch, struct sk_buff *skb) { struct choke_sched_data *q = qdisc_priv(sch); + u8 Wlog = READ_ONCE(q->parms.Wlog); struct nlattr *opts = NULL; struct tc_red_qopt opt = { - .limit = q->limit, - .flags = q->flags, - .qth_min = q->parms.qth_min >> q->parms.Wlog, - .qth_max = q->parms.qth_max >> q->parms.Wlog, - .Wlog = q->parms.Wlog, - .Plog = q->parms.Plog, - .Scell_log = q->parms.Scell_log, + .limit = READ_ONCE(q->limit), + .flags = READ_ONCE(q->flags), + .qth_min = READ_ONCE(q->parms.qth_min) >> Wlog, + .qth_max = READ_ONCE(q->parms.qth_max) >> Wlog, + .Wlog = Wlog, + .Plog = READ_ONCE(q->parms.Plog), + .Scell_log = READ_ONCE(q->parms.Scell_log), }; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); @@ -447,7 +448,7 @@ static int choke_dump(struct Qdisc *sch, struct sk_buff *skb) goto nla_put_failure; if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) || - nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P)) + nla_put_u32(skb, TCA_CHOKE_MAX_P, READ_ONCE(q->parms.max_P))) goto nla_put_failure; return nla_nest_end(skb, opts); @@ -498,6 +499,7 @@ static struct Qdisc_ops choke_qdisc_ops __read_mostly = { .dump_stats = choke_dump_stats, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("choke"); static int __init choke_module_init(void) { diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index d7a4874543..3e8d4fe4d9 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* * Codel - The Controlled-Delay Active Queue Management algorithm * @@ -7,37 +8,6 @@ * Implemented on linux by : * Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net> * Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com> - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The names of the authors may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, provided that this notice is retained in full, this - * software may be distributed under the terms of the GNU General - * Public License ("GPL") version 2, in which case the provisions of the - * GPL apply INSTEAD OF those given above. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH - * DAMAGE. - * */ #include <linux/module.h> @@ -148,26 +118,31 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_CODEL_TARGET]) { u32 target = nla_get_u32(tb[TCA_CODEL_TARGET]); - q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT; + WRITE_ONCE(q->params.target, + ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT); } if (tb[TCA_CODEL_CE_THRESHOLD]) { u64 val = nla_get_u32(tb[TCA_CODEL_CE_THRESHOLD]); - q->params.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT; + WRITE_ONCE(q->params.ce_threshold, + (val * NSEC_PER_USEC) >> CODEL_SHIFT); } if (tb[TCA_CODEL_INTERVAL]) { u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]); - q->params.interval = ((u64)interval * NSEC_PER_USEC) >> CODEL_SHIFT; + WRITE_ONCE(q->params.interval, + ((u64)interval * NSEC_PER_USEC) >> CODEL_SHIFT); } if (tb[TCA_CODEL_LIMIT]) - sch->limit = nla_get_u32(tb[TCA_CODEL_LIMIT]); + WRITE_ONCE(sch->limit, + nla_get_u32(tb[TCA_CODEL_LIMIT])); if (tb[TCA_CODEL_ECN]) - q->params.ecn = !!nla_get_u32(tb[TCA_CODEL_ECN]); + WRITE_ONCE(q->params.ecn, + !!nla_get_u32(tb[TCA_CODEL_ECN])); qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { @@ -213,6 +188,7 @@ static int codel_init(struct Qdisc *sch, struct nlattr *opt, static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) { struct codel_sched_data *q = qdisc_priv(sch); + codel_time_t ce_threshold; struct nlattr *opts; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); @@ -220,17 +196,18 @@ static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) goto nla_put_failure; if (nla_put_u32(skb, TCA_CODEL_TARGET, - codel_time_to_us(q->params.target)) || + codel_time_to_us(READ_ONCE(q->params.target))) || nla_put_u32(skb, TCA_CODEL_LIMIT, - sch->limit) || + READ_ONCE(sch->limit)) || nla_put_u32(skb, TCA_CODEL_INTERVAL, - codel_time_to_us(q->params.interval)) || + codel_time_to_us(READ_ONCE(q->params.interval))) || nla_put_u32(skb, TCA_CODEL_ECN, - q->params.ecn)) + READ_ONCE(q->params.ecn))) goto nla_put_failure; - if (q->params.ce_threshold != CODEL_DISABLED_THRESHOLD && + ce_threshold = READ_ONCE(q->params.ce_threshold); + if (ce_threshold != CODEL_DISABLED_THRESHOLD && nla_put_u32(skb, TCA_CODEL_CE_THRESHOLD, - codel_time_to_us(q->params.ce_threshold))) + codel_time_to_us(ce_threshold))) goto nla_put_failure; return nla_nest_end(skb, opts); @@ -287,6 +264,7 @@ static struct Qdisc_ops codel_qdisc_ops __read_mostly = { .dump_stats = codel_dump_stats, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("codel"); static int __init codel_module_init(void) { diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 097740a9af..c69b999fae 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -481,6 +481,7 @@ static struct Qdisc_ops drr_qdisc_ops __read_mostly = { .destroy = drr_destroy_qdisc, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("drr"); static int __init drr_init(void) { diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index 4808159a54..c74d778c32 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -467,15 +467,15 @@ static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) if (!nest) goto nla_put_failure; - opt.delta = q->delta; - opt.clockid = q->clockid; - if (q->offload) + opt.delta = READ_ONCE(q->delta); + opt.clockid = READ_ONCE(q->clockid); + if (READ_ONCE(q->offload)) opt.flags |= TC_ETF_OFFLOAD_ON; - if (q->deadline_mode) + if (READ_ONCE(q->deadline_mode)) opt.flags |= TC_ETF_DEADLINE_MODE_ON; - if (q->skip_sock_check) + if (READ_ONCE(q->skip_sock_check)) opt.flags |= TC_ETF_SKIP_SOCK_CHECK; if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt)) @@ -500,6 +500,7 @@ static struct Qdisc_ops etf_qdisc_ops __read_mostly = { .dump = etf_dump, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("etf"); static int __init etf_module_init(void) { diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index f7c8849594..f80bc05d4c 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -646,7 +646,7 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_lock(sch); - q->nbands = nbands; + WRITE_ONCE(q->nbands, nbands); for (i = nstrict; i < q->nstrict; i++) { if (q->classes[i].qdisc->q.qlen) { list_add_tail(&q->classes[i].alist, &q->active); @@ -658,11 +658,11 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, list_del(&q->classes[i].alist); qdisc_tree_flush_backlog(q->classes[i].qdisc); } - q->nstrict = nstrict; + WRITE_ONCE(q->nstrict, nstrict); memcpy(q->prio2band, priomap, sizeof(priomap)); for (i = 0; i < q->nbands; i++) - q->classes[i].quantum = quanta[i]; + WRITE_ONCE(q->classes[i].quantum, quanta[i]); for (i = oldbands; i < q->nbands; i++) { q->classes[i].qdisc = queues[i]; @@ -676,7 +676,7 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, for (i = q->nbands; i < oldbands; i++) { qdisc_put(q->classes[i].qdisc); q->classes[i].qdisc = NULL; - q->classes[i].quantum = 0; + WRITE_ONCE(q->classes[i].quantum, 0); q->classes[i].deficit = 0; gnet_stats_basic_sync_init(&q->classes[i].bstats); memset(&q->classes[i].qstats, 0, sizeof(q->classes[i].qstats)); @@ -733,6 +733,7 @@ static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb) struct ets_sched *q = qdisc_priv(sch); struct nlattr *opts; struct nlattr *nest; + u8 nbands, nstrict; int band; int prio; int err; @@ -745,21 +746,22 @@ static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb) if (!opts) goto nla_err; - if (nla_put_u8(skb, TCA_ETS_NBANDS, q->nbands)) + nbands = READ_ONCE(q->nbands); + if (nla_put_u8(skb, TCA_ETS_NBANDS, nbands)) goto nla_err; - if (q->nstrict && - nla_put_u8(skb, TCA_ETS_NSTRICT, q->nstrict)) + nstrict = READ_ONCE(q->nstrict); + if (nstrict && nla_put_u8(skb, TCA_ETS_NSTRICT, nstrict)) goto nla_err; - if (q->nbands > q->nstrict) { + if (nbands > nstrict) { nest = nla_nest_start(skb, TCA_ETS_QUANTA); if (!nest) goto nla_err; - for (band = q->nstrict; band < q->nbands; band++) { + for (band = nstrict; band < nbands; band++) { if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND, - q->classes[band].quantum)) + READ_ONCE(q->classes[band].quantum))) goto nla_err; } @@ -771,7 +773,8 @@ static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb) goto nla_err; for (prio = 0; prio <= TC_PRIO_MAX; prio++) { - if (nla_put_u8(skb, TCA_ETS_PRIOMAP_BAND, q->prio2band[prio])) + if (nla_put_u8(skb, TCA_ETS_PRIOMAP_BAND, + READ_ONCE(q->prio2band[prio]))) goto nla_err; } @@ -812,6 +815,7 @@ static struct Qdisc_ops ets_qdisc_ops __read_mostly = { .dump = ets_qdisc_dump, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("ets"); static int __init ets_init(void) { diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index 450f5c67ac..b50b2c2cc0 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -19,7 +19,8 @@ static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { - if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit)) + if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= + READ_ONCE(sch->limit))) return qdisc_enqueue_tail(skb, sch); return qdisc_drop(skb, sch, to_free); @@ -28,7 +29,7 @@ static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { - if (likely(sch->q.qlen < sch->limit)) + if (likely(sch->q.qlen < READ_ONCE(sch->limit))) return qdisc_enqueue_tail(skb, sch); return qdisc_drop(skb, sch, to_free); @@ -39,7 +40,7 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch, { unsigned int prev_backlog; - if (likely(sch->q.qlen < sch->limit)) + if (likely(sch->q.qlen < READ_ONCE(sch->limit))) return qdisc_enqueue_tail(skb, sch); prev_backlog = sch->qstats.backlog; @@ -105,14 +106,14 @@ static int __fifo_init(struct Qdisc *sch, struct nlattr *opt, if (is_bfifo) limit *= psched_mtu(qdisc_dev(sch)); - sch->limit = limit; + WRITE_ONCE(sch->limit, limit); } else { struct tc_fifo_qopt *ctl = nla_data(opt); if (nla_len(opt) < sizeof(*ctl)) return -EINVAL; - sch->limit = ctl->limit; + WRITE_ONCE(sch->limit, ctl->limit); } if (is_bfifo) @@ -154,7 +155,7 @@ static void fifo_destroy(struct Qdisc *sch) static int __fifo_dump(struct Qdisc *sch, struct sk_buff *skb) { - struct tc_fifo_qopt opt = { .limit = sch->limit }; + struct tc_fifo_qopt opt = { .limit = READ_ONCE(sch->limit) }; if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 3a31c47fea..2389747256 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -106,6 +106,8 @@ struct fq_perband_flows { int quantum; /* based on band nr : 576KB, 192KB, 64KB */ }; +#define FQ_PRIO2BAND_CRUMB_SIZE ((TC_PRIO_MAX + 1) >> 2) + struct fq_sched_data { /* Read mostly cache line */ @@ -122,7 +124,7 @@ struct fq_sched_data { u8 rate_enable; u8 fq_trees_log; u8 horizon_drop; - u8 prio2band[(TC_PRIO_MAX + 1) >> 2]; + u8 prio2band[FQ_PRIO2BAND_CRUMB_SIZE]; u32 timer_slack; /* hrtimer slack in ns */ /* Read/Write fields. */ @@ -159,7 +161,7 @@ struct fq_sched_data { /* return the i-th 2-bit value ("crumb") */ static u8 fq_prio2band(const u8 *prio2band, unsigned int prio) { - return (prio2band[prio / 4] >> (2 * (prio & 0x3))) & 0x3; + return (READ_ONCE(prio2band[prio / 4]) >> (2 * (prio & 0x3))) & 0x3; } /* @@ -888,7 +890,7 @@ static int fq_resize(struct Qdisc *sch, u32 log) fq_rehash(q, old_fq_root, q->fq_trees_log, array, log); q->fq_root = array; - q->fq_trees_log = log; + WRITE_ONCE(q->fq_trees_log, log); sch_tree_unlock(sch); @@ -927,11 +929,15 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { static void fq_prio2band_compress_crumb(const u8 *in, u8 *out) { const int num_elems = TC_PRIO_MAX + 1; + u8 tmp[FQ_PRIO2BAND_CRUMB_SIZE]; int i; - memset(out, 0, num_elems / 4); + memset(tmp, 0, sizeof(tmp)); for (i = 0; i < num_elems; i++) - out[i / 4] |= in[i] << (2 * (i & 0x3)); + tmp[i / 4] |= in[i] << (2 * (i & 0x3)); + + for (i = 0; i < FQ_PRIO2BAND_CRUMB_SIZE; i++) + WRITE_ONCE(out[i], tmp[i]); } static void fq_prio2band_decompress_crumb(const u8 *in, u8 *out) @@ -958,7 +964,7 @@ static int fq_load_weights(struct fq_sched_data *q, } } for (i = 0; i < FQ_BANDS; i++) - q->band_flows[i].quantum = weights[i]; + WRITE_ONCE(q->band_flows[i].quantum, weights[i]); return 0; } @@ -1011,16 +1017,18 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, err = -EINVAL; } if (tb[TCA_FQ_PLIMIT]) - sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]); + WRITE_ONCE(sch->limit, + nla_get_u32(tb[TCA_FQ_PLIMIT])); if (tb[TCA_FQ_FLOW_PLIMIT]) - q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]); + WRITE_ONCE(q->flow_plimit, + nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT])); if (tb[TCA_FQ_QUANTUM]) { u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); if (quantum > 0 && quantum <= (1 << 20)) { - q->quantum = quantum; + WRITE_ONCE(q->quantum, quantum); } else { NL_SET_ERR_MSG_MOD(extack, "invalid quantum"); err = -EINVAL; @@ -1028,7 +1036,8 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, } if (tb[TCA_FQ_INITIAL_QUANTUM]) - q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]); + WRITE_ONCE(q->initial_quantum, + nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM])); if (tb[TCA_FQ_FLOW_DEFAULT_RATE]) pr_warn_ratelimited("sch_fq: defrate %u ignored.\n", @@ -1037,17 +1046,19 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_FLOW_MAX_RATE]) { u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); - q->flow_max_rate = (rate == ~0U) ? ~0UL : rate; + WRITE_ONCE(q->flow_max_rate, + (rate == ~0U) ? ~0UL : rate); } if (tb[TCA_FQ_LOW_RATE_THRESHOLD]) - q->low_rate_threshold = - nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]); + WRITE_ONCE(q->low_rate_threshold, + nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD])); if (tb[TCA_FQ_RATE_ENABLE]) { u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]); if (enable <= 1) - q->rate_enable = enable; + WRITE_ONCE(q->rate_enable, + enable); else err = -EINVAL; } @@ -1055,7 +1066,8 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_FLOW_REFILL_DELAY]) { u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ; - q->flow_refill_delay = usecs_to_jiffies(usecs_delay); + WRITE_ONCE(q->flow_refill_delay, + usecs_to_jiffies(usecs_delay)); } if (!err && tb[TCA_FQ_PRIOMAP]) @@ -1065,21 +1077,26 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, err = fq_load_weights(q, tb[TCA_FQ_WEIGHTS], extack); if (tb[TCA_FQ_ORPHAN_MASK]) - q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]); + WRITE_ONCE(q->orphan_mask, + nla_get_u32(tb[TCA_FQ_ORPHAN_MASK])); if (tb[TCA_FQ_CE_THRESHOLD]) - q->ce_threshold = (u64)NSEC_PER_USEC * - nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]); + WRITE_ONCE(q->ce_threshold, + (u64)NSEC_PER_USEC * + nla_get_u32(tb[TCA_FQ_CE_THRESHOLD])); if (tb[TCA_FQ_TIMER_SLACK]) - q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]); + WRITE_ONCE(q->timer_slack, + nla_get_u32(tb[TCA_FQ_TIMER_SLACK])); if (tb[TCA_FQ_HORIZON]) - q->horizon = (u64)NSEC_PER_USEC * - nla_get_u32(tb[TCA_FQ_HORIZON]); + WRITE_ONCE(q->horizon, + (u64)NSEC_PER_USEC * + nla_get_u32(tb[TCA_FQ_HORIZON])); if (tb[TCA_FQ_HORIZON_DROP]) - q->horizon_drop = nla_get_u8(tb[TCA_FQ_HORIZON_DROP]); + WRITE_ONCE(q->horizon_drop, + nla_get_u8(tb[TCA_FQ_HORIZON_DROP])); if (!err) { @@ -1160,13 +1177,13 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct fq_sched_data *q = qdisc_priv(sch); - u64 ce_threshold = q->ce_threshold; struct tc_prio_qopt prio = { .bands = FQ_BANDS, }; - u64 horizon = q->horizon; struct nlattr *opts; + u64 ce_threshold; s32 weights[3]; + u64 horizon; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) @@ -1174,35 +1191,48 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ + ce_threshold = READ_ONCE(q->ce_threshold); do_div(ce_threshold, NSEC_PER_USEC); + + horizon = READ_ONCE(q->horizon); do_div(horizon, NSEC_PER_USEC); - if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || - nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || - nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || - nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) || - nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || + if (nla_put_u32(skb, TCA_FQ_PLIMIT, + READ_ONCE(sch->limit)) || + nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, + READ_ONCE(q->flow_plimit)) || + nla_put_u32(skb, TCA_FQ_QUANTUM, + READ_ONCE(q->quantum)) || + nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, + READ_ONCE(q->initial_quantum)) || + nla_put_u32(skb, TCA_FQ_RATE_ENABLE, + READ_ONCE(q->rate_enable)) || nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, - min_t(unsigned long, q->flow_max_rate, ~0U)) || + min_t(unsigned long, + READ_ONCE(q->flow_max_rate), ~0U)) || nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, - jiffies_to_usecs(q->flow_refill_delay)) || - nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) || + jiffies_to_usecs(READ_ONCE(q->flow_refill_delay))) || + nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, + READ_ONCE(q->orphan_mask)) || nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD, - q->low_rate_threshold) || + READ_ONCE(q->low_rate_threshold)) || nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) || - nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) || - nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack) || + nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, + READ_ONCE(q->fq_trees_log)) || + nla_put_u32(skb, TCA_FQ_TIMER_SLACK, + READ_ONCE(q->timer_slack)) || nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) || - nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop)) + nla_put_u8(skb, TCA_FQ_HORIZON_DROP, + READ_ONCE(q->horizon_drop))) goto nla_put_failure; fq_prio2band_decompress_crumb(q->prio2band, prio.priomap); if (nla_put(skb, TCA_FQ_PRIOMAP, sizeof(prio), &prio)) goto nla_put_failure; - weights[0] = q->band_flows[0].quantum; - weights[1] = q->band_flows[1].quantum; - weights[2] = q->band_flows[2].quantum; + weights[0] = READ_ONCE(q->band_flows[0].quantum); + weights[1] = READ_ONCE(q->band_flows[1].quantum); + weights[2] = READ_ONCE(q->band_flows[2].quantum); if (nla_put(skb, TCA_FQ_WEIGHTS, sizeof(weights), &weights)) goto nla_put_failure; @@ -1264,6 +1294,7 @@ static struct Qdisc_ops fq_qdisc_ops __read_mostly = { .dump_stats = fq_dump_stats, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("fq"); static int __init fq_module_init(void) { diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 8c4fee0634..4f908c11ba 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -396,40 +396,49 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_CODEL_TARGET]) { u64 target = nla_get_u32(tb[TCA_FQ_CODEL_TARGET]); - q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT; + WRITE_ONCE(q->cparams.target, + (target * NSEC_PER_USEC) >> CODEL_SHIFT); } if (tb[TCA_FQ_CODEL_CE_THRESHOLD]) { u64 val = nla_get_u32(tb[TCA_FQ_CODEL_CE_THRESHOLD]); - q->cparams.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT; + WRITE_ONCE(q->cparams.ce_threshold, + (val * NSEC_PER_USEC) >> CODEL_SHIFT); } if (tb[TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR]) - q->cparams.ce_threshold_selector = nla_get_u8(tb[TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR]); + WRITE_ONCE(q->cparams.ce_threshold_selector, + nla_get_u8(tb[TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR])); if (tb[TCA_FQ_CODEL_CE_THRESHOLD_MASK]) - q->cparams.ce_threshold_mask = nla_get_u8(tb[TCA_FQ_CODEL_CE_THRESHOLD_MASK]); + WRITE_ONCE(q->cparams.ce_threshold_mask, + nla_get_u8(tb[TCA_FQ_CODEL_CE_THRESHOLD_MASK])); if (tb[TCA_FQ_CODEL_INTERVAL]) { u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]); - q->cparams.interval = (interval * NSEC_PER_USEC) >> CODEL_SHIFT; + WRITE_ONCE(q->cparams.interval, + (interval * NSEC_PER_USEC) >> CODEL_SHIFT); } if (tb[TCA_FQ_CODEL_LIMIT]) - sch->limit = nla_get_u32(tb[TCA_FQ_CODEL_LIMIT]); + WRITE_ONCE(sch->limit, + nla_get_u32(tb[TCA_FQ_CODEL_LIMIT])); if (tb[TCA_FQ_CODEL_ECN]) - q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]); + WRITE_ONCE(q->cparams.ecn, + !!nla_get_u32(tb[TCA_FQ_CODEL_ECN])); if (quantum) - q->quantum = quantum; + WRITE_ONCE(q->quantum, quantum); if (tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]) - q->drop_batch_size = max(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE])); + WRITE_ONCE(q->drop_batch_size, + max(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]))); if (tb[TCA_FQ_CODEL_MEMORY_LIMIT]) - q->memory_limit = min(1U << 31, nla_get_u32(tb[TCA_FQ_CODEL_MEMORY_LIMIT])); + WRITE_ONCE(q->memory_limit, + min(1U << 31, nla_get_u32(tb[TCA_FQ_CODEL_MEMORY_LIMIT]))); while (sch->q.qlen > sch->limit || q->memory_usage > q->memory_limit) { @@ -522,6 +531,7 @@ init_failure: static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) { struct fq_codel_sched_data *q = qdisc_priv(sch); + codel_time_t ce_threshold; struct nlattr *opts; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); @@ -529,30 +539,33 @@ static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) goto nla_put_failure; if (nla_put_u32(skb, TCA_FQ_CODEL_TARGET, - codel_time_to_us(q->cparams.target)) || + codel_time_to_us(READ_ONCE(q->cparams.target))) || nla_put_u32(skb, TCA_FQ_CODEL_LIMIT, - sch->limit) || + READ_ONCE(sch->limit)) || nla_put_u32(skb, TCA_FQ_CODEL_INTERVAL, - codel_time_to_us(q->cparams.interval)) || + codel_time_to_us(READ_ONCE(q->cparams.interval))) || nla_put_u32(skb, TCA_FQ_CODEL_ECN, - q->cparams.ecn) || + READ_ONCE(q->cparams.ecn)) || nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM, - q->quantum) || + READ_ONCE(q->quantum)) || nla_put_u32(skb, TCA_FQ_CODEL_DROP_BATCH_SIZE, - q->drop_batch_size) || + READ_ONCE(q->drop_batch_size)) || nla_put_u32(skb, TCA_FQ_CODEL_MEMORY_LIMIT, - q->memory_limit) || + READ_ONCE(q->memory_limit)) || nla_put_u32(skb, TCA_FQ_CODEL_FLOWS, - q->flows_cnt)) + READ_ONCE(q->flows_cnt))) goto nla_put_failure; - if (q->cparams.ce_threshold != CODEL_DISABLED_THRESHOLD) { + ce_threshold = READ_ONCE(q->cparams.ce_threshold); + if (ce_threshold != CODEL_DISABLED_THRESHOLD) { if (nla_put_u32(skb, TCA_FQ_CODEL_CE_THRESHOLD, - codel_time_to_us(q->cparams.ce_threshold))) + codel_time_to_us(ce_threshold))) goto nla_put_failure; - if (nla_put_u8(skb, TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR, q->cparams.ce_threshold_selector)) + if (nla_put_u8(skb, TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR, + READ_ONCE(q->cparams.ce_threshold_selector))) goto nla_put_failure; - if (nla_put_u8(skb, TCA_FQ_CODEL_CE_THRESHOLD_MASK, q->cparams.ce_threshold_mask)) + if (nla_put_u8(skb, TCA_FQ_CODEL_CE_THRESHOLD_MASK, + READ_ONCE(q->cparams.ce_threshold_mask))) goto nla_put_failure; } @@ -717,6 +730,7 @@ static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = { .dump_stats = fq_codel_dump_stats, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("fq_codel"); static int __init fq_codel_module_init(void) { diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index 5b595773e5..c38f33ff80 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -10,6 +10,7 @@ */ #include <linux/jhash.h> +#include <linux/module.h> #include <linux/sizes.h> #include <linux/vmalloc.h> #include <net/pkt_cls.h> @@ -298,8 +299,8 @@ static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_PIE_LIMIT]) { u32 limit = nla_get_u32(tb[TCA_FQ_PIE_LIMIT]); - q->p_params.limit = limit; - sch->limit = limit; + WRITE_ONCE(q->p_params.limit, limit); + WRITE_ONCE(sch->limit, limit); } if (tb[TCA_FQ_PIE_FLOWS]) { if (q->flows) { @@ -321,39 +322,45 @@ static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, u32 target = nla_get_u32(tb[TCA_FQ_PIE_TARGET]); /* convert to pschedtime */ - q->p_params.target = - PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC); + WRITE_ONCE(q->p_params.target, + PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC)); } /* tupdate is in jiffies */ if (tb[TCA_FQ_PIE_TUPDATE]) - q->p_params.tupdate = - usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE])); + WRITE_ONCE(q->p_params.tupdate, + usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE]))); if (tb[TCA_FQ_PIE_ALPHA]) - q->p_params.alpha = nla_get_u32(tb[TCA_FQ_PIE_ALPHA]); + WRITE_ONCE(q->p_params.alpha, + nla_get_u32(tb[TCA_FQ_PIE_ALPHA])); if (tb[TCA_FQ_PIE_BETA]) - q->p_params.beta = nla_get_u32(tb[TCA_FQ_PIE_BETA]); + WRITE_ONCE(q->p_params.beta, + nla_get_u32(tb[TCA_FQ_PIE_BETA])); if (tb[TCA_FQ_PIE_QUANTUM]) - q->quantum = nla_get_u32(tb[TCA_FQ_PIE_QUANTUM]); + WRITE_ONCE(q->quantum, nla_get_u32(tb[TCA_FQ_PIE_QUANTUM])); if (tb[TCA_FQ_PIE_MEMORY_LIMIT]) - q->memory_limit = nla_get_u32(tb[TCA_FQ_PIE_MEMORY_LIMIT]); + WRITE_ONCE(q->memory_limit, + nla_get_u32(tb[TCA_FQ_PIE_MEMORY_LIMIT])); if (tb[TCA_FQ_PIE_ECN_PROB]) - q->ecn_prob = nla_get_u32(tb[TCA_FQ_PIE_ECN_PROB]); + WRITE_ONCE(q->ecn_prob, + nla_get_u32(tb[TCA_FQ_PIE_ECN_PROB])); if (tb[TCA_FQ_PIE_ECN]) - q->p_params.ecn = nla_get_u32(tb[TCA_FQ_PIE_ECN]); + WRITE_ONCE(q->p_params.ecn, + nla_get_u32(tb[TCA_FQ_PIE_ECN])); if (tb[TCA_FQ_PIE_BYTEMODE]) - q->p_params.bytemode = nla_get_u32(tb[TCA_FQ_PIE_BYTEMODE]); + WRITE_ONCE(q->p_params.bytemode, + nla_get_u32(tb[TCA_FQ_PIE_BYTEMODE])); if (tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]) - q->p_params.dq_rate_estimator = - nla_get_u32(tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]); + WRITE_ONCE(q->p_params.dq_rate_estimator, + nla_get_u32(tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR])); /* Drop excess packets if new limit is lower */ while (sch->q.qlen > sch->limit) { @@ -470,22 +477,23 @@ static int fq_pie_dump(struct Qdisc *sch, struct sk_buff *skb) return -EMSGSIZE; /* convert target from pschedtime to us */ - if (nla_put_u32(skb, TCA_FQ_PIE_LIMIT, sch->limit) || - nla_put_u32(skb, TCA_FQ_PIE_FLOWS, q->flows_cnt) || + if (nla_put_u32(skb, TCA_FQ_PIE_LIMIT, READ_ONCE(sch->limit)) || + nla_put_u32(skb, TCA_FQ_PIE_FLOWS, READ_ONCE(q->flows_cnt)) || nla_put_u32(skb, TCA_FQ_PIE_TARGET, - ((u32)PSCHED_TICKS2NS(q->p_params.target)) / + ((u32)PSCHED_TICKS2NS(READ_ONCE(q->p_params.target))) / NSEC_PER_USEC) || nla_put_u32(skb, TCA_FQ_PIE_TUPDATE, - jiffies_to_usecs(q->p_params.tupdate)) || - nla_put_u32(skb, TCA_FQ_PIE_ALPHA, q->p_params.alpha) || - nla_put_u32(skb, TCA_FQ_PIE_BETA, q->p_params.beta) || - nla_put_u32(skb, TCA_FQ_PIE_QUANTUM, q->quantum) || - nla_put_u32(skb, TCA_FQ_PIE_MEMORY_LIMIT, q->memory_limit) || - nla_put_u32(skb, TCA_FQ_PIE_ECN_PROB, q->ecn_prob) || - nla_put_u32(skb, TCA_FQ_PIE_ECN, q->p_params.ecn) || - nla_put_u32(skb, TCA_FQ_PIE_BYTEMODE, q->p_params.bytemode) || + jiffies_to_usecs(READ_ONCE(q->p_params.tupdate))) || + nla_put_u32(skb, TCA_FQ_PIE_ALPHA, READ_ONCE(q->p_params.alpha)) || + nla_put_u32(skb, TCA_FQ_PIE_BETA, READ_ONCE(q->p_params.beta)) || + nla_put_u32(skb, TCA_FQ_PIE_QUANTUM, READ_ONCE(q->quantum)) || + nla_put_u32(skb, TCA_FQ_PIE_MEMORY_LIMIT, + READ_ONCE(q->memory_limit)) || + nla_put_u32(skb, TCA_FQ_PIE_ECN_PROB, READ_ONCE(q->ecn_prob)) || + nla_put_u32(skb, TCA_FQ_PIE_ECN, READ_ONCE(q->p_params.ecn)) || + nla_put_u32(skb, TCA_FQ_PIE_BYTEMODE, READ_ONCE(q->p_params.bytemode)) || nla_put_u32(skb, TCA_FQ_PIE_DQ_RATE_ESTIMATOR, - q->p_params.dq_rate_estimator)) + READ_ONCE(q->p_params.dq_rate_estimator))) goto nla_put_failure; return nla_nest_end(skb, opts); @@ -563,6 +571,7 @@ static struct Qdisc_ops fq_pie_qdisc_ops __read_mostly = { .dump_stats = fq_pie_dump_stats, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("fq_pie"); static int __init fq_pie_module_init(void) { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index a498b5d7c5..e22ff003d5 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -27,6 +27,7 @@ #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/dst.h> +#include <net/hotdata.h> #include <trace/events/qdisc.h> #include <trace/events/net.h> #include <net/xfrm.h> @@ -409,7 +410,7 @@ static inline bool qdisc_restart(struct Qdisc *q, int *packets) void __qdisc_run(struct Qdisc *q) { - int quota = READ_ONCE(dev_tx_weight); + int quota = READ_ONCE(net_hotdata.dev_tx_weight); int packets; while (qdisc_restart(q, &packets)) { @@ -505,19 +506,22 @@ static void dev_watchdog(struct timer_list *t) unsigned int timedout_ms = 0; unsigned int i; unsigned long trans_start; + unsigned long oldest_start = jiffies; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq; txq = netdev_get_tx_queue(dev, i); trans_start = READ_ONCE(txq->trans_start); - if (netif_xmit_stopped(txq) && - time_after(jiffies, (trans_start + - dev->watchdog_timeo))) { + if (!netif_xmit_stopped(txq)) + continue; + if (time_after(jiffies, trans_start + dev->watchdog_timeo)) { timedout_ms = jiffies_to_msecs(jiffies - trans_start); atomic_long_inc(&txq->trans_timeout); break; } + if (time_after(oldest_start, trans_start)) + oldest_start = trans_start; } if (unlikely(timedout_ms)) { @@ -530,7 +534,7 @@ static void dev_watchdog(struct timer_list *t) netif_unfreeze_queues(dev); } if (!mod_timer(&dev->watchdog_timer, - round_jiffies(jiffies + + round_jiffies(oldest_start + dev->watchdog_timeo))) release = false; } @@ -672,6 +676,7 @@ struct Qdisc noop_qdisc = { .qlen = 0, .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock), }, + .owner = -1, }; EXPORT_SYMBOL(noop_qdisc); @@ -944,7 +949,9 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, __skb_queue_head_init(&sch->gso_skb); __skb_queue_head_init(&sch->skb_bad_txq); gnet_stats_basic_sync_init(&sch->bstats); + lockdep_register_key(&sch->root_lock_key); spin_lock_init(&sch->q.lock); + lockdep_set_class(&sch->q.lock, &sch->root_lock_key); if (ops->static_flags & TCQ_F_CPUSTATS) { sch->cpu_bstats = @@ -979,6 +986,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, return sch; errout1: + lockdep_unregister_key(&sch->root_lock_key); kfree(sch); errout: return ERR_PTR(err); @@ -1067,6 +1075,7 @@ static void __qdisc_destroy(struct Qdisc *qdisc) if (ops->destroy) ops->destroy(qdisc); + lockdep_unregister_key(&qdisc->root_lock_key); module_put(ops->owner); netdev_put(dev, &qdisc->dev_tracker); diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 8c61eb3dc9..79ba9dc702 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -930,6 +930,7 @@ static struct Qdisc_ops gred_qdisc_ops __read_mostly = { .dump = gred_dump, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("gred"); static int __init gred_module_init(void) { diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 16c45da403..c287bf8423 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1174,7 +1174,8 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) } /* classification failed, try default class */ - cl = hfsc_find_class(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch); + cl = hfsc_find_class(TC_H_MAKE(TC_H_MAJ(sch->handle), + READ_ONCE(q->defcls)), sch); if (cl == NULL || cl->level > 0) return NULL; @@ -1443,9 +1444,7 @@ hfsc_change_qdisc(struct Qdisc *sch, struct nlattr *opt, return -EINVAL; qopt = nla_data(opt); - sch_tree_lock(sch); - q->defcls = qopt->defcls; - sch_tree_unlock(sch); + WRITE_ONCE(q->defcls, qopt->defcls); return 0; } @@ -1525,7 +1524,7 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) unsigned char *b = skb_tail_pointer(skb); struct tc_hfsc_qopt qopt; - qopt.defcls = q->defcls; + qopt.defcls = READ_ONCE(q->defcls); if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) goto nla_put_failure; return skb->len; @@ -1679,6 +1678,7 @@ static struct Qdisc_ops hfsc_qdisc_ops __read_mostly = { .priv_size = sizeof(struct hfsc_sched), .owner = THIS_MODULE }; +MODULE_ALIAS_NET_SCH("hfsc"); static int __init hfsc_init(void) diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index d26cd436cb..44d9efe1a9 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -534,27 +534,31 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_lock(sch); if (tb[TCA_HHF_BACKLOG_LIMIT]) - sch->limit = nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT]); + WRITE_ONCE(sch->limit, nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT])); - q->quantum = new_quantum; - q->hhf_non_hh_weight = new_hhf_non_hh_weight; + WRITE_ONCE(q->quantum, new_quantum); + WRITE_ONCE(q->hhf_non_hh_weight, new_hhf_non_hh_weight); if (tb[TCA_HHF_HH_FLOWS_LIMIT]) - q->hh_flows_limit = nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT]); + WRITE_ONCE(q->hh_flows_limit, + nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT])); if (tb[TCA_HHF_RESET_TIMEOUT]) { u32 us = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]); - q->hhf_reset_timeout = usecs_to_jiffies(us); + WRITE_ONCE(q->hhf_reset_timeout, + usecs_to_jiffies(us)); } if (tb[TCA_HHF_ADMIT_BYTES]) - q->hhf_admit_bytes = nla_get_u32(tb[TCA_HHF_ADMIT_BYTES]); + WRITE_ONCE(q->hhf_admit_bytes, + nla_get_u32(tb[TCA_HHF_ADMIT_BYTES])); if (tb[TCA_HHF_EVICT_TIMEOUT]) { u32 us = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]); - q->hhf_evict_timeout = usecs_to_jiffies(us); + WRITE_ONCE(q->hhf_evict_timeout, + usecs_to_jiffies(us)); } qlen = sch->q.qlen; @@ -657,15 +661,18 @@ static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb) if (opts == NULL) goto nla_put_failure; - if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, sch->limit) || - nla_put_u32(skb, TCA_HHF_QUANTUM, q->quantum) || - nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, q->hh_flows_limit) || + if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, READ_ONCE(sch->limit)) || + nla_put_u32(skb, TCA_HHF_QUANTUM, READ_ONCE(q->quantum)) || + nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, + READ_ONCE(q->hh_flows_limit)) || nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT, - jiffies_to_usecs(q->hhf_reset_timeout)) || - nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, q->hhf_admit_bytes) || + jiffies_to_usecs(READ_ONCE(q->hhf_reset_timeout))) || + nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, + READ_ONCE(q->hhf_admit_bytes)) || nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT, - jiffies_to_usecs(q->hhf_evict_timeout)) || - nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, q->hhf_non_hh_weight)) + jiffies_to_usecs(READ_ONCE(q->hhf_evict_timeout))) || + nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, + READ_ONCE(q->hhf_non_hh_weight))) goto nla_put_failure; return nla_nest_end(skb, opts); @@ -702,6 +709,7 @@ static struct Qdisc_ops hhf_qdisc_ops __read_mostly = { .dump_stats = hhf_dump_stats, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("hhf"); static int __init hhf_module_init(void) { diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 7349233eaa..ff3de37874 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1039,13 +1039,6 @@ static void htb_work_func(struct work_struct *work) rcu_read_unlock(); } -static void htb_set_lockdep_class_child(struct Qdisc *q) -{ - static struct lock_class_key child_key; - - lockdep_set_class(qdisc_lock(q), &child_key); -} - static int htb_offload(struct net_device *dev, struct tc_htb_qopt_offload *opt) { return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_HTB, opt); @@ -1132,7 +1125,6 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt, return -ENOMEM; } - htb_set_lockdep_class_child(qdisc); q->direct_qdiscs[ntx] = qdisc; qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; } @@ -1468,7 +1460,6 @@ static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, } if (q->offload) { - htb_set_lockdep_class_child(new); /* One ref for cl->leaf.q, the other for dev_queue->qdisc. */ qdisc_refcount_inc(new); old_q = htb_graft_helper(dev_queue, new); @@ -1733,11 +1724,8 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg, new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, cl->parent->common.classid, NULL); - if (q->offload) { - if (new_q) - htb_set_lockdep_class_child(new_q); + if (q->offload) htb_parent_to_leaf_offload(sch, dev_queue, new_q); - } } sch_tree_lock(sch); @@ -1947,13 +1935,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, classid, NULL); if (q->offload) { - if (new_q) { - htb_set_lockdep_class_child(new_q); - /* One ref for cl->leaf.q, the other for - * dev_queue->qdisc. - */ + /* One ref for cl->leaf.q, the other for dev_queue->qdisc. */ + if (new_q) qdisc_refcount_inc(new_q); - } old_q = htb_graft_helper(dev_queue, new_q); /* No qdisc_put needed. */ WARN_ON(!(old_q->flags & TCQ_F_BUILTIN)); @@ -2166,6 +2150,7 @@ static struct Qdisc_ops htb_qdisc_ops __read_mostly = { .dump = htb_dump, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("htb"); static int __init htb_module_init(void) { diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 5fa9eaa79b..cc6051d4f2 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -91,7 +91,7 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt, entry = tcx_entry_fetch_or_create(dev, true, &created); if (!entry) return -ENOMEM; - tcx_miniq_set_active(entry, true); + tcx_miniq_inc(entry); mini_qdisc_pair_init(&q->miniqp, sch, &tcx_entry(entry)->miniq); if (created) tcx_entry_update(dev, entry, true); @@ -121,7 +121,7 @@ static void ingress_destroy(struct Qdisc *sch) tcf_block_put_ext(q->block, sch, &q->block_info); if (entry) { - tcx_miniq_set_active(entry, false); + tcx_miniq_dec(entry); if (!tcx_entry_is_active(entry)) { tcx_entry_update(dev, NULL, true); tcx_entry_free(entry); @@ -168,6 +168,7 @@ static struct Qdisc_ops ingress_qdisc_ops __read_mostly = { .ingress_block_get = ingress_ingress_block_get, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("ingress"); struct clsact_sched_data { struct tcf_block *ingress_block; @@ -256,7 +257,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, entry = tcx_entry_fetch_or_create(dev, true, &created); if (!entry) return -ENOMEM; - tcx_miniq_set_active(entry, true); + tcx_miniq_inc(entry); mini_qdisc_pair_init(&q->miniqp_ingress, sch, &tcx_entry(entry)->miniq); if (created) tcx_entry_update(dev, entry, true); @@ -275,7 +276,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, entry = tcx_entry_fetch_or_create(dev, false, &created); if (!entry) return -ENOMEM; - tcx_miniq_set_active(entry, true); + tcx_miniq_inc(entry); mini_qdisc_pair_init(&q->miniqp_egress, sch, &tcx_entry(entry)->miniq); if (created) tcx_entry_update(dev, entry, false); @@ -301,7 +302,7 @@ static void clsact_destroy(struct Qdisc *sch) tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info); if (ingress_entry) { - tcx_miniq_set_active(ingress_entry, false); + tcx_miniq_dec(ingress_entry); if (!tcx_entry_is_active(ingress_entry)) { tcx_entry_update(dev, NULL, true); tcx_entry_free(ingress_entry); @@ -309,7 +310,7 @@ static void clsact_destroy(struct Qdisc *sch) } if (egress_entry) { - tcx_miniq_set_active(egress_entry, false); + tcx_miniq_dec(egress_entry); if (!tcx_entry_is_active(egress_entry)) { tcx_entry_update(dev, NULL, false); tcx_entry_free(egress_entry); @@ -344,6 +345,7 @@ static struct Qdisc_ops clsact_qdisc_ops __read_mostly = { .egress_block_get = clsact_egress_block_get, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("clsact"); static int __init ingress_module_init(void) { @@ -368,6 +370,5 @@ static void __exit ingress_module_exit(void) module_init(ingress_module_init); module_exit(ingress_module_exit); -MODULE_ALIAS("sch_clsact"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Ingress and clsact based ingress and egress qdiscs"); diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 43e53ee00a..51d4013b61 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -215,10 +215,8 @@ static int mqprio_parse_tc_entries(struct Qdisc *sch, struct nlattr *nlattr_opt, for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) fp[tc] = priv->fp[tc]; - nla_for_each_attr(n, nlattr_opt, nlattr_opt_len, rem) { - if (nla_type(n) != TCA_MQPRIO_TC_ENTRY) - continue; - + nla_for_each_attr_type(n, TCA_MQPRIO_TC_ENTRY, nlattr_opt, + nlattr_opt_len, rem) { err = mqprio_parse_tc_entry(fp, n, &seen_tcs, extack); if (err) goto out; @@ -774,6 +772,7 @@ static struct Qdisc_ops mqprio_qdisc_ops __read_mostly = { .dump = mqprio_dump, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("mqprio"); static int __init mqprio_module_init(void) { diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index d66d5f0ec0..06e03f5cd7 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -185,7 +185,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, qopt->bands = qdisc_dev(sch)->real_num_tx_queues; - removed = kmalloc(sizeof(*removed) * (q->max_bands - q->bands), + removed = kmalloc(sizeof(*removed) * (q->max_bands - qopt->bands), GFP_KERNEL); if (!removed) return -ENOMEM; @@ -395,6 +395,7 @@ static struct Qdisc_ops multiq_qdisc_ops __read_mostly = { .dump = multiq_dump, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("multiq"); static int __init multiq_module_init(void) { diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index fa678eb885..edc72962ae 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -1293,6 +1293,7 @@ static struct Qdisc_ops netem_qdisc_ops __read_mostly = { .dump = netem_dump, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("netem"); static int __init netem_module_init(void) diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 2da6250ec3..b3dcb845b3 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -156,36 +156,38 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt, u32 target = nla_get_u32(tb[TCA_PIE_TARGET]); /* convert to pschedtime */ - q->params.target = PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC); + WRITE_ONCE(q->params.target, + PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC)); } /* tupdate is in jiffies */ if (tb[TCA_PIE_TUPDATE]) - q->params.tupdate = - usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE])); + WRITE_ONCE(q->params.tupdate, + usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE]))); if (tb[TCA_PIE_LIMIT]) { u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]); - q->params.limit = limit; - sch->limit = limit; + WRITE_ONCE(q->params.limit, limit); + WRITE_ONCE(sch->limit, limit); } if (tb[TCA_PIE_ALPHA]) - q->params.alpha = nla_get_u32(tb[TCA_PIE_ALPHA]); + WRITE_ONCE(q->params.alpha, nla_get_u32(tb[TCA_PIE_ALPHA])); if (tb[TCA_PIE_BETA]) - q->params.beta = nla_get_u32(tb[TCA_PIE_BETA]); + WRITE_ONCE(q->params.beta, nla_get_u32(tb[TCA_PIE_BETA])); if (tb[TCA_PIE_ECN]) - q->params.ecn = nla_get_u32(tb[TCA_PIE_ECN]); + WRITE_ONCE(q->params.ecn, nla_get_u32(tb[TCA_PIE_ECN])); if (tb[TCA_PIE_BYTEMODE]) - q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]); + WRITE_ONCE(q->params.bytemode, + nla_get_u32(tb[TCA_PIE_BYTEMODE])); if (tb[TCA_PIE_DQ_RATE_ESTIMATOR]) - q->params.dq_rate_estimator = - nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR]); + WRITE_ONCE(q->params.dq_rate_estimator, + nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR])); /* Drop excess packets if new limit is lower */ qlen = sch->q.qlen; @@ -469,17 +471,18 @@ static int pie_dump(struct Qdisc *sch, struct sk_buff *skb) /* convert target from pschedtime to us */ if (nla_put_u32(skb, TCA_PIE_TARGET, - ((u32)PSCHED_TICKS2NS(q->params.target)) / + ((u32)PSCHED_TICKS2NS(READ_ONCE(q->params.target))) / NSEC_PER_USEC) || - nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) || + nla_put_u32(skb, TCA_PIE_LIMIT, READ_ONCE(sch->limit)) || nla_put_u32(skb, TCA_PIE_TUPDATE, - jiffies_to_usecs(q->params.tupdate)) || - nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) || - nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) || + jiffies_to_usecs(READ_ONCE(q->params.tupdate))) || + nla_put_u32(skb, TCA_PIE_ALPHA, READ_ONCE(q->params.alpha)) || + nla_put_u32(skb, TCA_PIE_BETA, READ_ONCE(q->params.beta)) || nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) || - nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode) || + nla_put_u32(skb, TCA_PIE_BYTEMODE, + READ_ONCE(q->params.bytemode)) || nla_put_u32(skb, TCA_PIE_DQ_RATE_ESTIMATOR, - q->params.dq_rate_estimator)) + READ_ONCE(q->params.dq_rate_estimator))) goto nla_put_failure; return nla_nest_end(skb, opts); @@ -556,6 +559,7 @@ static struct Qdisc_ops pie_qdisc_ops __read_mostly = { .dump_stats = pie_dump_stats, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("pie"); static int __init pie_module_init(void) { diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c index 992f0c8d79..cefb65201e 100644 --- a/net/sched/sch_plug.c +++ b/net/sched/sch_plug.c @@ -213,6 +213,7 @@ static struct Qdisc_ops plug_qdisc_ops __read_mostly = { .reset = qdisc_reset_queue, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("plug"); static int __init plug_module_init(void) { diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 8ecdd3ef6f..cc30f7a32f 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -418,6 +418,7 @@ static struct Qdisc_ops prio_qdisc_ops __read_mostly = { .dump = prio_dump, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("prio"); static int __init prio_module_init(void) { diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 48a604c320..d584c0c258 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1521,6 +1521,7 @@ static struct Qdisc_ops qfq_qdisc_ops __read_mostly = { .destroy = qfq_destroy_qdisc, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("qfq"); static int __init qfq_init(void) { diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 607b6c8b3a..b5f096588f 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -548,6 +548,7 @@ static struct Qdisc_ops red_qdisc_ops __read_mostly = { .dump_stats = red_dump_stats, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("red"); static int __init red_module_init(void) { diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 1871a1c022..b717e15a3a 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -709,6 +709,7 @@ static struct Qdisc_ops sfb_qdisc_ops __read_mostly = { .dump_stats = sfb_dump_stats, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("sfb"); static int __init sfb_module_init(void) { diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index eb77558fa3..3b9245a3c7 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -608,6 +608,7 @@ static void sfq_perturbation(struct timer_list *t) struct Qdisc *sch = q->sch; spinlock_t *root_lock; siphash_key_t nkey; + int period; get_random_bytes(&nkey, sizeof(nkey)); rcu_read_lock(); @@ -618,8 +619,12 @@ static void sfq_perturbation(struct timer_list *t) sfq_rehash(sch); spin_unlock(root_lock); - if (q->perturb_period) - mod_timer(&q->perturb_timer, jiffies + q->perturb_period); + /* q->perturb_period can change under us from + * sfq_change() and sfq_destroy(). + */ + period = READ_ONCE(q->perturb_period); + if (period) + mod_timer(&q->perturb_timer, jiffies + period); rcu_read_unlock(); } @@ -662,7 +667,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) q->quantum = ctl->quantum; q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); } - q->perturb_period = ctl->perturb_period * HZ; + WRITE_ONCE(q->perturb_period, ctl->perturb_period * HZ); if (ctl->flows) q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS); if (ctl->divisor) { @@ -724,7 +729,7 @@ static void sfq_destroy(struct Qdisc *sch) struct sfq_sched_data *q = qdisc_priv(sch); tcf_block_put(q->block); - q->perturb_period = 0; + WRITE_ONCE(q->perturb_period, 0); del_timer_sync(&q->perturb_timer); sfq_free(q->ht); sfq_free(q->slots); @@ -925,6 +930,7 @@ static struct Qdisc_ops sfq_qdisc_ops __read_mostly = { .dump = sfq_dump, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("sfq"); static int __init sfq_module_init(void) { diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c index 28beb11762..20ff7386b7 100644 --- a/net/sched/sch_skbprio.c +++ b/net/sched/sch_skbprio.c @@ -79,7 +79,9 @@ static int skbprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, prio = min(skb->priority, max_priority); qdisc = &q->qdiscs[prio]; - if (sch->q.qlen < sch->limit) { + + /* sch->limit can change under us from skbprio_change() */ + if (sch->q.qlen < READ_ONCE(sch->limit)) { __skb_queue_tail(qdisc, skb); qdisc_qstats_backlog_inc(sch, skb); q->qstats[prio].backlog += qdisc_pkt_len(skb); @@ -172,7 +174,7 @@ static int skbprio_change(struct Qdisc *sch, struct nlattr *opt, if (opt->nla_len != nla_attr_size(sizeof(*ctl))) return -EINVAL; - sch->limit = ctl->limit; + WRITE_ONCE(sch->limit, ctl->limit); return 0; } @@ -200,7 +202,7 @@ static int skbprio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct tc_skbprio_qopt opt; - opt.limit = sch->limit; + opt.limit = READ_ONCE(sch->limit); if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) return -1; @@ -292,6 +294,7 @@ static struct Qdisc_ops skbprio_qdisc_ops __read_mostly = { .destroy = skbprio_destroy, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("skbprio"); static int __init skbprio_module_init(void) { diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index ad99409c63..b284a06b5a 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -40,6 +40,8 @@ static struct static_key_false taprio_have_working_mqprio; #define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) #define FULL_OFFLOAD_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD) +#define TAPRIO_SUPPORTED_FLAGS \ + (TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST | TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD) #define TAPRIO_FLAGS_INVALID U32_MAX struct sched_entry { @@ -408,19 +410,6 @@ static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch) return entry; } -static bool taprio_flags_valid(u32 flags) -{ - /* Make sure no other flag bits are set. */ - if (flags & ~(TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST | - TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)) - return false; - /* txtime-assist and full offload are mutually exclusive */ - if ((flags & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) && - (flags & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)) - return false; - return true; -} - /* This returns the tstamp value set by TCP in terms of the set clock. */ static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb) { @@ -1032,7 +1021,8 @@ static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = NLA_POLICY_FULL_RANGE_SIGNED(NLA_S64, &taprio_cycle_time_range), [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 }, - [TCA_TAPRIO_ATTR_FLAGS] = { .type = NLA_U32 }, + [TCA_TAPRIO_ATTR_FLAGS] = + NLA_POLICY_MASK(NLA_U32, TAPRIO_SUPPORTED_FLAGS), [TCA_TAPRIO_ATTR_TXTIME_DELAY] = { .type = NLA_U32 }, [TCA_TAPRIO_ATTR_TC_ENTRY] = { .type = NLA_NESTED }, }; @@ -1161,11 +1151,6 @@ static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, list_for_each_entry(entry, &new->entries, list) cycle = ktime_add_ns(cycle, entry->interval); - if (!cycle) { - NL_SET_ERR_MSG(extack, "'cycle_time' can never be 0"); - return -EINVAL; - } - if (cycle < 0 || cycle > INT_MAX) { NL_SET_ERR_MSG(extack, "'cycle_time' is too big"); return -EINVAL; @@ -1174,6 +1159,11 @@ static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, new->cycle_time = cycle; } + if (new->cycle_time < new->num_entries * length_to_duration(q, ETH_ZLEN)) { + NL_SET_ERR_MSG(extack, "'cycle_time' is too small"); + return -EINVAL; + } + taprio_calculate_gate_durations(q, new); return 0; @@ -1186,16 +1176,13 @@ static int taprio_parse_mqprio_opt(struct net_device *dev, { bool allow_overlapping_txqs = TXTIME_ASSIST_IS_ENABLED(taprio_flags); - if (!qopt && !dev->num_tc) { - NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); - return -EINVAL; - } - - /* If num_tc is already set, it means that the user already - * configured the mqprio part - */ - if (dev->num_tc) + if (!qopt) { + if (!dev->num_tc) { + NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); + return -EINVAL; + } return 0; + } /* taprio imposes that traffic classes map 1:n to tx queues */ if (qopt->num_tc > dev->num_tx_queues) { @@ -1762,10 +1749,7 @@ static int taprio_parse_tc_entries(struct Qdisc *sch, fp[tc] = q->fp[tc]; } - nla_for_each_nested(n, opt, rem) { - if (nla_type(n) != TCA_TAPRIO_ATTR_TC_ENTRY) - continue; - + nla_for_each_nested_type(n, TCA_TAPRIO_ATTR_TC_ENTRY, opt, rem) { err = taprio_parse_tc_entry(sch, n, max_sdu, fp, &seen_tcs, extack); if (err) @@ -1816,33 +1800,6 @@ static int taprio_mqprio_cmp(const struct net_device *dev, return 0; } -/* The semantics of the 'flags' argument in relation to 'change()' - * requests, are interpreted following two rules (which are applied in - * this order): (1) an omitted 'flags' argument is interpreted as - * zero; (2) the 'flags' of a "running" taprio instance cannot be - * changed. - */ -static int taprio_new_flags(const struct nlattr *attr, u32 old, - struct netlink_ext_ack *extack) -{ - u32 new = 0; - - if (attr) - new = nla_get_u32(attr); - - if (old != TAPRIO_FLAGS_INVALID && old != new) { - NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported"); - return -EOPNOTSUPP; - } - - if (!taprio_flags_valid(new)) { - NL_SET_ERR_MSG_MOD(extack, "Specified 'flags' are not valid"); - return -EINVAL; - } - - return new; -} - static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -1853,6 +1810,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct net_device *dev = qdisc_dev(sch); struct tc_mqprio_qopt *mqprio = NULL; unsigned long flags; + u32 taprio_flags; ktime_t start; int i, err; @@ -1864,12 +1822,31 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_TAPRIO_ATTR_PRIOMAP]) mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]); - err = taprio_new_flags(tb[TCA_TAPRIO_ATTR_FLAGS], - q->flags, extack); - if (err < 0) - return err; + /* The semantics of the 'flags' argument in relation to 'change()' + * requests, are interpreted following two rules (which are applied in + * this order): (1) an omitted 'flags' argument is interpreted as + * zero; (2) the 'flags' of a "running" taprio instance cannot be + * changed. + */ + taprio_flags = tb[TCA_TAPRIO_ATTR_FLAGS] ? nla_get_u32(tb[TCA_TAPRIO_ATTR_FLAGS]) : 0; - q->flags = err; + /* txtime-assist and full offload are mutually exclusive */ + if ((taprio_flags & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) && + (taprio_flags & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)) { + NL_SET_ERR_MSG_ATTR(extack, tb[TCA_TAPRIO_ATTR_FLAGS], + "TXTIME_ASSIST and FULL_OFFLOAD are mutually exclusive"); + return -EINVAL; + } + + if (q->flags != TAPRIO_FLAGS_INVALID && q->flags != taprio_flags) { + NL_SET_ERR_MSG_MOD(extack, + "Changing 'flags' of a running schedule is not supported"); + return -EOPNOTSUPP; + } + q->flags = taprio_flags; + + /* Needed for length_to_duration() during netlink attribute parsing */ + taprio_set_picos_per_byte(dev, q); err = taprio_parse_mqprio_opt(dev, mqprio, extack, q->flags); if (err < 0) @@ -1930,7 +1907,6 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, if (err < 0) goto free_sched; - taprio_set_picos_per_byte(dev, q); taprio_update_queue_max_sdu(q, new_admin, stab); if (FULL_OFFLOAD_IS_ENABLED(q->flags)) @@ -2549,6 +2525,7 @@ static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { .dump_stats = taprio_dump_stats, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("taprio"); static struct notifier_block taprio_device_notifier = { .notifier_call = taprio_dev_notifier, diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index dd6b1a723b..f1d09183ae 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -608,6 +608,7 @@ static struct Qdisc_ops tbf_qdisc_ops __read_mostly = { .dump = tbf_dump, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("tbf"); static int __init tbf_module_init(void) { diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 59304611dc..8badec6d82 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -78,7 +78,7 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) struct net_device *dev = qdisc_dev(sch); struct teql_sched_data *q = qdisc_priv(sch); - if (q->q.qlen < dev->tx_queue_len) { + if (q->q.qlen < READ_ONCE(dev->tx_queue_len)) { __skb_queue_tail(&q->q, skb); return NET_XMIT_SUCCESS; } @@ -424,7 +424,7 @@ static int teql_master_mtu(struct net_device *dev, int new_mtu) } while ((q = NEXT_SLAVE(q)) != m->slaves); } - dev->mtu = new_mtu; + WRITE_ONCE(dev->mtu, new_mtu); return 0; } diff --git a/net/sctp/diag.c b/net/sctp/diag.c index eb05131ff1..23359e5222 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -507,6 +507,7 @@ done: } static const struct inet_diag_handler sctp_diag_handler = { + .owner = THIS_MODULE, .dump = sctp_diag_dump, .dump_one = sctp_diag_dump_one, .idiag_get_info = sctp_diag_get_info, diff --git a/net/sctp/input.c b/net/sctp/input.c index 17fcaa9b0d..a8a254a500 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -735,15 +735,19 @@ static int __sctp_hash_endpoint(struct sctp_endpoint *ep) struct sock *sk = ep->base.sk; struct net *net = sock_net(sk); struct sctp_hashbucket *head; + int err = 0; ep->hashent = sctp_ep_hashfn(net, ep->base.bind_addr.port); head = &sctp_ep_hashtable[ep->hashent]; + write_lock(&head->lock); if (sk->sk_reuseport) { bool any = sctp_is_ep_boundall(sk); struct sctp_endpoint *ep2; struct list_head *list; - int cnt = 0, err = 1; + int cnt = 0; + + err = 1; list_for_each(list, &ep->base.bind_addr.address_list) cnt++; @@ -761,24 +765,24 @@ static int __sctp_hash_endpoint(struct sctp_endpoint *ep) if (!err) { err = reuseport_add_sock(sk, sk2, any); if (err) - return err; + goto out; break; } else if (err < 0) { - return err; + goto out; } } if (err) { err = reuseport_alloc(sk, any); if (err) - return err; + goto out; } } - write_lock(&head->lock); hlist_add_head(&ep->node, &head->chain); +out: write_unlock(&head->lock); - return 0; + return err; } /* Add an endpoint to the hash. Local BH-safe. */ @@ -803,10 +807,9 @@ static void __sctp_unhash_endpoint(struct sctp_endpoint *ep) head = &sctp_ep_hashtable[ep->hashent]; + write_lock(&head->lock); if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); - - write_lock(&head->lock); hlist_del_init(&ep->node); write_unlock(&head->lock); } diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 24368f755a..f7b809c0d1 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -415,7 +415,7 @@ out: if (!IS_ERR_OR_NULL(dst)) { struct rt6_info *rt; - rt = (struct rt6_info *)dst; + rt = dst_rt6_info(dst); t->dst_cookie = rt6_get_cookie(rt); pr_debug("rt6_dst:%pI6/%d rt6_src:%pI6\n", &rt->rt6i_dst.addr, rt->rt6i_dst.plen, diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 94c6dd53cd..5a7436a13b 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -552,7 +552,7 @@ static void sctp_v4_get_saddr(struct sctp_sock *sk, struct flowi *fl) { union sctp_addr *saddr = &t->saddr; - struct rtable *rt = (struct rtable *)t->dst; + struct rtable *rt = dst_rtable(t->dst); if (rt) { saddr->v4.sin_family = AF_INET; @@ -1085,7 +1085,7 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t) skb_reset_inner_mac_header(skb); skb_reset_inner_transport_header(skb); skb_set_inner_ipproto(skb, IPPROTO_SCTP); - udp_tunnel_xmit_skb((struct rtable *)dst, sk, skb, fl4->saddr, + udp_tunnel_xmit_skb(dst_rtable(dst), sk, skb, fl4->saddr, fl4->daddr, dscp, ip4_dst_hoplimit(dst), df, sctp_sk(sk)->udp_port, t->encap_port, false, false); return 0; @@ -1495,17 +1495,11 @@ static __init int sctp_init(void) /* Allocate bind_bucket and chunk caches. */ status = -ENOBUFS; - sctp_bucket_cachep = kmem_cache_create("sctp_bind_bucket", - sizeof(struct sctp_bind_bucket), - 0, SLAB_HWCACHE_ALIGN, - NULL); + sctp_bucket_cachep = KMEM_CACHE(sctp_bind_bucket, SLAB_HWCACHE_ALIGN); if (!sctp_bucket_cachep) goto out; - sctp_chunk_cachep = kmem_cache_create("sctp_chunk", - sizeof(struct sctp_chunk), - 0, SLAB_HWCACHE_ALIGN, - NULL); + sctp_chunk_cachep = KMEM_CACHE(sctp_chunk, SLAB_HWCACHE_ALIGN); if (!sctp_chunk_cachep) goto err_chunk_cachep; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 08fdf1251f..5adf0c0a6c 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -38,6 +38,7 @@ #include <linux/inet.h> #include <linux/slab.h> #include <net/sock.h> +#include <net/proto_memory.h> #include <net/inet_ecn.h> #include <linux/skbuff.h> #include <net/sctp/sctp.h> diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 6b9fcdb095..c009383369 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -67,6 +67,7 @@ #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> +#include <net/rps.h> /* Forward declarations for internal helper functions. */ static bool sctp_writeable(const struct sock *sk); @@ -4846,7 +4847,7 @@ static int sctp_disconnect(struct sock *sk, int flags) * descriptor will be returned from accept() to represent the newly * formed association. */ -static struct sock *sctp_accept(struct sock *sk, int flags, int *err, bool kern) +static struct sock *sctp_accept(struct sock *sk, struct proto_accept_arg *arg) { struct sctp_sock *sp; struct sctp_endpoint *ep; @@ -4870,7 +4871,7 @@ static struct sock *sctp_accept(struct sock *sk, int flags, int *err, bool kern) goto out; } - timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); error = sctp_wait_for_accept(sk, timeo); if (error) @@ -4881,7 +4882,7 @@ static struct sock *sctp_accept(struct sock *sk, int flags, int *err, bool kern) */ asoc = list_entry(ep->asocs.next, struct sctp_association, asocs); - newsk = sp->pf->create_accept_sk(sk, asoc, kern); + newsk = sp->pf->create_accept_sk(sk, asoc, arg->kern); if (!newsk) { error = -ENOMEM; goto out; @@ -4898,7 +4899,7 @@ static struct sock *sctp_accept(struct sock *sk, int flags, int *err, bool kern) out: release_sock(sk); - *err = error; + arg->err = error; return newsk; } @@ -7118,6 +7119,7 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len, struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; struct sctp_assoc_ids *ids; + size_t ids_size; u32 num = 0; if (sctp_style(sk, TCP)) @@ -7130,11 +7132,11 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len, num++; } - if (len < sizeof(struct sctp_assoc_ids) + sizeof(sctp_assoc_t) * num) + ids_size = struct_size(ids, gaids_assoc_id, num); + if (len < ids_size) return -EINVAL; - len = sizeof(struct sctp_assoc_ids) + sizeof(sctp_assoc_t) * num; - + len = ids_size; ids = kmalloc(len, GFP_USER | __GFP_NOWARN); if (unlikely(!ids)) return -ENOMEM; @@ -9275,7 +9277,7 @@ void sctp_data_ready(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index f65d6f92af..61c6f3027e 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -80,8 +80,6 @@ static struct ctl_table sctp_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - - { /* sentinel */ } }; /* The following index defines are used in sctp_sysctl_net_register(). @@ -384,8 +382,6 @@ static struct ctl_table sctp_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &pf_expose_max, }, - - { /* sentinel */ } }; static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, @@ -597,6 +593,7 @@ static int proc_sctp_do_probe_interval(struct ctl_table *ctl, int write, int sctp_sysctl_net_register(struct net *net) { + size_t table_size = ARRAY_SIZE(sctp_net_table); struct ctl_table *table; int i; @@ -604,7 +601,7 @@ int sctp_sysctl_net_register(struct net *net) if (!table) return -ENOMEM; - for (i = 0; table[i].data; i++) + for (i = 0; i < table_size; i++) table[i].data += (char *)(&net->sctp) - (char *)&init_net.sctp; table[SCTP_RTO_MIN_IDX].extra2 = &net->sctp.rto_max; @@ -613,8 +610,7 @@ int sctp_sysctl_net_register(struct net *net) table[SCTP_PS_RETRANS_IDX].extra1 = &net->sctp.pf_retrans; net->sctp.sysctl_header = register_net_sysctl_sz(net, "net/sctp", - table, - ARRAY_SIZE(sctp_net_table)); + table, table_size); if (net->sctp.sysctl_header == NULL) { kfree(table); return -ENOMEM; @@ -624,7 +620,7 @@ int sctp_sysctl_net_register(struct net *net) void sctp_sysctl_net_unregister(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->sctp.sysctl_header->ctl_table_arg; unregister_net_sysctl_table(net->sctp.sysctl_header); diff --git a/net/smc/Kconfig b/net/smc/Kconfig index 746be39967..ba5e6a2dd2 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -20,3 +20,16 @@ config SMC_DIAG smcss. if unsure, say Y. + +config SMC_LO + bool "SMC intra-OS shortcut with loopback-ism" + depends on SMC + default n + help + SMC_LO enables the creation of an Emulated-ISM device named + loopback-ism in SMC and makes use of it for transferring data + when communication occurs within the same OS. This helps in + convenient testing of SMC-D since loopback-ism is independent + of architecture or hardware. + + if unsure, say N. diff --git a/net/smc/Makefile b/net/smc/Makefile index 875efcd126..2c510d5430 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -6,3 +6,4 @@ smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o smc-y += smc_tracepoint.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o +smc-$(CONFIG_SMC_LO) += smc_loopback.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 0f53a5c6fd..c5f98c6b25 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -53,6 +53,7 @@ #include "smc_stats.h" #include "smc_tracepoint.h" #include "smc_sysctl.h" +#include "smc_loopback.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -177,7 +178,7 @@ static struct smc_hashinfo smc_v6_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock), }; -int smc_hash_sk(struct sock *sk) +static int smc_hash_sk(struct sock *sk) { struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; struct hlist_head *head; @@ -191,9 +192,8 @@ int smc_hash_sk(struct sock *sk) return 0; } -EXPORT_SYMBOL_GPL(smc_hash_sk); -void smc_unhash_sk(struct sock *sk) +static void smc_unhash_sk(struct sock *sk) { struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; @@ -202,7 +202,6 @@ void smc_unhash_sk(struct sock *sk) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); write_unlock_bh(&h->lock); } -EXPORT_SYMBOL_GPL(smc_unhash_sk); /* This will be called before user really release sock_lock. So do the * work which we didn't do because of user hold the sock_lock in the @@ -460,29 +459,11 @@ out: static void smc_adjust_sock_bufsizes(struct sock *nsk, struct sock *osk, unsigned long mask) { - struct net *nnet = sock_net(nsk); - nsk->sk_userlocks = osk->sk_userlocks; - if (osk->sk_userlocks & SOCK_SNDBUF_LOCK) { + if (osk->sk_userlocks & SOCK_SNDBUF_LOCK) nsk->sk_sndbuf = osk->sk_sndbuf; - } else { - if (mask == SK_FLAGS_SMC_TO_CLC) - WRITE_ONCE(nsk->sk_sndbuf, - READ_ONCE(nnet->ipv4.sysctl_tcp_wmem[1])); - else - WRITE_ONCE(nsk->sk_sndbuf, - 2 * READ_ONCE(nnet->smc.sysctl_wmem)); - } - if (osk->sk_userlocks & SOCK_RCVBUF_LOCK) { + if (osk->sk_userlocks & SOCK_RCVBUF_LOCK) nsk->sk_rcvbuf = osk->sk_rcvbuf; - } else { - if (mask == SK_FLAGS_SMC_TO_CLC) - WRITE_ONCE(nsk->sk_rcvbuf, - READ_ONCE(nnet->ipv4.sysctl_tcp_rmem[1])); - else - WRITE_ONCE(nsk->sk_rcvbuf, - 2 * READ_ONCE(nnet->smc.sysctl_rmem)); - } } static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, @@ -1046,7 +1027,7 @@ static int smc_find_ism_v2_device_clnt(struct smc_sock *smc, int rc = SMC_CLC_DECL_NOSMCDDEV; struct smcd_dev *smcd; int i = 1, entry = 1; - bool is_virtual; + bool is_emulated; u16 chid; if (smcd_indicated(ini->smc_type_v1)) @@ -1058,12 +1039,12 @@ static int smc_find_ism_v2_device_clnt(struct smc_sock *smc, chid = smc_ism_get_chid(smcd); if (!smc_find_ism_v2_is_unique_chid(chid, ini, i)) continue; - is_virtual = __smc_ism_is_virtual(chid); + is_emulated = __smc_ism_is_emulated(chid); if (!smc_pnet_is_pnetid_set(smcd->pnetid) || smc_pnet_is_ndev_pnetid(sock_net(&smc->sk), smcd->pnetid)) { - if (is_virtual && entry == SMCD_CLC_MAX_V2_GID_ENTRIES) + if (is_emulated && entry == SMCD_CLC_MAX_V2_GID_ENTRIES) /* It's the last GID-CHID entry left in CLC - * Proposal SMC-Dv2 extension, but a virtual + * Proposal SMC-Dv2 extension, but an Emulated- * ISM device will take two entries. So give * up it and try the next potential ISM device. */ @@ -1073,7 +1054,7 @@ static int smc_find_ism_v2_device_clnt(struct smc_sock *smc, ini->is_smcd = true; rc = 0; i++; - entry = is_virtual ? entry + 2 : entry + 1; + entry = is_emulated ? entry + 2 : entry + 1; if (entry > SMCD_CLC_MAX_V2_GID_ENTRIES) break; } @@ -1414,10 +1395,10 @@ static int smc_connect_ism(struct smc_sock *smc, if (rc) return rc; - if (__smc_ism_is_virtual(ini->ism_chid[ini->ism_selected])) + if (__smc_ism_is_emulated(ini->ism_chid[ini->ism_selected])) ini->ism_peer_gid[ini->ism_selected].gid_ext = ntohll(aclc->d1.gid_ext); - /* for non-virtual ISM devices, peer gid_ext remains 0. */ + /* for non-Emulated-ISM devices, peer gid_ext remains 0. */ } ini->ism_peer_gid[ini->ism_selected].gid = ntohll(aclc->d0.gid); @@ -1437,6 +1418,14 @@ static int smc_connect_ism(struct smc_sock *smc, } smc_conn_save_peer_info(smc, aclc); + + if (smc_ism_support_dmb_nocopy(smc->conn.lgr->smcd)) { + rc = smcd_buf_attach(smc); + if (rc) { + rc = SMC_CLC_DECL_MEM; /* try to fallback */ + goto connect_abort; + } + } smc_close_init(smc); smc_rx_init(smc); smc_tx_init(smc); @@ -2118,10 +2107,10 @@ static void smc_check_ism_v2_match(struct smc_init_info *ini, if (smc_ism_get_chid(smcd) == proposed_chid && !smc_ism_cantalk(proposed_gid, ISM_RESERVED_VLANID, smcd)) { ini->ism_peer_gid[*matches].gid = proposed_gid->gid; - if (__smc_ism_is_virtual(proposed_chid)) + if (__smc_ism_is_emulated(proposed_chid)) ini->ism_peer_gid[*matches].gid_ext = proposed_gid->gid_ext; - /* non-virtual ISM's peer gid_ext remains 0. */ + /* non-Emulated-ISM's peer gid_ext remains 0. */ ini->ism_dev[*matches] = smcd; (*matches)++; break; @@ -2171,10 +2160,10 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, smcd_gid.gid = ntohll(smcd_v2_ext->gidchid[i].gid); smcd_gid.gid_ext = 0; chid = ntohs(smcd_v2_ext->gidchid[i].chid); - if (__smc_ism_is_virtual(chid)) { + if (__smc_ism_is_emulated(chid)) { if ((i + 1) == smc_v2_ext->hdr.ism_gid_cnt || chid != ntohs(smcd_v2_ext->gidchid[i + 1].chid)) - /* each virtual ISM device takes two GID-CHID + /* each Emulated-ISM device takes two GID-CHID * entries and CHID of the second entry repeats * that of the first entry. * @@ -2541,6 +2530,14 @@ static void smc_listen_work(struct work_struct *work) mutex_unlock(&smc_server_lgr_pending); } smc_conn_save_peer_info(new_smc, cclc); + + if (ini->is_smcd && + smc_ism_support_dmb_nocopy(new_smc->conn.lgr->smcd)) { + rc = smcd_buf_attach(new_smc); + if (rc) + goto out_decl; + } + smc_listen_out_connected(new_smc); SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini); goto out_free; @@ -2674,7 +2671,7 @@ out: } static int smc_accept(struct socket *sock, struct socket *new_sock, - int flags, bool kern) + struct proto_accept_arg *arg) { struct sock *sk = sock->sk, *nsk; DECLARE_WAITQUEUE(wait, current); @@ -2693,7 +2690,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, } /* Wait for an incoming connection */ - timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); add_wait_queue_exclusive(sk_sleep(sk), &wait); while (!(nsk = smc_accept_dequeue(sk, new_sock))) { set_current_state(TASK_INTERRUPTIBLE); @@ -2720,7 +2717,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, if (rc) goto out; - if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) { + if (lsmc->sockopt_defer_accept && !(arg->flags & O_NONBLOCK)) { /* wait till data arrives on the socket */ timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept * MSEC_PER_SEC); @@ -3557,15 +3554,23 @@ static int __init smc_init(void) goto out_sock; } + rc = smc_loopback_init(); + if (rc) { + pr_err("%s: smc_loopback_init fails with %d\n", __func__, rc); + goto out_ib; + } + rc = tcp_register_ulp(&smc_ulp_ops); if (rc) { pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); - goto out_ib; + goto out_lo; } static_branch_enable(&tcp_have_smc); return 0; +out_lo: + smc_loopback_exit(); out_ib: smc_ib_unregister_client(); out_sock: @@ -3603,6 +3608,7 @@ static void __exit smc_exit(void) tcp_unregister_ulp(&smc_ulp_ops); sock_unregister(PF_SMC); smc_core_exit(); + smc_loopback_exit(); smc_ib_unregister_client(); smc_ism_exit(); destroy_workqueue(smc_close_wq); diff --git a/net/smc/smc.h b/net/smc/smc.h index df64efd2de..18c8b78701 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -56,11 +56,11 @@ enum smc_state { /* possible states of an SMC socket */ }; enum smc_supplemental_features { - SMC_SPF_VIRT_ISM_DEV = 0, + SMC_SPF_EMULATED_ISM_DEV = 0, }; #define SMC_FEATURE_MASK \ - (BIT(SMC_SPF_VIRT_ISM_DEV)) + (BIT(SMC_SPF_EMULATED_ISM_DEV)) struct smc_link_group; diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 3c06625ceb..619b3bab38 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -18,6 +18,7 @@ #include "smc_tx.h" #include "smc_rx.h" #include "smc_close.h" +#include "smc_ism.h" /********************************** send *************************************/ @@ -255,6 +256,14 @@ int smcd_cdc_msg_send(struct smc_connection *conn) return rc; smc_curs_copy(&conn->rx_curs_confirmed, &curs, conn); conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; + + if (smc_ism_support_dmb_nocopy(conn->lgr->smcd)) + /* if local sndbuf shares the same memory region with + * peer DMB, then don't update the tx_curs_fin + * and sndbuf_space until peer has consumed the data. + */ + return 0; + /* Calculate transmitted data and increment free send buffer space */ diff = smc_curs_diff(conn->sndbuf_desc->len, &conn->tx_curs_fin, &conn->tx_curs_sent); @@ -266,7 +275,7 @@ int smcd_cdc_msg_send(struct smc_connection *conn) smc_curs_copy(&conn->tx_curs_fin, &conn->tx_curs_sent, conn); smc_tx_sndbuf_nonfull(smc); - return rc; + return 0; } /********************************* receive ***********************************/ @@ -323,7 +332,7 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, { union smc_host_cursor cons_old, prod_old; struct smc_connection *conn = &smc->conn; - int diff_cons, diff_prod; + int diff_cons, diff_prod, diff_tx; smc_curs_copy(&prod_old, &conn->local_rx_ctrl.prod, conn); smc_curs_copy(&cons_old, &conn->local_rx_ctrl.cons, conn); @@ -339,6 +348,29 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, atomic_add(diff_cons, &conn->peer_rmbe_space); /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */ smp_mb__after_atomic(); + + /* if local sndbuf shares the same memory region with + * peer RMB, then update tx_curs_fin and sndbuf_space + * here since peer has already consumed the data. + */ + if (conn->lgr->is_smcd && + smc_ism_support_dmb_nocopy(conn->lgr->smcd)) { + /* Calculate consumed data and + * increment free send buffer space. + */ + diff_tx = smc_curs_diff(conn->sndbuf_desc->len, + &conn->tx_curs_fin, + &conn->local_rx_ctrl.cons); + /* increase local sndbuf space and fin_curs */ + smp_mb__before_atomic(); + atomic_add(diff_tx, &conn->sndbuf_space); + /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ + smp_mb__after_atomic(); + smc_curs_copy(&conn->tx_curs_fin, + &conn->local_rx_ctrl.cons, conn); + + smc_tx_sndbuf_nonfull(smc); + } } diff_prod = smc_curs_diff(conn->rmb_desc->len, &prod_old, diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 9a13709bea..33fa787c28 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -853,8 +853,10 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) pclc_smcd = &pclc->pclc_smcd; pclc_prfx = &pclc->pclc_prfx; ipv6_prfx = pclc->pclc_prfx_ipv6; - v2_ext = &pclc->pclc_v2_ext; - smcd_v2_ext = &pclc->pclc_smcd_v2_ext; + v2_ext = container_of(&pclc->pclc_v2_ext, + struct smc_clc_v2_extension, fixed); + smcd_v2_ext = container_of(&pclc->pclc_smcd_v2_ext, + struct smc_clc_smcd_v2_extension, fixed); gidchids = pclc->pclc_gidchids; trl = &pclc->pclc_trl; @@ -952,8 +954,8 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) gidchids[entry].chid = htons(smc_ism_get_chid(ini->ism_dev[i])); gidchids[entry].gid = htonll(smcd_gid.gid); - if (smc_ism_is_virtual(smcd)) { - /* a virtual ISM device takes two + if (smc_ism_is_emulated(smcd)) { + /* an Emulated-ISM device takes two * entries. CHID of the second entry * repeats that of the first entry. */ @@ -1055,7 +1057,7 @@ smcd_clc_prep_confirm_accept(struct smc_connection *conn, clc->d1.chid = htons(chid); if (eid && eid[0]) memcpy(clc->d1.eid, eid, SMC_MAX_EID_LEN); - if (__smc_ism_is_virtual(chid)) + if (__smc_ism_is_emulated(chid)) clc->d1.gid_ext = htonll(smcd_gid.gid_ext); len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2; if (first_contact) { diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index a9f9bdd26d..467effb50c 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -134,12 +134,15 @@ struct smc_clc_smcd_gid_chid { */ struct smc_clc_v2_extension { - struct smc_clnt_opts_area_hdr hdr; - u8 roce[16]; /* RoCEv2 GID */ - u8 max_conns; - u8 max_links; - __be16 feature_mask; - u8 reserved[12]; + /* New members must be added within the struct_group() macro below. */ + struct_group_tagged(smc_clc_v2_extension_fixed, fixed, + struct smc_clnt_opts_area_hdr hdr; + u8 roce[16]; /* RoCEv2 GID */ + u8 max_conns; + u8 max_links; + __be16 feature_mask; + u8 reserved[12]; + ); u8 user_eids[][SMC_MAX_EID_LEN]; }; @@ -159,8 +162,11 @@ struct smc_clc_msg_smcd { /* SMC-D GID information */ }; struct smc_clc_smcd_v2_extension { - u8 system_eid[SMC_MAX_EID_LEN]; - u8 reserved[16]; + /* New members must be added within the struct_group() macro below. */ + struct_group_tagged(smc_clc_smcd_v2_extension_fixed, fixed, + u8 system_eid[SMC_MAX_EID_LEN]; + u8 reserved[16]; + ); struct smc_clc_smcd_gid_chid gidchid[]; }; @@ -175,7 +181,7 @@ struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ #define SMCD_CLC_MAX_V2_GID_ENTRIES 8 /* max # of CHID-GID entries in CLC * proposal SMC-Dv2 extension. * each ISM device takes one entry and - * each virtual ISM takes two entries. + * each Emulated-ISM takes two entries */ struct smc_clc_msg_proposal_area { @@ -183,9 +189,9 @@ struct smc_clc_msg_proposal_area { struct smc_clc_msg_smcd pclc_smcd; struct smc_clc_msg_proposal_prefix pclc_prfx; struct smc_clc_ipv6_prefix pclc_prfx_ipv6[SMC_CLC_MAX_V6_PREFIX]; - struct smc_clc_v2_extension pclc_v2_ext; + struct smc_clc_v2_extension_fixed pclc_v2_ext; u8 user_eids[SMC_CLC_MAX_UEID][SMC_MAX_EID_LEN]; - struct smc_clc_smcd_v2_extension pclc_smcd_v2_ext; + struct smc_clc_smcd_v2_extension_fixed pclc_smcd_v2_ext; struct smc_clc_smcd_gid_chid pclc_gidchids[SMCD_CLC_MAX_V2_GID_ENTRIES]; struct smc_clc_msg_trail pclc_trl; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index e4c8584112..acca3b1a06 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1149,6 +1149,20 @@ static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb, } } +static void smcd_buf_detach(struct smc_connection *conn) +{ + struct smcd_dev *smcd = conn->lgr->smcd; + u64 peer_token = conn->peer_token; + + if (!conn->sndbuf_desc) + return; + + smc_ism_detach_dmb(smcd, peer_token); + + kfree(conn->sndbuf_desc); + conn->sndbuf_desc = NULL; +} + static void smc_buf_unuse(struct smc_connection *conn, struct smc_link_group *lgr) { @@ -1192,6 +1206,8 @@ void smc_conn_free(struct smc_connection *conn) if (lgr->is_smcd) { if (!list_empty(&lgr->list)) smc_ism_unset_conn(conn); + if (smc_ism_support_dmb_nocopy(lgr->smcd)) + smcd_buf_detach(conn); tasklet_kill(&conn->rx_tsklet); } else { smc_cdc_wait_pend_tx_wr(conn); @@ -1445,6 +1461,8 @@ static void smc_conn_kill(struct smc_connection *conn, bool soft) smc_sk_wake_ups(smc); if (conn->lgr->is_smcd) { smc_ism_unset_conn(conn); + if (smc_ism_support_dmb_nocopy(conn->lgr->smcd)) + smcd_buf_detach(conn); if (soft) tasklet_kill(&conn->rx_tsklet); else @@ -1535,7 +1553,7 @@ void smc_smcd_terminate(struct smcd_dev *dev, struct smcd_gid *peer_gid, list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) { if ((!peer_gid->gid || (lgr->peer_gid.gid == peer_gid->gid && - !smc_ism_is_virtual(dev) ? 1 : + !smc_ism_is_emulated(dev) ? 1 : lgr->peer_gid.gid_ext == peer_gid->gid_ext)) && (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) { if (peer_gid->gid) /* peer triggered termination */ @@ -1881,7 +1899,7 @@ static bool smcd_lgr_match(struct smc_link_group *lgr, lgr->smcd != smcismdev) return false; - if (smc_ism_is_virtual(smcismdev) && + if (smc_ism_is_emulated(smcismdev) && lgr->peer_gid.gid_ext != peer_gid->gid_ext) return false; @@ -1997,7 +2015,6 @@ out: */ static u8 smc_compress_bufsize(int size, bool is_smcd, bool is_rmb) { - const unsigned int max_scat = SG_MAX_SINGLE_ALLOC * PAGE_SIZE; u8 compressed; if (size <= SMC_BUF_MIN_SIZE) @@ -2007,9 +2024,11 @@ static u8 smc_compress_bufsize(int size, bool is_smcd, bool is_rmb) compressed = min_t(u8, ilog2(size) + 1, is_smcd ? SMCD_DMBE_SIZES : SMCR_RMBE_SIZES); +#ifdef CONFIG_ARCH_NO_SG_CHAIN if (!is_smcd && is_rmb) /* RMBs are backed by & limited to max size of scatterlists */ - compressed = min_t(u8, compressed, ilog2(max_scat >> 14)); + compressed = min_t(u8, compressed, ilog2((SG_MAX_SINGLE_ALLOC * PAGE_SIZE) >> 14)); +#endif return compressed; } @@ -2464,12 +2483,18 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd) int rc; /* create send buffer */ + if (is_smcd && + smc_ism_support_dmb_nocopy(smc->conn.lgr->smcd)) + goto create_rmb; + rc = __smc_buf_create(smc, is_smcd, false); if (rc) return rc; + +create_rmb: /* create rmb */ rc = __smc_buf_create(smc, is_smcd, true); - if (rc) { + if (rc && smc->conn.sndbuf_desc) { down_write(&smc->conn.lgr->sndbufs_lock); list_del(&smc->conn.sndbuf_desc->list); up_write(&smc->conn.lgr->sndbufs_lock); @@ -2479,6 +2504,41 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd) return rc; } +int smcd_buf_attach(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + struct smcd_dev *smcd = conn->lgr->smcd; + u64 peer_token = conn->peer_token; + struct smc_buf_desc *buf_desc; + int rc; + + buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); + if (!buf_desc) + return -ENOMEM; + + /* The ghost sndbuf_desc describes the same memory region as + * peer RMB. Its lifecycle is consistent with the connection's + * and it will be freed with the connections instead of the + * link group. + */ + rc = smc_ism_attach_dmb(smcd, peer_token, buf_desc); + if (rc) + goto free; + + smc->sk.sk_sndbuf = buf_desc->len; + buf_desc->cpu_addr = + (u8 *)buf_desc->cpu_addr + sizeof(struct smcd_cdc_msg); + buf_desc->len -= sizeof(struct smcd_cdc_msg); + conn->sndbuf_desc = buf_desc; + conn->sndbuf_desc->used = 1; + atomic_set(&conn->sndbuf_space, conn->sndbuf_desc->len); + return 0; + +free: + kfree(buf_desc); + return rc; +} + static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) { int i; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 1f17537603..d93cf51dbd 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -557,6 +557,7 @@ void smc_smcd_terminate(struct smcd_dev *dev, struct smcd_gid *peer_gid, void smc_smcd_terminate_all(struct smcd_dev *dev); void smc_smcr_terminate_all(struct smc_ib_device *smcibdev); int smc_buf_create(struct smc_sock *smc, bool is_smcd); +int smcd_buf_attach(struct smc_sock *smc); int smc_uncompress_bufsize(u8 compressed); int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link, struct smc_clc_msg_accept_confirm *clc); diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 5a33908015..6fdb2d9677 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -255,6 +255,7 @@ static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) } static const struct sock_diag_handler smc_diag_handler = { + .owner = THIS_MODULE, .family = AF_SMC, .dump = smc_diag_handler_dump, }; diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 97704a9e84..9297dc20bf 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -209,13 +209,18 @@ int smc_ib_find_route(struct net *net, __be32 saddr, __be32 daddr, if (IS_ERR(rt)) goto out; if (rt->rt_uses_gateway && rt->rt_gw_family != AF_INET) - goto out; - neigh = rt->dst.ops->neigh_lookup(&rt->dst, NULL, &fl4.daddr); - if (neigh) { - memcpy(nexthop_mac, neigh->ha, ETH_ALEN); - *uses_gateway = rt->rt_uses_gateway; - return 0; - } + goto out_rt; + neigh = dst_neigh_lookup(&rt->dst, &fl4.daddr); + if (!neigh) + goto out_rt; + memcpy(nexthop_mac, neigh->ha, ETH_ALEN); + *uses_gateway = rt->rt_uses_gateway; + neigh_release(neigh); + ip_rt_put(rt); + return 0; + +out_rt: + ip_rt_put(rt); out: return -ENOENT; } diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index ac88de2a06..84f98e18c7 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -91,6 +91,11 @@ bool smc_ism_is_v2_capable(void) return smc_ism_v2_capable; } +void smc_ism_set_v2_capable(void) +{ + smc_ism_v2_capable = true; +} + /* Set a connection using this DMBE. */ void smc_ism_set_conn(struct smc_connection *conn) { @@ -126,6 +131,8 @@ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) if (!vlanid) /* No valid vlan id */ return -EINVAL; + if (!smcd->ops->add_vlan_id) + return -EOPNOTSUPP; /* create new vlan entry, in case we need it */ new_vlan = kzalloc(sizeof(*new_vlan), GFP_KERNEL); @@ -171,6 +178,8 @@ int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) if (!vlanid) /* No valid vlan id */ return -EINVAL; + if (!smcd->ops->del_vlan_id) + return -EOPNOTSUPP; spin_lock_irqsave(&smcd->lock, flags); list_for_each_entry(vlan, &smcd->vlan, list) { @@ -222,7 +231,6 @@ int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, struct smc_buf_desc *dmb_desc) { -#if IS_ENABLED(CONFIG_ISM) struct smcd_dmb dmb; int rc; @@ -231,7 +239,7 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, dmb.sba_idx = dmb_desc->sba_idx; dmb.vlan_id = lgr->vlan_id; dmb.rgid = lgr->peer_gid.gid; - rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, &smc_ism_client); + rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, lgr->smcd->client); if (!rc) { dmb_desc->sba_idx = dmb.sba_idx; dmb_desc->token = dmb.dmb_tok; @@ -240,9 +248,46 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, dmb_desc->len = dmb.dmb_len; } return rc; -#else - return 0; -#endif +} + +bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd) +{ + /* for now only loopback-ism supports + * merging sndbuf with peer DMB to avoid + * data copies between them. + */ + return (smcd->ops->support_dmb_nocopy && + smcd->ops->support_dmb_nocopy(smcd)); +} + +int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, + struct smc_buf_desc *dmb_desc) +{ + struct smcd_dmb dmb; + int rc = 0; + + if (!dev->ops->attach_dmb) + return -EINVAL; + + memset(&dmb, 0, sizeof(dmb)); + dmb.dmb_tok = token; + rc = dev->ops->attach_dmb(dev, &dmb); + if (!rc) { + dmb_desc->sba_idx = dmb.sba_idx; + dmb_desc->token = dmb.dmb_tok; + dmb_desc->cpu_addr = dmb.cpu_addr; + dmb_desc->dma_addr = dmb.dma_addr; + dmb_desc->len = dmb.dmb_len; + } + return rc; +} + +int smc_ism_detach_dmb(struct smcd_dev *dev, u64 token) +{ + if (!dev->ops->detach_dmb) + return -EINVAL; + + return dev->ops->detach_dmb(dev, token); } static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, @@ -322,6 +367,8 @@ static void smc_nl_prep_smcd_dev(struct smcd_dev_list *dev_list, list_for_each_entry(smcd, &dev_list->list, list) { if (num < snum) goto next; + if (smc_ism_is_loopback(smcd)) + goto next; if (smc_nl_handle_smcd_dev(smcd, skb, cb)) goto errout; next: @@ -372,7 +419,8 @@ static void smcd_handle_sw_event(struct smc_ism_event_work *wrk) smc_smcd_terminate(wrk->smcd, &peer_gid, ev_info.vlan_id); break; case ISM_EVENT_CODE_TESTLINK: /* Activity timer */ - if (ev_info.code == ISM_EVENT_REQUEST) { + if (ev_info.code == ISM_EVENT_REQUEST && + wrk->smcd->ops->signal_event) { ev_info.code = ISM_EVENT_RESPONSE; wrk->smcd->ops->signal_event(wrk->smcd, &peer_gid, @@ -436,7 +484,7 @@ static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, static void smcd_register_dev(struct ism_dev *ism) { const struct smcd_ops *ops = ism_get_smcd_ops(); - struct smcd_dev *smcd; + struct smcd_dev *smcd, *fentry; if (!ops) return; @@ -446,20 +494,28 @@ static void smcd_register_dev(struct ism_dev *ism) if (!smcd) return; smcd->priv = ism; + smcd->client = &smc_ism_client; ism_set_priv(ism, &smc_ism_client, smcd); if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid)) smc_pnetid_by_table_smcd(smcd); + if (smcd->ops->supports_v2()) + smc_ism_set_v2_capable(); mutex_lock(&smcd_dev_list.mutex); - if (list_empty(&smcd_dev_list.list)) { - if (smcd->ops->supports_v2()) - smc_ism_v2_capable = true; - } - /* sort list: devices without pnetid before devices with pnetid */ - if (smcd->pnetid[0]) + /* sort list: + * - devices without pnetid before devices with pnetid; + * - loopback-ism always at the very beginning; + */ + if (!smcd->pnetid[0]) { + fentry = list_first_entry_or_null(&smcd_dev_list.list, + struct smcd_dev, list); + if (fentry && smc_ism_is_loopback(fentry)) + list_add(&smcd->list, &fentry->list); + else + list_add(&smcd->list, &smcd_dev_list.list); + } else { list_add_tail(&smcd->list, &smcd_dev_list.list); - else - list_add(&smcd->list, &smcd_dev_list.list); + } mutex_unlock(&smcd_dev_list.mutex); pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", @@ -541,6 +597,8 @@ int smc_ism_signal_shutdown(struct smc_link_group *lgr) if (lgr->peer_shutdown) return 0; + if (!lgr->smcd->ops->signal_event) + return 0; memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); ev_info.vlan_id = lgr->vlan_id; diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index ffff40c30a..6763133dd8 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -15,7 +15,7 @@ #include "smc.h" -#define SMC_VIRTUAL_ISM_CHID_MASK 0xFF00 +#define SMC_EMULATED_ISM_CHID_MASK 0xFF00 #define SMC_ISM_IDENT_MASK 0x00FFFF struct smcd_dev_list { /* List of SMCD devices */ @@ -48,10 +48,15 @@ int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id); int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size, struct smc_buf_desc *dmb_desc); int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc); +bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd); +int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, + struct smc_buf_desc *dmb_desc); +int smc_ism_detach_dmb(struct smcd_dev *dev, u64 token); int smc_ism_signal_shutdown(struct smc_link_group *lgr); void smc_ism_get_system_eid(u8 **eid); u16 smc_ism_get_chid(struct smcd_dev *dev); bool smc_ism_is_v2_capable(void); +void smc_ism_set_v2_capable(void); int smc_ism_init(void); void smc_ism_exit(void); int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb); @@ -66,10 +71,10 @@ static inline int smc_ism_write(struct smcd_dev *smcd, u64 dmb_tok, return rc < 0 ? rc : 0; } -static inline bool __smc_ism_is_virtual(u16 chid) +static inline bool __smc_ism_is_emulated(u16 chid) { /* CHIDs in range of 0xFF00 to 0xFFFF are reserved - * for virtual ISM device. + * for Emulated-ISM device. * * loopback-ism: 0xFFFF * virtio-ism: 0xFF00 ~ 0xFFFE @@ -77,11 +82,16 @@ static inline bool __smc_ism_is_virtual(u16 chid) return ((chid & 0xFF00) == 0xFF00); } -static inline bool smc_ism_is_virtual(struct smcd_dev *smcd) +static inline bool smc_ism_is_emulated(struct smcd_dev *smcd) { u16 chid = smcd->ops->get_chid(smcd); - return __smc_ism_is_virtual(chid); + return __smc_ism_is_emulated(chid); +} + +static inline bool smc_ism_is_loopback(struct smcd_dev *smcd) +{ + return (smcd->ops->get_chid(smcd) == 0xFFFF); } #endif diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c new file mode 100644 index 0000000000..3c5f64ca41 --- /dev/null +++ b/net/smc/smc_loopback.c @@ -0,0 +1,427 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications Direct over loopback-ism device. + * + * Functions for loopback-ism device. + * + * Copyright (c) 2024, Alibaba Inc. + * + * Author: Wen Gu <guwen@linux.alibaba.com> + * Tony Lu <tonylu@linux.alibaba.com> + * + */ + +#include <linux/device.h> +#include <linux/types.h> +#include <net/smc.h> + +#include "smc_cdc.h" +#include "smc_ism.h" +#include "smc_loopback.h" + +#define SMC_LO_V2_CAPABLE 0x1 /* loopback-ism acts as ISMv2 */ +#define SMC_LO_SUPPORT_NOCOPY 0x1 +#define SMC_DMA_ADDR_INVALID (~(dma_addr_t)0) + +static const char smc_lo_dev_name[] = "loopback-ism"; +static struct smc_lo_dev *lo_dev; + +static void smc_lo_generate_ids(struct smc_lo_dev *ldev) +{ + struct smcd_gid *lgid = &ldev->local_gid; + uuid_t uuid; + + uuid_gen(&uuid); + memcpy(&lgid->gid, &uuid, sizeof(lgid->gid)); + memcpy(&lgid->gid_ext, (u8 *)&uuid + sizeof(lgid->gid), + sizeof(lgid->gid_ext)); + + ldev->chid = SMC_LO_RESERVED_CHID; +} + +static int smc_lo_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid, + u32 vid_valid, u32 vid) +{ + struct smc_lo_dev *ldev = smcd->priv; + + /* rgid should be the same as lgid */ + if (!ldev || rgid->gid != ldev->local_gid.gid || + rgid->gid_ext != ldev->local_gid.gid_ext) + return -ENETUNREACH; + return 0; +} + +static int smc_lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, + void *client_priv) +{ + struct smc_lo_dmb_node *dmb_node, *tmp_node; + struct smc_lo_dev *ldev = smcd->priv; + int sba_idx, rc; + + /* check space for new dmb */ + for_each_clear_bit(sba_idx, ldev->sba_idx_mask, SMC_LO_MAX_DMBS) { + if (!test_and_set_bit(sba_idx, ldev->sba_idx_mask)) + break; + } + if (sba_idx == SMC_LO_MAX_DMBS) + return -ENOSPC; + + dmb_node = kzalloc(sizeof(*dmb_node), GFP_KERNEL); + if (!dmb_node) { + rc = -ENOMEM; + goto err_bit; + } + + dmb_node->sba_idx = sba_idx; + dmb_node->len = dmb->dmb_len; + dmb_node->cpu_addr = kzalloc(dmb_node->len, GFP_KERNEL | + __GFP_NOWARN | __GFP_NORETRY | + __GFP_NOMEMALLOC); + if (!dmb_node->cpu_addr) { + rc = -ENOMEM; + goto err_node; + } + dmb_node->dma_addr = SMC_DMA_ADDR_INVALID; + refcount_set(&dmb_node->refcnt, 1); + +again: + /* add new dmb into hash table */ + get_random_bytes(&dmb_node->token, sizeof(dmb_node->token)); + write_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_node->token) { + if (tmp_node->token == dmb_node->token) { + write_unlock_bh(&ldev->dmb_ht_lock); + goto again; + } + } + hash_add(ldev->dmb_ht, &dmb_node->list, dmb_node->token); + write_unlock_bh(&ldev->dmb_ht_lock); + atomic_inc(&ldev->dmb_cnt); + + dmb->sba_idx = dmb_node->sba_idx; + dmb->dmb_tok = dmb_node->token; + dmb->cpu_addr = dmb_node->cpu_addr; + dmb->dma_addr = dmb_node->dma_addr; + dmb->dmb_len = dmb_node->len; + + return 0; + +err_node: + kfree(dmb_node); +err_bit: + clear_bit(sba_idx, ldev->sba_idx_mask); + return rc; +} + +static void __smc_lo_unregister_dmb(struct smc_lo_dev *ldev, + struct smc_lo_dmb_node *dmb_node) +{ + /* remove dmb from hash table */ + write_lock_bh(&ldev->dmb_ht_lock); + hash_del(&dmb_node->list); + write_unlock_bh(&ldev->dmb_ht_lock); + + clear_bit(dmb_node->sba_idx, ldev->sba_idx_mask); + kvfree(dmb_node->cpu_addr); + kfree(dmb_node); + + if (atomic_dec_and_test(&ldev->dmb_cnt)) + wake_up(&ldev->ldev_release); +} + +static int smc_lo_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) +{ + struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; + struct smc_lo_dev *ldev = smcd->priv; + + /* find dmb from hash table */ + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { + if (tmp_node->token == dmb->dmb_tok) { + dmb_node = tmp_node; + break; + } + } + if (!dmb_node) { + read_unlock_bh(&ldev->dmb_ht_lock); + return -EINVAL; + } + read_unlock_bh(&ldev->dmb_ht_lock); + + if (refcount_dec_and_test(&dmb_node->refcnt)) + __smc_lo_unregister_dmb(ldev, dmb_node); + return 0; +} + +static int smc_lo_support_dmb_nocopy(struct smcd_dev *smcd) +{ + return SMC_LO_SUPPORT_NOCOPY; +} + +static int smc_lo_attach_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) +{ + struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; + struct smc_lo_dev *ldev = smcd->priv; + + /* find dmb_node according to dmb->dmb_tok */ + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { + if (tmp_node->token == dmb->dmb_tok) { + dmb_node = tmp_node; + break; + } + } + if (!dmb_node) { + read_unlock_bh(&ldev->dmb_ht_lock); + return -EINVAL; + } + read_unlock_bh(&ldev->dmb_ht_lock); + + if (!refcount_inc_not_zero(&dmb_node->refcnt)) + /* the dmb is being unregistered, but has + * not been removed from the hash table. + */ + return -EINVAL; + + /* provide dmb information */ + dmb->sba_idx = dmb_node->sba_idx; + dmb->dmb_tok = dmb_node->token; + dmb->cpu_addr = dmb_node->cpu_addr; + dmb->dma_addr = dmb_node->dma_addr; + dmb->dmb_len = dmb_node->len; + return 0; +} + +static int smc_lo_detach_dmb(struct smcd_dev *smcd, u64 token) +{ + struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; + struct smc_lo_dev *ldev = smcd->priv; + + /* find dmb_node according to dmb->dmb_tok */ + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, token) { + if (tmp_node->token == token) { + dmb_node = tmp_node; + break; + } + } + if (!dmb_node) { + read_unlock_bh(&ldev->dmb_ht_lock); + return -EINVAL; + } + read_unlock_bh(&ldev->dmb_ht_lock); + + if (refcount_dec_and_test(&dmb_node->refcnt)) + __smc_lo_unregister_dmb(ldev, dmb_node); + return 0; +} + +static int smc_lo_move_data(struct smcd_dev *smcd, u64 dmb_tok, + unsigned int idx, bool sf, unsigned int offset, + void *data, unsigned int size) +{ + struct smc_lo_dmb_node *rmb_node = NULL, *tmp_node; + struct smc_lo_dev *ldev = smcd->priv; + struct smc_connection *conn; + + if (!sf) + /* since sndbuf is merged with peer DMB, there is + * no need to copy data from sndbuf to peer DMB. + */ + return 0; + + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_tok) { + if (tmp_node->token == dmb_tok) { + rmb_node = tmp_node; + break; + } + } + if (!rmb_node) { + read_unlock_bh(&ldev->dmb_ht_lock); + return -EINVAL; + } + memcpy((char *)rmb_node->cpu_addr + offset, data, size); + read_unlock_bh(&ldev->dmb_ht_lock); + + conn = smcd->conn[rmb_node->sba_idx]; + if (!conn || conn->killed) + return -EPIPE; + tasklet_schedule(&conn->rx_tsklet); + return 0; +} + +static int smc_lo_supports_v2(void) +{ + return SMC_LO_V2_CAPABLE; +} + +static void smc_lo_get_local_gid(struct smcd_dev *smcd, + struct smcd_gid *smcd_gid) +{ + struct smc_lo_dev *ldev = smcd->priv; + + smcd_gid->gid = ldev->local_gid.gid; + smcd_gid->gid_ext = ldev->local_gid.gid_ext; +} + +static u16 smc_lo_get_chid(struct smcd_dev *smcd) +{ + return ((struct smc_lo_dev *)smcd->priv)->chid; +} + +static struct device *smc_lo_get_dev(struct smcd_dev *smcd) +{ + return &((struct smc_lo_dev *)smcd->priv)->dev; +} + +static const struct smcd_ops lo_ops = { + .query_remote_gid = smc_lo_query_rgid, + .register_dmb = smc_lo_register_dmb, + .unregister_dmb = smc_lo_unregister_dmb, + .support_dmb_nocopy = smc_lo_support_dmb_nocopy, + .attach_dmb = smc_lo_attach_dmb, + .detach_dmb = smc_lo_detach_dmb, + .add_vlan_id = NULL, + .del_vlan_id = NULL, + .set_vlan_required = NULL, + .reset_vlan_required = NULL, + .signal_event = NULL, + .move_data = smc_lo_move_data, + .supports_v2 = smc_lo_supports_v2, + .get_local_gid = smc_lo_get_local_gid, + .get_chid = smc_lo_get_chid, + .get_dev = smc_lo_get_dev, +}; + +static struct smcd_dev *smcd_lo_alloc_dev(const struct smcd_ops *ops, + int max_dmbs) +{ + struct smcd_dev *smcd; + + smcd = kzalloc(sizeof(*smcd), GFP_KERNEL); + if (!smcd) + return NULL; + + smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *), + GFP_KERNEL); + if (!smcd->conn) + goto out_smcd; + + smcd->ops = ops; + + spin_lock_init(&smcd->lock); + spin_lock_init(&smcd->lgr_lock); + INIT_LIST_HEAD(&smcd->vlan); + INIT_LIST_HEAD(&smcd->lgr_list); + init_waitqueue_head(&smcd->lgrs_deleted); + return smcd; + +out_smcd: + kfree(smcd); + return NULL; +} + +static int smcd_lo_register_dev(struct smc_lo_dev *ldev) +{ + struct smcd_dev *smcd; + + smcd = smcd_lo_alloc_dev(&lo_ops, SMC_LO_MAX_DMBS); + if (!smcd) + return -ENOMEM; + ldev->smcd = smcd; + smcd->priv = ldev; + smc_ism_set_v2_capable(); + mutex_lock(&smcd_dev_list.mutex); + list_add(&smcd->list, &smcd_dev_list.list); + mutex_unlock(&smcd_dev_list.mutex); + pr_warn_ratelimited("smc: adding smcd device %s\n", + dev_name(&ldev->dev)); + return 0; +} + +static void smcd_lo_unregister_dev(struct smc_lo_dev *ldev) +{ + struct smcd_dev *smcd = ldev->smcd; + + pr_warn_ratelimited("smc: removing smcd device %s\n", + dev_name(&ldev->dev)); + smcd->going_away = 1; + smc_smcd_terminate_all(smcd); + mutex_lock(&smcd_dev_list.mutex); + list_del_init(&smcd->list); + mutex_unlock(&smcd_dev_list.mutex); + kfree(smcd->conn); + kfree(smcd); +} + +static int smc_lo_dev_init(struct smc_lo_dev *ldev) +{ + smc_lo_generate_ids(ldev); + rwlock_init(&ldev->dmb_ht_lock); + hash_init(ldev->dmb_ht); + atomic_set(&ldev->dmb_cnt, 0); + init_waitqueue_head(&ldev->ldev_release); + + return smcd_lo_register_dev(ldev); +} + +static void smc_lo_dev_exit(struct smc_lo_dev *ldev) +{ + smcd_lo_unregister_dev(ldev); + if (atomic_read(&ldev->dmb_cnt)) + wait_event(ldev->ldev_release, !atomic_read(&ldev->dmb_cnt)); +} + +static void smc_lo_dev_release(struct device *dev) +{ + struct smc_lo_dev *ldev = + container_of(dev, struct smc_lo_dev, dev); + + kfree(ldev); +} + +static int smc_lo_dev_probe(void) +{ + struct smc_lo_dev *ldev; + int ret; + + ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); + if (!ldev) + return -ENOMEM; + + ldev->dev.parent = NULL; + ldev->dev.release = smc_lo_dev_release; + device_initialize(&ldev->dev); + dev_set_name(&ldev->dev, smc_lo_dev_name); + + ret = smc_lo_dev_init(ldev); + if (ret) + goto free_dev; + + lo_dev = ldev; /* global loopback device */ + return 0; + +free_dev: + put_device(&ldev->dev); + return ret; +} + +static void smc_lo_dev_remove(void) +{ + if (!lo_dev) + return; + + smc_lo_dev_exit(lo_dev); + put_device(&lo_dev->dev); /* device_initialize in smc_lo_dev_probe */ +} + +int smc_loopback_init(void) +{ + return smc_lo_dev_probe(); +} + +void smc_loopback_exit(void) +{ + smc_lo_dev_remove(); +} diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h new file mode 100644 index 0000000000..6dd4292dae --- /dev/null +++ b/net/smc/smc_loopback.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications Direct over loopback-ism device. + * + * SMC-D loopback-ism device structure definitions. + * + * Copyright (c) 2024, Alibaba Inc. + * + * Author: Wen Gu <guwen@linux.alibaba.com> + * Tony Lu <tonylu@linux.alibaba.com> + * + */ + +#ifndef _SMC_LOOPBACK_H +#define _SMC_LOOPBACK_H + +#include <linux/device.h> +#include <linux/err.h> +#include <net/smc.h> + +#if IS_ENABLED(CONFIG_SMC_LO) +#define SMC_LO_MAX_DMBS 5000 +#define SMC_LO_DMBS_HASH_BITS 12 +#define SMC_LO_RESERVED_CHID 0xFFFF + +struct smc_lo_dmb_node { + struct hlist_node list; + u64 token; + u32 len; + u32 sba_idx; + void *cpu_addr; + dma_addr_t dma_addr; + refcount_t refcnt; +}; + +struct smc_lo_dev { + struct smcd_dev *smcd; + struct device dev; + u16 chid; + struct smcd_gid local_gid; + atomic_t dmb_cnt; + rwlock_t dmb_ht_lock; + DECLARE_BITMAP(sba_idx_mask, SMC_LO_MAX_DMBS); + DECLARE_HASHTABLE(dmb_ht, SMC_LO_DMBS_HASH_BITS); + wait_queue_head_t ldev_release; +}; + +int smc_loopback_init(void); +void smc_loopback_exit(void); +#else +static inline int smc_loopback_init(void) +{ + return 0; +} + +static inline void smc_loopback_exit(void) +{ +} +#endif + +#endif /* _SMC_LOOPBACK_H */ diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 9a2f3638d1..f0cbe77a80 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -42,10 +42,10 @@ static void smc_rx_wake_up(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); if ((sk->sk_shutdown == SHUTDOWN_MASK) || (sk->sk_state == SMC_CLOSED)) - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_HUP); rcu_read_unlock(); } diff --git a/net/smc/smc_stats.h b/net/smc/smc_stats.h index 9d32058db2..e19177ce40 100644 --- a/net/smc/smc_stats.h +++ b/net/smc/smc_stats.h @@ -19,7 +19,7 @@ #include "smc_clc.h" -#define SMC_MAX_FBACK_RSN_CNT 30 +#define SMC_MAX_FBACK_RSN_CNT 36 enum { SMC_BUF_8K, diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index a5946d1b9d..13f2bc092d 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -90,11 +90,11 @@ static struct ctl_table smc_table[] = { .extra1 = &conns_per_lgr_min, .extra2 = &conns_per_lgr_max, }, - { } }; int __net_init smc_sysctl_net_init(struct net *net) { + size_t table_size = ARRAY_SIZE(smc_table); struct ctl_table *table; table = smc_table; @@ -105,12 +105,12 @@ int __net_init smc_sysctl_net_init(struct net *net) if (!table) goto err_alloc; - for (i = 0; i < ARRAY_SIZE(smc_table) - 1; i++) + for (i = 0; i < table_size; i++) table[i].data += (void *)net - (void *)&init_net; } net->smc.smc_hdr = register_net_sysctl_sz(net, "net/smc", table, - ARRAY_SIZE(smc_table)); + table_size); if (!net->smc.smc_hdr) goto err_reg; @@ -133,7 +133,7 @@ err_alloc: void __net_exit smc_sysctl_net_exit(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->smc.smc_hdr->ctl_table_arg; unregister_net_sysctl_table(net->smc.smc_hdr); diff --git a/net/smc/smc_tracepoint.h b/net/smc/smc_tracepoint.h index 9fc5e586d2..a9a6e3c111 100644 --- a/net/smc/smc_tracepoint.h +++ b/net/smc/smc_tracepoint.h @@ -60,7 +60,7 @@ DECLARE_EVENT_CLASS(smc_msg_event, __entry->smc = smc; __entry->net_cookie = sock_net(sk)->net_cookie; __entry->len = len; - __assign_str(name, smc->conn.lnk->ibname); + __assign_str(name); ), TP_printk("smc=%p net=%llu len=%zu dev=%s", @@ -104,7 +104,7 @@ TRACE_EVENT(smcr_link_down, __entry->lgr = lgr; __entry->net_cookie = lgr->net->net_cookie; __entry->state = lnk->state; - __assign_str(name, lnk->ibname); + __assign_str(name); __entry->location = location; ), diff --git a/net/socket.c b/net/socket.c index ed3df2f749..e416920e93 100644 --- a/net/socket.c +++ b/net/socket.c @@ -88,7 +88,7 @@ #include <linux/xattr.h> #include <linux/nospec.h> #include <linux/indirect_call_wrapper.h> -#include <linux/io_uring.h> +#include <linux/io_uring/net.h> #include <linux/uaccess.h> #include <asm/unistd.h> @@ -343,7 +343,7 @@ static void init_inodecache(void) 0, (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD | SLAB_ACCOUNT), + SLAB_ACCOUNT), init_once); BUG_ON(sock_inode_cachep == NULL); } @@ -1890,7 +1890,7 @@ SYSCALL_DEFINE2(listen, int, fd, int, backlog) return __sys_listen(fd, backlog); } -struct file *do_accept(struct file *file, unsigned file_flags, +struct file *do_accept(struct file *file, struct proto_accept_arg *arg, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags) { @@ -1926,8 +1926,8 @@ struct file *do_accept(struct file *file, unsigned file_flags, if (err) goto out_fd; - err = ops->accept(sock, newsock, sock->file->f_flags | file_flags, - false); + arg->flags |= sock->file->f_flags; + err = ops->accept(sock, newsock, arg); if (err < 0) goto out_fd; @@ -1953,6 +1953,7 @@ out_fd: static int __sys_accept4_file(struct file *file, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags) { + struct proto_accept_arg arg = { }; struct file *newfile; int newfd; @@ -1966,7 +1967,7 @@ static int __sys_accept4_file(struct file *file, struct sockaddr __user *upeer_s if (unlikely(newfd < 0)) return newfd; - newfile = do_accept(file, 0, upeer_sockaddr, upeer_addrlen, + newfile = do_accept(file, &arg, upeer_sockaddr, upeer_addrlen, flags); if (IS_ERR(newfile)) { put_unused_fd(newfd); @@ -2600,9 +2601,9 @@ out: return err; } -int sendmsg_copy_msghdr(struct msghdr *msg, - struct user_msghdr __user *umsg, unsigned flags, - struct iovec **iov) +static int sendmsg_copy_msghdr(struct msghdr *msg, + struct user_msghdr __user *umsg, unsigned flags, + struct iovec **iov) { int err; @@ -2753,10 +2754,10 @@ SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg, return __sys_sendmmsg(fd, mmsg, vlen, flags, true); } -int recvmsg_copy_msghdr(struct msghdr *msg, - struct user_msghdr __user *umsg, unsigned flags, - struct sockaddr __user **uaddr, - struct iovec **iov) +static int recvmsg_copy_msghdr(struct msghdr *msg, + struct user_msghdr __user *umsg, unsigned flags, + struct sockaddr __user **uaddr, + struct iovec **iov) { ssize_t err; @@ -3580,6 +3581,10 @@ int kernel_accept(struct socket *sock, struct socket **newsock, int flags) { struct sock *sk = sock->sk; const struct proto_ops *ops = READ_ONCE(sock->ops); + struct proto_accept_arg arg = { + .flags = flags, + .kern = true, + }; int err; err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol, @@ -3587,7 +3592,7 @@ int kernel_accept(struct socket *sock, struct socket **newsock, int flags) if (err < 0) goto done; - err = ops->accept(sock, *newsock, flags, true); + err = ops->accept(sock, *newsock, &arg); if (err < 0) { sock_release(*newsock); *newsock = NULL; diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index c7af0220f8..369310909f 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1875,8 +1875,10 @@ gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; maj_stat = gss_wrap(ctx->gc_gss_ctx, offset, snd_buf, inpages); /* slack space should prevent this ever happening: */ - if (unlikely(snd_buf->len > snd_buf->buflen)) + if (unlikely(snd_buf->len > snd_buf->buflen)) { + status = -EIO; goto wrap_failed; + } /* We're assuming that when GSS_S_CONTEXT_EXPIRED, the encryption was * done anyway, so it's safe to put the request on the wire: */ if (maj_stat == GSS_S_CONTEXT_EXPIRED) diff --git a/net/sunrpc/auth_gss/auth_gss_internal.h b/net/sunrpc/auth_gss/auth_gss_internal.h index c53b329092..4ebc1b7043 100644 --- a/net/sunrpc/auth_gss/auth_gss_internal.h +++ b/net/sunrpc/auth_gss/auth_gss_internal.h @@ -23,7 +23,7 @@ simple_get_bytes(const void *p, const void *end, void *res, size_t len) } static inline const void * -simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest) +simple_get_netobj_noprof(const void *p, const void *end, struct xdr_netobj *dest) { const void *q; unsigned int len; @@ -35,7 +35,7 @@ simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest) if (unlikely(q > end || q < p)) return ERR_PTR(-EFAULT); if (len) { - dest->data = kmemdup(p, len, GFP_KERNEL); + dest->data = kmemdup_noprof(p, len, GFP_KERNEL); if (unlikely(dest->data == NULL)) return ERR_PTR(-ENOMEM); } else @@ -43,3 +43,5 @@ simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest) dest->len = len; return q; } + +#define simple_get_netobj(...) alloc_hooks(simple_get_netobj_noprof(__VA_ARGS__)) diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c index 06d8ee0db0..4eb19c3a54 100644 --- a/net/sunrpc/auth_gss/gss_krb5_keys.c +++ b/net/sunrpc/auth_gss/gss_krb5_keys.c @@ -168,7 +168,7 @@ static int krb5_DK(const struct gss_krb5_enctype *gk5e, goto err_return; blocksize = crypto_sync_skcipher_blocksize(cipher); if (crypto_sync_skcipher_setkey(cipher, inkey->data, inkey->len)) - goto err_return; + goto err_free_cipher; ret = -ENOMEM; inblockdata = kmalloc(blocksize, gfp_mask); diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 24de941847..73a90ad873 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1033,17 +1033,11 @@ null_verifier: static void gss_free_in_token_pages(struct gssp_in_token *in_token) { - u32 inlen; int i; i = 0; - inlen = in_token->page_len; - while (inlen) { - if (in_token->pages[i]) - put_page(in_token->pages[i]); - inlen -= inlen > PAGE_SIZE ? PAGE_SIZE : inlen; - } - + while (in_token->pages[i]) + put_page(in_token->pages[i++]); kfree(in_token->pages); in_token->pages = NULL; } @@ -1075,7 +1069,7 @@ static int gss_read_proxy_verf(struct svc_rqst *rqstp, goto out_denied_free; pages = DIV_ROUND_UP(inlen, PAGE_SIZE); - in_token->pages = kcalloc(pages, sizeof(struct page *), GFP_KERNEL); + in_token->pages = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL); if (!in_token->pages) goto out_denied_free; in_token->page_base = 0; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index cda0935a68..09f29a95f2 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -405,7 +405,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, clnt->cl_maxproc = version->nrprocs; clnt->cl_prog = args->prognumber ? : program->number; clnt->cl_vers = version->number; - clnt->cl_stats = program->stats; + clnt->cl_stats = args->stats ? : program->stats; clnt->cl_metrics = rpc_alloc_iostats(clnt); rpc_init_pipe_dir_head(&clnt->cl_pipedir_objects); err = -ENOMEM; @@ -691,6 +691,7 @@ struct rpc_clnt *rpc_clone_client(struct rpc_clnt *clnt) .version = clnt->cl_vers, .authflavor = clnt->cl_auth->au_flavor, .cred = clnt->cl_cred, + .stats = clnt->cl_stats, }; return __rpc_clone_client(&args, clnt); } @@ -713,6 +714,7 @@ rpc_clone_client_set_auth(struct rpc_clnt *clnt, rpc_authflavor_t flavor) .version = clnt->cl_vers, .authflavor = flavor, .cred = clnt->cl_cred, + .stats = clnt->cl_stats, }; return __rpc_clone_client(&args, clnt); } @@ -1068,6 +1070,8 @@ struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old, .version = vers, .authflavor = old->cl_auth->au_flavor, .cred = old->cl_cred, + .stats = old->cl_stats, + .timeout = old->cl_timeout, }; struct rpc_clnt *clnt; int err; @@ -2322,12 +2326,13 @@ call_transmit_status(struct rpc_task *task) task->tk_action = call_transmit; task->tk_status = 0; break; - case -ECONNREFUSED: case -EHOSTDOWN: case -ENETDOWN: case -EHOSTUNREACH: case -ENETUNREACH: case -EPERM: + break; + case -ECONNREFUSED: if (RPC_IS_SOFTCONN(task)) { if (!task->tk_msg.rpc_proc->p_proc) trace_xprt_ping(task->tk_xprt, @@ -2695,8 +2700,19 @@ rpc_decode_header(struct rpc_task *task, struct xdr_stream *xdr) goto out_msg_denied; error = rpcauth_checkverf(task, xdr); - if (error) + if (error) { + struct rpc_cred *cred = task->tk_rqstp->rq_cred; + + if (!test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags)) { + rpcauth_invalcred(task); + if (!task->tk_cred_retry) + goto out_err; + task->tk_cred_retry--; + trace_rpc__stale_creds(task); + return -EKEYREJECTED; + } goto out_verifier; + } p = xdr_inline_decode(xdr, sizeof(*p)); if (!p) diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index dcc2b4f49e..910a5d850d 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1490,7 +1490,7 @@ int register_rpc_pipefs(void) rpc_inode_cachep = kmem_cache_create("rpc_inode_cache", sizeof(struct rpc_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), init_once); if (!rpc_inode_cachep) return -ENOMEM; diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 6debf4fd42..cef623ea15 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -369,8 +369,10 @@ static void rpc_make_runnable(struct workqueue_struct *wq, if (RPC_IS_ASYNC(task)) { INIT_WORK(&task->u.tk_work, rpc_async_schedule); queue_work(wq, &task->u.tk_work); - } else + } else { + smp_mb__after_atomic(); wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED); + } } /* diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index 65fc1297c6..383860cb1d 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -314,7 +314,7 @@ EXPORT_SYMBOL_GPL(rpc_proc_unregister); struct proc_dir_entry * svc_proc_register(struct net *net, struct svc_stat *statp, const struct proc_ops *proc_ops) { - return do_register(net, statp->program->pg_name, statp, proc_ops); + return do_register(net, statp->program->pg_name, net, proc_ops); } EXPORT_SYMBOL_GPL(svc_proc_register); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index b969e505c7..d9cda1e53a 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -451,8 +451,8 @@ __svc_init_bc(struct svc_serv *serv) * Create an RPC service */ static struct svc_serv * -__svc_create(struct svc_program *prog, unsigned int bufsize, int npools, - int (*threadfn)(void *data)) +__svc_create(struct svc_program *prog, struct svc_stat *stats, + unsigned int bufsize, int npools, int (*threadfn)(void *data)) { struct svc_serv *serv; unsigned int vers; @@ -463,7 +463,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, return NULL; serv->sv_name = prog->pg_name; serv->sv_program = prog; - serv->sv_stats = prog->pg_stats; + serv->sv_stats = stats; if (bufsize > RPCSVC_MAXPAYLOAD) bufsize = RPCSVC_MAXPAYLOAD; serv->sv_max_payload = bufsize? bufsize : 4096; @@ -529,26 +529,28 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, struct svc_serv *svc_create(struct svc_program *prog, unsigned int bufsize, int (*threadfn)(void *data)) { - return __svc_create(prog, bufsize, 1, threadfn); + return __svc_create(prog, NULL, bufsize, 1, threadfn); } EXPORT_SYMBOL_GPL(svc_create); /** * svc_create_pooled - Create an RPC service with pooled threads * @prog: the RPC program the new service will handle + * @stats: the stats struct if desired * @bufsize: maximum message size for @prog * @threadfn: a function to service RPC requests for @prog * * Returns an instantiated struct svc_serv object or NULL. */ struct svc_serv *svc_create_pooled(struct svc_program *prog, + struct svc_stat *stats, unsigned int bufsize, int (*threadfn)(void *data)) { struct svc_serv *serv; unsigned int npools = svc_pool_map_get(); - serv = __svc_create(prog, bufsize, npools, threadfn); + serv = __svc_create(prog, stats, bufsize, npools, threadfn); if (!serv) goto out_err; return serv; @@ -1263,8 +1265,6 @@ svc_generic_init_request(struct svc_rqst *rqstp, if (rqstp->rq_proc >= versp->vs_nproc) goto err_bad_proc; rqstp->rq_procinfo = procp = &versp->vs_proc[rqstp->rq_proc]; - if (!procp) - goto err_bad_proc; /* Initialize storage for argp and resp */ memset(rqstp->rq_argp, 0, procp->pc_argzero); @@ -1375,7 +1375,8 @@ svc_process_common(struct svc_rqst *rqstp) goto err_bad_proc; /* Syntactic check complete */ - serv->sv_stats->rpccnt++; + if (serv->sv_stats) + serv->sv_stats->rpccnt++; trace_svc_process(rqstp, progp->pg_name); aoffset = xdr_stream_pos(xdr); @@ -1427,7 +1428,8 @@ err_short_len: goto close_xprt; err_bad_rpc: - serv->sv_stats->rpcbadfmt++; + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; xdr_stream_encode_u32(xdr, RPC_MSG_DENIED); xdr_stream_encode_u32(xdr, RPC_MISMATCH); /* Only RPCv2 supported */ @@ -1438,7 +1440,8 @@ err_bad_rpc: err_bad_auth: dprintk("svc: authentication failed (%d)\n", be32_to_cpu(rqstp->rq_auth_stat)); - serv->sv_stats->rpcbadauth++; + if (serv->sv_stats) + serv->sv_stats->rpcbadauth++; /* Restore write pointer to location of reply status: */ xdr_truncate_encode(xdr, XDR_UNIT * 2); xdr_stream_encode_u32(xdr, RPC_MSG_DENIED); @@ -1448,7 +1451,8 @@ err_bad_auth: err_bad_prog: dprintk("svc: unknown program %d\n", rqstp->rq_prog); - serv->sv_stats->rpcbadfmt++; + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_prog_unavail; goto sendit; @@ -1456,7 +1460,8 @@ err_bad_vers: svc_printk(rqstp, "unknown version (%d for prog %d, %s)\n", rqstp->rq_vers, rqstp->rq_prog, progp->pg_name); - serv->sv_stats->rpcbadfmt++; + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_prog_mismatch; /* @@ -1470,19 +1475,22 @@ err_bad_vers: err_bad_proc: svc_printk(rqstp, "unknown procedure (%d)\n", rqstp->rq_proc); - serv->sv_stats->rpcbadfmt++; + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_proc_unavail; goto sendit; err_garbage_args: svc_printk(rqstp, "failed to decode RPC header\n"); - serv->sv_stats->rpcbadfmt++; + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_garbage_args; goto sendit; err_system_err: - serv->sv_stats->rpcbadfmt++; + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_system_err; goto sendit; } @@ -1534,7 +1542,8 @@ void svc_process(struct svc_rqst *rqstp) out_baddir: svc_printk(rqstp, "bad direction 0x%08x, dropping request\n", be32_to_cpu(*p)); - rqstp->rq_server->sv_stats->rpcbadfmt++; + if (rqstp->rq_server->sv_stats) + rqstp->rq_server->sv_stats->rpcbadfmt++; out_drop: svc_drop(rqstp); } @@ -1548,9 +1557,11 @@ out_drop: */ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp) { + struct rpc_timeout timeout = { + .to_increment = 0, + }; struct rpc_task *task; int proc_error; - struct rpc_timeout timeout; /* Build the svc_rqst used by the common processing routine */ rqstp->rq_xid = req->rq_xid; @@ -1603,6 +1614,7 @@ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp) timeout.to_initval = req->rq_xprt->timeout->to_initval; timeout.to_retries = req->rq_xprt->timeout->to_retries; } + timeout.to_maxval = timeout.to_initval; memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf)); task = rpc_run_bc_task(req, &timeout); @@ -1612,7 +1624,6 @@ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp) WARN_ON_ONCE(atomic_read(&task->tk_count) != 1); rpc_put_task(task); } -EXPORT_SYMBOL_GPL(svc_process_bc); #endif /* CONFIG_SUNRPC_BACKCHANNEL */ /** diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index b4a85a227b..dd86d7f1e9 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -46,7 +46,6 @@ static LIST_HEAD(svc_xprt_class_list); /* SMP locking strategy: * - * svc_pool->sp_lock protects most of the fields of that pool. * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt. * when both need to be taken (rare), svc_serv->sv_lock is first. * The "service mutex" protects svc_serv->sv_nrthread. @@ -211,51 +210,6 @@ void svc_xprt_init(struct net *net, struct svc_xprt_class *xcl, } EXPORT_SYMBOL_GPL(svc_xprt_init); -static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, - struct svc_serv *serv, - struct net *net, - const int family, - const unsigned short port, - int flags) -{ - struct sockaddr_in sin = { - .sin_family = AF_INET, - .sin_addr.s_addr = htonl(INADDR_ANY), - .sin_port = htons(port), - }; -#if IS_ENABLED(CONFIG_IPV6) - struct sockaddr_in6 sin6 = { - .sin6_family = AF_INET6, - .sin6_addr = IN6ADDR_ANY_INIT, - .sin6_port = htons(port), - }; -#endif - struct svc_xprt *xprt; - struct sockaddr *sap; - size_t len; - - switch (family) { - case PF_INET: - sap = (struct sockaddr *)&sin; - len = sizeof(sin); - break; -#if IS_ENABLED(CONFIG_IPV6) - case PF_INET6: - sap = (struct sockaddr *)&sin6; - len = sizeof(sin6); - break; -#endif - default: - return ERR_PTR(-EAFNOSUPPORT); - } - - xprt = xcl->xcl_ops->xpo_create(serv, net, sap, len, flags); - if (IS_ERR(xprt)) - trace_svc_xprt_create_err(serv->sv_program->pg_name, - xcl->xcl_name, sap, len, xprt); - return xprt; -} - /** * svc_xprt_received - start next receiver thread * @xprt: controlling transport @@ -294,9 +248,8 @@ void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new) } static int _svc_xprt_create(struct svc_serv *serv, const char *xprt_name, - struct net *net, const int family, - const unsigned short port, int flags, - const struct cred *cred) + struct net *net, struct sockaddr *sap, + size_t len, int flags, const struct cred *cred) { struct svc_xprt_class *xcl; @@ -312,8 +265,11 @@ static int _svc_xprt_create(struct svc_serv *serv, const char *xprt_name, goto err; spin_unlock(&svc_xprt_class_lock); - newxprt = __svc_xpo_create(xcl, serv, net, family, port, flags); + newxprt = xcl->xcl_ops->xpo_create(serv, net, sap, len, flags); if (IS_ERR(newxprt)) { + trace_svc_xprt_create_err(serv->sv_program->pg_name, + xcl->xcl_name, sap, len, + newxprt); module_put(xcl->xcl_owner); return PTR_ERR(newxprt); } @@ -330,6 +286,48 @@ static int _svc_xprt_create(struct svc_serv *serv, const char *xprt_name, } /** + * svc_xprt_create_from_sa - Add a new listener to @serv from socket address + * @serv: target RPC service + * @xprt_name: transport class name + * @net: network namespace + * @sap: socket address pointer + * @flags: SVC_SOCK flags + * @cred: credential to bind to this transport + * + * Return local xprt port on success or %-EPROTONOSUPPORT on failure + */ +int svc_xprt_create_from_sa(struct svc_serv *serv, const char *xprt_name, + struct net *net, struct sockaddr *sap, + int flags, const struct cred *cred) +{ + size_t len; + int err; + + switch (sap->sa_family) { + case AF_INET: + len = sizeof(struct sockaddr_in); + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + len = sizeof(struct sockaddr_in6); + break; +#endif + default: + return -EAFNOSUPPORT; + } + + err = _svc_xprt_create(serv, xprt_name, net, sap, len, flags, cred); + if (err == -EPROTONOSUPPORT) { + request_module("svc%s", xprt_name); + err = _svc_xprt_create(serv, xprt_name, net, sap, len, flags, + cred); + } + + return err; +} +EXPORT_SYMBOL_GPL(svc_xprt_create_from_sa); + +/** * svc_xprt_create - Add a new listener to @serv * @serv: target RPC service * @xprt_name: transport class name @@ -339,23 +337,41 @@ static int _svc_xprt_create(struct svc_serv *serv, const char *xprt_name, * @flags: SVC_SOCK flags * @cred: credential to bind to this transport * - * Return values: - * %0: New listener added successfully - * %-EPROTONOSUPPORT: Requested transport type not supported + * Return local xprt port on success or %-EPROTONOSUPPORT on failure */ int svc_xprt_create(struct svc_serv *serv, const char *xprt_name, struct net *net, const int family, const unsigned short port, int flags, const struct cred *cred) { - int err; + struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_ANY), + .sin_port = htons(port), + }; +#if IS_ENABLED(CONFIG_IPV6) + struct sockaddr_in6 sin6 = { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_ANY_INIT, + .sin6_port = htons(port), + }; +#endif + struct sockaddr *sap; - err = _svc_xprt_create(serv, xprt_name, net, family, port, flags, cred); - if (err == -EPROTONOSUPPORT) { - request_module("svc%s", xprt_name); - err = _svc_xprt_create(serv, xprt_name, net, family, port, flags, cred); + switch (family) { + case PF_INET: + sap = (struct sockaddr *)&sin; + break; +#if IS_ENABLED(CONFIG_IPV6) + case PF_INET6: + sap = (struct sockaddr *)&sin6; + break; +#endif + default: + return -EAFNOSUPPORT; } - return err; + + return svc_xprt_create_from_sa(serv, xprt_name, net, sap, flags, cred); } EXPORT_SYMBOL_GPL(svc_xprt_create); @@ -1260,6 +1276,40 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt) } /** + * svc_find_listener - find an RPC transport instance + * @serv: pointer to svc_serv to search + * @xcl_name: C string containing transport's class name + * @net: owner net pointer + * @sa: sockaddr containing address + * + * Return the transport instance pointer for the endpoint accepting + * connections/peer traffic from the specified transport class, + * and matching sockaddr. + */ +struct svc_xprt *svc_find_listener(struct svc_serv *serv, const char *xcl_name, + struct net *net, const struct sockaddr *sa) +{ + struct svc_xprt *xprt; + struct svc_xprt *found = NULL; + + spin_lock_bh(&serv->sv_lock); + list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) { + if (xprt->xpt_net != net) + continue; + if (strcmp(xprt->xpt_class->xcl_name, xcl_name)) + continue; + if (!rpc_cmp_addr_port(sa, (struct sockaddr *)&xprt->xpt_local)) + continue; + found = xprt; + svc_xprt_get(xprt); + break; + } + spin_unlock_bh(&serv->sv_lock); + return found; +} +EXPORT_SYMBOL_GPL(svc_find_listener); + +/** * svc_find_xprt - find an RPC transport instance * @serv: pointer to svc_serv to search * @xcl_name: C string containing transport's class name diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index 93941ab125..5f3170a1c9 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -160,7 +160,6 @@ static struct ctl_table debug_table[] = { .mode = 0444, .proc_handler = proc_do_xprt, }, - { } }; void diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index af13fdfa66..09f245cda5 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1398,6 +1398,12 @@ xprt_request_dequeue_transmit_locked(struct rpc_task *task) if (!test_and_clear_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) return; if (!list_empty(&req->rq_xmit)) { + struct rpc_xprt *xprt = req->rq_xprt; + + if (list_is_first(&req->rq_xmit, &xprt->xmit_queue) && + xprt->ops->abort_send_request) + xprt->ops->abort_send_request(req); + list_del(&req->rq_xmit); if (!list_empty(&req->rq_xmit2)) { struct rpc_rqst *next = list_first_entry(&req->rq_xmit2, @@ -1541,6 +1547,9 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task) int is_retrans = RPC_WAS_SENT(task); int status; + if (test_bit(XPRT_CLOSE_WAIT, &xprt->state)) + return -ENOTCONN; + if (!req->rq_bytes_sent) { if (xprt_request_data_received(task)) { status = 0; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index ffbf998949..47f33bb7bf 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -92,7 +92,8 @@ static void frwr_mr_put(struct rpcrdma_mr *mr) rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs); } -/* frwr_reset - Place MRs back on the free list +/** + * frwr_reset - Place MRs back on @req's free list * @req: request to reset * * Used after a failed marshal. For FRWR, this means the MRs diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index f86970733e..474f7a98fe 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -209,7 +209,6 @@ static struct ctl_table svcrdma_parm_table[] = { .extra1 = &zero, .extra2 = &zero, }, - { }, }; static void svc_rdma_proc_cleanup(void) diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index c9be677864..e5a78b7610 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -90,7 +90,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, */ get_page(virt_to_page(rqst->rq_buffer)); sctxt->sc_send_wr.opcode = IB_WR_SEND; - return svc_rdma_send(rdma, sctxt); + return svc_rdma_post_send(rdma, sctxt); } /* Server-side transport endpoint wants a whole page for its send diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index c00fcce61d..40797114d5 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -197,28 +197,6 @@ void svc_rdma_cc_release(struct svcxprt_rdma *rdma, llist_add_batch(first, last, &rdma->sc_rw_ctxts); } -/* State for sending a Write or Reply chunk. - * - Tracks progress of writing one chunk over all its segments - * - Stores arguments for the SGL constructor functions - */ -struct svc_rdma_write_info { - struct svcxprt_rdma *wi_rdma; - - const struct svc_rdma_chunk *wi_chunk; - - /* write state of this chunk */ - unsigned int wi_seg_off; - unsigned int wi_seg_no; - - /* SGL constructor arguments */ - const struct xdr_buf *wi_xdr; - unsigned char *wi_base; - unsigned int wi_next_off; - - struct svc_rdma_chunk_ctxt wi_cc; - struct work_struct wi_work; -}; - static struct svc_rdma_write_info * svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, const struct svc_rdma_chunk *chunk) @@ -253,6 +231,49 @@ static void svc_rdma_write_info_free(struct svc_rdma_write_info *info) } /** + * svc_rdma_reply_chunk_release - Release Reply chunk I/O resources + * @rdma: controlling transport + * @ctxt: Send context that is being released + */ +void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt) +{ + struct svc_rdma_chunk_ctxt *cc = &ctxt->sc_reply_info.wi_cc; + + if (!cc->cc_sqecount) + return; + svc_rdma_cc_release(rdma, cc, DMA_TO_DEVICE); +} + +/** + * svc_rdma_reply_done - Reply chunk Write completion handler + * @cq: controlling Completion Queue + * @wc: Work Completion report + * + * Pages under I/O are released by a subsequent Send completion. + */ +static void svc_rdma_reply_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_cqe *cqe = wc->wr_cqe; + struct svc_rdma_chunk_ctxt *cc = + container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); + struct svcxprt_rdma *rdma = cq->cq_context; + + switch (wc->status) { + case IB_WC_SUCCESS: + trace_svcrdma_wc_reply(&cc->cc_cid); + return; + case IB_WC_WR_FLUSH_ERR: + trace_svcrdma_wc_reply_flush(wc, &cc->cc_cid); + break; + default: + trace_svcrdma_wc_reply_err(wc, &cc->cc_cid); + } + + svc_xprt_deferred_close(&rdma->sc_xprt); +} + +/** * svc_rdma_write_done - Write chunk completion * @cq: controlling Completion Queue * @wc: Work Completion @@ -580,41 +601,33 @@ static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data) return xdr->len; } -/** - * svc_rdma_send_write_chunk - Write all segments in a Write chunk - * @rdma: controlling RDMA transport - * @chunk: Write chunk provided by the client - * @xdr: xdr_buf containing the data payload - * - * Returns a non-negative number of bytes the chunk consumed, or - * %-E2BIG if the payload was larger than the Write chunk, - * %-EINVAL if client provided too many segments, - * %-ENOMEM if rdma_rw context pool was exhausted, - * %-ENOTCONN if posting failed (connection is lost), - * %-EIO if rdma_rw initialization failed (DMA mapping, etc). - */ -int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, - const struct svc_rdma_chunk *chunk, - const struct xdr_buf *xdr) +static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, + const struct svc_rdma_chunk *chunk, + const struct xdr_buf *xdr) { struct svc_rdma_write_info *info; struct svc_rdma_chunk_ctxt *cc; + struct xdr_buf payload; int ret; + if (xdr_buf_subsegment(xdr, &payload, chunk->ch_position, + chunk->ch_payload_length)) + return -EMSGSIZE; + info = svc_rdma_write_info_alloc(rdma, chunk); if (!info) return -ENOMEM; cc = &info->wi_cc; - ret = svc_rdma_xb_write(xdr, info); - if (ret != xdr->len) + ret = svc_rdma_xb_write(&payload, info); + if (ret != payload.len) goto out_err; trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount); ret = svc_rdma_post_chunk_ctxt(rdma, cc); if (ret < 0) goto out_err; - return xdr->len; + return 0; out_err: svc_rdma_write_info_free(info); @@ -622,9 +635,37 @@ out_err: } /** - * svc_rdma_send_reply_chunk - Write all segments in the Reply chunk + * svc_rdma_send_write_list - Send all chunks on the Write list * @rdma: controlling RDMA transport - * @rctxt: Write and Reply chunks from client + * @rctxt: Write list provisioned by the client + * @xdr: xdr_buf containing an RPC Reply message + * + * Returns zero on success, or a negative errno if one or more + * Write chunks could not be sent. + */ +int svc_rdma_send_write_list(struct svcxprt_rdma *rdma, + const struct svc_rdma_recv_ctxt *rctxt, + const struct xdr_buf *xdr) +{ + struct svc_rdma_chunk *chunk; + int ret; + + pcl_for_each_chunk(chunk, &rctxt->rc_write_pcl) { + if (!chunk->ch_payload_length) + break; + ret = svc_rdma_send_write_chunk(rdma, chunk, xdr); + if (ret < 0) + return ret; + } + return 0; +} + +/** + * svc_rdma_prepare_reply_chunk - Construct WR chain for writing the Reply chunk + * @rdma: controlling RDMA transport + * @write_pcl: Write chunk list provided by client + * @reply_pcl: Reply chunk provided by client + * @sctxt: Send WR resources * @xdr: xdr_buf containing an RPC Reply * * Returns a non-negative number of bytes the chunk consumed, or @@ -634,39 +675,45 @@ out_err: * %-ENOTCONN if posting failed (connection is lost), * %-EIO if rdma_rw initialization failed (DMA mapping, etc). */ -int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, - const struct svc_rdma_recv_ctxt *rctxt, - const struct xdr_buf *xdr) +int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma, + const struct svc_rdma_pcl *write_pcl, + const struct svc_rdma_pcl *reply_pcl, + struct svc_rdma_send_ctxt *sctxt, + const struct xdr_buf *xdr) { - struct svc_rdma_write_info *info; - struct svc_rdma_chunk_ctxt *cc; - struct svc_rdma_chunk *chunk; + struct svc_rdma_write_info *info = &sctxt->sc_reply_info; + struct svc_rdma_chunk_ctxt *cc = &info->wi_cc; + struct ib_send_wr *first_wr; + struct list_head *pos; + struct ib_cqe *cqe; int ret; - if (pcl_is_empty(&rctxt->rc_reply_pcl)) - return 0; - - chunk = pcl_first_chunk(&rctxt->rc_reply_pcl); - info = svc_rdma_write_info_alloc(rdma, chunk); - if (!info) - return -ENOMEM; - cc = &info->wi_cc; + info->wi_rdma = rdma; + info->wi_chunk = pcl_first_chunk(reply_pcl); + info->wi_seg_off = 0; + info->wi_seg_no = 0; + info->wi_cc.cc_cqe.done = svc_rdma_reply_done; - ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + ret = pcl_process_nonpayloads(write_pcl, xdr, svc_rdma_xb_write, info); if (ret < 0) - goto out_err; + return ret; - trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount); - ret = svc_rdma_post_chunk_ctxt(rdma, cc); - if (ret < 0) - goto out_err; + first_wr = sctxt->sc_wr_chain; + cqe = &cc->cc_cqe; + list_for_each(pos, &cc->cc_rwctxts) { + struct svc_rdma_rw_ctxt *rwc; - return xdr->len; + rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list); + first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, cqe, first_wr); + cqe = NULL; + } + sctxt->sc_wr_chain = first_wr; + sctxt->sc_sqecount += cc->cc_sqecount; -out_err: - svc_rdma_write_info_free(info); - return ret; + trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount); + return xdr->len; } /** diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 1a49b7f020..bb5436b719 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -205,9 +205,13 @@ out: xdr_init_encode(&ctxt->sc_stream, &ctxt->sc_hdrbuf, ctxt->sc_xprt_buf, NULL); + svc_rdma_cc_init(rdma, &ctxt->sc_reply_info.wi_cc); ctxt->sc_send_wr.num_sge = 0; ctxt->sc_cur_sge_no = 0; ctxt->sc_page_count = 0; + ctxt->sc_wr_chain = &ctxt->sc_send_wr; + ctxt->sc_sqecount = 1; + return ctxt; out_empty: @@ -223,6 +227,8 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma, struct ib_device *device = rdma->sc_cm_id->device; unsigned int i; + svc_rdma_reply_chunk_release(rdma, ctxt); + if (ctxt->sc_page_count) release_pages(ctxt->sc_pages, ctxt->sc_page_count); @@ -293,7 +299,7 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) struct svc_rdma_send_ctxt *ctxt = container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe); - svc_rdma_wake_send_waiters(rdma, 1); + svc_rdma_wake_send_waiters(rdma, ctxt->sc_sqecount); if (unlikely(wc->status != IB_WC_SUCCESS)) goto flushed; @@ -312,51 +318,76 @@ flushed: } /** - * svc_rdma_send - Post a single Send WR - * @rdma: transport on which to post the WR - * @ctxt: send ctxt with a Send WR ready to post + * svc_rdma_post_send - Post a WR chain to the Send Queue + * @rdma: transport context + * @ctxt: WR chain to post + * + * Copy fields in @ctxt to stack variables in order to guarantee + * that these values remain available after the ib_post_send() call. + * In some error flow cases, svc_rdma_wc_send() releases @ctxt. + * + * Note there is potential for starvation when the Send Queue is + * full because there is no order to when waiting threads are + * awoken. The transport is typically provisioned with a deep + * enough Send Queue that SQ exhaustion should be a rare event. * - * Returns zero if the Send WR was posted successfully. Otherwise, a - * negative errno is returned. + * Return values: + * %0: @ctxt's WR chain was posted successfully + * %-ENOTCONN: The connection was lost */ -int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt) +int svc_rdma_post_send(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt) { - struct ib_send_wr *wr = &ctxt->sc_send_wr; - int ret; + struct ib_send_wr *first_wr = ctxt->sc_wr_chain; + struct ib_send_wr *send_wr = &ctxt->sc_send_wr; + const struct ib_send_wr *bad_wr = first_wr; + struct rpc_rdma_cid cid = ctxt->sc_cid; + int ret, sqecount = ctxt->sc_sqecount; might_sleep(); /* Sync the transport header buffer */ ib_dma_sync_single_for_device(rdma->sc_pd->device, - wr->sg_list[0].addr, - wr->sg_list[0].length, + send_wr->sg_list[0].addr, + send_wr->sg_list[0].length, DMA_TO_DEVICE); /* If the SQ is full, wait until an SQ entry is available */ - while (1) { - if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) { + while (!test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) { + if (atomic_sub_return(sqecount, &rdma->sc_sq_avail) < 0) { + svc_rdma_wake_send_waiters(rdma, sqecount); + + /* When the transport is torn down, assume + * ib_drain_sq() will trigger enough Send + * completions to wake us. The XPT_CLOSE test + * above should then cause the while loop to + * exit. + */ percpu_counter_inc(&svcrdma_stat_sq_starve); - trace_svcrdma_sq_full(rdma, &ctxt->sc_cid); - atomic_inc(&rdma->sc_sq_avail); + trace_svcrdma_sq_full(rdma, &cid); wait_event(rdma->sc_send_wait, - atomic_read(&rdma->sc_sq_avail) > 1); - if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) - return -ENOTCONN; - trace_svcrdma_sq_retry(rdma, &ctxt->sc_cid); + atomic_read(&rdma->sc_sq_avail) > 0); + trace_svcrdma_sq_retry(rdma, &cid); continue; } trace_svcrdma_post_send(ctxt); - ret = ib_post_send(rdma->sc_qp, wr, NULL); - if (ret) - break; + ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); + if (ret) { + trace_svcrdma_sq_post_err(rdma, &cid, ret); + svc_xprt_deferred_close(&rdma->sc_xprt); + + /* If even one WR was posted, there will be a + * Send completion that bumps sc_sq_avail. + */ + if (bad_wr == first_wr) { + svc_rdma_wake_send_waiters(rdma, sqecount); + break; + } + } return 0; } - - trace_svcrdma_sq_post_err(rdma, &ctxt->sc_cid, ret); - svc_xprt_deferred_close(&rdma->sc_xprt); - wake_up(&rdma->sc_send_wait); - return ret; + return -ENOTCONN; } /** @@ -839,16 +870,10 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, * in sc_sges[0], and the RPC xdr_buf is prepared in following sges. * * Depending on whether a Write list or Reply chunk is present, - * the server may send all, a portion of, or none of the xdr_buf. + * the server may Send all, a portion of, or none of the xdr_buf. * In the latter case, only the transport header (sc_sges[0]) is * transmitted. * - * RDMA Send is the last step of transmitting an RPC reply. Pages - * involved in the earlier RDMA Writes are here transferred out - * of the rqstp and into the sctxt's page array. These pages are - * DMA unmapped by each Write completion, but the subsequent Send - * completion finally releases these pages. - * * Assumptions: * - The Reply's transport header will never be larger than a page. */ @@ -857,6 +882,7 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, const struct svc_rdma_recv_ctxt *rctxt, struct svc_rqst *rqstp) { + struct ib_send_wr *send_wr = &sctxt->sc_send_wr; int ret; ret = svc_rdma_map_reply_msg(rdma, sctxt, &rctxt->rc_write_pcl, @@ -864,16 +890,19 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, if (ret < 0) return ret; + /* Transfer pages involved in RDMA Writes to the sctxt's + * page array. Completion handling releases these pages. + */ svc_rdma_save_io_pages(rqstp, sctxt); if (rctxt->rc_inv_rkey) { - sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV; - sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey; + send_wr->opcode = IB_WR_SEND_WITH_INV; + send_wr->ex.invalidate_rkey = rctxt->rc_inv_rkey; } else { - sctxt->sc_send_wr.opcode = IB_WR_SEND; + send_wr->opcode = IB_WR_SEND; } - return svc_rdma_send(rdma, sctxt); + return svc_rdma_post_send(rdma, sctxt); } /** @@ -937,7 +966,7 @@ void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, sctxt->sc_send_wr.num_sge = 1; sctxt->sc_send_wr.opcode = IB_WR_SEND; sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len; - if (svc_rdma_send(rdma, sctxt)) + if (svc_rdma_post_send(rdma, sctxt)) goto put_ctxt; return; @@ -984,10 +1013,19 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) if (!p) goto put_ctxt; - ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res); + ret = svc_rdma_send_write_list(rdma, rctxt, &rqstp->rq_res); if (ret < 0) - goto reply_chunk; - rc_size = ret; + goto put_ctxt; + + rc_size = 0; + if (!pcl_is_empty(&rctxt->rc_reply_pcl)) { + ret = svc_rdma_prepare_reply_chunk(rdma, &rctxt->rc_write_pcl, + &rctxt->rc_reply_pcl, sctxt, + &rqstp->rq_res); + if (ret < 0) + goto reply_chunk; + rc_size = ret; + } *p++ = *rdma_argp; *p++ = *(rdma_argp + 1); @@ -1030,45 +1068,33 @@ drop_connection: /** * svc_rdma_result_payload - special processing for a result payload - * @rqstp: svc_rqst to operate on - * @offset: payload's byte offset in @xdr + * @rqstp: RPC transaction context + * @offset: payload's byte offset in @rqstp->rq_res * @length: size of payload, in bytes * + * Assign the passed-in result payload to the current Write chunk, + * and advance to cur_result_payload to the next Write chunk, if + * there is one. + * * Return values: * %0 if successful or nothing needed to be done - * %-EMSGSIZE on XDR buffer overflow * %-E2BIG if the payload was larger than the Write chunk - * %-EINVAL if client provided too many segments - * %-ENOMEM if rdma_rw context pool was exhausted - * %-ENOTCONN if posting failed (connection is lost) - * %-EIO if rdma_rw initialization failed (DMA mapping, etc) */ int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length) { struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; struct svc_rdma_chunk *chunk; - struct svcxprt_rdma *rdma; - struct xdr_buf subbuf; - int ret; chunk = rctxt->rc_cur_result_payload; if (!length || !chunk) return 0; rctxt->rc_cur_result_payload = pcl_next_chunk(&rctxt->rc_write_pcl, chunk); + if (length > chunk->ch_length) return -E2BIG; - chunk->ch_position = offset; chunk->ch_payload_length = length; - - if (xdr_buf_subsegment(&rqstp->rq_res, &subbuf, offset, length)) - return -EMSGSIZE; - - rdma = container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt); - ret = svc_rdma_send_write_chunk(rdma, chunk, &subbuf); - if (ret < 0) - return ret; return 0; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 4f27325ace..2b1c16b954 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -415,15 +415,20 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge) newxprt->sc_max_send_sges = dev->attrs.max_send_sge; rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests + - newxprt->sc_recv_batch; + newxprt->sc_recv_batch + 1 /* drain */; if (rq_depth > dev->attrs.max_qp_wr) { rq_depth = dev->attrs.max_qp_wr; newxprt->sc_recv_batch = 1; newxprt->sc_max_requests = rq_depth - 2; newxprt->sc_max_bc_requests = 2; } - ctxts = rdma_rw_mr_factor(dev, newxprt->sc_port_num, RPCSVC_MAXPAGES); - ctxts *= newxprt->sc_max_requests; + + /* Arbitrarily estimate the number of rw_ctxs needed for + * this transport. This is enough rw_ctxs to make forward + * progress even if the client is using one rkey per page + * in each Read chunk. + */ + ctxts = 3 * RPCSVC_MAXPAGES; newxprt->sc_sq_depth = rq_depth + ctxts; if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) newxprt->sc_sq_depth = dev->attrs.max_qp_wr; @@ -460,12 +465,14 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr); dprintk(" cap.max_send_sge = %d, cap.max_recv_sge = %d\n", qp_attr.cap.max_send_sge, qp_attr.cap.max_recv_sge); - + dprintk(" send CQ depth = %u, recv CQ depth = %u\n", + newxprt->sc_sq_depth, rq_depth); ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr); if (ret) { trace_svcrdma_qp_err(newxprt, ret); goto errout; } + newxprt->sc_max_send_sges = qp_attr.cap.max_send_sge; newxprt->sc_qp = newxprt->sc_cm_id->qp; if (!(dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 29b0562d62..9a8ce5df83 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -137,7 +137,6 @@ static struct ctl_table xr_tunables_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { }, }; #endif diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 4f8d7efa46..a0b071089e 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -244,7 +244,11 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_DEVICE_REMOVAL: pr_info("rpcrdma: removing device %s for %pISpc\n", ep->re_id->device->name, sap); - fallthrough; + switch (xchg(&ep->re_connect_status, -ENODEV)) { + case 0: goto wake_connect_worker; + case 1: goto disconnected; + } + return 0; case RDMA_CM_EVENT_ADDR_CHANGE: ep->re_connect_status = -ENODEV; goto disconnected; @@ -893,6 +897,8 @@ static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt) static void rpcrdma_req_reset(struct rpcrdma_req *req) { + struct rpcrdma_mr *mr; + /* Credits are valid for only one connection */ req->rl_slot.rq_cong = 0; @@ -902,7 +908,19 @@ static void rpcrdma_req_reset(struct rpcrdma_req *req) rpcrdma_regbuf_dma_unmap(req->rl_sendbuf); rpcrdma_regbuf_dma_unmap(req->rl_recvbuf); - frwr_reset(req); + /* The verbs consumer can't know the state of an MR on the + * req->rl_registered list unless a successful completion + * has occurred, so they cannot be re-used. + */ + while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { + struct rpcrdma_buffer *buf = &mr->mr_xprt->rx_buf; + + spin_lock(&buf->rb_lock); + list_del(&mr->mr_all); + spin_unlock(&buf->rb_lock); + + frwr_mr_release(mr); + } } /* ASSUMPTION: the rb_allreqs list is stable for the duration, diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 58f3dc8d0d..0e1691316f 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -62,6 +62,7 @@ #include "sunrpc.h" static void xs_close(struct rpc_xprt *xprt); +static void xs_reset_srcport(struct sock_xprt *transport); static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock); static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, struct socket *sock); @@ -159,7 +160,6 @@ static struct ctl_table xs_tunables_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { }, }; /* @@ -883,6 +883,17 @@ static int xs_stream_prepare_request(struct rpc_rqst *req, struct xdr_buf *buf) return xdr_alloc_bvec(buf, rpc_task_gfp_mask()); } +static void xs_stream_abort_send_request(struct rpc_rqst *req) +{ + struct rpc_xprt *xprt = req->rq_xprt; + struct sock_xprt *transport = + container_of(xprt, struct sock_xprt, xprt); + + if (transport->xmit.offset != 0 && + !test_bit(XPRT_CLOSE_WAIT, &xprt->state)) + xprt_force_disconnect(xprt); +} + /* * Determine if the previous message in the stream was aborted before it * could complete transmission. @@ -1565,8 +1576,10 @@ static void xs_tcp_state_change(struct sock *sk) break; case TCP_CLOSE: if (test_and_clear_bit(XPRT_SOCK_CONNECTING, - &transport->sock_state)) + &transport->sock_state)) { + xs_reset_srcport(transport); xprt_clear_connecting(xprt); + } clear_bit(XPRT_CLOSING, &xprt->state); /* Trigger the socket release */ xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT); @@ -1722,6 +1735,11 @@ static void xs_set_port(struct rpc_xprt *xprt, unsigned short port) xs_update_peer_port(xprt); } +static void xs_reset_srcport(struct sock_xprt *transport) +{ + transport->srcport = 0; +} + static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock) { if (transport->srcport == 0 && transport->xprt.reuseport) @@ -2423,6 +2441,13 @@ static void xs_tcp_setup_socket(struct work_struct *work) transport->srcport = 0; status = -EAGAIN; break; + case -EPERM: + /* Happens, for instance, if a BPF program is preventing + * the connect. Remap the error so upper layers can better + * deal with it. + */ + status = -ECONNREFUSED; + fallthrough; case -EINVAL: /* Happens, for instance, if the user specified a link * local IPv6 address without a scope-id. @@ -2645,6 +2670,7 @@ static void xs_tcp_tls_setup_socket(struct work_struct *work) .xprtsec = { .policy = RPC_XPRTSEC_NONE, }, + .stats = upper_clnt->cl_stats, }; unsigned int pflags = current->flags; struct rpc_clnt *lower_clnt; @@ -2987,20 +3013,11 @@ static int bc_send_request(struct rpc_rqst *req) return len; } -/* - * The close routine. Since this is client initiated, we do nothing - */ - static void bc_close(struct rpc_xprt *xprt) { xprt_disconnect_done(xprt); } -/* - * The xprt destroy routine. Again, because this connection is client - * initiated, we do nothing - */ - static void bc_destroy(struct rpc_xprt *xprt) { dprintk("RPC: bc_destroy xprt %p\n", xprt); @@ -3021,6 +3038,7 @@ static const struct rpc_xprt_ops xs_local_ops = { .buf_free = rpc_free, .prepare_request = xs_stream_prepare_request, .send_request = xs_local_send_request, + .abort_send_request = xs_stream_abort_send_request, .wait_for_reply_request = xprt_wait_for_reply_request_def, .close = xs_close, .destroy = xs_destroy, @@ -3068,6 +3086,7 @@ static const struct rpc_xprt_ops xs_tcp_ops = { .buf_free = rpc_free, .prepare_request = xs_stream_prepare_request, .send_request = xs_tcp_send_request, + .abort_send_request = xs_stream_abort_send_request, .wait_for_reply_request = xprt_wait_for_reply_request_def, .close = xs_tcp_shutdown, .destroy = xs_destroy, diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index c9189a970e..6488ead9e4 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -244,6 +244,99 @@ static int switchdev_port_obj_notify(enum switchdev_notifier_type nt, return 0; } +static void switchdev_obj_id_to_helpful_msg(struct net_device *dev, + enum switchdev_obj_id obj_id, + int err, bool add) +{ + const char *action = add ? "add" : "del"; + const char *reason = ""; + const char *problem; + const char *obj_str; + + switch (obj_id) { + case SWITCHDEV_OBJ_ID_UNDEFINED: + obj_str = "Undefined object"; + problem = "Attempted operation is undefined, indicating a possible programming\n" + "error.\n"; + break; + case SWITCHDEV_OBJ_ID_PORT_VLAN: + obj_str = "VLAN entry"; + problem = "Failure in VLAN settings on this port might disrupt network\n" + "segmentation or traffic isolation, affecting network partitioning.\n"; + break; + case SWITCHDEV_OBJ_ID_PORT_MDB: + obj_str = "Port Multicast Database entry"; + problem = "Failure in updating the port's Multicast Database could lead to\n" + "multicast forwarding issues.\n"; + break; + case SWITCHDEV_OBJ_ID_HOST_MDB: + obj_str = "Host Multicast Database entry"; + problem = "Failure in updating the host's Multicast Database may impact multicast\n" + "group memberships or traffic delivery, affecting multicast\n" + "communication.\n"; + break; + case SWITCHDEV_OBJ_ID_MRP: + obj_str = "Media Redundancy Protocol configuration for port"; + problem = "Failure to set MRP ring ID on this port prevents communication with\n" + "the specified redundancy ring, resulting in an inability to engage\n" + "in MRP-based network operations.\n"; + break; + case SWITCHDEV_OBJ_ID_RING_TEST_MRP: + obj_str = "MRP Test Frame Operations for port"; + problem = "Failure to generate/monitor MRP test frames may lead to inability to\n" + "assess the ring's operational integrity and fault response, hindering\n" + "proactive network management.\n"; + break; + case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: + obj_str = "MRP Ring Role Configuration"; + problem = "Improper MRP ring role configuration may create conflicts in the ring,\n" + "disrupting communication for all participants, or isolate the local\n" + "system from the ring, hindering its ability to communicate with other\n" + "participants.\n"; + break; + case SWITCHDEV_OBJ_ID_RING_STATE_MRP: + obj_str = "MRP Ring State Configuration"; + problem = "Failure to correctly set the MRP ring state can result in network\n" + "loops or leave segments without communication. In a Closed state,\n" + "it maintains loop prevention by blocking one MRM port, while an Open\n" + "state activates in response to failures, changing port states to\n" + "preserve network connectivity.\n"; + break; + case SWITCHDEV_OBJ_ID_IN_TEST_MRP: + obj_str = "MRP_InTest Frame Generation Configuration"; + problem = "Failure in managing MRP_InTest frame generation can misjudge the\n" + "interconnection ring's state, leading to incorrect blocking or\n" + "unblocking of the I/C port. This misconfiguration might result\n" + "in unintended network loops or isolate critical network segments,\n" + "compromising network integrity and reliability.\n"; + break; + case SWITCHDEV_OBJ_ID_IN_ROLE_MRP: + obj_str = "Interconnection Ring Role Configuration"; + problem = "Failure in incorrect assignment of interconnection ring roles\n" + "(MIM/MIC) can impair the formation of the interconnection rings.\n"; + break; + case SWITCHDEV_OBJ_ID_IN_STATE_MRP: + obj_str = "Interconnection Ring State Configuration"; + problem = "Failure in updating the interconnection ring state can lead in\n" + "case of Open state to incorrect blocking or unblocking of the\n" + "I/C port, resulting in unintended network loops or isolation\n" + "of critical network\n"; + break; + default: + obj_str = "Unknown object"; + problem = "Indicating a possible programming error.\n"; + } + + switch (err) { + case -ENOSPC: + reason = "Current HW/SW setup lacks sufficient resources.\n"; + break; + } + + netdev_err(dev, "Failed to %s %s (object id=%d) with error: %pe (%d).\n%s%s\n", + action, obj_str, obj_id, ERR_PTR(err), err, problem, reason); +} + static void switchdev_port_obj_add_deferred(struct net_device *dev, const void *data) { @@ -254,8 +347,7 @@ static void switchdev_port_obj_add_deferred(struct net_device *dev, err = switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD, dev, obj, NULL); if (err && err != -EOPNOTSUPP) - netdev_err(dev, "failed (err=%d) to add object (id=%d)\n", - err, obj->id); + switchdev_obj_id_to_helpful_msg(dev, obj->id, err, true); if (obj->complete) obj->complete(dev, err, obj->complete_priv); } @@ -304,8 +396,7 @@ static void switchdev_port_obj_del_deferred(struct net_device *dev, err = switchdev_port_obj_del_now(dev, obj); if (err && err != -EOPNOTSUPP) - netdev_err(dev, "failed (err=%d) to del object (id=%d)\n", - err, obj->id); + switchdev_obj_id_to_helpful_msg(dev, obj->id, err, false); if (obj->complete) obj->complete(dev, err, obj->complete_priv); } diff --git a/net/sysctl_net.c b/net/sysctl_net.c index 051ed5f6fc..f5017012a0 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -40,7 +40,7 @@ static int is_seen(struct ctl_table_set *set) /* Return standard mode bits for table entry. */ static int net_ctl_permissions(struct ctl_table_header *head, - struct ctl_table *table) + const struct ctl_table *table) { struct net *net = container_of(head->set, struct net, sysctls); @@ -54,7 +54,6 @@ static int net_ctl_permissions(struct ctl_table_header *head, } static void net_ctl_set_ownership(struct ctl_table_header *head, - struct ctl_table *table, kuid_t *uid, kgid_t *gid) { struct net *net = container_of(head->set, struct net, sysctls); diff --git a/net/tipc/Kconfig b/net/tipc/Kconfig index be1c4003d6..bb0d71eb02 100644 --- a/net/tipc/Kconfig +++ b/net/tipc/Kconfig @@ -32,16 +32,17 @@ config TIPC_MEDIA_UDP bool "IP/UDP media type support" depends on TIPC select NET_UDP_TUNNEL + default y help Saying Y here will enable support for running TIPC over IP/UDP - bool - default y + config TIPC_CRYPTO bool "TIPC encryption support" depends on TIPC select CRYPTO select CRYPTO_AES select CRYPTO_GCM + default y help Saying Y here will enable support for TIPC encryption. All TIPC messages will be encrypted/decrypted by using the currently most @@ -49,8 +50,6 @@ config TIPC_CRYPTO entering the TIPC stack. Key setting from user-space is performed via netlink by a user program (e.g. the iproute2 'tipc' tool). - bool - default y config TIPC_DIAG tristate "TIPC: socket monitoring interface" diff --git a/net/tipc/Makefile b/net/tipc/Makefile index ee49a9f1dd..18e1636aa0 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -18,5 +18,5 @@ tipc-$(CONFIG_TIPC_MEDIA_IB) += ib_media.o tipc-$(CONFIG_SYSCTL) += sysctl.o tipc-$(CONFIG_TIPC_CRYPTO) += crypto.o - -obj-$(CONFIG_TIPC_DIAG) += diag.o +obj-$(CONFIG_TIPC_DIAG) += tipc_diag.o +tipc_diag-y += diag.o diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 878415c435..5a526ebafe 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -1079,30 +1079,27 @@ int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info) rtnl_lock(); b = tipc_bearer_find(net, name); if (!b) { - rtnl_unlock(); NL_SET_ERR_MSG(info->extack, "Bearer not found"); - return -EINVAL; + err = -EINVAL; + goto out; } #ifdef CONFIG_TIPC_MEDIA_UDP if (attrs[TIPC_NLA_BEARER_UDP_OPTS]) { if (b->media->type_id != TIPC_MEDIA_TYPE_UDP) { - rtnl_unlock(); NL_SET_ERR_MSG(info->extack, "UDP option is unsupported"); - return -EINVAL; + err = -EINVAL; + goto out; } err = tipc_udp_nl_bearer_add(b, attrs[TIPC_NLA_BEARER_UDP_OPTS]); - if (err) { - rtnl_unlock(); - return err; - } } #endif +out: rtnl_unlock(); - return 0; + return err; } int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) diff --git a/net/tipc/diag.c b/net/tipc/diag.c index 18733451c9..54dde8c4e4 100644 --- a/net/tipc/diag.c +++ b/net/tipc/diag.c @@ -95,6 +95,7 @@ static int tipc_sock_diag_handler_dump(struct sk_buff *skb, } static const struct sock_diag_handler tipc_sock_diag_handler = { + .owner = THIS_MODULE, .family = AF_TIPC, .dump = tipc_sock_diag_handler_dump, }; diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 5c9fd4791c..76284fc538 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -142,9 +142,9 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) if (fragid == FIRST_FRAGMENT) { if (unlikely(head)) goto err; - *buf = NULL; if (skb_has_frag_list(frag) && __skb_linearize(frag)) goto err; + *buf = NULL; frag = skb_unshare(frag, GFP_ATOMIC); if (unlikely(!frag)) goto err; @@ -156,6 +156,11 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) if (!head) goto err; + /* Either the input skb ownership is transferred to headskb + * or the input skb is freed, clear the reference to avoid + * bad access on error path. + */ + *buf = NULL; if (skb_try_coalesce(head, frag, &headstolen, &delta)) { kfree_skb_partial(frag, headstolen); } else { @@ -179,7 +184,6 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) *headbuf = NULL; return 1; } - *buf = NULL; return 0; err: kfree_skb(*buf); diff --git a/net/tipc/node.c b/net/tipc/node.c index 3105abe97b..500320e5ca 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -86,8 +86,6 @@ struct tipc_bclink_entry { * @lock: rwlock governing access to structure * @net: the applicable net namespace * @hash: links to adjacent nodes in unsorted hash chain - * @inputq: pointer to input queue containing messages for msg event - * @namedq: pointer to name table input queue with name table messages * @active_links: bearer ids of active links, used as index into links[] array * @links: array containing references to all links to node * @bc_entry: broadcast link entry @@ -2107,6 +2105,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) } else { n = tipc_node_find_by_id(net, ehdr->id); } + skb_dst_force(skb); tipc_crypto_rcv(net, (n) ? n->crypto_rx : NULL, &skb, b); if (!skb) return; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index bb1118d02f..2d58ecae4e 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -80,7 +80,6 @@ struct sockaddr_pair { * @phdr: preformatted message header used when sending messages * @cong_links: list of congested links * @publications: list of publications for port - * @blocking_link: address of the congested link we are currently sleeping on * @pub_count: total # of publications port has made during its lifetime * @conn_timeout: the time we can wait for an unresponded setup request * @probe_unacked: probe has not received ack yet @@ -147,8 +146,6 @@ static void tipc_data_ready(struct sock *sk); static void tipc_write_space(struct sock *sk); static void tipc_sock_destruct(struct sock *sk); static int tipc_release(struct socket *sock); -static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, - bool kern); static void tipc_sk_timeout(struct timer_list *t); static int tipc_sk_publish(struct tipc_sock *tsk, struct tipc_uaddr *ua); static int tipc_sk_withdraw(struct tipc_sock *tsk, struct tipc_uaddr *ua); @@ -2712,13 +2709,12 @@ static int tipc_wait_for_accept(struct socket *sock, long timeo) * tipc_accept - wait for connection request * @sock: listening socket * @new_sock: new socket that is to be connected - * @flags: file-related flags associated with socket - * @kern: caused by kernel or by userspace? + * @arg: arguments for accept * * Return: 0 on success, errno otherwise */ -static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, - bool kern) +static int tipc_accept(struct socket *sock, struct socket *new_sock, + struct proto_accept_arg *arg) { struct sock *new_sk, *sk = sock->sk; struct tipc_sock *new_tsock; @@ -2734,14 +2730,14 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, res = -EINVAL; goto exit; } - timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); res = tipc_wait_for_accept(sock, timeo); if (res) goto exit; buf = skb_peek(&sk->sk_receive_queue); - res = tipc_sk_create(sock_net(sock->sk), new_sock, 0, kern); + res = tipc_sk_create(sock_net(sock->sk), new_sock, 0, arg->kern); if (res) goto exit; security_sk_clone(sock->sk, new_sock->sk); @@ -3566,11 +3562,8 @@ int tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb, rhashtable_walk_start(iter); while ((tsk = rhashtable_walk_next(iter)) != NULL) { if (IS_ERR(tsk)) { - err = PTR_ERR(tsk); - if (err == -EAGAIN) { - err = 0; + if (PTR_ERR(tsk) == -EAGAIN) continue; - } break; } diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c index 9fb65c988f..30d2e06e3d 100644 --- a/net/tipc/sysctl.c +++ b/net/tipc/sysctl.c @@ -91,7 +91,6 @@ static struct ctl_table tipc_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, - {} }; int tipc_register_sysctl(void) diff --git a/net/tipc/trace.h b/net/tipc/trace.h index 04af83f050..865142ed0a 100644 --- a/net/tipc/trace.h +++ b/net/tipc/trace.h @@ -145,7 +145,7 @@ DECLARE_EVENT_CLASS(tipc_skb_class, ), TP_fast_assign( - __assign_str(header, header); + __assign_str(header); tipc_skb_dump(skb, more, __get_str(buf)); ), @@ -172,7 +172,7 @@ DECLARE_EVENT_CLASS(tipc_list_class, ), TP_fast_assign( - __assign_str(header, header); + __assign_str(header); tipc_list_dump(list, more, __get_str(buf)); ), @@ -200,7 +200,7 @@ DECLARE_EVENT_CLASS(tipc_sk_class, ), TP_fast_assign( - __assign_str(header, header); + __assign_str(header); __entry->portid = tipc_sock_get_portid(sk); tipc_sk_dump(sk, dqueues, __get_str(buf)); if (skb) @@ -254,7 +254,7 @@ DECLARE_EVENT_CLASS(tipc_link_class, ), TP_fast_assign( - __assign_str(header, header); + __assign_str(header); memcpy(__entry->name, tipc_link_name(l), TIPC_MAX_LINK_NAME); tipc_link_dump(l, dqueues, __get_str(buf)); ), @@ -337,7 +337,7 @@ DECLARE_EVENT_CLASS(tipc_node_class, ), TP_fast_assign( - __assign_str(header, header); + __assign_str(header); __entry->addr = tipc_node_get_addr(n); tipc_node_dump(n, more, __get_str(buf)); ), @@ -374,7 +374,7 @@ DECLARE_EVENT_CLASS(tipc_fsm_class, ), TP_fast_assign( - __assign_str(name, name); + __assign_str(name); __entry->os = os; __entry->ns = ns; __entry->evt = evt; @@ -409,8 +409,8 @@ TRACE_EVENT(tipc_l2_device_event, ), TP_fast_assign( - __assign_str(dev_name, dev->name); - __assign_str(b_name, b->name); + __assign_str(dev_name); + __assign_str(b_name); __entry->evt = evt; __entry->b_up = test_bit(0, &b->up); __entry->carrier = netif_carrier_ok(dev); diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index f892b0903d..439f755399 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -135,8 +135,11 @@ static int tipc_udp_addr2str(struct tipc_media_addr *a, char *buf, int size) snprintf(buf, size, "%pI4:%u", &ua->ipv4, ntohs(ua->port)); else if (ntohs(ua->proto) == ETH_P_IPV6) snprintf(buf, size, "%pI6:%u", &ua->ipv6, ntohs(ua->port)); - else + else { pr_err("Invalid UDP media address\n"); + return 1; + } + return 0; } @@ -174,7 +177,7 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, local_bh_disable(); ndst = dst_cache_get(cache); if (dst->proto == htons(ETH_P_IP)) { - struct rtable *rt = (struct rtable *)ndst; + struct rtable *rt = dst_rtable(ndst); if (!rt) { struct flowi4 fl = { diff --git a/net/tls/Kconfig b/net/tls/Kconfig index 0cdc1f7b6b..ce8d56a191 100644 --- a/net/tls/Kconfig +++ b/net/tls/Kconfig @@ -20,6 +20,7 @@ config TLS config TLS_DEVICE bool "Transport Layer Security HW offload" depends on TLS + select SKB_DECRYPTED select SOCK_VALIDATE_XMIT select SOCK_RX_QUEUE_MAPPING default n diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index bf8ed36b1a..ab6e694f7b 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -37,6 +37,7 @@ #include <net/inet_connection_sock.h> #include <net/tcp.h> #include <net/tls.h> +#include <linux/skbuff_ref.h> #include "tls.h" #include "trace.h" diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 4e7228f275..f9e3d3d90d 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -33,6 +33,7 @@ #include <crypto/aead.h> #include <crypto/scatterwalk.h> #include <net/ip6_checksum.h> +#include <linux/skbuff_ref.h> #include "tls.h" diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index b4674f03d7..90b7f253d3 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -816,9 +816,17 @@ struct tls_context *tls_ctx_create(struct sock *sk) return NULL; mutex_init(&ctx->tx_lock); - rcu_assign_pointer(icsk->icsk_ulp_data, ctx); ctx->sk_proto = READ_ONCE(sk->sk_prot); ctx->sk = sk; + /* Release semantic of rcu_assign_pointer() ensures that + * ctx->sk_proto is visible before changing sk->sk_prot in + * update_sk_prot(), and prevents reading uninitialized value in + * tls_{getsockopt, setsockopt}. Note that we do not need a + * read barrier in tls_{getsockopt,setsockopt} as there is an + * address dependency between sk->sk_proto->{getsockopt,setsockopt} + * and ctx->sk_proto. + */ + rcu_assign_pointer(icsk->icsk_ulp_data, ctx); return ctx; } diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index 5df08d848b..77e33e1e34 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -2,6 +2,7 @@ /* Copyright (c) 2016 Tom Herbert <tom@herbertland.com> */ #include <linux/skbuff.h> +#include <linux/skbuff_ref.h> #include <linux/workqueue.h> #include <net/strparser.h> #include <net/tcp.h> diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index b783231668..305a412785 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2147,7 +2147,6 @@ recv_end: if (ret) { if (err >= 0 || err == -EINPROGRESS) err = ret; - decrypted = 0; goto end; } diff --git a/net/unix/Kconfig b/net/unix/Kconfig index 28b232f281..8b5d04210d 100644 --- a/net/unix/Kconfig +++ b/net/unix/Kconfig @@ -16,11 +16,6 @@ config UNIX Say Y unless you know what you are doing. -config UNIX_SCM - bool - depends on UNIX - default y - config AF_UNIX_OOB bool depends on UNIX diff --git a/net/unix/Makefile b/net/unix/Makefile index 20491825b4..4ddd125c46 100644 --- a/net/unix/Makefile +++ b/net/unix/Makefile @@ -11,5 +11,3 @@ unix-$(CONFIG_BPF_SYSCALL) += unix_bpf.o obj-$(CONFIG_UNIX_DIAG) += unix_diag.o unix_diag-y := diag.o - -obj-$(CONFIG_UNIX_SCM) += scm.o diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 9df15a7bc2..be5266007b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -118,8 +118,6 @@ #include <linux/btf_ids.h> #include <linux/bpf-cgroup.h> -#include "scm.h" - static atomic_long_t unix_nr_socks; static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; @@ -223,15 +221,9 @@ static inline int unix_may_send(struct sock *sk, struct sock *osk) return unix_peer(osk) == NULL || unix_our_peer(sk, osk); } -static inline int unix_recvq_full(const struct sock *sk) -{ - return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; -} - static inline int unix_recvq_full_lockless(const struct sock *sk) { - return skb_queue_len_lockless(&sk->sk_receive_queue) > - READ_ONCE(sk->sk_max_ack_backlog); + return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; } struct sock *unix_peer_get(struct sock *s) @@ -532,10 +524,10 @@ static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) return 0; } -static int unix_writable(const struct sock *sk) +static int unix_writable(const struct sock *sk, unsigned char state) { - return sk->sk_state != TCP_LISTEN && - (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf; + return state != TCP_LISTEN && + (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf); } static void unix_write_space(struct sock *sk) @@ -543,12 +535,12 @@ static void unix_write_space(struct sock *sk) struct socket_wq *wq; rcu_read_lock(); - if (unix_writable(sk)) { + if (unix_writable(sk, READ_ONCE(sk->sk_state))) { wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } @@ -572,7 +564,6 @@ static void unix_dgram_disconnected(struct sock *sk, struct sock *other) sk_error_report(other); } } - other->sk_state = TCP_CLOSE; } static void unix_sock_destructor(struct sock *sk) @@ -619,7 +610,7 @@ static void unix_release_sock(struct sock *sk, int embrion) u->path.dentry = NULL; u->path.mnt = NULL; state = sk->sk_state; - sk->sk_state = TCP_CLOSE; + WRITE_ONCE(sk->sk_state, TCP_CLOSE); skpair = unix_peer(sk); unix_peer(sk) = NULL; @@ -640,7 +631,7 @@ static void unix_release_sock(struct sock *sk, int embrion) unix_state_lock(skpair); /* No more writes */ WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); - if (!skb_queue_empty(&sk->sk_receive_queue) || embrion) + if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion) WRITE_ONCE(skpair->sk_err, ECONNRESET); unix_state_unlock(skpair); skpair->sk_state_change(skpair); @@ -733,7 +724,7 @@ static int unix_listen(struct socket *sock, int backlog) if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) goto out; /* Only stream/seqpacket sockets accept */ err = -EINVAL; - if (!u->addr) + if (!READ_ONCE(u->addr)) goto out; /* No listens on an unbound socket */ unix_state_lock(sk); if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) @@ -741,7 +732,8 @@ static int unix_listen(struct socket *sock, int backlog) if (backlog > sk->sk_max_ack_backlog) wake_up_interruptible_all(&u->peer_wait); sk->sk_max_ack_backlog = backlog; - sk->sk_state = TCP_LISTEN; + WRITE_ONCE(sk->sk_state, TCP_LISTEN); + /* set credentials so connect can copy them */ init_peercred(sk); err = 0; @@ -757,7 +749,7 @@ static int unix_bind(struct socket *, struct sockaddr *, int); static int unix_stream_connect(struct socket *, struct sockaddr *, int addr_len, int flags); static int unix_socketpair(struct socket *, struct socket *); -static int unix_accept(struct socket *, struct socket *, int, bool); +static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg); static int unix_getname(struct socket *, struct sockaddr *, int); static __poll_t unix_poll(struct file *, struct socket *, poll_table *); static __poll_t unix_dgram_poll(struct file *, struct socket *, @@ -978,14 +970,14 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sk->sk_hash = unix_unbound_hash(sk); sk->sk_allocation = GFP_KERNEL_ACCOUNT; sk->sk_write_space = unix_write_space; - sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; + sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); sk->sk_destruct = unix_sock_destructor; u = unix_sk(sk); - u->inflight = 0; + u->listener = NULL; + u->vertex = NULL; u->path.dentry = NULL; u->path.mnt = NULL; spin_lock_init(&u->lock); - INIT_LIST_HEAD(&u->link); mutex_init(&u->iolock); /* single task reading lock */ mutex_init(&u->bindlock); /* single task binding lock */ init_waitqueue_head(&u->peer_wait); @@ -1133,8 +1125,8 @@ static struct sock *unix_find_other(struct net *net, static int unix_autobind(struct sock *sk) { - unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); + unsigned int new_hash, old_hash; struct net *net = sock_net(sk); struct unix_address *addr; u32 lastnum, ordernum; @@ -1157,6 +1149,7 @@ static int unix_autobind(struct sock *sk) addr->name->sun_family = AF_UNIX; refcount_set(&addr->refcnt, 1); + old_hash = sk->sk_hash; ordernum = get_random_u32(); lastnum = ordernum & 0xFFFFF; retry: @@ -1197,8 +1190,8 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, { umode_t mode = S_IFSOCK | (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); - unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); + unsigned int new_hash, old_hash; struct net *net = sock_net(sk); struct mnt_idmap *idmap; struct unix_address *addr; @@ -1236,6 +1229,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, if (u->addr) goto out_unlock; + old_hash = sk->sk_hash; new_hash = unix_bsd_hash(d_backing_inode(dentry)); unix_table_double_lock(net, old_hash, new_hash); u->path.mnt = mntget(parent.mnt); @@ -1263,8 +1257,8 @@ out: static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, int addr_len) { - unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); + unsigned int new_hash, old_hash; struct net *net = sock_net(sk); struct unix_address *addr; int err; @@ -1282,6 +1276,7 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, goto out_mutex; } + old_hash = sk->sk_hash; new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); unix_table_double_lock(net, old_hash, new_hash); @@ -1371,7 +1366,7 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, if ((test_bit(SOCK_PASSCRED, &sock->flags) || test_bit(SOCK_PASSPIDFD, &sock->flags)) && - !unix_sk(sk)->addr) { + !READ_ONCE(unix_sk(sk)->addr)) { err = unix_autobind(sk); if (err) goto out; @@ -1401,7 +1396,8 @@ restart: if (err) goto out_unlock; - sk->sk_state = other->sk_state = TCP_ESTABLISHED; + WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); + WRITE_ONCE(other->sk_state, TCP_ESTABLISHED); } else { /* * 1003.1g breaking connected state with AF_UNSPEC @@ -1418,13 +1414,20 @@ restart: unix_peer(sk) = other; if (!other) - sk->sk_state = TCP_CLOSE; + WRITE_ONCE(sk->sk_state, TCP_CLOSE); unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); unix_state_double_unlock(sk, other); - if (other != old_peer) + if (other != old_peer) { unix_dgram_disconnected(sk, old_peer); + + unix_state_lock(old_peer); + if (!unix_peer(old_peer)) + WRITE_ONCE(old_peer->sk_state, TCP_CLOSE); + unix_state_unlock(old_peer); + } + sock_put(old_peer); } else { unix_peer(sk) = other; @@ -1470,9 +1473,9 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, struct unix_sock *u = unix_sk(sk), *newu, *otheru; struct net *net = sock_net(sk); struct sk_buff *skb = NULL; + unsigned char state; long timeo; int err; - int st; err = unix_validate_addr(sunaddr, addr_len); if (err) @@ -1483,7 +1486,8 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, goto out; if ((test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { + test_bit(SOCK_PASSPIDFD, &sock->flags)) && + !READ_ONCE(u->addr)) { err = unix_autobind(sk); if (err) goto out; @@ -1520,7 +1524,6 @@ restart: goto out; } - /* Latch state of peer */ unix_state_lock(other); /* Apparently VFS overslept socket death. Retry. */ @@ -1536,7 +1539,7 @@ restart: if (other->sk_shutdown & RCV_SHUTDOWN) goto out_unlock; - if (unix_recvq_full(other)) { + if (unix_recvq_full_lockless(other)) { err = -EAGAIN; if (!timeo) goto out_unlock; @@ -1550,39 +1553,21 @@ restart: goto restart; } - /* Latch our state. - - It is tricky place. We need to grab our state lock and cannot - drop lock on peer. It is dangerous because deadlock is - possible. Connect to self case and simultaneous - attempt to connect are eliminated by checking socket - state. other is TCP_LISTEN, if sk is TCP_LISTEN we - check this before attempt to grab lock. - - Well, and we have to recheck the state after socket locked. + /* self connect and simultaneous connect are eliminated + * by rejecting TCP_LISTEN socket to avoid deadlock. */ - st = sk->sk_state; - - switch (st) { - case TCP_CLOSE: - /* This is ok... continue with connect */ - break; - case TCP_ESTABLISHED: - /* Socket is already connected */ - err = -EISCONN; - goto out_unlock; - default: - err = -EINVAL; + state = READ_ONCE(sk->sk_state); + if (unlikely(state != TCP_CLOSE)) { + err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; goto out_unlock; } unix_state_lock_nested(sk, U_LOCK_SECOND); - if (sk->sk_state != st) { + if (unlikely(sk->sk_state != TCP_CLOSE)) { + err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; unix_state_unlock(sk); - unix_state_unlock(other); - sock_put(other); - goto restart; + goto out_unlock; } err = security_unix_stream_connect(sk, other, newsk); @@ -1599,6 +1584,7 @@ restart: newsk->sk_type = sk->sk_type; init_peercred(newsk); newu = unix_sk(newsk); + newu->listener = other; RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); otheru = unix_sk(other); @@ -1630,7 +1616,7 @@ restart: copy_peercred(sk, other); sock->state = SS_CONNECTED; - sk->sk_state = TCP_ESTABLISHED; + WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); sock_hold(newsk); smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ @@ -1690,32 +1676,31 @@ static void unix_sock_inherit_flags(const struct socket *old, set_bit(SOCK_PASSSEC, &new->flags); } -static int unix_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +static int unix_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { struct sock *sk = sock->sk; - struct sock *tsk; struct sk_buff *skb; - int err; + struct sock *tsk; - err = -EOPNOTSUPP; + arg->err = -EOPNOTSUPP; if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) goto out; - err = -EINVAL; - if (sk->sk_state != TCP_LISTEN) + arg->err = -EINVAL; + if (READ_ONCE(sk->sk_state) != TCP_LISTEN) goto out; /* If socket state is TCP_LISTEN it cannot change (for now...), * so that no locks are necessary. */ - skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, - &err); + skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, + &arg->err); if (!skb) { /* This means receive shutdown. */ - if (err == 0) - err = -EINVAL; + if (arg->err == 0) + arg->err = -EINVAL; goto out; } @@ -1725,6 +1710,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, /* attach accepted sock to socket */ unix_state_lock(tsk); + unix_update_edges(unix_sk(tsk)); newsock->state = SS_CONNECTED; unix_sock_inherit_flags(sock, newsock); sock_graft(tsk, newsock); @@ -1732,7 +1718,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, return 0; out: - return err; + return arg->err; } @@ -1775,51 +1761,60 @@ out: return err; } +/* The "user->unix_inflight" variable is protected by the garbage + * collection lock, and we just read it locklessly here. If you go + * over the limit, there might be a tiny race in actually noticing + * it across threads. Tough. + */ +static inline bool too_many_unix_fds(struct task_struct *p) +{ + struct user_struct *user = current_user(); + + if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) + return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); + return false; +} + +static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + if (too_many_unix_fds(current)) + return -ETOOMANYREFS; + + UNIXCB(skb).fp = scm->fp; + scm->fp = NULL; + + if (unix_prepare_fpl(UNIXCB(skb).fp)) + return -ENOMEM; + + return 0; +} + +static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + scm->fp = UNIXCB(skb).fp; + UNIXCB(skb).fp = NULL; + + unix_destroy_fpl(scm->fp); +} + static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) { scm->fp = scm_fp_dup(UNIXCB(skb).fp); +} - /* - * Garbage collection of unix sockets starts by selecting a set of - * candidate sockets which have reference only from being in flight - * (total_refs == inflight_refs). This condition is checked once during - * the candidate collection phase, and candidates are marked as such, so - * that non-candidates can later be ignored. While inflight_refs is - * protected by unix_gc_lock, total_refs (file count) is not, hence this - * is an instantaneous decision. - * - * Once a candidate, however, the socket must not be reinstalled into a - * file descriptor while the garbage collection is in progress. - * - * If the above conditions are met, then the directed graph of - * candidates (*) does not change while unix_gc_lock is held. - * - * Any operations that changes the file count through file descriptors - * (dup, close, sendmsg) does not change the graph since candidates are - * not installed in fds. - * - * Dequeing a candidate via recvmsg would install it into an fd, but - * that takes unix_gc_lock to decrement the inflight count, so it's - * serialized with garbage collection. - * - * MSG_PEEK is special in that it does not change the inflight count, - * yet does install the socket into an fd. The following lock/unlock - * pair is to ensure serialization with garbage collection. It must be - * done between incrementing the file count and installing the file into - * an fd. - * - * If garbage collection starts after the barrier provided by the - * lock/unlock, then it will see the elevated refcount and not mark this - * as a candidate. If a garbage collection is already in progress - * before the file count was incremented, then the lock/unlock pair will - * ensure that garbage collection is finished before progressing to - * installing the fd. - * - * (*) A -> B where B is on the queue of A or B is on the queue of C - * which is on the queue of listening socket A. - */ - spin_lock(&unix_gc_lock); - spin_unlock(&unix_gc_lock); +static void unix_destruct_scm(struct sk_buff *skb) +{ + struct scm_cookie scm; + + memset(&scm, 0, sizeof(scm)); + scm.pid = UNIXCB(skb).pid; + if (UNIXCB(skb).fp) + unix_detach_fds(&scm, skb); + + /* Alas, it calls VFS */ + /* So fscking what? fput() had been SMP-safe since the last Summer */ + scm_destroy(&scm); + sock_wfree(skb); } static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) @@ -1878,8 +1873,10 @@ static void scm_stat_add(struct sock *sk, struct sk_buff *skb) struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); - if (unlikely(fp && fp->count)) + if (unlikely(fp && fp->count)) { atomic_add(fp->count, &u->scm_stat.nr_fds); + unix_add_edges(fp, u); + } } static void scm_stat_del(struct sock *sk, struct sk_buff *skb) @@ -1887,8 +1884,10 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb) struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); - if (unlikely(fp && fp->count)) + if (unlikely(fp && fp->count)) { atomic_sub(fp->count, &u->scm_stat.nr_fds); + unix_del_edges(fp); + } } /* @@ -1908,11 +1907,12 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, long timeo; int err; - wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); if (err < 0) return err; + wait_for_unix_gc(scm.fp); + err = -EOPNOTSUPP; if (msg->msg_flags&MSG_OOB) goto out; @@ -1937,14 +1937,15 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, } if ((test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { + test_bit(SOCK_PASSPIDFD, &sock->flags)) && + !READ_ONCE(u->addr)) { err = unix_autobind(sk); if (err) goto out; } err = -EMSGSIZE; - if (len > sk->sk_sndbuf - 32) + if (len > READ_ONCE(sk->sk_sndbuf) - 32) goto out; if (len > SKB_MAX_ALLOC) { @@ -2026,7 +2027,7 @@ restart_locked: unix_peer(sk) = NULL; unix_dgram_peer_wake_disconnect_wakeup(sk, other); - sk->sk_state = TCP_CLOSE; + WRITE_ONCE(sk->sk_state, TCP_CLOSE); unix_state_unlock(sk); unix_dgram_disconnected(sk, other); @@ -2157,13 +2158,15 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other maybe_add_creds(skb, sock, other); skb_get(skb); + scm_stat_add(other, skb); + + spin_lock(&other->sk_receive_queue.lock); if (ousk->oob_skb) consume_skb(ousk->oob_skb); - WRITE_ONCE(ousk->oob_skb, skb); + __skb_queue_tail(&other->sk_receive_queue, skb); + spin_unlock(&other->sk_receive_queue.lock); - scm_stat_add(other, skb); - skb_queue_tail(&other->sk_receive_queue, skb); sk_send_sigurg(other); unix_state_unlock(other); other->sk_data_ready(other); @@ -2184,11 +2187,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, bool fds_sent = false; int data_len; - wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); if (err < 0) return err; + wait_for_unix_gc(scm.fp); + err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) { #if IS_ENABLED(CONFIG_AF_UNIX_OOB) @@ -2200,7 +2204,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, } if (msg->msg_namelen) { - err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; + err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out_err; } else { err = -ENOTCONN; @@ -2209,7 +2213,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, goto out_err; } - if (sk->sk_shutdown & SEND_SHUTDOWN) + if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) goto pipe_err; while (sent < len) { @@ -2221,7 +2225,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, &err, 0); } else { /* Keep two messages in the pipe so it schedules better */ - size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64); + size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64); /* allow fallback to order-0 allocations */ size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); @@ -2314,7 +2318,7 @@ static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, if (err) return err; - if (sk->sk_state != TCP_ESTABLISHED) + if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) return -ENOTCONN; if (msg->msg_namelen) @@ -2328,7 +2332,7 @@ static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, { struct sock *sk = sock->sk; - if (sk->sk_state != TCP_ESTABLISHED) + if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) return -ENOTCONN; return unix_dgram_recvmsg(sock, msg, size, flags); @@ -2553,8 +2557,10 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state) mutex_lock(&u->iolock); unix_state_lock(sk); + spin_lock(&sk->sk_receive_queue.lock); if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { + spin_unlock(&sk->sk_receive_queue.lock); unix_state_unlock(sk); mutex_unlock(&u->iolock); return -EINVAL; @@ -2566,6 +2572,8 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state) WRITE_ONCE(u->oob_skb, NULL); else skb_get(oob_skb); + + spin_unlock(&sk->sk_receive_queue.lock); unix_state_unlock(sk); chunk = state->recv_actor(oob_skb, 0, chunk, state); @@ -2589,29 +2597,53 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, { struct unix_sock *u = unix_sk(sk); - if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) { - skb_unlink(skb, &sk->sk_receive_queue); - consume_skb(skb); - skb = NULL; + if (!unix_skb_len(skb)) { + struct sk_buff *unlinked_skb = NULL; + + spin_lock(&sk->sk_receive_queue.lock); + + if (copied && (!u->oob_skb || skb == u->oob_skb)) { + skb = NULL; + } else if (flags & MSG_PEEK) { + skb = skb_peek_next(skb, &sk->sk_receive_queue); + } else { + unlinked_skb = skb; + skb = skb_peek_next(skb, &sk->sk_receive_queue); + __skb_unlink(unlinked_skb, &sk->sk_receive_queue); + } + + spin_unlock(&sk->sk_receive_queue.lock); + + consume_skb(unlinked_skb); } else { + struct sk_buff *unlinked_skb = NULL; + + spin_lock(&sk->sk_receive_queue.lock); + if (skb == u->oob_skb) { if (copied) { skb = NULL; - } else if (sock_flag(sk, SOCK_URGINLINE)) { - if (!(flags & MSG_PEEK)) { + } else if (!(flags & MSG_PEEK)) { + if (sock_flag(sk, SOCK_URGINLINE)) { WRITE_ONCE(u->oob_skb, NULL); consume_skb(skb); + } else { + __skb_unlink(skb, &sk->sk_receive_queue); + WRITE_ONCE(u->oob_skb, NULL); + unlinked_skb = skb; + skb = skb_peek(&sk->sk_receive_queue); } - } else if (flags & MSG_PEEK) { - skb = NULL; - } else { - skb_unlink(skb, &sk->sk_receive_queue); - WRITE_ONCE(u->oob_skb, NULL); - if (!WARN_ON_ONCE(skb_unref(skb))) - kfree_skb(skb); - skb = skb_peek(&sk->sk_receive_queue); + } else if (!sock_flag(sk, SOCK_URGINLINE)) { + skb = skb_peek_next(skb, &sk->sk_receive_queue); } } + + spin_unlock(&sk->sk_receive_queue.lock); + + if (unlinked_skb) { + WARN_ON_ONCE(skb_unref(unlinked_skb)); + kfree_skb(unlinked_skb); + } } return skb; } @@ -2619,10 +2651,49 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { - if (unlikely(sk->sk_state != TCP_ESTABLISHED)) + struct unix_sock *u = unix_sk(sk); + struct sk_buff *skb; + int err; + + if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) return -ENOTCONN; - return unix_read_skb(sk, recv_actor); + mutex_lock(&u->iolock); + skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); + mutex_unlock(&u->iolock); + if (!skb) + return err; + +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (unlikely(skb == READ_ONCE(u->oob_skb))) { + bool drop = false; + + unix_state_lock(sk); + + if (sock_flag(sk, SOCK_DEAD)) { + unix_state_unlock(sk); + kfree_skb(skb); + return -ECONNRESET; + } + + spin_lock(&sk->sk_receive_queue.lock); + if (likely(skb == u->oob_skb)) { + WRITE_ONCE(u->oob_skb, NULL); + drop = true; + } + spin_unlock(&sk->sk_receive_queue.lock); + + unix_state_unlock(sk); + + if (drop) { + WARN_ON_ONCE(skb_unref(skb)); + kfree_skb(skb); + return -EAGAIN; + } + } +#endif + + return recv_actor(sk, skb); } static int unix_stream_read_generic(struct unix_stream_read_state *state, @@ -2643,7 +2714,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, size_t size = state->size; unsigned int last_len; - if (unlikely(sk->sk_state != TCP_ESTABLISHED)) { + if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) { err = -EINVAL; goto out; } @@ -2974,7 +3045,7 @@ long unix_inq_len(struct sock *sk) struct sk_buff *skb; long amount = 0; - if (sk->sk_state == TCP_LISTEN) + if (READ_ONCE(sk->sk_state) == TCP_LISTEN) return -EINVAL; spin_lock(&sk->sk_receive_queue.lock); @@ -3059,12 +3130,23 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) #if IS_ENABLED(CONFIG_AF_UNIX_OOB) case SIOCATMARK: { + struct unix_sock *u = unix_sk(sk); struct sk_buff *skb; int answ = 0; + mutex_lock(&u->iolock); + skb = skb_peek(&sk->sk_receive_queue); - if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb)) - answ = 1; + if (skb) { + struct sk_buff *oob_skb = READ_ONCE(u->oob_skb); + + if (skb == oob_skb || + (!oob_skb && !unix_skb_len(skb))) + answ = 1; + } + + mutex_unlock(&u->iolock); + err = put_user(answ, (int __user *)arg); } break; @@ -3086,12 +3168,14 @@ static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned lon static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; + unsigned char state; __poll_t mask; u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; shutdown = READ_ONCE(sk->sk_shutdown); + state = READ_ONCE(sk->sk_state); /* exceptional events? */ if (READ_ONCE(sk->sk_err)) @@ -3113,14 +3197,14 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa /* Connection-based need to check for termination and startup */ if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && - sk->sk_state == TCP_CLOSE) + state == TCP_CLOSE) mask |= EPOLLHUP; /* * we set writable also when the other side has shut down the * connection. This prevents stuck sockets. */ - if (unix_writable(sk)) + if (unix_writable(sk, state)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; return mask; @@ -3131,12 +3215,14 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, { struct sock *sk = sock->sk, *other; unsigned int writable; + unsigned char state; __poll_t mask; u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; shutdown = READ_ONCE(sk->sk_shutdown); + state = READ_ONCE(sk->sk_state); /* exceptional events? */ if (READ_ONCE(sk->sk_err) || @@ -3156,19 +3242,14 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ - if (sk->sk_type == SOCK_SEQPACKET) { - if (sk->sk_state == TCP_CLOSE) - mask |= EPOLLHUP; - /* connection hasn't started yet? */ - if (sk->sk_state == TCP_SYN_SENT) - return mask; - } + if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE) + mask |= EPOLLHUP; /* No write status requested, avoid expensive OUT tests. */ if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) return mask; - writable = unix_writable(sk); + writable = unix_writable(sk, state); if (writable) { unix_state_lock(sk); diff --git a/net/unix/diag.c b/net/unix/diag.c index be19827eca..937edf4afe 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -65,7 +65,7 @@ static int sk_diag_dump_icons(struct sock *sk, struct sk_buff *nlskb) u32 *buf; int i; - if (sk->sk_state == TCP_LISTEN) { + if (READ_ONCE(sk->sk_state) == TCP_LISTEN) { spin_lock(&sk->sk_receive_queue.lock); attr = nla_reserve(nlskb, UNIX_DIAG_ICONS, @@ -103,8 +103,8 @@ static int sk_diag_show_rqlen(struct sock *sk, struct sk_buff *nlskb) { struct unix_diag_rqlen rql; - if (sk->sk_state == TCP_LISTEN) { - rql.udiag_rqueue = sk->sk_receive_queue.qlen; + if (READ_ONCE(sk->sk_state) == TCP_LISTEN) { + rql.udiag_rqueue = skb_queue_len_lockless(&sk->sk_receive_queue); rql.udiag_wqueue = sk->sk_max_ack_backlog; } else { rql.udiag_rqueue = (u32) unix_inq_len(sk); @@ -136,7 +136,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r rep = nlmsg_data(nlh); rep->udiag_family = AF_UNIX; rep->udiag_type = sk->sk_type; - rep->udiag_state = sk->sk_state; + rep->udiag_state = READ_ONCE(sk->sk_state); rep->pad = 0; rep->udiag_ino = sk_ino; sock_diag_save_cookie(sk, rep->udiag_cookie); @@ -165,7 +165,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r sock_diag_put_meminfo(sk, skb, UNIX_DIAG_MEMINFO)) goto out_nlmsg_trim; - if (nla_put_u8(skb, UNIX_DIAG_SHUTDOWN, sk->sk_shutdown)) + if (nla_put_u8(skb, UNIX_DIAG_SHUTDOWN, READ_ONCE(sk->sk_shutdown))) goto out_nlmsg_trim; if ((req->udiag_show & UDIAG_SHOW_UID) && @@ -215,7 +215,7 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) sk_for_each(sk, &net->unx.table.buckets[slot]) { if (num < s_num) goto next; - if (!(req->udiag_states & (1 << sk->sk_state))) + if (!(req->udiag_states & (1 << READ_ONCE(sk->sk_state)))) goto next; if (sk_diag_dump(sk, skb, req, sk_user_ns(skb->sk), NETLINK_CB(cb->skb).portid, @@ -322,6 +322,7 @@ static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) } static const struct sock_diag_handler unix_diag_handler = { + .owner = THIS_MODULE, .family = AF_UNIX, .dump = unix_diag_handler_dump, }; diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 2a758531e1..23efb78fe9 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -81,278 +81,551 @@ #include <net/scm.h> #include <net/tcp_states.h> -#include "scm.h" +struct unix_sock *unix_get_socket(struct file *filp) +{ + struct inode *inode = file_inode(filp); + + /* Socket ? */ + if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { + struct socket *sock = SOCKET_I(inode); + const struct proto_ops *ops; + struct sock *sk = sock->sk; -/* Internal data structures and random procedures: */ + ops = READ_ONCE(sock->ops); -static LIST_HEAD(gc_candidates); -static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait); + /* PF_UNIX ? */ + if (sk && ops && ops->family == PF_UNIX) + return unix_sk(sk); + } -static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), - struct sk_buff_head *hitlist) + return NULL; +} + +static struct unix_vertex *unix_edge_successor(struct unix_edge *edge) { - struct sk_buff *skb; - struct sk_buff *next; - - spin_lock(&x->sk_receive_queue.lock); - skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { - /* Do we have file descriptors ? */ - if (UNIXCB(skb).fp) { - bool hit = false; - /* Process the descriptors of this socket */ - int nfd = UNIXCB(skb).fp->count; - struct file **fp = UNIXCB(skb).fp->fp; - - while (nfd--) { - /* Get the socket the fd matches if it indeed does so */ - struct sock *sk = unix_get_socket(*fp++); - - if (sk) { - struct unix_sock *u = unix_sk(sk); - - /* Ignore non-candidates, they could - * have been added to the queues after - * starting the garbage collection - */ - if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { - hit = true; - - func(u); - } - } - } - if (hit && hitlist != NULL) { - __skb_unlink(skb, &x->sk_receive_queue); - __skb_queue_tail(hitlist, skb); - } - } + /* If an embryo socket has a fd, + * the listener indirectly holds the fd's refcnt. + */ + if (edge->successor->listener) + return unix_sk(edge->successor->listener)->vertex; + + return edge->successor->vertex; +} + +static bool unix_graph_maybe_cyclic; +static bool unix_graph_grouped; + +static void unix_update_graph(struct unix_vertex *vertex) +{ + /* If the receiver socket is not inflight, no cyclic + * reference could be formed. + */ + if (!vertex) + return; + + unix_graph_maybe_cyclic = true; + unix_graph_grouped = false; +} + +static LIST_HEAD(unix_unvisited_vertices); + +enum unix_vertex_index { + UNIX_VERTEX_INDEX_MARK1, + UNIX_VERTEX_INDEX_MARK2, + UNIX_VERTEX_INDEX_START, +}; + +static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1; + +static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) +{ + struct unix_vertex *vertex = edge->predecessor->vertex; + + if (!vertex) { + vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry); + vertex->index = unix_vertex_unvisited_index; + vertex->out_degree = 0; + INIT_LIST_HEAD(&vertex->edges); + INIT_LIST_HEAD(&vertex->scc_entry); + + list_move_tail(&vertex->entry, &unix_unvisited_vertices); + edge->predecessor->vertex = vertex; } - spin_unlock(&x->sk_receive_queue.lock); + + vertex->out_degree++; + list_add_tail(&edge->vertex_entry, &vertex->edges); + + unix_update_graph(unix_edge_successor(edge)); } -static void scan_children(struct sock *x, void (*func)(struct unix_sock *), - struct sk_buff_head *hitlist) +static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge) { - if (x->sk_state != TCP_LISTEN) { - scan_inflight(x, func, hitlist); - } else { - struct sk_buff *skb; - struct sk_buff *next; - struct unix_sock *u; - LIST_HEAD(embryos); + struct unix_vertex *vertex = edge->predecessor->vertex; - /* For a listening socket collect the queued embryos - * and perform a scan on them as well. - */ - spin_lock(&x->sk_receive_queue.lock); - skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { - u = unix_sk(skb->sk); + if (!fpl->dead) + unix_update_graph(unix_edge_successor(edge)); - /* An embryo cannot be in-flight, so it's safe - * to use the list link. - */ - BUG_ON(!list_empty(&u->link)); - list_add_tail(&u->link, &embryos); - } - spin_unlock(&x->sk_receive_queue.lock); + list_del(&edge->vertex_entry); + vertex->out_degree--; - while (!list_empty(&embryos)) { - u = list_entry(embryos.next, struct unix_sock, link); - scan_inflight(&u->sk, func, hitlist); - list_del_init(&u->link); - } + if (!vertex->out_degree) { + edge->predecessor->vertex = NULL; + list_move_tail(&vertex->entry, &fpl->vertices); } } -static void dec_inflight(struct unix_sock *usk) +static void unix_free_vertices(struct scm_fp_list *fpl) { - usk->inflight--; + struct unix_vertex *vertex, *next_vertex; + + list_for_each_entry_safe(vertex, next_vertex, &fpl->vertices, entry) { + list_del(&vertex->entry); + kfree(vertex); + } } -static void inc_inflight(struct unix_sock *usk) +static DEFINE_SPINLOCK(unix_gc_lock); +unsigned int unix_tot_inflight; + +void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) { - usk->inflight++; + int i = 0, j = 0; + + spin_lock(&unix_gc_lock); + + if (!fpl->count_unix) + goto out; + + do { + struct unix_sock *inflight = unix_get_socket(fpl->fp[j++]); + struct unix_edge *edge; + + if (!inflight) + continue; + + edge = fpl->edges + i++; + edge->predecessor = inflight; + edge->successor = receiver; + + unix_add_edge(fpl, edge); + } while (i < fpl->count_unix); + + receiver->scm_stat.nr_unix_fds += fpl->count_unix; + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix); +out: + WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count); + + spin_unlock(&unix_gc_lock); + + fpl->inflight = true; + + unix_free_vertices(fpl); } -static void inc_inflight_move_tail(struct unix_sock *u) +void unix_del_edges(struct scm_fp_list *fpl) { - u->inflight++; + struct unix_sock *receiver; + int i = 0; + + spin_lock(&unix_gc_lock); - /* If this still might be part of a cycle, move it to the end - * of the list, so that it's checked even if it was already - * passed over + if (!fpl->count_unix) + goto out; + + do { + struct unix_edge *edge = fpl->edges + i++; + + unix_del_edge(fpl, edge); + } while (i < fpl->count_unix); + + if (!fpl->dead) { + receiver = fpl->edges[0].successor; + receiver->scm_stat.nr_unix_fds -= fpl->count_unix; + } + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix); +out: + WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count); + + spin_unlock(&unix_gc_lock); + + fpl->inflight = false; +} + +void unix_update_edges(struct unix_sock *receiver) +{ + /* nr_unix_fds is only updated under unix_state_lock(). + * If it's 0 here, the embryo socket is not part of the + * inflight graph, and GC will not see it, so no lock needed. */ - if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags)) - list_move_tail(&u->link, &gc_candidates); + if (!receiver->scm_stat.nr_unix_fds) { + receiver->listener = NULL; + } else { + spin_lock(&unix_gc_lock); + unix_update_graph(unix_sk(receiver->listener)->vertex); + receiver->listener = NULL; + spin_unlock(&unix_gc_lock); + } } -static bool gc_in_progress; -#define UNIX_INFLIGHT_TRIGGER_GC 16000 +int unix_prepare_fpl(struct scm_fp_list *fpl) +{ + struct unix_vertex *vertex; + int i; + + if (!fpl->count_unix) + return 0; + + for (i = 0; i < fpl->count_unix; i++) { + vertex = kmalloc(sizeof(*vertex), GFP_KERNEL); + if (!vertex) + goto err; + + list_add(&vertex->entry, &fpl->vertices); + } + + fpl->edges = kvmalloc_array(fpl->count_unix, sizeof(*fpl->edges), + GFP_KERNEL_ACCOUNT); + if (!fpl->edges) + goto err; + + return 0; -void wait_for_unix_gc(void) +err: + unix_free_vertices(fpl); + return -ENOMEM; +} + +void unix_destroy_fpl(struct scm_fp_list *fpl) { - /* If number of inflight sockets is insane, - * force a garbage collect right now. - * Paired with the WRITE_ONCE() in unix_inflight(), - * unix_notinflight() and gc_in_progress(). - */ - if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && - !READ_ONCE(gc_in_progress)) - unix_gc(); - wait_event(unix_gc_wait, !READ_ONCE(gc_in_progress)); + if (fpl->inflight) + unix_del_edges(fpl); + + kvfree(fpl->edges); + unix_free_vertices(fpl); } -/* The external entry point: unix_gc() */ -void unix_gc(void) +static bool unix_vertex_dead(struct unix_vertex *vertex) { - struct sk_buff *next_skb, *skb; + struct unix_edge *edge; struct unix_sock *u; - struct unix_sock *next; - struct sk_buff_head hitlist; - struct list_head cursor; - LIST_HEAD(not_cycle_list); + long total_ref; - spin_lock(&unix_gc_lock); + list_for_each_entry(edge, &vertex->edges, vertex_entry) { + struct unix_vertex *next_vertex = unix_edge_successor(edge); - /* Avoid a recursive GC. */ - if (gc_in_progress) - goto out; + /* The vertex's fd can be received by a non-inflight socket. */ + if (!next_vertex) + return false; - /* Paired with READ_ONCE() in wait_for_unix_gc(). */ - WRITE_ONCE(gc_in_progress, true); + /* The vertex's fd can be received by an inflight socket in + * another SCC. + */ + if (next_vertex->scc_index != vertex->scc_index) + return false; + } - /* First, select candidates for garbage collection. Only - * in-flight sockets are considered, and from those only ones - * which don't have any external reference. - * - * Holding unix_gc_lock will protect these candidates from - * being detached, and hence from gaining an external - * reference. Since there are no possible receivers, all - * buffers currently on the candidates' queues stay there - * during the garbage collection. - * - * We also know that no new candidate can be added onto the - * receive queues. Other, non candidate sockets _can_ be - * added to queue, so we must make sure only to touch - * candidates. - * - * Embryos, though never candidates themselves, affect which - * candidates are reachable by the garbage collector. Before - * being added to a listener's queue, an embryo may already - * receive data carrying SCM_RIGHTS, potentially making the - * passed socket a candidate that is not yet reachable by the - * collector. It becomes reachable once the embryo is - * enqueued. Therefore, we must ensure that no SCM-laden - * embryo appears in a (candidate) listener's queue between - * consecutive scan_children() calls. - */ - list_for_each_entry_safe(u, next, &gc_inflight_list, link) { - struct sock *sk = &u->sk; - long total_refs; - - total_refs = file_count(sk->sk_socket->file); - - BUG_ON(!u->inflight); - BUG_ON(total_refs < u->inflight); - if (total_refs == u->inflight) { - list_move_tail(&u->link, &gc_candidates); - __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); - __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); - - if (sk->sk_state == TCP_LISTEN) { - unix_state_lock_nested(sk, U_LOCK_GC_LISTENER); - unix_state_unlock(sk); + /* No receiver exists out of the same SCC. */ + + edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); + u = edge->predecessor; + total_ref = file_count(u->sk.sk_socket->file); + + /* If not close()d, total_ref > out_degree. */ + if (total_ref != vertex->out_degree) + return false; + + return true; +} + +enum unix_recv_queue_lock_class { + U_RECVQ_LOCK_NORMAL, + U_RECVQ_LOCK_EMBRYO, +}; + +static void unix_collect_queue(struct unix_sock *u, struct sk_buff_head *hitlist) +{ + skb_queue_splice_init(&u->sk.sk_receive_queue, hitlist); + +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (u->oob_skb) { + WARN_ON_ONCE(skb_unref(u->oob_skb)); + u->oob_skb = NULL; + } +#endif +} + +static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist) +{ + struct unix_vertex *vertex; + + list_for_each_entry_reverse(vertex, scc, scc_entry) { + struct sk_buff_head *queue; + struct unix_edge *edge; + struct unix_sock *u; + + edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); + u = edge->predecessor; + queue = &u->sk.sk_receive_queue; + + spin_lock(&queue->lock); + + if (u->sk.sk_state == TCP_LISTEN) { + struct sk_buff *skb; + + skb_queue_walk(queue, skb) { + struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue; + + /* listener -> embryo order, the inversion never happens. */ + spin_lock_nested(&embryo_queue->lock, U_RECVQ_LOCK_EMBRYO); + unix_collect_queue(unix_sk(skb->sk), hitlist); + spin_unlock(&embryo_queue->lock); } + } else { + unix_collect_queue(u, hitlist); } + + spin_unlock(&queue->lock); } +} - /* Now remove all internal in-flight reference to children of - * the candidates. - */ - list_for_each_entry(u, &gc_candidates, link) - scan_children(&u->sk, dec_inflight, NULL); +static bool unix_scc_cyclic(struct list_head *scc) +{ + struct unix_vertex *vertex; + struct unix_edge *edge; - /* Restore the references for children of all candidates, - * which have remaining references. Do this recursively, so - * only those remain, which form cyclic references. - * - * Use a "cursor" link, to make the list traversal safe, even - * though elements might be moved about. + /* SCC containing multiple vertices ? */ + if (!list_is_singular(scc)) + return true; + + vertex = list_first_entry(scc, typeof(*vertex), scc_entry); + + /* Self-reference or a embryo-listener circle ? */ + list_for_each_entry(edge, &vertex->edges, vertex_entry) { + if (unix_edge_successor(edge) == vertex) + return true; + } + + return false; +} + +static LIST_HEAD(unix_visited_vertices); +static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; + +static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index, + struct sk_buff_head *hitlist) +{ + LIST_HEAD(vertex_stack); + struct unix_edge *edge; + LIST_HEAD(edge_stack); + +next_vertex: + /* Push vertex to vertex_stack and mark it as on-stack + * (index >= UNIX_VERTEX_INDEX_START). + * The vertex will be popped when finalising SCC later. */ - list_add(&cursor, &gc_candidates); - while (cursor.next != &gc_candidates) { - u = list_entry(cursor.next, struct unix_sock, link); + list_add(&vertex->scc_entry, &vertex_stack); + + vertex->index = *last_index; + vertex->scc_index = *last_index; + (*last_index)++; + + /* Explore neighbour vertices (receivers of the current vertex's fd). */ + list_for_each_entry(edge, &vertex->edges, vertex_entry) { + struct unix_vertex *next_vertex = unix_edge_successor(edge); + + if (!next_vertex) + continue; + + if (next_vertex->index == unix_vertex_unvisited_index) { + /* Iterative deepening depth first search + * + * 1. Push a forward edge to edge_stack and set + * the successor to vertex for the next iteration. + */ + list_add(&edge->stack_entry, &edge_stack); - /* Move cursor to after the current position. */ - list_move(&cursor, &u->link); + vertex = next_vertex; + goto next_vertex; - if (u->inflight) { - list_move_tail(&u->link, ¬_cycle_list); - __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); - scan_children(&u->sk, inc_inflight_move_tail, NULL); + /* 2. Pop the edge directed to the current vertex + * and restore the ancestor for backtracking. + */ +prev_vertex: + edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry); + list_del_init(&edge->stack_entry); + + next_vertex = vertex; + vertex = edge->predecessor->vertex; + + /* If the successor has a smaller scc_index, two vertices + * are in the same SCC, so propagate the smaller scc_index + * to skip SCC finalisation. + */ + vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); + } else if (next_vertex->index != unix_vertex_grouped_index) { + /* Loop detected by a back/cross edge. + * + * The successor is on vertex_stack, so two vertices are in + * the same SCC. If the successor has a smaller *scc_index*, + * propagate it to skip SCC finalisation. + */ + vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); + } else { + /* The successor was already grouped as another SCC */ } } - list_del(&cursor); - /* Now gc_candidates contains only garbage. Restore original - * inflight counters for these as well, and remove the skbuffs - * which are creating the cycle(s). - */ - skb_queue_head_init(&hitlist); - list_for_each_entry(u, &gc_candidates, link) { - scan_children(&u->sk, inc_inflight, &hitlist); + if (vertex->index == vertex->scc_index) { + struct unix_vertex *v; + struct list_head scc; + bool scc_dead = true; -#if IS_ENABLED(CONFIG_AF_UNIX_OOB) - if (u->oob_skb) { - kfree_skb(u->oob_skb); - u->oob_skb = NULL; + /* SCC finalised. + * + * If the scc_index was not updated, all the vertices above on + * vertex_stack are in the same SCC. Group them using scc_entry. + */ + __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry); + + list_for_each_entry_reverse(v, &scc, scc_entry) { + /* Don't restart DFS from this vertex in unix_walk_scc(). */ + list_move_tail(&v->entry, &unix_visited_vertices); + + /* Mark vertex as off-stack. */ + v->index = unix_vertex_grouped_index; + + if (scc_dead) + scc_dead = unix_vertex_dead(v); } -#endif + + if (scc_dead) + unix_collect_skb(&scc, hitlist); + else if (!unix_graph_maybe_cyclic) + unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); + + list_del(&scc); } - /* not_cycle_list contains those sockets which do not make up a - * cycle. Restore these to the inflight list. + /* Need backtracking ? */ + if (!list_empty(&edge_stack)) + goto prev_vertex; +} + +static void unix_walk_scc(struct sk_buff_head *hitlist) +{ + unsigned long last_index = UNIX_VERTEX_INDEX_START; + + unix_graph_maybe_cyclic = false; + + /* Visit every vertex exactly once. + * __unix_walk_scc() moves visited vertices to unix_visited_vertices. */ - while (!list_empty(¬_cycle_list)) { - u = list_entry(not_cycle_list.next, struct unix_sock, link); - __clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags); - list_move_tail(&u->link, &gc_inflight_list); + while (!list_empty(&unix_unvisited_vertices)) { + struct unix_vertex *vertex; + + vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); + __unix_walk_scc(vertex, &last_index, hitlist); } - spin_unlock(&unix_gc_lock); + list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); + swap(unix_vertex_unvisited_index, unix_vertex_grouped_index); - /* We need io_uring to clean its registered files, ignore all io_uring - * originated skbs. It's fine as io_uring doesn't keep references to - * other io_uring instances and so killing all other files in the cycle - * will put all io_uring references forcing it to go through normal - * release.path eventually putting registered files. - */ - skb_queue_walk_safe(&hitlist, skb, next_skb) { - if (skb->destructor == io_uring_destruct_scm) { - __skb_unlink(skb, &hitlist); - skb_queue_tail(&skb->sk->sk_receive_queue, skb); + unix_graph_grouped = true; +} + +static void unix_walk_scc_fast(struct sk_buff_head *hitlist) +{ + unix_graph_maybe_cyclic = false; + + while (!list_empty(&unix_unvisited_vertices)) { + struct unix_vertex *vertex; + struct list_head scc; + bool scc_dead = true; + + vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); + list_add(&scc, &vertex->scc_entry); + + list_for_each_entry_reverse(vertex, &scc, scc_entry) { + list_move_tail(&vertex->entry, &unix_visited_vertices); + + if (scc_dead) + scc_dead = unix_vertex_dead(vertex); } + + if (scc_dead) + unix_collect_skb(&scc, hitlist); + else if (!unix_graph_maybe_cyclic) + unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); + + list_del(&scc); } - /* Here we are. Hitlist is filled. Die. */ - __skb_queue_purge(&hitlist); + list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); +} + +static bool gc_in_progress; + +static void __unix_gc(struct work_struct *work) +{ + struct sk_buff_head hitlist; + struct sk_buff *skb; spin_lock(&unix_gc_lock); - /* There could be io_uring registered files, just push them back to - * the inflight list - */ - list_for_each_entry_safe(u, next, &gc_candidates, link) - list_move_tail(&u->link, &gc_inflight_list); + if (!unix_graph_maybe_cyclic) { + spin_unlock(&unix_gc_lock); + goto skip_gc; + } + + __skb_queue_head_init(&hitlist); + + if (unix_graph_grouped) + unix_walk_scc_fast(&hitlist); + else + unix_walk_scc(&hitlist); - /* All candidates should have been detached by now. */ - BUG_ON(!list_empty(&gc_candidates)); + spin_unlock(&unix_gc_lock); + + skb_queue_walk(&hitlist, skb) { + if (UNIXCB(skb).fp) + UNIXCB(skb).fp->dead = true; + } - /* Paired with READ_ONCE() in wait_for_unix_gc(). */ + __skb_queue_purge(&hitlist); +skip_gc: WRITE_ONCE(gc_in_progress, false); +} - wake_up(&unix_gc_wait); +static DECLARE_WORK(unix_gc_work, __unix_gc); - out: - spin_unlock(&unix_gc_lock); +void unix_gc(void) +{ + WRITE_ONCE(gc_in_progress, true); + queue_work(system_unbound_wq, &unix_gc_work); +} + +#define UNIX_INFLIGHT_TRIGGER_GC 16000 +#define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8) + +void wait_for_unix_gc(struct scm_fp_list *fpl) +{ + /* If number of inflight sockets is insane, + * force a garbage collect right now. + * + * Paired with the WRITE_ONCE() in unix_inflight(), + * unix_notinflight(), and __unix_gc(). + */ + if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && + !READ_ONCE(gc_in_progress)) + unix_gc(); + + /* Penalise users who want to send AF_UNIX sockets + * but whose sockets have not been received yet. + */ + if (!fpl || !fpl->count_unix || + READ_ONCE(fpl->user->unix_inflight) < UNIX_INFLIGHT_SANE_USER) + return; + + if (READ_ONCE(gc_in_progress)) + flush_work(&unix_gc_work); } diff --git a/net/unix/scm.c b/net/unix/scm.c deleted file mode 100644 index e92f2fad64..0000000000 --- a/net/unix/scm.c +++ /dev/null @@ -1,161 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/socket.h> -#include <linux/net.h> -#include <linux/fs.h> -#include <net/af_unix.h> -#include <net/scm.h> -#include <linux/init.h> -#include <linux/io_uring.h> - -#include "scm.h" - -unsigned int unix_tot_inflight; -EXPORT_SYMBOL(unix_tot_inflight); - -LIST_HEAD(gc_inflight_list); -EXPORT_SYMBOL(gc_inflight_list); - -DEFINE_SPINLOCK(unix_gc_lock); -EXPORT_SYMBOL(unix_gc_lock); - -struct sock *unix_get_socket(struct file *filp) -{ - struct sock *u_sock = NULL; - struct inode *inode = file_inode(filp); - - /* Socket ? */ - if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { - struct socket *sock = SOCKET_I(inode); - const struct proto_ops *ops = READ_ONCE(sock->ops); - struct sock *s = sock->sk; - - /* PF_UNIX ? */ - if (s && ops && ops->family == PF_UNIX) - u_sock = s; - } - - return u_sock; -} -EXPORT_SYMBOL(unix_get_socket); - -/* Keep the number of times in flight count for the file - * descriptor if it is for an AF_UNIX socket. - */ -void unix_inflight(struct user_struct *user, struct file *fp) -{ - struct sock *s = unix_get_socket(fp); - - spin_lock(&unix_gc_lock); - - if (s) { - struct unix_sock *u = unix_sk(s); - - if (!u->inflight) { - BUG_ON(!list_empty(&u->link)); - list_add_tail(&u->link, &gc_inflight_list); - } else { - BUG_ON(list_empty(&u->link)); - } - u->inflight++; - /* Paired with READ_ONCE() in wait_for_unix_gc() */ - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); - } - WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1); - spin_unlock(&unix_gc_lock); -} - -void unix_notinflight(struct user_struct *user, struct file *fp) -{ - struct sock *s = unix_get_socket(fp); - - spin_lock(&unix_gc_lock); - - if (s) { - struct unix_sock *u = unix_sk(s); - - BUG_ON(!u->inflight); - BUG_ON(list_empty(&u->link)); - - u->inflight--; - if (!u->inflight) - list_del_init(&u->link); - /* Paired with READ_ONCE() in wait_for_unix_gc() */ - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); - } - WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1); - spin_unlock(&unix_gc_lock); -} - -/* - * The "user->unix_inflight" variable is protected by the garbage - * collection lock, and we just read it locklessly here. If you go - * over the limit, there might be a tiny race in actually noticing - * it across threads. Tough. - */ -static inline bool too_many_unix_fds(struct task_struct *p) -{ - struct user_struct *user = current_user(); - - if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) - return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); - return false; -} - -int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) -{ - int i; - - if (too_many_unix_fds(current)) - return -ETOOMANYREFS; - - /* - * Need to duplicate file references for the sake of garbage - * collection. Otherwise a socket in the fps might become a - * candidate for GC while the skb is not yet queued. - */ - UNIXCB(skb).fp = scm_fp_dup(scm->fp); - if (!UNIXCB(skb).fp) - return -ENOMEM; - - for (i = scm->fp->count - 1; i >= 0; i--) - unix_inflight(scm->fp->user, scm->fp->fp[i]); - return 0; -} -EXPORT_SYMBOL(unix_attach_fds); - -void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) -{ - int i; - - scm->fp = UNIXCB(skb).fp; - UNIXCB(skb).fp = NULL; - - for (i = scm->fp->count-1; i >= 0; i--) - unix_notinflight(scm->fp->user, scm->fp->fp[i]); -} -EXPORT_SYMBOL(unix_detach_fds); - -void unix_destruct_scm(struct sk_buff *skb) -{ - struct scm_cookie scm; - - memset(&scm, 0, sizeof(scm)); - scm.pid = UNIXCB(skb).pid; - if (UNIXCB(skb).fp) - unix_detach_fds(&scm, skb); - - /* Alas, it calls VFS */ - /* So fscking what? fput() had been SMP-safe since the last Summer */ - scm_destroy(&scm); - sock_wfree(skb); -} -EXPORT_SYMBOL(unix_destruct_scm); - -void io_uring_destruct_scm(struct sk_buff *skb) -{ - unix_destruct_scm(skb); -} -EXPORT_SYMBOL(io_uring_destruct_scm); diff --git a/net/unix/scm.h b/net/unix/scm.h deleted file mode 100644 index 5a255a477f..0000000000 --- a/net/unix/scm.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef NET_UNIX_SCM_H -#define NET_UNIX_SCM_H - -extern struct list_head gc_inflight_list; -extern spinlock_t unix_gc_lock; - -int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb); -void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb); - -#endif diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c index 3e84b31c35..357b3e5f38 100644 --- a/net/unix/sysctl_net_unix.c +++ b/net/unix/sysctl_net_unix.c @@ -19,7 +19,6 @@ static struct ctl_table unix_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { } }; int __net_init unix_sysctl_register(struct net *net) @@ -52,7 +51,7 @@ err_alloc: void unix_sysctl_unregister(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->unx.ctl->ctl_table_arg; unregister_net_sysctl_table(net->unx.ctl); diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c index bd84785bf8..bca2d86ba9 100644 --- a/net/unix/unix_bpf.c +++ b/net/unix/unix_bpf.c @@ -54,6 +54,9 @@ static int unix_bpf_recvmsg(struct sock *sk, struct msghdr *msg, struct sk_psock *psock; int copied; + if (flags & MSG_OOB) + return -EOPNOTSUPP; + if (!len) return 0; diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 54ba7316f8..4b040285aa 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1500,8 +1500,8 @@ out: return err; } -static int vsock_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +static int vsock_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { struct sock *listener; int err; @@ -1528,7 +1528,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags, /* Wait for children sockets to appear; these are the new sockets * created upon connection establishment. */ - timeout = sock_rcvtimeo(listener, flags & O_NONBLOCK); + timeout = sock_rcvtimeo(listener, arg->flags & O_NONBLOCK); prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); while ((connected = vsock_dequeue_accept(listener)) == NULL && diff --git a/net/vmw_vsock/diag.c b/net/vmw_vsock/diag.c index 2e29994f92..ab87ef66c1 100644 --- a/net/vmw_vsock/diag.c +++ b/net/vmw_vsock/diag.c @@ -157,6 +157,7 @@ static int vsock_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) } static const struct sock_diag_handler vsock_diag_handler = { + .owner = THIS_MODULE, .family = AF_VSOCK, .dump = vsock_diag_handler_dump, }; diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index ee5d306a96..43d4052988 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -859,7 +859,6 @@ static struct virtio_driver virtio_vsock_driver = { .feature_table = features, .feature_table_size = ARRAY_SIZE(features), .driver.name = KBUILD_MODNAME, - .driver.owner = THIS_MODULE, .id_table = id_table, .probe = virtio_vsock_probe, .remove = virtio_vsock_remove, diff --git a/net/wireless/Makefile b/net/wireless/Makefile index 72074fd36d..1d49cc8b6d 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -25,7 +25,7 @@ ifneq ($(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR),) cfg80211-y += extra-certs.o endif -$(obj)/shipped-certs.c: $(sort $(wildcard $(srctree)/$(src)/certs/*.hex)) +$(obj)/shipped-certs.c: $(sort $(wildcard $(src)/certs/*.hex)) @$(kecho) " GEN $@" $(Q)(echo '#include "reg.h"'; \ echo 'const u8 shipped_regdb_certs[] = {'; \ diff --git a/net/wireless/chan.c b/net/wireless/chan.c index ceb9174c5c..3414b2c3ab 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -6,7 +6,7 @@ * * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright 2018-2023 Intel Corporation + * Copyright 2018-2024 Intel Corporation */ #include <linux/export.h> @@ -27,11 +27,10 @@ void cfg80211_chandef_create(struct cfg80211_chan_def *chandef, if (WARN_ON(!chan)) return; - chandef->chan = chan; - chandef->freq1_offset = chan->freq_offset; - chandef->center_freq2 = 0; - chandef->edmg.bw_config = 0; - chandef->edmg.channels = 0; + *chandef = (struct cfg80211_chan_def) { + .chan = chan, + .freq1_offset = chan->freq_offset, + }; switch (chan_type) { case NL80211_CHAN_NO_HT: @@ -56,6 +55,73 @@ void cfg80211_chandef_create(struct cfg80211_chan_def *chandef, } EXPORT_SYMBOL(cfg80211_chandef_create); +struct cfg80211_per_bw_puncturing_values { + u8 len; + const u16 *valid_values; +}; + +static const u16 puncturing_values_80mhz[] = { + 0x8, 0x4, 0x2, 0x1 +}; + +static const u16 puncturing_values_160mhz[] = { + 0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1, 0xc0, 0x30, 0xc, 0x3 +}; + +static const u16 puncturing_values_320mhz[] = { + 0xc000, 0x3000, 0xc00, 0x300, 0xc0, 0x30, 0xc, 0x3, 0xf000, 0xf00, + 0xf0, 0xf, 0xfc00, 0xf300, 0xf0c0, 0xf030, 0xf00c, 0xf003, 0xc00f, + 0x300f, 0xc0f, 0x30f, 0xcf, 0x3f +}; + +#define CFG80211_PER_BW_VALID_PUNCTURING_VALUES(_bw) \ + { \ + .len = ARRAY_SIZE(puncturing_values_ ## _bw ## mhz), \ + .valid_values = puncturing_values_ ## _bw ## mhz \ + } + +static const struct cfg80211_per_bw_puncturing_values per_bw_puncturing[] = { + CFG80211_PER_BW_VALID_PUNCTURING_VALUES(80), + CFG80211_PER_BW_VALID_PUNCTURING_VALUES(160), + CFG80211_PER_BW_VALID_PUNCTURING_VALUES(320) +}; + +static bool valid_puncturing_bitmap(const struct cfg80211_chan_def *chandef) +{ + u32 idx, i, start_freq, primary_center = chandef->chan->center_freq; + + switch (chandef->width) { + case NL80211_CHAN_WIDTH_80: + idx = 0; + start_freq = chandef->center_freq1 - 40; + break; + case NL80211_CHAN_WIDTH_160: + idx = 1; + start_freq = chandef->center_freq1 - 80; + break; + case NL80211_CHAN_WIDTH_320: + idx = 2; + start_freq = chandef->center_freq1 - 160; + break; + default: + return chandef->punctured == 0; + } + + if (!chandef->punctured) + return true; + + /* check if primary channel is punctured */ + if (chandef->punctured & (u16)BIT((primary_center - start_freq) / 20)) + return false; + + for (i = 0; i < per_bw_puncturing[idx].len; i++) { + if (per_bw_puncturing[idx].valid_values[i] == chandef->punctured) + return true; + } + + return false; +} + static bool cfg80211_edmg_chandef_valid(const struct cfg80211_chan_def *chandef) { int max_contiguous = 0; @@ -317,72 +383,81 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) !cfg80211_edmg_chandef_valid(chandef)) return false; - return true; + return valid_puncturing_bitmap(chandef); } EXPORT_SYMBOL(cfg80211_chandef_valid); -static void chandef_primary_freqs(const struct cfg80211_chan_def *c, - u32 *pri40, u32 *pri80, u32 *pri160) +int cfg80211_chandef_primary(const struct cfg80211_chan_def *c, + enum nl80211_chan_width primary_chan_width, + u16 *punctured) { - int tmp; + int pri_width = nl80211_chan_width_to_mhz(primary_chan_width); + int width = cfg80211_chandef_get_width(c); + u32 control = c->chan->center_freq; + u32 center = c->center_freq1; + u16 _punct = 0; - switch (c->width) { - case NL80211_CHAN_WIDTH_40: - *pri40 = c->center_freq1; - *pri80 = 0; - *pri160 = 0; - break; - case NL80211_CHAN_WIDTH_80: - case NL80211_CHAN_WIDTH_80P80: - *pri160 = 0; - *pri80 = c->center_freq1; - /* n_P20 */ - tmp = (30 + c->chan->center_freq - c->center_freq1)/20; - /* n_P40 */ - tmp /= 2; - /* freq_P40 */ - *pri40 = c->center_freq1 - 20 + 40 * tmp; - break; - case NL80211_CHAN_WIDTH_160: - *pri160 = c->center_freq1; - /* n_P20 */ - tmp = (70 + c->chan->center_freq - c->center_freq1)/20; - /* n_P40 */ - tmp /= 2; - /* freq_P40 */ - *pri40 = c->center_freq1 - 60 + 40 * tmp; - /* n_P80 */ - tmp /= 2; - *pri80 = c->center_freq1 - 40 + 80 * tmp; - break; - case NL80211_CHAN_WIDTH_320: - /* n_P20 */ - tmp = (150 + c->chan->center_freq - c->center_freq1) / 20; - /* n_P40 */ - tmp /= 2; - /* freq_P40 */ - *pri40 = c->center_freq1 - 140 + 40 * tmp; - /* n_P80 */ - tmp /= 2; - *pri80 = c->center_freq1 - 120 + 80 * tmp; - /* n_P160 */ - tmp /= 2; - *pri160 = c->center_freq1 - 80 + 160 * tmp; - break; - default: - WARN_ON_ONCE(1); + if (WARN_ON_ONCE(pri_width < 0 || width < 0)) + return -1; + + /* not intended to be called this way, can't determine */ + if (WARN_ON_ONCE(pri_width > width)) + return -1; + + if (!punctured) + punctured = &_punct; + + *punctured = c->punctured; + + while (width > pri_width) { + unsigned int bits_to_drop = width / 20 / 2; + + if (control > center) { + center += width / 4; + *punctured >>= bits_to_drop; + } else { + center -= width / 4; + *punctured &= (1 << bits_to_drop) - 1; + } + width /= 2; } + + return center; } +EXPORT_SYMBOL(cfg80211_chandef_primary); -const struct cfg80211_chan_def * -cfg80211_chandef_compatible(const struct cfg80211_chan_def *c1, - const struct cfg80211_chan_def *c2) +static const struct cfg80211_chan_def * +check_chandef_primary_compat(const struct cfg80211_chan_def *c1, + const struct cfg80211_chan_def *c2, + enum nl80211_chan_width primary_chan_width) { - u32 c1_pri40, c1_pri80, c2_pri40, c2_pri80, c1_pri160, c2_pri160; + u16 punct_c1 = 0, punct_c2 = 0; + + /* check primary is compatible -> error if not */ + if (cfg80211_chandef_primary(c1, primary_chan_width, &punct_c1) != + cfg80211_chandef_primary(c2, primary_chan_width, &punct_c2)) + return ERR_PTR(-EINVAL); + + if (punct_c1 != punct_c2) + return ERR_PTR(-EINVAL); + + /* assumes c1 is smaller width, if that was just checked -> done */ + if (c1->width == primary_chan_width) + return c2; + + /* otherwise continue checking the next width */ + return NULL; +} + +static const struct cfg80211_chan_def * +_cfg80211_chandef_compatible(const struct cfg80211_chan_def *c1, + const struct cfg80211_chan_def *c2) +{ + const struct cfg80211_chan_def *ret; /* If they are identical, return */ if (cfg80211_chandef_identical(c1, c2)) - return c1; + return c2; /* otherwise, must have same control channel */ if (c1->chan != c2->chan) @@ -396,53 +471,76 @@ cfg80211_chandef_compatible(const struct cfg80211_chan_def *c1, return NULL; /* - * can't be compatible if one of them is 5 or 10 MHz, + * can't be compatible if one of them is 5/10 MHz or S1G * but they don't have the same width. */ - if (c1->width == NL80211_CHAN_WIDTH_5 || - c1->width == NL80211_CHAN_WIDTH_10 || - c2->width == NL80211_CHAN_WIDTH_5 || - c2->width == NL80211_CHAN_WIDTH_10) +#define NARROW_OR_S1G(width) ((width) == NL80211_CHAN_WIDTH_5 || \ + (width) == NL80211_CHAN_WIDTH_10 || \ + (width) == NL80211_CHAN_WIDTH_1 || \ + (width) == NL80211_CHAN_WIDTH_2 || \ + (width) == NL80211_CHAN_WIDTH_4 || \ + (width) == NL80211_CHAN_WIDTH_8 || \ + (width) == NL80211_CHAN_WIDTH_16) + + if (NARROW_OR_S1G(c1->width) || NARROW_OR_S1G(c2->width)) return NULL; - if (c1->width == NL80211_CHAN_WIDTH_20_NOHT || - c1->width == NL80211_CHAN_WIDTH_20) + /* + * Make sure that c1 is always the narrower one, so that later + * we either return NULL or c2 and don't have to check both + * directions. + */ + if (c1->width > c2->width) + swap(c1, c2); + + /* + * No further checks needed if the "narrower" one is only 20 MHz. + * Here "narrower" includes being a 20 MHz non-HT channel vs. a + * 20 MHz HT (or later) one. + */ + if (c1->width <= NL80211_CHAN_WIDTH_20) return c2; - if (c2->width == NL80211_CHAN_WIDTH_20_NOHT || - c2->width == NL80211_CHAN_WIDTH_20) - return c1; + ret = check_chandef_primary_compat(c1, c2, NL80211_CHAN_WIDTH_40); + if (ret) + return ret; - chandef_primary_freqs(c1, &c1_pri40, &c1_pri80, &c1_pri160); - chandef_primary_freqs(c2, &c2_pri40, &c2_pri80, &c2_pri160); + ret = check_chandef_primary_compat(c1, c2, NL80211_CHAN_WIDTH_80); + if (ret) + return ret; - if (c1_pri40 != c2_pri40) + /* + * If c1 is 80+80, then c2 is 160 or higher, but that cannot + * match. If c2 was also 80+80 it was already either accepted + * or rejected above (identical or not, respectively.) + */ + if (c1->width == NL80211_CHAN_WIDTH_80P80) return NULL; - if (c1->width == NL80211_CHAN_WIDTH_40) - return c2; - - if (c2->width == NL80211_CHAN_WIDTH_40) - return c1; + ret = check_chandef_primary_compat(c1, c2, NL80211_CHAN_WIDTH_160); + if (ret) + return ret; - if (c1_pri80 != c2_pri80) - return NULL; + /* + * Getting here would mean they're both wider than 160, have the + * same primary 160, but are not identical - this cannot happen + * since they must be 320 (no wider chandefs exist, at least yet.) + */ + WARN_ON_ONCE(1); - if (c1->width == NL80211_CHAN_WIDTH_80 && - c2->width > NL80211_CHAN_WIDTH_80) - return c2; + return NULL; +} - if (c2->width == NL80211_CHAN_WIDTH_80 && - c1->width > NL80211_CHAN_WIDTH_80) - return c1; +const struct cfg80211_chan_def * +cfg80211_chandef_compatible(const struct cfg80211_chan_def *c1, + const struct cfg80211_chan_def *c2) +{ + const struct cfg80211_chan_def *ret; - WARN_ON(!c1_pri160 && !c2_pri160); - if (c1_pri160 && c2_pri160 && c1_pri160 != c2_pri160) + ret = _cfg80211_chandef_compatible(c1, c2); + if (IS_ERR(ret)) return NULL; - - if (c1->width > c2->width) - return c1; - return c2; + return ret; } EXPORT_SYMBOL(cfg80211_chandef_compatible); @@ -1047,7 +1145,7 @@ EXPORT_SYMBOL(cfg80211_chandef_dfs_cac_time); static bool cfg80211_secondary_chans_ok(struct wiphy *wiphy, u32 center_freq, u32 bandwidth, - u32 prohibited_flags) + u32 prohibited_flags, bool monitor) { struct ieee80211_channel *c; u32 freq, start_freq, end_freq; @@ -1057,7 +1155,11 @@ static bool cfg80211_secondary_chans_ok(struct wiphy *wiphy, for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) { c = ieee80211_get_channel_khz(wiphy, freq); - if (!c || c->flags & prohibited_flags) + if (!c) + return false; + if (monitor && c->flags & IEEE80211_CHAN_CAN_MONITOR) + continue; + if (c->flags & prohibited_flags) return false; } @@ -1117,9 +1219,9 @@ static bool cfg80211_edmg_usable(struct wiphy *wiphy, u8 edmg_channels, return true; } -bool cfg80211_chandef_usable(struct wiphy *wiphy, - const struct cfg80211_chan_def *chandef, - u32 prohibited_flags) +bool _cfg80211_chandef_usable(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef, + u32 prohibited_flags, bool monitor) { struct ieee80211_sta_ht_cap *ht_cap; struct ieee80211_sta_vht_cap *vht_cap; @@ -1281,14 +1383,22 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, if (!cfg80211_secondary_chans_ok(wiphy, ieee80211_chandef_to_khz(chandef), - width, prohibited_flags)) + width, prohibited_flags, monitor)) return false; if (!chandef->center_freq2) return true; return cfg80211_secondary_chans_ok(wiphy, MHZ_TO_KHZ(chandef->center_freq2), - width, prohibited_flags); + width, prohibited_flags, monitor); +} + +bool cfg80211_chandef_usable(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef, + u32 prohibited_flags) +{ + return _cfg80211_chandef_usable(wiphy, chandef, prohibited_flags, + false); } EXPORT_SYMBOL(cfg80211_chandef_usable); @@ -1532,72 +1642,3 @@ struct cfg80211_chan_def *wdev_chandef(struct wireless_dev *wdev, } } EXPORT_SYMBOL(wdev_chandef); - -struct cfg80211_per_bw_puncturing_values { - u8 len; - const u16 *valid_values; -}; - -static const u16 puncturing_values_80mhz[] = { - 0x8, 0x4, 0x2, 0x1 -}; - -static const u16 puncturing_values_160mhz[] = { - 0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1, 0xc0, 0x30, 0xc, 0x3 -}; - -static const u16 puncturing_values_320mhz[] = { - 0xc000, 0x3000, 0xc00, 0x300, 0xc0, 0x30, 0xc, 0x3, 0xf000, 0xf00, - 0xf0, 0xf, 0xfc00, 0xf300, 0xf0c0, 0xf030, 0xf00c, 0xf003, 0xc00f, - 0x300f, 0xc0f, 0x30f, 0xcf, 0x3f -}; - -#define CFG80211_PER_BW_VALID_PUNCTURING_VALUES(_bw) \ - { \ - .len = ARRAY_SIZE(puncturing_values_ ## _bw ## mhz), \ - .valid_values = puncturing_values_ ## _bw ## mhz \ - } - -static const struct cfg80211_per_bw_puncturing_values per_bw_puncturing[] = { - CFG80211_PER_BW_VALID_PUNCTURING_VALUES(80), - CFG80211_PER_BW_VALID_PUNCTURING_VALUES(160), - CFG80211_PER_BW_VALID_PUNCTURING_VALUES(320) -}; - -bool cfg80211_valid_disable_subchannel_bitmap(u16 *bitmap, - const struct cfg80211_chan_def *chandef) -{ - u32 idx, i, start_freq; - - switch (chandef->width) { - case NL80211_CHAN_WIDTH_80: - idx = 0; - start_freq = chandef->center_freq1 - 40; - break; - case NL80211_CHAN_WIDTH_160: - idx = 1; - start_freq = chandef->center_freq1 - 80; - break; - case NL80211_CHAN_WIDTH_320: - idx = 2; - start_freq = chandef->center_freq1 - 160; - break; - default: - *bitmap = 0; - break; - } - - if (!*bitmap) - return true; - - /* check if primary channel is punctured */ - if (*bitmap & (u16)BIT((chandef->chan->center_freq - start_freq) / 20)) - return false; - - for (i = 0; i < per_bw_puncturing[idx].len; i++) - if (per_bw_puncturing[idx].valid_values[i] == *bitmap) - return true; - - return false; -} -EXPORT_SYMBOL(cfg80211_valid_disable_subchannel_bitmap); diff --git a/net/wireless/core.c b/net/wireless/core.c index 3fb1b63735..4b1f45e307 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -431,7 +431,7 @@ static void cfg80211_wiphy_work(struct work_struct *work) if (wk) { list_del_init(&wk->entry); if (!list_empty(&rdev->wiphy_work_list)) - schedule_work(work); + queue_work(system_unbound_wq, work); spin_unlock_irq(&rdev->wiphy_work_lock); wk->func(&rdev->wiphy, wk); diff --git a/net/wireless/core.h b/net/wireless/core.h index 13657a85cf..118f2f6198 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -3,7 +3,7 @@ * Wireless configuration interface internals. * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation */ #ifndef __NET_WIRELESS_CORE_H #define __NET_WIRELESS_CORE_H @@ -362,7 +362,8 @@ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev, struct cfg80211_auth_request *req); int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, struct net_device *dev, - struct cfg80211_assoc_request *req); + struct cfg80211_assoc_request *req, + struct netlink_ext_ack *extack); int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *bssid, const u8 *ie, int ie_len, u16 reason, @@ -491,6 +492,9 @@ bool cfg80211_is_sub_chan(struct cfg80211_chan_def *chandef, bool cfg80211_wdev_on_sub_chan(struct wireless_dev *wdev, struct ieee80211_channel *chan, bool primary_only); +bool _cfg80211_chandef_usable(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef, + u32 prohibited_flags, bool monitor); static inline unsigned int elapsed_jiffies_msecs(unsigned long start) { @@ -549,9 +553,53 @@ int cfg80211_remove_virtual_intf(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); void cfg80211_wdev_release_link_bsses(struct wireless_dev *wdev, u16 link_mask); +/** + * struct cfg80211_colocated_ap - colocated AP information + * + * @list: linked list to all colocated APs + * @bssid: BSSID of the reported AP + * @ssid: SSID of the reported AP + * @ssid_len: length of the ssid + * @center_freq: frequency the reported AP is on + * @unsolicited_probe: the reported AP is part of an ESS, where all the APs + * that operate in the same channel as the reported AP and that might be + * detected by a STA receiving this frame, are transmitting unsolicited + * Probe Response frames every 20 TUs + * @oct_recommended: OCT is recommended to exchange MMPDUs with the reported AP + * @same_ssid: the reported AP has the same SSID as the reporting AP + * @multi_bss: the reported AP is part of a multiple BSSID set + * @transmitted_bssid: the reported AP is the transmitting BSSID + * @colocated_ess: all the APs that share the same ESS as the reported AP are + * colocated and can be discovered via legacy bands. + * @short_ssid_valid: short_ssid is valid and can be used + * @short_ssid: the short SSID for this SSID + * @psd_20: The 20MHz PSD EIRP of the primary 20MHz channel for the reported AP + */ +struct cfg80211_colocated_ap { + struct list_head list; + u8 bssid[ETH_ALEN]; + u8 ssid[IEEE80211_MAX_SSID_LEN]; + size_t ssid_len; + u32 short_ssid; + u32 center_freq; + u8 unsolicited_probe:1, + oct_recommended:1, + same_ssid:1, + multi_bss:1, + transmitted_bssid:1, + colocated_ess:1, + short_ssid_valid:1; + s8 psd_20; +}; + #if IS_ENABLED(CONFIG_CFG80211_KUNIT_TEST) #define EXPORT_SYMBOL_IF_CFG80211_KUNIT(sym) EXPORT_SYMBOL_IF_KUNIT(sym) #define VISIBLE_IF_CFG80211_KUNIT +void cfg80211_free_coloc_ap_list(struct list_head *coloc_ap_list); + +int cfg80211_parse_colocated_ap(const struct cfg80211_bss_ies *ies, + struct list_head *list); + size_t cfg80211_gen_new_ie(const u8 *ie, size_t ielen, const u8 *subie, size_t subie_len, u8 *new_ie, size_t new_ie_len); diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index f635a8b6ca..4052041a19 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -4,7 +4,7 @@ * * Copyright (c) 2009, Jouni Malinen <j@w1.fi> * Copyright (c) 2015 Intel Deutschland GmbH - * Copyright (C) 2019-2020, 2022-2023 Intel Corporation + * Copyright (C) 2019-2020, 2022-2024 Intel Corporation */ #include <linux/kernel.h> @@ -241,12 +241,12 @@ void cfg80211_michael_mic_failure(struct net_device *dev, const u8 *addr, char *buf = kmalloc(128, gfp); if (buf) { - sprintf(buf, "MLME-MICHAELMICFAILURE.indication(" - "keyid=%d %scast addr=%pM)", key_id, - key_type == NL80211_KEYTYPE_GROUP ? "broad" : "uni", - addr); memset(&wrqu, 0, sizeof(wrqu)); - wrqu.data.length = strlen(buf); + wrqu.data.length = + sprintf(buf, "MLME-MICHAELMICFAILURE." + "indication(keyid=%d %scast addr=%pM)", + key_id, key_type == NL80211_KEYTYPE_GROUP + ? "broad" : "uni", addr); wireless_send_event(dev, IWEVCUSTOM, &wrqu, buf); kfree(buf); } @@ -325,28 +325,136 @@ void cfg80211_oper_and_vht_capa(struct ieee80211_vht_cap *vht_capa, p1[i] &= p2[i]; } -/* Note: caller must cfg80211_put_bss() regardless of result */ -int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, - struct net_device *dev, - struct cfg80211_assoc_request *req) +static int +cfg80211_mlme_check_mlo_compat(const struct ieee80211_multi_link_elem *mle_a, + const struct ieee80211_multi_link_elem *mle_b, + struct netlink_ext_ack *extack) { - struct wireless_dev *wdev = dev->ieee80211_ptr; - int err, i, j; + const struct ieee80211_mle_basic_common_info *common_a, *common_b; - lockdep_assert_wiphy(wdev->wiphy); + common_a = (const void *)mle_a->variable; + common_b = (const void *)mle_b->variable; + + if (memcmp(common_a->mld_mac_addr, common_b->mld_mac_addr, ETH_ALEN)) { + NL_SET_ERR_MSG(extack, "AP MLD address mismatch"); + return -EINVAL; + } + + if (ieee80211_mle_get_eml_med_sync_delay((const u8 *)mle_a) != + ieee80211_mle_get_eml_med_sync_delay((const u8 *)mle_b)) { + NL_SET_ERR_MSG(extack, "link EML medium sync delay mismatch"); + return -EINVAL; + } + + if (ieee80211_mle_get_eml_cap((const u8 *)mle_a) != + ieee80211_mle_get_eml_cap((const u8 *)mle_b)) { + NL_SET_ERR_MSG(extack, "link EML capabilities mismatch"); + return -EINVAL; + } + + if (ieee80211_mle_get_mld_capa_op((const u8 *)mle_a) != + ieee80211_mle_get_mld_capa_op((const u8 *)mle_b)) { + NL_SET_ERR_MSG(extack, "link MLD capabilities/ops mismatch"); + return -EINVAL; + } + + return 0; +} + +static int cfg80211_mlme_check_mlo(struct net_device *dev, + struct cfg80211_assoc_request *req, + struct netlink_ext_ack *extack) +{ + const struct ieee80211_multi_link_elem *mles[ARRAY_SIZE(req->links)] = {}; + int i; + + if (req->link_id < 0) + return 0; + + if (!req->links[req->link_id].bss) { + NL_SET_ERR_MSG(extack, "no BSS for assoc link"); + return -EINVAL; + } + + rcu_read_lock(); + for (i = 0; i < ARRAY_SIZE(req->links); i++) { + const struct cfg80211_bss_ies *ies; + const struct element *ml; - for (i = 1; i < ARRAY_SIZE(req->links); i++) { if (!req->links[i].bss) continue; - for (j = 0; j < i; j++) { - if (req->links[i].bss == req->links[j].bss) - return -EINVAL; + + if (ether_addr_equal(req->links[i].bss->bssid, dev->dev_addr)) { + NL_SET_ERR_MSG(extack, "BSSID must not be our address"); + req->links[i].error = -EINVAL; + goto error; } - if (ether_addr_equal(req->links[i].bss->bssid, dev->dev_addr)) - return -EINVAL; + ies = rcu_dereference(req->links[i].bss->ies); + ml = cfg80211_find_ext_elem(WLAN_EID_EXT_EHT_MULTI_LINK, + ies->data, ies->len); + if (!ml) { + NL_SET_ERR_MSG(extack, "MLO BSS w/o ML element"); + req->links[i].error = -EINVAL; + goto error; + } + + if (!ieee80211_mle_type_ok(ml->data + 1, + IEEE80211_ML_CONTROL_TYPE_BASIC, + ml->datalen - 1)) { + NL_SET_ERR_MSG(extack, "BSS with invalid ML element"); + req->links[i].error = -EINVAL; + goto error; + } + + mles[i] = (const void *)(ml->data + 1); + + if (ieee80211_mle_get_link_id((const u8 *)mles[i]) != i) { + NL_SET_ERR_MSG(extack, "link ID mismatch"); + req->links[i].error = -EINVAL; + goto error; + } + } + + if (WARN_ON(!mles[req->link_id])) + goto error; + + for (i = 0; i < ARRAY_SIZE(req->links); i++) { + if (i == req->link_id || !req->links[i].bss) + continue; + + if (WARN_ON(!mles[i])) + goto error; + + if (cfg80211_mlme_check_mlo_compat(mles[req->link_id], mles[i], + extack)) { + req->links[i].error = -EINVAL; + goto error; + } } + rcu_read_unlock(); + return 0; +error: + rcu_read_unlock(); + return -EINVAL; +} + +/* Note: caller must cfg80211_put_bss() regardless of result */ +int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_assoc_request *req, + struct netlink_ext_ack *extack) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + int err; + + lockdep_assert_wiphy(wdev->wiphy); + + err = cfg80211_mlme_check_mlo(dev, req, extack); + if (err) + return err; + if (wdev->connected && (!req->prev_bssid || !ether_addr_equal(wdev->u.client.connected_addr, req->prev_bssid))) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index bd54a928ba..c2829d673b 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5,7 +5,7 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation */ #include <linux/if.h> @@ -468,6 +468,10 @@ static const struct netlink_range_validation nl80211_punct_bitmap_range = { .max = 0xffff, }; +static const struct netlink_range_validation q_range = { + .max = INT_MAX, +}; + static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [0] = { .strict_start_type = NL80211_ATTR_HE_OBSS_PD }, [NL80211_ATTR_WIPHY] = { .type = NLA_U32 }, @@ -581,7 +585,11 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_PRIVACY] = { .type = NLA_FLAG }, [NL80211_ATTR_STATUS_CODE] = { .type = NLA_U16 }, [NL80211_ATTR_CIPHER_SUITE_GROUP] = { .type = NLA_U32 }, - [NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 }, + [NL80211_ATTR_WPA_VERSIONS] = + NLA_POLICY_RANGE(NLA_U32, 0, + NL80211_WPA_VERSION_1 | + NL80211_WPA_VERSION_2 | + NL80211_WPA_VERSION_3), [NL80211_ATTR_PID] = { .type = NLA_U32 }, [NL80211_ATTR_4ADDR] = { .type = NLA_U8 }, [NL80211_ATTR_PMKID] = NLA_POLICY_EXACT_LEN_WARN(WLAN_PMKID_LEN), @@ -750,7 +758,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_TXQ_LIMIT] = { .type = NLA_U32 }, [NL80211_ATTR_TXQ_MEMORY_LIMIT] = { .type = NLA_U32 }, - [NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 }, + [NL80211_ATTR_TXQ_QUANTUM] = NLA_POLICY_FULL_RANGE(NLA_U32, &q_range), [NL80211_ATTR_HE_CAPABILITY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_he_capa, NL80211_HE_MAX_CAPABILITY_LEN), @@ -821,6 +829,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_BSS_DUMP_INCLUDE_USE_DATA] = { .type = NLA_FLAG }, [NL80211_ATTR_MLO_TTLM_DLINK] = NLA_POLICY_EXACT_LEN(sizeof(u16) * 8), [NL80211_ATTR_MLO_TTLM_ULINK] = NLA_POLICY_EXACT_LEN(sizeof(u16) * 8), + [NL80211_ATTR_ASSOC_SPP_AMSDU] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -906,22 +915,11 @@ nl80211_rekey_policy[NUM_NL80211_REKEY_DATA] = { }; static const struct nla_policy -nl80211_match_band_rssi_policy[NUM_NL80211_BANDS] = { - [NL80211_BAND_2GHZ] = { .type = NLA_S32 }, - [NL80211_BAND_5GHZ] = { .type = NLA_S32 }, - [NL80211_BAND_6GHZ] = { .type = NLA_S32 }, - [NL80211_BAND_60GHZ] = { .type = NLA_S32 }, - [NL80211_BAND_LC] = { .type = NLA_S32 }, -}; - -static const struct nla_policy nl80211_match_policy[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1] = { [NL80211_SCHED_SCAN_MATCH_ATTR_SSID] = { .type = NLA_BINARY, .len = IEEE80211_MAX_SSID_LEN }, [NL80211_SCHED_SCAN_MATCH_ATTR_BSSID] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_SCHED_SCAN_MATCH_ATTR_RSSI] = { .type = NLA_U32 }, - [NL80211_SCHED_SCAN_MATCH_PER_BAND_RSSI] = - NLA_POLICY_NESTED(nl80211_match_band_rssi_policy), }; static const struct nla_policy @@ -1204,11 +1202,14 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, if ((chan->flags & IEEE80211_CHAN_DFS_CONCURRENT) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_DFS_CONCURRENT)) goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_NO_UHB_VLP_CLIENT) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_UHB_VLP_CLIENT)) + if ((chan->flags & IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_6GHZ_VLP_CLIENT)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_6GHZ_AFC_CLIENT)) goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_NO_UHB_AFC_CLIENT) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_UHB_AFC_CLIENT)) + if ((chan->flags & IEEE80211_CHAN_CAN_MONITOR) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_CAN_MONITOR)) goto nla_put_failure; } @@ -3224,24 +3225,9 @@ static bool nl80211_can_set_dev_channel(struct wireless_dev *wdev) wdev->iftype == NL80211_IFTYPE_P2P_GO; } -static int nl80211_parse_punct_bitmap(struct cfg80211_registered_device *rdev, - struct genl_info *info, - const struct cfg80211_chan_def *chandef, - u16 *punct_bitmap) -{ - if (!wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_PUNCT)) - return -EINVAL; - - *punct_bitmap = nla_get_u32(info->attrs[NL80211_ATTR_PUNCT_BITMAP]); - if (!cfg80211_valid_disable_subchannel_bitmap(punct_bitmap, chandef)) - return -EINVAL; - - return 0; -} - -int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, - struct genl_info *info, - struct cfg80211_chan_def *chandef) +static int _nl80211_parse_chandef(struct cfg80211_registered_device *rdev, + struct genl_info *info, bool monitor, + struct cfg80211_chan_def *chandef) { struct netlink_ext_ack *extack = info->extack; struct nlattr **attrs = info->attrs; @@ -3266,10 +3252,9 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, chandef->freq1_offset = control_freq % 1000; chandef->center_freq2 = 0; - /* Primary channel not allowed */ - if (!chandef->chan || chandef->chan->flags & IEEE80211_CHAN_DISABLED) { + if (!chandef->chan) { NL_SET_ERR_MSG_ATTR(extack, attrs[NL80211_ATTR_WIPHY_FREQ], - "Channel is disabled"); + "Unknown channel"); return -EINVAL; } @@ -3346,13 +3331,27 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, chandef->edmg.channels = 0; } + if (info->attrs[NL80211_ATTR_PUNCT_BITMAP]) { + chandef->punctured = + nla_get_u32(info->attrs[NL80211_ATTR_PUNCT_BITMAP]); + + if (chandef->punctured && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_PUNCT)) { + NL_SET_ERR_MSG(extack, + "driver doesn't support puncturing"); + return -EINVAL; + } + } + if (!cfg80211_chandef_valid(chandef)) { NL_SET_ERR_MSG(extack, "invalid channel definition"); return -EINVAL; } - if (!cfg80211_chandef_usable(&rdev->wiphy, chandef, - IEEE80211_CHAN_DISABLED)) { + if (!_cfg80211_chandef_usable(&rdev->wiphy, chandef, + IEEE80211_CHAN_DISABLED, + monitor)) { NL_SET_ERR_MSG(extack, "(extension) channel is disabled"); return -EINVAL; } @@ -3367,6 +3366,13 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, return 0; } +int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, + struct genl_info *info, + struct cfg80211_chan_def *chandef) +{ + return _nl80211_parse_chandef(rdev, info, false, chandef); +} + static int __nl80211_set_channel(struct cfg80211_registered_device *rdev, struct net_device *dev, struct genl_info *info, @@ -3391,7 +3397,9 @@ static int __nl80211_set_channel(struct cfg80211_registered_device *rdev, link_id = 0; } - result = nl80211_parse_chandef(rdev, info, &chandef); + result = _nl80211_parse_chandef(rdev, info, + iftype == NL80211_IFTYPE_MONITOR, + &chandef); if (result) return result; @@ -3414,6 +3422,33 @@ static int __nl80211_set_channel(struct cfg80211_registered_device *rdev, if (chandef.chan != cur_chan) return -EBUSY; + /* only allow this for regular channel widths */ + switch (wdev->links[link_id].ap.chandef.width) { + case NL80211_CHAN_WIDTH_20_NOHT: + case NL80211_CHAN_WIDTH_20: + case NL80211_CHAN_WIDTH_40: + case NL80211_CHAN_WIDTH_80: + case NL80211_CHAN_WIDTH_80P80: + case NL80211_CHAN_WIDTH_160: + case NL80211_CHAN_WIDTH_320: + break; + default: + return -EINVAL; + } + + switch (chandef.width) { + case NL80211_CHAN_WIDTH_20_NOHT: + case NL80211_CHAN_WIDTH_20: + case NL80211_CHAN_WIDTH_40: + case NL80211_CHAN_WIDTH_80: + case NL80211_CHAN_WIDTH_80P80: + case NL80211_CHAN_WIDTH_160: + case NL80211_CHAN_WIDTH_320: + break; + default: + return -EINVAL; + } + result = rdev_set_ap_chanwidth(rdev, dev, link_id, &chandef); if (result) @@ -3822,6 +3857,10 @@ int nl80211_send_chandef(struct sk_buff *msg, const struct cfg80211_chan_def *ch if (chandef->center_freq2 && nla_put_u32(msg, NL80211_ATTR_CENTER_FREQ2, chandef->center_freq2)) return -ENOBUFS; + if (chandef->punctured && + nla_put_u32(msg, NL80211_ATTR_PUNCT_BITMAP, chandef->punctured)) + return -ENOBUFS; + return 0; } EXPORT_SYMBOL(nl80211_send_chandef); @@ -4202,8 +4241,6 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) if (netif_running(dev)) return -EBUSY; - BUILD_BUG_ON(IEEE80211_MAX_SSID_LEN != - IEEE80211_MAX_MESH_ID_LEN); wdev->u.mesh.id_up_len = nla_len(info->attrs[NL80211_ATTR_MESH_ID]); memcpy(wdev->u.mesh.id, @@ -4309,8 +4346,6 @@ static int _nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) case NL80211_IFTYPE_MESH_POINT: if (!info->attrs[NL80211_ATTR_MESH_ID]) break; - BUILD_BUG_ON(IEEE80211_MAX_SSID_LEN != - IEEE80211_MAX_MESH_ID_LEN); wdev->u.mesh.id_up_len = nla_len(info->attrs[NL80211_ATTR_MESH_ID]); memcpy(wdev->u.mesh.id, @@ -4450,10 +4485,7 @@ static void get_key_callback(void *c, struct key_params *params) struct nlattr *key; struct get_key_cookie *cookie = c; - if ((params->key && - nla_put(cookie->msg, NL80211_ATTR_KEY_DATA, - params->key_len, params->key)) || - (params->seq && + if ((params->seq && nla_put(cookie->msg, NL80211_ATTR_KEY_SEQ, params->seq_len, params->seq)) || (params->cipher && @@ -4465,10 +4497,7 @@ static void get_key_callback(void *c, struct key_params *params) if (!key) goto nla_put_failure; - if ((params->key && - nla_put(cookie->msg, NL80211_KEY_DATA, - params->key_len, params->key)) || - (params->seq && + if ((params->seq && nla_put(cookie->msg, NL80211_KEY_SEQ, params->seq_len, params->seq)) || (params->cipher && @@ -6069,14 +6098,6 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) goto out; } - if (info->attrs[NL80211_ATTR_PUNCT_BITMAP]) { - err = nl80211_parse_punct_bitmap(rdev, info, - ¶ms->chandef, - ¶ms->punct_bitmap); - if (err) - goto out; - } - if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, ¶ms->chandef, wdev->iftype)) { err = -EINVAL; @@ -6876,7 +6897,7 @@ int cfg80211_check_station_change(struct wiphy *wiphy, return -EINVAL; /* When you run into this, adjust the code below for the new flag */ - BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 7); + BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 8); switch (statype) { case CFG80211_STA_MESH_PEER_KERNEL: @@ -6936,6 +6957,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy, params->link_sta_params.he_capa || params->link_sta_params.eht_capa) return -EINVAL; + if (params->sta_flags_mask & BIT(NL80211_STA_FLAG_SPP_AMSDU)) + return -EINVAL; } if (statype != CFG80211_STA_AP_CLIENT && @@ -6959,7 +6982,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy, BIT(NL80211_STA_FLAG_ASSOCIATED) | BIT(NL80211_STA_FLAG_SHORT_PREAMBLE) | BIT(NL80211_STA_FLAG_WME) | - BIT(NL80211_STA_FLAG_MFP))) + BIT(NL80211_STA_FLAG_MFP) | + BIT(NL80211_STA_FLAG_SPP_AMSDU))) return -EINVAL; /* but authenticated/associated only if driver handles it */ @@ -7518,7 +7542,7 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) return -EINVAL; /* When you run into this, adjust the code below for the new flag */ - BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 7); + BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 8); switch (dev->ieee80211_ptr->iftype) { case NL80211_IFTYPE_AP: @@ -7542,6 +7566,11 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) params.sta_flags_mask & auth_assoc) return -EINVAL; + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_SPP_AMSDU_SUPPORT) && + params.sta_flags_mask & BIT(NL80211_STA_FLAG_SPP_AMSDU)) + return -EINVAL; + /* Older userspace, or userspace wanting to be compatible with * !NL80211_FEATURE_FULL_AP_CLIENT_STATE, will not set the auth * and assoc flags in the mask, but assumes the station will be @@ -7630,14 +7659,16 @@ static int nl80211_del_station(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; struct station_del_parameters params; + int link_id = nl80211_link_id_or_invalid(info->attrs); memset(¶ms, 0, sizeof(params)); if (info->attrs[NL80211_ATTR_MAC]) params.mac = nla_data(info->attrs[NL80211_ATTR_MAC]); - switch (dev->ieee80211_ptr->iftype) { + switch (wdev->iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MESH_POINT: @@ -7678,6 +7709,17 @@ static int nl80211_del_station(struct sk_buff *skb, struct genl_info *info) params.reason_code = WLAN_REASON_PREV_AUTH_NOT_VALID; } + /* Link ID not expected in case of non-ML operation */ + if (!wdev->valid_links && link_id != -1) + return -EINVAL; + + /* If given, a valid link ID should be passed during MLO */ + if (wdev->valid_links && link_id >= 0 && + !(wdev->valid_links & BIT(link_id))) + return -EINVAL; + + params.link_id = link_id; + return rdev_del_station(rdev, dev, ¶ms); } @@ -8102,7 +8144,8 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info) is_indoor = true; } - return regulatory_hint_indoor(is_indoor, owner_nlportid); + regulatory_hint_indoor(is_indoor, owner_nlportid); + return 0; default: return -EINVAL; } @@ -9148,6 +9191,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) struct wiphy *wiphy; int err, tmp, n_ssids = 0, n_channels, i; size_t ie_len, size; + size_t ssids_offset, ie_offset; wiphy = &rdev->wiphy; @@ -9193,21 +9237,20 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) return -EINVAL; size = struct_size(request, channels, n_channels); + ssids_offset = size; size = size_add(size, array_size(sizeof(*request->ssids), n_ssids)); + ie_offset = size; size = size_add(size, ie_len); request = kzalloc(size, GFP_KERNEL); if (!request) return -ENOMEM; + request->n_channels = n_channels; if (n_ssids) - request->ssids = (void *)&request->channels[n_channels]; + request->ssids = (void *)request + ssids_offset; request->n_ssids = n_ssids; - if (ie_len) { - if (n_ssids) - request->ie = (void *)(request->ssids + n_ssids); - else - request->ie = (void *)(request->channels + n_channels); - } + if (ie_len) + request->ie = (void *)request + ie_offset; i = 0; if (scan_freqs) { @@ -9483,41 +9526,6 @@ nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans, return 0; } -static int -nl80211_parse_sched_scan_per_band_rssi(struct wiphy *wiphy, - struct cfg80211_match_set *match_sets, - struct nlattr *tb_band_rssi, - s32 rssi_thold) -{ - struct nlattr *attr; - int i, tmp, ret = 0; - - if (!wiphy_ext_feature_isset(wiphy, - NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD)) { - if (tb_band_rssi) - ret = -EOPNOTSUPP; - else - for (i = 0; i < NUM_NL80211_BANDS; i++) - match_sets->per_band_rssi_thold[i] = - NL80211_SCAN_RSSI_THOLD_OFF; - return ret; - } - - for (i = 0; i < NUM_NL80211_BANDS; i++) - match_sets->per_band_rssi_thold[i] = rssi_thold; - - nla_for_each_nested(attr, tb_band_rssi, tmp) { - enum nl80211_band band = nla_type(attr); - - if (band < 0 || band >= NUM_NL80211_BANDS) - return -EINVAL; - - match_sets->per_band_rssi_thold[band] = nla_get_s32(attr); - } - - return 0; -} - static struct cfg80211_sched_scan_request * nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, struct nlattr **attrs, int max_match_sets) @@ -9792,15 +9800,6 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, if (rssi) request->match_sets[i].rssi_thold = nla_get_s32(rssi); - - /* Parse per band RSSI attribute */ - err = nl80211_parse_sched_scan_per_band_rssi(wiphy, - &request->match_sets[i], - tb[NL80211_SCHED_SCAN_MATCH_PER_BAND_RSSI], - request->match_sets[i].rssi_thold); - if (err) - goto out_free; - i++; } @@ -10080,6 +10079,42 @@ static int nl80211_notify_radar_detection(struct sk_buff *skb, return 0; } +static int nl80211_parse_counter_offsets(struct cfg80211_registered_device *rdev, + const u8 *data, size_t datalen, + int first_count, struct nlattr *attr, + const u16 **offsets, unsigned int *n_offsets) +{ + int i; + + *n_offsets = 0; + + if (!attr) + return 0; + + if (!nla_len(attr) || (nla_len(attr) % sizeof(u16))) + return -EINVAL; + + *n_offsets = nla_len(attr) / sizeof(u16); + if (rdev->wiphy.max_num_csa_counters && + (*n_offsets > rdev->wiphy.max_num_csa_counters)) + return -EINVAL; + + *offsets = nla_data(attr); + + /* sanity checks - counters should fit and be the same */ + for (i = 0; i < *n_offsets; i++) { + u16 offset = (*offsets)[i]; + + if (offset >= datalen) + return -EINVAL; + + if (first_count != -1 && data[offset] != first_count) + return -EINVAL; + } + + return 0; +} + static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -10091,7 +10126,6 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) int err; bool need_new_beacon = false; bool need_handle_dfs_flag = true; - int len, i; u32 cs_count; if (!rdev->ops->channel_switch || @@ -10176,72 +10210,23 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) goto free; } - len = nla_len(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON]); - if (!len || (len % sizeof(u16))) { - err = -EINVAL; + err = nl80211_parse_counter_offsets(rdev, params.beacon_csa.tail, + params.beacon_csa.tail_len, + params.count, + csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON], + ¶ms.counter_offsets_beacon, + ¶ms.n_counter_offsets_beacon); + if (err) goto free; - } - params.n_counter_offsets_beacon = len / sizeof(u16); - if (rdev->wiphy.max_num_csa_counters && - (params.n_counter_offsets_beacon > - rdev->wiphy.max_num_csa_counters)) { - err = -EINVAL; + err = nl80211_parse_counter_offsets(rdev, params.beacon_csa.probe_resp, + params.beacon_csa.probe_resp_len, + params.count, + csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP], + ¶ms.counter_offsets_presp, + ¶ms.n_counter_offsets_presp); + if (err) goto free; - } - - params.counter_offsets_beacon = - nla_data(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON]); - - /* sanity checks - counters should fit and be the same */ - for (i = 0; i < params.n_counter_offsets_beacon; i++) { - u16 offset = params.counter_offsets_beacon[i]; - - if (offset >= params.beacon_csa.tail_len) { - err = -EINVAL; - goto free; - } - - if (params.beacon_csa.tail[offset] != params.count) { - err = -EINVAL; - goto free; - } - } - - if (csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP]) { - len = nla_len(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP]); - if (!len || (len % sizeof(u16))) { - err = -EINVAL; - goto free; - } - - params.n_counter_offsets_presp = len / sizeof(u16); - if (rdev->wiphy.max_num_csa_counters && - (params.n_counter_offsets_presp > - rdev->wiphy.max_num_csa_counters)) { - err = -EINVAL; - goto free; - } - - params.counter_offsets_presp = - nla_data(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP]); - - /* sanity checks - counters should fit and be the same */ - for (i = 0; i < params.n_counter_offsets_presp; i++) { - u16 offset = params.counter_offsets_presp[i]; - - if (offset >= params.beacon_csa.probe_resp_len) { - err = -EINVAL; - goto free; - } - - if (params.beacon_csa.probe_resp[offset] != - params.count) { - err = -EINVAL; - goto free; - } - } - } skip_beacons: err = nl80211_parse_chandef(rdev, info, ¶ms.chandef); @@ -10272,14 +10257,7 @@ skip_beacons: if (info->attrs[NL80211_ATTR_CH_SWITCH_BLOCK_TX]) params.block_tx = true; - if (info->attrs[NL80211_ATTR_PUNCT_BITMAP]) { - err = nl80211_parse_punct_bitmap(rdev, info, - ¶ms.chandef, - ¶ms.punct_bitmap); - if (err) - goto free; - } - + params.link_id = link_id; err = rdev_channel_switch(rdev, dev, ¶ms); free: @@ -10652,13 +10630,6 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb) return res; } -static bool nl80211_valid_wpa_versions(u32 wpa_versions) -{ - return !(wpa_versions & ~(NL80211_WPA_VERSION_1 | - NL80211_WPA_VERSION_2 | - NL80211_WPA_VERSION_3)); -} - static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -10884,12 +10855,9 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, return -EINVAL; } - if (info->attrs[NL80211_ATTR_WPA_VERSIONS]) { + if (info->attrs[NL80211_ATTR_WPA_VERSIONS]) settings->wpa_versions = nla_get_u32(info->attrs[NL80211_ATTR_WPA_VERSIONS]); - if (!nl80211_valid_wpa_versions(settings->wpa_versions)) - return -EINVAL; - } if (info->attrs[NL80211_ATTR_AKM_SUITES]) { void *data; @@ -11104,6 +11072,15 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) sizeof(req.s1g_capa)); } + if (nla_get_flag(info->attrs[NL80211_ATTR_ASSOC_SPP_AMSDU])) { + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_SPP_AMSDU_SUPPORT)) { + GENL_SET_ERR_MSG(info, "SPP A-MSDUs not supported"); + return -EINVAL; + } + req.flags |= ASSOC_REQ_SPP_AMSDU; + } + req.link_id = nl80211_link_id_or_invalid(info->attrs); if (info->attrs[NL80211_ATTR_MLO_LINKS]) { @@ -11229,7 +11206,8 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) struct nlattr *link; int rem = 0; - err = cfg80211_mlme_assoc(rdev, dev, &req); + err = cfg80211_mlme_assoc(rdev, dev, &req, + info->extack); if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) { dev->ieee80211_ptr->conn_owner_nlportid = @@ -12677,23 +12655,12 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) params.buf = nla_data(info->attrs[NL80211_ATTR_FRAME]); params.len = nla_len(info->attrs[NL80211_ATTR_FRAME]); - if (info->attrs[NL80211_ATTR_CSA_C_OFFSETS_TX]) { - int len = nla_len(info->attrs[NL80211_ATTR_CSA_C_OFFSETS_TX]); - int i; - - if (len % sizeof(u16)) - return -EINVAL; - - params.n_csa_offsets = len / sizeof(u16); - params.csa_offsets = - nla_data(info->attrs[NL80211_ATTR_CSA_C_OFFSETS_TX]); - - /* check that all the offsets fit the frame */ - for (i = 0; i < params.n_csa_offsets; i++) { - if (params.csa_offsets[i] >= params.len) - return -EINVAL; - } - } + err = nl80211_parse_counter_offsets(rdev, NULL, params.len, -1, + info->attrs[NL80211_ATTR_CSA_C_OFFSETS_TX], + ¶ms.csa_offsets, + ¶ms.n_csa_offsets); + if (err) + return err; if (!params.dont_wait_for_ack) { msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); @@ -14092,6 +14059,8 @@ static int nl80211_set_coalesce(struct sk_buff *skb, struct genl_info *info) error: for (i = 0; i < new_coalesce.n_rules; i++) { tmp_rule = &new_coalesce.rules[i]; + if (!tmp_rule) + continue; for (j = 0; j < tmp_rule->n_patterns; j++) kfree(tmp_rule->patterns[j].mask); kfree(tmp_rule->patterns); @@ -16091,6 +16060,7 @@ static int nl80211_color_change(struct sk_buff *skb, struct genl_info *info) params.counter_offset_presp = offset; } + params.link_id = nl80211_link_id(info->attrs); err = rdev_color_change(rdev, dev, ¶ms); out: @@ -16830,6 +16800,10 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_station, .flags = GENL_UNS_ADMIN_PERM, + /* cannot use NL80211_FLAG_MLO_VALID_LINK_ID, depends on + * whether MAC address is passed or not. If MAC address is + * passed, then even during MLO, link ID is not required. + */ .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { @@ -17489,7 +17463,8 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_color_change, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_MLO_VALID_LINK_ID), }, { .cmd = NL80211_CMD_SET_FILS_AAD, @@ -19400,7 +19375,7 @@ static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev, struct cfg80211_chan_def *chandef, gfp_t gfp, enum nl80211_commands notif, - u8 count, bool quiet, u16 punct_bitmap) + u8 count, bool quiet) { struct wireless_dev *wdev = netdev->ieee80211_ptr; struct sk_buff *msg; @@ -19434,9 +19409,6 @@ static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev, goto nla_put_failure; } - if (nla_put_u32(msg, NL80211_ATTR_PUNCT_BITMAP, punct_bitmap)) - goto nla_put_failure; - genlmsg_end(msg, hdr); genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, @@ -19449,7 +19421,7 @@ static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev, void cfg80211_ch_switch_notify(struct net_device *dev, struct cfg80211_chan_def *chandef, - unsigned int link_id, u16 punct_bitmap) + unsigned int link_id) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; @@ -19458,7 +19430,7 @@ void cfg80211_ch_switch_notify(struct net_device *dev, lockdep_assert_wiphy(wdev->wiphy); WARN_INVALID_LINK_ID(wdev, link_id); - trace_cfg80211_ch_switch_notify(dev, chandef, link_id, punct_bitmap); + trace_cfg80211_ch_switch_notify(dev, chandef, link_id); switch (wdev->iftype) { case NL80211_IFTYPE_STATION: @@ -19487,15 +19459,14 @@ void cfg80211_ch_switch_notify(struct net_device *dev, cfg80211_sched_dfs_chan_update(rdev); nl80211_ch_switch_notify(rdev, dev, link_id, chandef, GFP_KERNEL, - NL80211_CMD_CH_SWITCH_NOTIFY, 0, false, - punct_bitmap); + NL80211_CMD_CH_SWITCH_NOTIFY, 0, false); } EXPORT_SYMBOL(cfg80211_ch_switch_notify); void cfg80211_ch_switch_started_notify(struct net_device *dev, struct cfg80211_chan_def *chandef, unsigned int link_id, u8 count, - bool quiet, u16 punct_bitmap) + bool quiet) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; @@ -19504,19 +19475,18 @@ void cfg80211_ch_switch_started_notify(struct net_device *dev, lockdep_assert_wiphy(wdev->wiphy); WARN_INVALID_LINK_ID(wdev, link_id); - trace_cfg80211_ch_switch_started_notify(dev, chandef, link_id, - punct_bitmap); + trace_cfg80211_ch_switch_started_notify(dev, chandef, link_id); nl80211_ch_switch_notify(rdev, dev, link_id, chandef, GFP_KERNEL, NL80211_CMD_CH_SWITCH_STARTED_NOTIFY, - count, quiet, punct_bitmap); + count, quiet); } EXPORT_SYMBOL(cfg80211_ch_switch_started_notify); int cfg80211_bss_color_notify(struct net_device *dev, enum nl80211_commands cmd, u8 count, - u64 color_bitmap) + u64 color_bitmap, u8 link_id) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; @@ -19539,6 +19509,10 @@ int cfg80211_bss_color_notify(struct net_device *dev, if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; + if (wdev->valid_links && + nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link_id)) + goto nla_put_failure; + if (cmd == NL80211_CMD_COLOR_CHANGE_STARTED && nla_put_u32(msg, NL80211_ATTR_COLOR_CHANGE_COUNT, count)) goto nla_put_failure; @@ -19887,6 +19861,11 @@ void cfg80211_report_wowlan_wakeup(struct wireless_dev *wdev, NL80211_WOWLAN_TRIG_WAKEUP_TCP_NOMORETOKENS)) goto free_msg; + if (wakeup->unprot_deauth_disassoc && + nla_put_flag(msg, + NL80211_WOWLAN_TRIG_UNPROTECTED_DEAUTH_DISASSOC)) + goto free_msg; + if (wakeup->packet) { u32 pkt_attr = NL80211_WOWLAN_TRIG_WAKEUP_PKT_80211; u32 len_attr = NL80211_WOWLAN_TRIG_WAKEUP_PKT_80211_LEN; @@ -20167,9 +20146,26 @@ int cfg80211_external_auth_request(struct net_device *dev, if (!hdr) goto nla_put_failure; + /* Some historical mistakes in drivers <-> userspace interface (notably + * between drivers and wpa_supplicant) led to a big-endian conversion + * being needed on NL80211_ATTR_AKM_SUITES _only_ when its value is + * WLAN_AKM_SUITE_SAE. This is now fixed on userspace side, but for the + * benefit of older wpa_supplicant versions, send this particular value + * in big-endian. Note that newer wpa_supplicant will also detect this + * particular value in big endian still, so it all continues to work. + */ + if (params->key_mgmt_suite == WLAN_AKM_SUITE_SAE) { + if (nla_put_be32(msg, NL80211_ATTR_AKM_SUITES, + cpu_to_be32(WLAN_AKM_SUITE_SAE))) + goto nla_put_failure; + } else { + if (nla_put_u32(msg, NL80211_ATTR_AKM_SUITES, + params->key_mgmt_suite)) + goto nla_put_failure; + } + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) || - nla_put_u32(msg, NL80211_ATTR_AKM_SUITES, params->key_mgmt_suite) || nla_put_u32(msg, NL80211_ATTR_EXTERNAL_AUTH_ACTION, params->action) || nla_put(msg, NL80211_ATTR_BSSID, ETH_ALEN, params->bssid) || diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index e106dcea39..c569c37da3 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -56,7 +56,7 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, out->ftm.burst_period = 0; if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD]) out->ftm.burst_period = - nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD]); + nla_get_u16(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD]); out->ftm.asap = !!tb[NL80211_PMSR_FTM_REQ_ATTR_ASAP]; if (out->ftm.asap && !capa->ftm.asap) { @@ -75,7 +75,7 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, out->ftm.num_bursts_exp = 0; if (tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP]) out->ftm.num_bursts_exp = - nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP]); + nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP]); if (capa->ftm.max_bursts_exponent >= 0 && out->ftm.num_bursts_exp > capa->ftm.max_bursts_exponent) { @@ -88,7 +88,7 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, out->ftm.burst_duration = 15; if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]) out->ftm.burst_duration = - nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]); + nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]); out->ftm.ftms_per_burst = 0; if (tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]) @@ -107,7 +107,7 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, out->ftm.ftmr_retries = 3; if (tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES]) out->ftm.ftmr_retries = - nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES]); + nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES]); out->ftm.request_lci = !!tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI]; if (out->ftm.request_lci && !capa->ftm.request_lci) { diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 43897a5269..755af47b88 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -2,7 +2,7 @@ /* * Portions of this file * Copyright(c) 2016-2017 Intel Deutschland GmbH - * Copyright (C) 2018, 2021-2023 Intel Corporation + * Copyright (C) 2018, 2021-2024 Intel Corporation */ #ifndef __CFG80211_RDEV_OPS #define __CFG80211_RDEV_OPS @@ -458,6 +458,10 @@ static inline int rdev_scan(struct cfg80211_registered_device *rdev, struct cfg80211_scan_request *request) { int ret; + + if (WARN_ON_ONCE(!request->n_ssids && request->ssids)) + return -EINVAL; + trace_rdev_scan(&rdev->wiphy, request); ret = rdev->ops->scan(&rdev->wiphy, request); trace_rdev_return_int(&rdev->wiphy, ret); diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 2741b62691..3cef0021a3 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -5,7 +5,7 @@ * Copyright 2008-2011 Luis R. Rodriguez <mcgrof@qca.qualcomm.com> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2023 Intel Corporation + * Copyright (C) 2018 - 2024 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -57,6 +57,8 @@ #include <linux/verification.h> #include <linux/moduleparam.h> #include <linux/firmware.h> +#include <linux/units.h> + #include <net/cfg80211.h> #include "core.h" #include "reg.h" @@ -1289,20 +1291,17 @@ static bool is_valid_rd(const struct ieee80211_regdomain *rd) static bool freq_in_rule_band(const struct ieee80211_freq_range *freq_range, u32 freq_khz) { -#define ONE_GHZ_IN_KHZ 1000000 /* * From 802.11ad: directional multi-gigabit (DMG): * Pertaining to operation in a frequency band containing a channel * with the Channel starting frequency above 45 GHz. */ - u32 limit = freq_khz > 45 * ONE_GHZ_IN_KHZ ? - 20 * ONE_GHZ_IN_KHZ : 2 * ONE_GHZ_IN_KHZ; + u32 limit = freq_khz > 45 * KHZ_PER_GHZ ? 20 * KHZ_PER_GHZ : 2 * KHZ_PER_GHZ; if (abs(freq_khz - freq_range->start_freq_khz) <= limit) return true; if (abs(freq_khz - freq_range->end_freq_khz) <= limit) return true; return false; -#undef ONE_GHZ_IN_KHZ } /* @@ -1595,10 +1594,10 @@ static u32 map_regdom_flags(u32 rd_flags) channel_flags |= IEEE80211_CHAN_NO_EHT; if (rd_flags & NL80211_RRF_DFS_CONCURRENT) channel_flags |= IEEE80211_CHAN_DFS_CONCURRENT; - if (rd_flags & NL80211_RRF_NO_UHB_VLP_CLIENT) - channel_flags |= IEEE80211_CHAN_NO_UHB_VLP_CLIENT; - if (rd_flags & NL80211_RRF_NO_UHB_AFC_CLIENT) - channel_flags |= IEEE80211_CHAN_NO_UHB_AFC_CLIENT; + if (rd_flags & NL80211_RRF_NO_6GHZ_VLP_CLIENT) + channel_flags |= IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT; + if (rd_flags & NL80211_RRF_NO_6GHZ_AFC_CLIENT) + channel_flags |= IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT; if (rd_flags & NL80211_RRF_PSD) channel_flags |= IEEE80211_CHAN_PSD; return channel_flags; @@ -3285,7 +3284,7 @@ int regulatory_hint_user(const char *alpha2, return 0; } -int regulatory_hint_indoor(bool is_indoor, u32 portid) +void regulatory_hint_indoor(bool is_indoor, u32 portid) { spin_lock(®_indoor_lock); @@ -3308,8 +3307,6 @@ int regulatory_hint_indoor(bool is_indoor, u32 portid) if (!is_indoor) reg_check_channels(); - - return 0; } void regulatory_netlink_notify(u32 portid) @@ -3667,9 +3664,9 @@ static bool pending_reg_beacon(struct ieee80211_channel *beacon_chan) return false; } -int regulatory_hint_found_beacon(struct wiphy *wiphy, - struct ieee80211_channel *beacon_chan, - gfp_t gfp) +void regulatory_hint_found_beacon(struct wiphy *wiphy, + struct ieee80211_channel *beacon_chan, + gfp_t gfp) { struct reg_beacon *reg_beacon; bool processing; @@ -3678,18 +3675,18 @@ int regulatory_hint_found_beacon(struct wiphy *wiphy, beacon_chan->flags & IEEE80211_CHAN_RADAR || (beacon_chan->band == NL80211_BAND_2GHZ && !freq_is_chan_12_13_14(beacon_chan->center_freq))) - return 0; + return; spin_lock_bh(®_pending_beacons_lock); processing = pending_reg_beacon(beacon_chan); spin_unlock_bh(®_pending_beacons_lock); if (processing) - return 0; + return; reg_beacon = kzalloc(sizeof(struct reg_beacon), gfp); if (!reg_beacon) - return -ENOMEM; + return; pr_debug("Found new beacon on frequency: %d.%03d MHz (Ch %d) on %s\n", beacon_chan->center_freq, beacon_chan->freq_offset, @@ -3709,8 +3706,6 @@ int regulatory_hint_found_beacon(struct wiphy *wiphy, spin_unlock_bh(®_pending_beacons_lock); schedule_work(®_work); - - return 0; } static void print_rd_rules(const struct ieee80211_regdomain *rd) diff --git a/net/wireless/reg.h b/net/wireless/reg.h index a02ef5609f..e1b211c4f7 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -42,7 +42,7 @@ int regulatory_hint_user(const char *alpha2, * device is operating in an indoor environment. * @portid: the netlink port ID on which the hint was given. */ -int regulatory_hint_indoor(bool is_indoor, u32 portid); +void regulatory_hint_indoor(bool is_indoor, u32 portid); /** * regulatory_netlink_notify - notify on released netlink socket @@ -82,9 +82,9 @@ bool reg_last_request_cell_base(void); * on a newly found BSS. If you cannot make use of this feature you can * set the wiphy->disable_beacon_hints to true. */ -int regulatory_hint_found_beacon(struct wiphy *wiphy, - struct ieee80211_channel *beacon_chan, - gfp_t gfp); +void regulatory_hint_found_beacon(struct wiphy *wiphy, + struct ieee80211_channel *beacon_chan, + gfp_t gfp); /** * regulatory_hint_country_ie - hints a country IE as a regulatory domain @@ -137,13 +137,14 @@ void regulatory_hint_disconnect(void); * Get a value specifying the U-NII band frequency belongs to. * U-NII bands are defined by the FCC in C.F.R 47 part 15. * - * Returns -EINVAL if freq is invalid, 0 for UNII-1, 1 for UNII-2A, + * Return: -EINVAL if freq is invalid, 0 for UNII-1, 1 for UNII-2A, * 2 for UNII-2B, 3 for UNII-2C and 4 for UNII-3. */ int cfg80211_get_unii(int freq); /** * regulatory_indoor_allowed - is indoor operation allowed + * Return: %true if indoor operation is allowed, %false otherwise */ bool regulatory_indoor_allowed(void); @@ -173,11 +174,13 @@ void regulatory_propagate_dfs_state(struct wiphy *wiphy, * reg_dfs_domain_same - Checks if both wiphy have same DFS domain configured * @wiphy1: wiphy it's dfs_region to be checked against that of wiphy2 * @wiphy2: wiphy it's dfs_region to be checked against that of wiphy1 + * Return: %true if both wiphys have the same DFS domain, %false otherwise */ bool reg_dfs_domain_same(struct wiphy *wiphy1, struct wiphy *wiphy2); /** * reg_reload_regdb - reload the regulatory.db firmware file + * Return: 0 for success, an error code otherwise */ int reg_reload_regdb(void); diff --git a/net/wireless/scan.c b/net/wireless/scan.c index f138f88be9..292b530a6d 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -5,7 +5,7 @@ * Copyright 2008 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2016 Intel Deutschland GmbH - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation */ #include <linux/kernel.h> #include <linux/slab.h> @@ -77,45 +77,6 @@ MODULE_PARM_DESC(bss_entries_limit, #define IEEE80211_SCAN_RESULT_EXPIRE (30 * HZ) -/** - * struct cfg80211_colocated_ap - colocated AP information - * - * @list: linked list to all colocated aPS - * @bssid: BSSID of the reported AP - * @ssid: SSID of the reported AP - * @ssid_len: length of the ssid - * @center_freq: frequency the reported AP is on - * @unsolicited_probe: the reported AP is part of an ESS, where all the APs - * that operate in the same channel as the reported AP and that might be - * detected by a STA receiving this frame, are transmitting unsolicited - * Probe Response frames every 20 TUs - * @oct_recommended: OCT is recommended to exchange MMPDUs with the reported AP - * @same_ssid: the reported AP has the same SSID as the reporting AP - * @multi_bss: the reported AP is part of a multiple BSSID set - * @transmitted_bssid: the reported AP is the transmitting BSSID - * @colocated_ess: all the APs that share the same ESS as the reported AP are - * colocated and can be discovered via legacy bands. - * @short_ssid_valid: short_ssid is valid and can be used - * @short_ssid: the short SSID for this SSID - * @psd_20: The 20MHz PSD EIRP of the primary 20MHz channel for the reported AP - */ -struct cfg80211_colocated_ap { - struct list_head list; - u8 bssid[ETH_ALEN]; - u8 ssid[IEEE80211_MAX_SSID_LEN]; - size_t ssid_len; - u32 short_ssid; - u32 center_freq; - u8 unsolicited_probe:1, - oct_recommended:1, - same_ssid:1, - multi_bss:1, - transmitted_bssid:1, - colocated_ess:1, - short_ssid_valid:1; - s8 psd_20; -}; - static void bss_free(struct cfg80211_internal_bss *bss) { struct cfg80211_bss_ies *ies; @@ -566,7 +527,8 @@ static int cfg80211_calc_short_ssid(const struct cfg80211_bss_ies *ies, return 0; } -static void cfg80211_free_coloc_ap_list(struct list_head *coloc_ap_list) +VISIBLE_IF_CFG80211_KUNIT void +cfg80211_free_coloc_ap_list(struct list_head *coloc_ap_list) { struct cfg80211_colocated_ap *ap, *tmp_ap; @@ -575,6 +537,7 @@ static void cfg80211_free_coloc_ap_list(struct list_head *coloc_ap_list) kfree(ap); } } +EXPORT_SYMBOL_IF_CFG80211_KUNIT(cfg80211_free_coloc_ap_list); static int cfg80211_parse_ap_info(struct cfg80211_colocated_ap *entry, const u8 *pos, u8 length, @@ -648,104 +611,140 @@ static int cfg80211_parse_ap_info(struct cfg80211_colocated_ap *entry, return 0; } -static int cfg80211_parse_colocated_ap(const struct cfg80211_bss_ies *ies, - struct list_head *list) +bool cfg80211_iter_rnr(const u8 *elems, size_t elems_len, + enum cfg80211_rnr_iter_ret + (*iter)(void *data, u8 type, + const struct ieee80211_neighbor_ap_info *info, + const u8 *tbtt_info, u8 tbtt_info_len), + void *iter_data) { - struct ieee80211_neighbor_ap_info *ap_info; - const struct element *elem, *ssid_elem; + const struct element *rnr; const u8 *pos, *end; - u32 s_ssid_tmp; - int n_coloc = 0, ret; - LIST_HEAD(ap_list); - ret = cfg80211_calc_short_ssid(ies, &ssid_elem, &s_ssid_tmp); - if (ret) - return 0; + for_each_element_id(rnr, WLAN_EID_REDUCED_NEIGHBOR_REPORT, + elems, elems_len) { + const struct ieee80211_neighbor_ap_info *info; - for_each_element_id(elem, WLAN_EID_REDUCED_NEIGHBOR_REPORT, - ies->data, ies->len) { - pos = elem->data; - end = elem->data + elem->datalen; + pos = rnr->data; + end = rnr->data + rnr->datalen; /* RNR IE may contain more than one NEIGHBOR_AP_INFO */ - while (pos + sizeof(*ap_info) <= end) { - enum nl80211_band band; - int freq; + while (sizeof(*info) <= end - pos) { u8 length, i, count; + u8 type; - ap_info = (void *)pos; - count = u8_get_bits(ap_info->tbtt_info_hdr, - IEEE80211_AP_INFO_TBTT_HDR_COUNT) + 1; - length = ap_info->tbtt_info_len; + info = (void *)pos; + count = u8_get_bits(info->tbtt_info_hdr, + IEEE80211_AP_INFO_TBTT_HDR_COUNT) + + 1; + length = info->tbtt_info_len; - pos += sizeof(*ap_info); + pos += sizeof(*info); - if (!ieee80211_operating_class_to_band(ap_info->op_class, - &band)) - break; + if (count * length > end - pos) + return false; - freq = ieee80211_channel_to_frequency(ap_info->channel, - band); + type = u8_get_bits(info->tbtt_info_hdr, + IEEE80211_AP_INFO_TBTT_HDR_TYPE); - if (end - pos < count * length) - break; + for (i = 0; i < count; i++) { + switch (iter(iter_data, type, info, + pos, length)) { + case RNR_ITER_CONTINUE: + break; + case RNR_ITER_BREAK: + return true; + case RNR_ITER_ERROR: + return false; + } - if (u8_get_bits(ap_info->tbtt_info_hdr, - IEEE80211_AP_INFO_TBTT_HDR_TYPE) != - IEEE80211_TBTT_INFO_TYPE_TBTT) { - pos += count * length; - continue; + pos += length; } + } - /* TBTT info must include bss param + BSSID + - * (short SSID or same_ssid bit to be set). - * ignore other options, and move to the - * next AP info - */ - if (band != NL80211_BAND_6GHZ || - !(length == offsetofend(struct ieee80211_tbtt_info_7_8_9, - bss_params) || - length == sizeof(struct ieee80211_tbtt_info_7_8_9) || - length >= offsetofend(struct ieee80211_tbtt_info_ge_11, - bss_params))) { - pos += count * length; - continue; - } + if (pos != end) + return false; + } - for (i = 0; i < count; i++) { - struct cfg80211_colocated_ap *entry; + return true; +} +EXPORT_SYMBOL_GPL(cfg80211_iter_rnr); + +struct colocated_ap_data { + const struct element *ssid_elem; + struct list_head ap_list; + u32 s_ssid_tmp; + int n_coloc; +}; - entry = kzalloc(sizeof(*entry) + IEEE80211_MAX_SSID_LEN, - GFP_ATOMIC); +static enum cfg80211_rnr_iter_ret +cfg80211_parse_colocated_ap_iter(void *_data, u8 type, + const struct ieee80211_neighbor_ap_info *info, + const u8 *tbtt_info, u8 tbtt_info_len) +{ + struct colocated_ap_data *data = _data; + struct cfg80211_colocated_ap *entry; + enum nl80211_band band; - if (!entry) - goto error; + if (type != IEEE80211_TBTT_INFO_TYPE_TBTT) + return RNR_ITER_CONTINUE; - entry->center_freq = freq; + if (!ieee80211_operating_class_to_band(info->op_class, &band)) + return RNR_ITER_CONTINUE; - if (!cfg80211_parse_ap_info(entry, pos, length, - ssid_elem, - s_ssid_tmp)) { - n_coloc++; - list_add_tail(&entry->list, &ap_list); - } else { - kfree(entry); - } + /* TBTT info must include bss param + BSSID + (short SSID or + * same_ssid bit to be set). Ignore other options, and move to + * the next AP info + */ + if (band != NL80211_BAND_6GHZ || + !(tbtt_info_len == offsetofend(struct ieee80211_tbtt_info_7_8_9, + bss_params) || + tbtt_info_len == sizeof(struct ieee80211_tbtt_info_7_8_9) || + tbtt_info_len >= offsetofend(struct ieee80211_tbtt_info_ge_11, + bss_params))) + return RNR_ITER_CONTINUE; + + entry = kzalloc(sizeof(*entry) + IEEE80211_MAX_SSID_LEN, GFP_ATOMIC); + if (!entry) + return RNR_ITER_ERROR; + + entry->center_freq = + ieee80211_channel_to_frequency(info->channel, band); + + if (!cfg80211_parse_ap_info(entry, tbtt_info, tbtt_info_len, + data->ssid_elem, data->s_ssid_tmp)) { + data->n_coloc++; + list_add_tail(&entry->list, &data->ap_list); + } else { + kfree(entry); + } - pos += length; - } - } + return RNR_ITER_CONTINUE; +} -error: - if (pos != end) { - cfg80211_free_coloc_ap_list(&ap_list); - return 0; - } +VISIBLE_IF_CFG80211_KUNIT int +cfg80211_parse_colocated_ap(const struct cfg80211_bss_ies *ies, + struct list_head *list) +{ + struct colocated_ap_data data = {}; + int ret; + + INIT_LIST_HEAD(&data.ap_list); + + ret = cfg80211_calc_short_ssid(ies, &data.ssid_elem, &data.s_ssid_tmp); + if (ret) + return 0; + + if (!cfg80211_iter_rnr(ies->data, ies->len, + cfg80211_parse_colocated_ap_iter, &data)) { + cfg80211_free_coloc_ap_list(&data.ap_list); + return 0; } - list_splice_tail(&ap_list, list); - return n_coloc; + list_splice_tail(&data.ap_list, list); + return data.n_coloc; } +EXPORT_SYMBOL_IF_CFG80211_KUNIT(cfg80211_parse_colocated_ap); static void cfg80211_scan_req_add_chan(struct cfg80211_scan_request *request, struct ieee80211_channel *chan, @@ -813,6 +812,7 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) LIST_HEAD(coloc_ap_list); bool need_scan_psc = true; const struct ieee80211_sband_iftype_data *iftd; + size_t size, offs_ssids, offs_6ghz_params, offs_ies; rdev_req->scan_6ghz = true; @@ -878,10 +878,15 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) spin_unlock_bh(&rdev->bss_lock); } - request = kzalloc(struct_size(request, channels, n_channels) + - sizeof(*request->scan_6ghz_params) * count + - sizeof(*request->ssids) * rdev_req->n_ssids, - GFP_KERNEL); + size = struct_size(request, channels, n_channels); + offs_ssids = size; + size += sizeof(*request->ssids) * rdev_req->n_ssids; + offs_6ghz_params = size; + size += sizeof(*request->scan_6ghz_params) * count; + offs_ies = size; + size += rdev_req->ie_len; + + request = kzalloc(size, GFP_KERNEL); if (!request) { cfg80211_free_coloc_ap_list(&coloc_ap_list); return -ENOMEM; @@ -889,8 +894,26 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) *request = *rdev_req; request->n_channels = 0; - request->scan_6ghz_params = - (void *)&request->channels[n_channels]; + request->n_6ghz_params = 0; + if (rdev_req->n_ssids) { + /* + * Add the ssids from the parent scan request to the new + * scan request, so the driver would be able to use them + * in its probe requests to discover hidden APs on PSC + * channels. + */ + request->ssids = (void *)request + offs_ssids; + memcpy(request->ssids, rdev_req->ssids, + sizeof(*request->ssids) * request->n_ssids); + } + request->scan_6ghz_params = (void *)request + offs_6ghz_params; + + if (rdev_req->ie_len) { + void *ie = (void *)request + offs_ies; + + memcpy(ie, rdev_req->ie, rdev_req->ie_len); + request->ie = ie; + } /* * PSC channels should not be scanned in case of direct scan with 1 SSID @@ -979,17 +1002,8 @@ skip: if (request->n_channels) { struct cfg80211_scan_request *old = rdev->int_scan_req; - rdev->int_scan_req = request; - /* - * Add the ssids from the parent scan request to the new scan - * request, so the driver would be able to use them in its - * probe requests to discover hidden APs on PSC channels. - */ - request->ssids = (void *)&request->channels[request->n_channels]; - request->n_ssids = rdev_req->n_ssids; - memcpy(request->ssids, rdev_req->ssids, sizeof(*request->ssids) * - request->n_ssids); + rdev->int_scan_req = request; /* * If this scan follows a previous scan, save the scan start @@ -2122,6 +2136,40 @@ struct cfg80211_inform_single_bss_data { u64 cannot_use_reasons; }; +static bool cfg80211_6ghz_power_type_valid(const u8 *ie, size_t ielen, + const u32 flags) +{ + const struct element *tmp; + struct ieee80211_he_operation *he_oper; + + tmp = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_OPERATION, ie, ielen); + if (tmp && tmp->datalen >= sizeof(*he_oper) + 1 && + tmp->datalen >= ieee80211_he_oper_size(tmp->data + 1)) { + const struct ieee80211_he_6ghz_oper *he_6ghz_oper; + + he_oper = (void *)&tmp->data[1]; + he_6ghz_oper = ieee80211_he_6ghz_oper(he_oper); + + if (!he_6ghz_oper) + return false; + + switch (u8_get_bits(he_6ghz_oper->control, + IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) { + case IEEE80211_6GHZ_CTRL_REG_LPI_AP: + case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP: + return true; + case IEEE80211_6GHZ_CTRL_REG_SP_AP: + case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP: + return !(flags & IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT); + case IEEE80211_6GHZ_CTRL_REG_VLP_AP: + return !(flags & IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT); + default: + return false; + } + } + return false; +} + /* Returned bss is reference counted and must be cleaned up appropriately. */ static struct cfg80211_bss * cfg80211_inform_single_bss_data(struct wiphy *wiphy, @@ -2154,6 +2202,14 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy, if (!channel) return NULL; + if (channel->band == NL80211_BAND_6GHZ && + !cfg80211_6ghz_power_type_valid(data->ie, data->ielen, + channel->flags)) { + data->use_for = 0; + data->cannot_use_reasons = + NL80211_BSS_CANNOT_USE_6GHZ_PWR_MISMATCH; + } + memcpy(tmp.pub.bssid, data->bssid, ETH_ALEN); tmp.pub.channel = channel; if (data->bss_source != BSS_SOURCE_STA_PROFILE) @@ -2165,15 +2221,22 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy, tmp.ts_boottime = drv_data->boottime_ns; tmp.parent_tsf = drv_data->parent_tsf; ether_addr_copy(tmp.parent_bssid, drv_data->parent_bssid); + tmp.pub.chains = drv_data->chains; + memcpy(tmp.pub.chain_signal, drv_data->chain_signal, + IEEE80211_MAX_CHAINS); tmp.pub.use_for = data->use_for; tmp.pub.cannot_use_reasons = data->cannot_use_reasons; - if (data->bss_source != BSS_SOURCE_DIRECT) { + switch (data->bss_source) { + case BSS_SOURCE_MBSSID: tmp.pub.transmitted_bss = data->source_bss; + fallthrough; + case BSS_SOURCE_STA_PROFILE: ts = bss_from_pub(data->source_bss)->ts; tmp.pub.bssid_index = data->bssid_index; tmp.pub.max_bssid_indicator = data->max_bssid_indicator; - } else { + break; + case BSS_SOURCE_DIRECT: ts = jiffies; if (channel->band == NL80211_BAND_60GHZ) { @@ -2188,6 +2251,7 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy, regulatory_hint_found_beacon(wiphy, channel, gfp); } + break; } /* @@ -2208,6 +2272,7 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy, switch (data->ftype) { case CFG80211_BSS_FTYPE_BEACON: + case CFG80211_BSS_FTYPE_S1G_BEACON: ies->from_beacon = true; fallthrough; case CFG80211_BSS_FTYPE_UNKNOWN: @@ -2403,7 +2468,8 @@ cfg80211_parse_mbssid_data(struct wiphy *wiphy, profile, profile_len); if (!mbssid_index_ie || mbssid_index_ie[1] < 1 || mbssid_index_ie[2] == 0 || - mbssid_index_ie[2] > 46) { + mbssid_index_ie[2] > 46 || + mbssid_index_ie[2] >= (1 << elem->data[0])) { /* No valid Multiple BSSID-Index element */ continue; } @@ -2464,16 +2530,22 @@ ssize_t cfg80211_defragment_element(const struct element *elem, const u8 *ies, if (elem->id == WLAN_EID_EXTENSION) { copied = elem->datalen - 1; - if (copied > data_len) - return -ENOSPC; - memmove(data, elem->data + 1, copied); + if (data) { + if (copied > data_len) + return -ENOSPC; + + memmove(data, elem->data + 1, copied); + } } else { copied = elem->datalen; - if (copied > data_len) - return -ENOSPC; - memmove(data, elem->data, copied); + if (data) { + if (copied > data_len) + return -ENOSPC; + + memmove(data, elem->data, copied); + } } /* Fragmented elements must have 255 bytes */ @@ -2492,10 +2564,13 @@ ssize_t cfg80211_defragment_element(const struct element *elem, const u8 *ies, elem_datalen = elem->datalen; - if (copied + elem_datalen > data_len) - return -ENOSPC; + if (data) { + if (copied + elem_datalen > data_len) + return -ENOSPC; + + memmove(data + copied, elem->data, elem_datalen); + } - memmove(data + copied, elem->data, elem_datalen); copied += elem_datalen; /* Only the last fragment may be short */ @@ -2601,79 +2676,80 @@ error: return NULL; } -static u8 -cfg80211_rnr_info_for_mld_ap(const u8 *ie, size_t ielen, u8 mld_id, u8 link_id, - const struct ieee80211_neighbor_ap_info **ap_info, - u8 *param_ch_count) -{ - const struct ieee80211_neighbor_ap_info *info; - const struct element *rnr; - const u8 *pos, *end; - - for_each_element_id(rnr, WLAN_EID_REDUCED_NEIGHBOR_REPORT, ie, ielen) { - pos = rnr->data; - end = rnr->data + rnr->datalen; - - /* RNR IE may contain more than one NEIGHBOR_AP_INFO */ - while (sizeof(*info) <= end - pos) { - const struct ieee80211_rnr_mld_params *mld_params; - u16 params; - u8 length, i, count, mld_params_offset; - u8 type, lid; - u32 use_for; - - info = (void *)pos; - count = u8_get_bits(info->tbtt_info_hdr, - IEEE80211_AP_INFO_TBTT_HDR_COUNT) + 1; - length = info->tbtt_info_len; +struct tbtt_info_iter_data { + const struct ieee80211_neighbor_ap_info *ap_info; + u8 param_ch_count; + u32 use_for; + u8 mld_id, link_id; + bool non_tx; +}; - pos += sizeof(*info); +static enum cfg80211_rnr_iter_ret +cfg802121_mld_ap_rnr_iter(void *_data, u8 type, + const struct ieee80211_neighbor_ap_info *info, + const u8 *tbtt_info, u8 tbtt_info_len) +{ + const struct ieee80211_rnr_mld_params *mld_params; + struct tbtt_info_iter_data *data = _data; + u8 link_id; + bool non_tx = false; + + if (type == IEEE80211_TBTT_INFO_TYPE_TBTT && + tbtt_info_len >= offsetofend(struct ieee80211_tbtt_info_ge_11, + mld_params)) { + const struct ieee80211_tbtt_info_ge_11 *tbtt_info_ge_11 = + (void *)tbtt_info; + + non_tx = (tbtt_info_ge_11->bss_params & + (IEEE80211_RNR_TBTT_PARAMS_MULTI_BSSID | + IEEE80211_RNR_TBTT_PARAMS_TRANSMITTED_BSSID)) == + IEEE80211_RNR_TBTT_PARAMS_MULTI_BSSID; + mld_params = &tbtt_info_ge_11->mld_params; + } else if (type == IEEE80211_TBTT_INFO_TYPE_MLD && + tbtt_info_len >= sizeof(struct ieee80211_rnr_mld_params)) + mld_params = (void *)tbtt_info; + else + return RNR_ITER_CONTINUE; - if (count * length > end - pos) - return 0; + link_id = le16_get_bits(mld_params->params, + IEEE80211_RNR_MLD_PARAMS_LINK_ID); - type = u8_get_bits(info->tbtt_info_hdr, - IEEE80211_AP_INFO_TBTT_HDR_TYPE); + if (data->mld_id != mld_params->mld_id) + return RNR_ITER_CONTINUE; - if (type == IEEE80211_TBTT_INFO_TYPE_TBTT && - length >= - offsetofend(struct ieee80211_tbtt_info_ge_11, - mld_params)) { - mld_params_offset = - offsetof(struct ieee80211_tbtt_info_ge_11, mld_params); - use_for = NL80211_BSS_USE_FOR_ALL; - } else if (type == IEEE80211_TBTT_INFO_TYPE_MLD && - length >= sizeof(struct ieee80211_rnr_mld_params)) { - mld_params_offset = 0; - use_for = NL80211_BSS_USE_FOR_MLD_LINK; - } else { - pos += count * length; - continue; - } + if (data->link_id != link_id) + return RNR_ITER_CONTINUE; - for (i = 0; i < count; i++) { - mld_params = (void *)pos + mld_params_offset; - params = le16_to_cpu(mld_params->params); + data->ap_info = info; + data->param_ch_count = + le16_get_bits(mld_params->params, + IEEE80211_RNR_MLD_PARAMS_BSS_CHANGE_COUNT); + data->non_tx = non_tx; - lid = u16_get_bits(params, - IEEE80211_RNR_MLD_PARAMS_LINK_ID); + if (type == IEEE80211_TBTT_INFO_TYPE_TBTT) + data->use_for = NL80211_BSS_USE_FOR_ALL; + else + data->use_for = NL80211_BSS_USE_FOR_MLD_LINK; + return RNR_ITER_BREAK; +} - if (mld_id == mld_params->mld_id && - link_id == lid) { - *ap_info = info; - *param_ch_count = - le16_get_bits(mld_params->params, - IEEE80211_RNR_MLD_PARAMS_BSS_CHANGE_COUNT); +static u8 +cfg80211_rnr_info_for_mld_ap(const u8 *ie, size_t ielen, u8 mld_id, u8 link_id, + const struct ieee80211_neighbor_ap_info **ap_info, + u8 *param_ch_count, bool *non_tx) +{ + struct tbtt_info_iter_data data = { + .mld_id = mld_id, + .link_id = link_id, + }; - return use_for; - } + cfg80211_iter_rnr(ie, ielen, cfg802121_mld_ap_rnr_iter, &data); - pos += length; - } - } - } + *ap_info = data.ap_info; + *param_ch_count = data.param_ch_count; + *non_tx = data.non_tx; - return 0; + return data.use_for; } static struct element * @@ -2795,17 +2871,16 @@ cfg80211_parse_ml_elem_sta_data(struct wiphy *wiphy, struct cfg80211_bss *bss; u8 mld_id, reporter_link_id, bss_change_count; u16 seen_links = 0; - const u8 *pos; u8 i; - if (!ieee80211_mle_size_ok(elem->data + 1, elem->datalen - 1)) + if (!ieee80211_mle_type_ok(elem->data + 1, + IEEE80211_ML_CONTROL_TYPE_BASIC, + elem->datalen - 1)) return; - ml_elem = (void *)elem->data + 1; + ml_elem = (void *)(elem->data + 1); control = le16_to_cpu(ml_elem->control); - if (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE) != - IEEE80211_ML_CONTROL_TYPE_BASIC) - return; + ml_common_len = ml_elem->variable[0]; /* Must be present when transmitted by an AP (in a probe response) */ if (!(control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) || @@ -2813,24 +2888,8 @@ cfg80211_parse_ml_elem_sta_data(struct wiphy *wiphy, !(control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP)) return; - ml_common_len = ml_elem->variable[0]; - - /* length + MLD MAC address */ - pos = ml_elem->variable + 1 + 6; - - reporter_link_id = pos[0]; - pos += 1; - - bss_change_count = pos[0]; - pos += 1; - - if (u16_get_bits(control, IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY)) - pos += 2; - if (u16_get_bits(control, IEEE80211_MLC_BASIC_PRES_EML_CAPA)) - pos += 2; - - /* MLD capabilities and operations */ - pos += 2; + reporter_link_id = ieee80211_mle_get_link_id(elem->data + 1); + bss_change_count = ieee80211_mle_get_bss_param_ch_cnt(elem->data + 1); /* * The MLD ID of the reporting AP is always zero. It is set if the AP @@ -2838,15 +2897,7 @@ cfg80211_parse_ml_elem_sta_data(struct wiphy *wiphy, * relating to a nontransmitted BSS (matching the Multi-BSSID Index, * Draft P802.11be_D3.2, 35.3.4.2) */ - if (u16_get_bits(control, IEEE80211_MLC_BASIC_PRES_MLD_ID)) { - mld_id = *pos; - pos += 1; - } else { - mld_id = 0; - } - - /* Extended MLD capabilities and operations */ - pos += 2; + mld_id = ieee80211_mle_get_mld_id(elem->data + 1); /* Fully defrag the ML element for sta information/profile iteration */ mle = cfg80211_defrag_mle(elem, tx_data->ie, tx_data->ielen, gfp); @@ -2876,6 +2927,7 @@ cfg80211_parse_ml_elem_sta_data(struct wiphy *wiphy, ssize_t profile_len; u8 param_ch_count; u8 link_id, use_for; + bool non_tx; if (!ieee80211_mle_basic_sta_prof_size_ok((u8 *)mle->sta_prof[i], mle->sta_prof_len[i])) @@ -2921,10 +2973,24 @@ cfg80211_parse_ml_elem_sta_data(struct wiphy *wiphy, tx_data->ielen, mld_id, link_id, &ap_info, - ¶m_ch_count); + ¶m_ch_count, + &non_tx); if (!use_for) continue; + /* + * As of 802.11be_D5.0, the specification does not give us any + * way of discovering both the MaxBSSID and the Multiple-BSSID + * Index. It does seem like the Multiple-BSSID Index element + * may be provided, but section 9.4.2.45 explicitly forbids + * including a Multiple-BSSID Element (in this case without any + * subelements). + * Without both pieces of information we cannot calculate the + * reference BSSID, so simply ignore the BSS. + */ + if (non_tx) + continue; + /* We could sanity check the BSSID is included */ if (!ieee80211_operating_class_to_band(ap_info->op_class, @@ -3052,6 +3118,10 @@ cfg80211_inform_bss_data(struct wiphy *wiphy, if (!res) return NULL; + /* don't do any further MBSSID/ML handling for S1G */ + if (ftype == CFG80211_BSS_FTYPE_S1G_BEACON) + return res; + cfg80211_parse_mbssid_data(wiphy, &inform_data, res, gfp); cfg80211_parse_ml_sta_data(wiphy, &inform_data, res, gfp); @@ -3060,59 +3130,21 @@ cfg80211_inform_bss_data(struct wiphy *wiphy, } EXPORT_SYMBOL(cfg80211_inform_bss_data); -static bool cfg80211_uhb_power_type_valid(const u8 *ie, - size_t ielen, - const u32 flags) -{ - const struct element *tmp; - struct ieee80211_he_operation *he_oper; - - tmp = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_OPERATION, ie, ielen); - if (tmp && tmp->datalen >= sizeof(*he_oper) + 1) { - const struct ieee80211_he_6ghz_oper *he_6ghz_oper; - - he_oper = (void *)&tmp->data[1]; - he_6ghz_oper = ieee80211_he_6ghz_oper(he_oper); - - if (!he_6ghz_oper) - return false; - - switch (u8_get_bits(he_6ghz_oper->control, - IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) { - case IEEE80211_6GHZ_CTRL_REG_LPI_AP: - return true; - case IEEE80211_6GHZ_CTRL_REG_SP_AP: - return !(flags & IEEE80211_CHAN_NO_UHB_AFC_CLIENT); - case IEEE80211_6GHZ_CTRL_REG_VLP_AP: - return !(flags & IEEE80211_CHAN_NO_UHB_VLP_CLIENT); - } - } - return false; -} - -/* cfg80211_inform_bss_width_frame helper */ -static struct cfg80211_bss * -cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy, - struct cfg80211_inform_bss *data, - struct ieee80211_mgmt *mgmt, size_t len, - gfp_t gfp) +struct cfg80211_bss * +cfg80211_inform_bss_frame_data(struct wiphy *wiphy, + struct cfg80211_inform_bss *data, + struct ieee80211_mgmt *mgmt, size_t len, + gfp_t gfp) { - struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); - struct cfg80211_internal_bss tmp = {}, *res; - struct cfg80211_bss_ies *ies; - struct ieee80211_channel *channel; - bool signal_valid; + size_t min_hdr_len; struct ieee80211_ext *ext = NULL; - u8 *bssid, *variable; - u16 capability, beacon_int; - size_t ielen, min_hdr_len = offsetof(struct ieee80211_mgmt, - u.probe_resp.variable); - int bss_type; - - BUILD_BUG_ON(offsetof(struct ieee80211_mgmt, u.probe_resp.variable) != - offsetof(struct ieee80211_mgmt, u.beacon.variable)); - - trace_cfg80211_inform_bss_frame(wiphy, data, mgmt, len); + enum cfg80211_bss_frame_type ftype; + u16 beacon_interval; + const u8 *bssid; + u16 capability; + const u8 *ie; + size_t ielen; + u64 tsf; if (WARN_ON(!mgmt)) return NULL; @@ -3120,48 +3152,40 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy, if (WARN_ON(!wiphy)) return NULL; - if (WARN_ON(wiphy->signal_type == CFG80211_SIGNAL_TYPE_UNSPEC && - (data->signal < 0 || data->signal > 100))) - return NULL; + BUILD_BUG_ON(offsetof(struct ieee80211_mgmt, u.probe_resp.variable) != + offsetof(struct ieee80211_mgmt, u.beacon.variable)); + + trace_cfg80211_inform_bss_frame(wiphy, data, mgmt, len); if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { ext = (void *) mgmt; - min_hdr_len = offsetof(struct ieee80211_ext, u.s1g_beacon); if (ieee80211_is_s1g_short_beacon(mgmt->frame_control)) min_hdr_len = offsetof(struct ieee80211_ext, u.s1g_short_beacon.variable); + else + min_hdr_len = offsetof(struct ieee80211_ext, + u.s1g_beacon.variable); + } else { + /* same for beacons */ + min_hdr_len = offsetof(struct ieee80211_mgmt, + u.probe_resp.variable); } if (WARN_ON(len < min_hdr_len)) return NULL; ielen = len - min_hdr_len; - variable = mgmt->u.probe_resp.variable; - if (ext) { - if (ieee80211_is_s1g_short_beacon(mgmt->frame_control)) - variable = ext->u.s1g_short_beacon.variable; - else - variable = ext->u.s1g_beacon.variable; - } - - channel = cfg80211_get_bss_channel(wiphy, variable, ielen, data->chan); - if (!channel) - return NULL; - - if (channel->band == NL80211_BAND_6GHZ && - !cfg80211_uhb_power_type_valid(variable, ielen, channel->flags)) { - data->restrict_use = 1; - data->use_for = 0; - data->cannot_use_reasons = - NL80211_BSS_CANNOT_USE_UHB_PWR_MISMATCH; - } - + ie = mgmt->u.probe_resp.variable; if (ext) { const struct ieee80211_s1g_bcn_compat_ie *compat; const struct element *elem; - elem = cfg80211_find_elem(WLAN_EID_S1G_BCN_COMPAT, - variable, ielen); + if (ieee80211_is_s1g_short_beacon(mgmt->frame_control)) + ie = ext->u.s1g_short_beacon.variable; + else + ie = ext->u.s1g_beacon.variable; + + elem = cfg80211_find_elem(WLAN_EID_S1G_BCN_COMPAT, ie, ielen); if (!elem) return NULL; if (elem->datalen < sizeof(*compat)) @@ -3169,112 +3193,26 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy, compat = (void *)elem->data; bssid = ext->u.s1g_beacon.sa; capability = le16_to_cpu(compat->compat_info); - beacon_int = le16_to_cpu(compat->beacon_int); + beacon_interval = le16_to_cpu(compat->beacon_int); } else { bssid = mgmt->bssid; - beacon_int = le16_to_cpu(mgmt->u.probe_resp.beacon_int); + beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int); capability = le16_to_cpu(mgmt->u.probe_resp.capab_info); } - if (channel->band == NL80211_BAND_60GHZ) { - bss_type = capability & WLAN_CAPABILITY_DMG_TYPE_MASK; - if (bss_type == WLAN_CAPABILITY_DMG_TYPE_AP || - bss_type == WLAN_CAPABILITY_DMG_TYPE_PBSS) - regulatory_hint_found_beacon(wiphy, channel, gfp); - } else { - if (capability & WLAN_CAPABILITY_ESS) - regulatory_hint_found_beacon(wiphy, channel, gfp); - } - - ies = kzalloc(sizeof(*ies) + ielen, gfp); - if (!ies) - return NULL; - ies->len = ielen; - ies->tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp); - ies->from_beacon = ieee80211_is_beacon(mgmt->frame_control) || - ieee80211_is_s1g_beacon(mgmt->frame_control); - memcpy(ies->data, variable, ielen); + tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp); if (ieee80211_is_probe_resp(mgmt->frame_control)) - rcu_assign_pointer(tmp.pub.proberesp_ies, ies); + ftype = CFG80211_BSS_FTYPE_PRESP; + else if (ext) + ftype = CFG80211_BSS_FTYPE_S1G_BEACON; else - rcu_assign_pointer(tmp.pub.beacon_ies, ies); - rcu_assign_pointer(tmp.pub.ies, ies); - - memcpy(tmp.pub.bssid, bssid, ETH_ALEN); - tmp.pub.beacon_interval = beacon_int; - tmp.pub.capability = capability; - tmp.pub.channel = channel; - tmp.pub.signal = data->signal; - tmp.ts_boottime = data->boottime_ns; - tmp.parent_tsf = data->parent_tsf; - tmp.pub.chains = data->chains; - memcpy(tmp.pub.chain_signal, data->chain_signal, IEEE80211_MAX_CHAINS); - ether_addr_copy(tmp.parent_bssid, data->parent_bssid); - tmp.pub.use_for = data->restrict_use ? - data->use_for : - NL80211_BSS_USE_FOR_ALL; - tmp.pub.cannot_use_reasons = data->cannot_use_reasons; - - signal_valid = data->chan == channel; - spin_lock_bh(&rdev->bss_lock); - res = __cfg80211_bss_update(rdev, &tmp, signal_valid, jiffies); - if (!res) - goto drop; - - rdev_inform_bss(rdev, &res->pub, ies, data->drv_data); - - spin_unlock_bh(&rdev->bss_lock); - - trace_cfg80211_return_bss(&res->pub); - /* __cfg80211_bss_update gives us a referenced result */ - return &res->pub; - -drop: - spin_unlock_bh(&rdev->bss_lock); - return NULL; -} - -struct cfg80211_bss * -cfg80211_inform_bss_frame_data(struct wiphy *wiphy, - struct cfg80211_inform_bss *data, - struct ieee80211_mgmt *mgmt, size_t len, - gfp_t gfp) -{ - struct cfg80211_inform_single_bss_data inform_data = { - .drv_data = data, - .ie = mgmt->u.probe_resp.variable, - .ielen = len - offsetof(struct ieee80211_mgmt, - u.probe_resp.variable), - .use_for = data->restrict_use ? - data->use_for : - NL80211_BSS_USE_FOR_ALL, - .cannot_use_reasons = data->cannot_use_reasons, - }; - struct cfg80211_bss *res; - - res = cfg80211_inform_single_bss_frame_data(wiphy, data, mgmt, - len, gfp); - if (!res) - return NULL; - - /* don't do any further MBSSID/ML handling for S1G */ - if (ieee80211_is_s1g_beacon(mgmt->frame_control)) - return res; - - inform_data.ftype = ieee80211_is_beacon(mgmt->frame_control) ? - CFG80211_BSS_FTYPE_BEACON : CFG80211_BSS_FTYPE_PRESP; - memcpy(inform_data.bssid, mgmt->bssid, ETH_ALEN); - inform_data.tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp); - inform_data.beacon_interval = - le16_to_cpu(mgmt->u.probe_resp.beacon_int); + ftype = CFG80211_BSS_FTYPE_BEACON; - /* process each non-transmitting bss */ - cfg80211_parse_mbssid_data(wiphy, &inform_data, res, gfp); - - cfg80211_parse_ml_sta_data(wiphy, &inform_data, res, gfp); - - return res; + return cfg80211_inform_bss_data(wiphy, data, ftype, + bssid, tsf, capability, + beacon_interval, ie, ielen, + gfp); } EXPORT_SYMBOL(cfg80211_inform_bss_frame_data); @@ -3483,10 +3421,14 @@ int cfg80211_wext_siwscan(struct net_device *dev, wiphy = &rdev->wiphy; /* Determine number of channels, needed to allocate creq */ - if (wreq && wreq->num_channels) + if (wreq && wreq->num_channels) { + /* Passed from userspace so should be checked */ + if (unlikely(wreq->num_channels > IW_MAX_FREQUENCIES)) + return -EINVAL; n_channels = wreq->num_channels; - else + } else { n_channels = ieee80211_get_num_supported_channels(wiphy); + } creq = kzalloc(sizeof(*creq) + sizeof(struct cfg80211_ssid) + n_channels * sizeof(void *), @@ -3560,8 +3502,10 @@ int cfg80211_wext_siwscan(struct net_device *dev, memcpy(creq->ssids[0].ssid, wreq->essid, wreq->essid_len); creq->ssids[0].ssid_len = wreq->essid_len; } - if (wreq->scan_type == IW_SCAN_TYPE_PASSIVE) + if (wreq->scan_type == IW_SCAN_TYPE_PASSIVE) { + creq->ssids = NULL; creq->n_ssids = 0; + } } for (i = 0; i < NUM_NL80211_BANDS; i++) diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 195c853273..1cfe673bc5 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -209,7 +209,8 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev, if (!req.bss) { err = -ENOENT; } else { - err = cfg80211_mlme_assoc(rdev, wdev->netdev, &req); + err = cfg80211_mlme_assoc(rdev, wdev->netdev, + &req, NULL); cfg80211_put_bss(&rdev->wiphy, req.bss); } @@ -1044,6 +1045,7 @@ void cfg80211_connect_done(struct net_device *dev, cfg80211_hold_bss( bss_from_pub(params->links[link].bss)); ev->cr.links[link].bss = params->links[link].bss; + ev->cr.links[link].status = params->links[link].status; if (params->links[link].addr) { ev->cr.links[link].addr = next; @@ -1352,6 +1354,7 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, return; cfg80211_wdev_release_bsses(wdev); + wdev->valid_links = 0; wdev->connected = false; wdev->u.client.ssid_len = 0; wdev->conn_owner_nlportid = 0; diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c index 565511a3f4..62f26618f6 100644 --- a/net/wireless/sysfs.c +++ b/net/wireless/sysfs.c @@ -5,7 +5,7 @@ * * Copyright 2005-2006 Jiri Benc <jbenc@suse.cz> * Copyright 2006 Johannes Berg <johannes@sipsolutions.net> - * Copyright (C) 2020-2021, 2023 Intel Corporation + * Copyright (C) 2020-2021, 2023-2024 Intel Corporation */ #include <linux/device.h> @@ -137,7 +137,7 @@ static int wiphy_resume(struct device *dev) if (rdev->wiphy.registered && rdev->ops->resume) ret = rdev_resume(rdev); rdev->suspended = false; - schedule_work(&rdev->wiphy_work); + queue_work(system_unbound_wq, &rdev->wiphy_work); wiphy_unlock(&rdev->wiphy); if (ret) diff --git a/net/wireless/tests/Makefile b/net/wireless/tests/Makefile index 1f6622fcb7..c364e63b50 100644 --- a/net/wireless/tests/Makefile +++ b/net/wireless/tests/Makefile @@ -1,3 +1,3 @@ -cfg80211-tests-y += module.o fragmentation.o scan.o util.o +cfg80211-tests-y += module.o fragmentation.o scan.o util.o chan.o obj-$(CONFIG_CFG80211_KUNIT_TEST) += cfg80211-tests.o diff --git a/net/wireless/tests/chan.c b/net/wireless/tests/chan.c new file mode 100644 index 0000000000..d02258ac2d --- /dev/null +++ b/net/wireless/tests/chan.c @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KUnit tests for channel helper functions + * + * Copyright (C) 2023-2024 Intel Corporation + */ +#include <net/cfg80211.h> +#include <kunit/test.h> + +MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING); + +static struct ieee80211_channel chan_6ghz_1 = { + .band = NL80211_BAND_6GHZ, + .center_freq = 5955, +}; + +static struct ieee80211_channel chan_6ghz_5 = { + .band = NL80211_BAND_6GHZ, + .center_freq = 5975, +}; + +static struct ieee80211_channel chan_6ghz_105 = { + .band = NL80211_BAND_6GHZ, + .center_freq = 6475, +}; + +static const struct chandef_compat_case { + const char *desc; + /* leave c1 empty for tests for identical */ + struct cfg80211_chan_def c1, c2; + /* we test both ways around, so c2 should always be the compat one */ + bool compat; +} chandef_compat_cases[] = { + { + .desc = "identical non-HT", + .c2 = { + .width = NL80211_CHAN_WIDTH_20_NOHT, + .chan = &chan_6ghz_1, + .center_freq1 = 5955, + }, + .compat = true, + }, + { + .desc = "identical 20 MHz", + .c2 = { + .width = NL80211_CHAN_WIDTH_20, + .chan = &chan_6ghz_1, + .center_freq1 = 5955, + }, + .compat = true, + }, + { + .desc = "identical 40 MHz", + .c2 = { + .width = NL80211_CHAN_WIDTH_40, + .chan = &chan_6ghz_1, + .center_freq1 = 5955 + 10, + }, + .compat = true, + }, + { + .desc = "identical 80 MHz", + .c2 = { + .width = NL80211_CHAN_WIDTH_80, + .chan = &chan_6ghz_1, + .center_freq1 = 5955 + 10 + 20, + }, + .compat = true, + }, + { + .desc = "identical 160 MHz", + .c2 = { + .width = NL80211_CHAN_WIDTH_160, + .chan = &chan_6ghz_1, + .center_freq1 = 5955 + 10 + 20 + 40, + }, + .compat = true, + }, + { + .desc = "identical 320 MHz", + .c2 = { + .width = NL80211_CHAN_WIDTH_320, + .chan = &chan_6ghz_1, + .center_freq1 = 5955 + 10 + 20 + 40 + 80, + }, + .compat = true, + }, + { + .desc = "20 MHz in 320 MHz\n", + .c1 = { + .width = NL80211_CHAN_WIDTH_20, + .chan = &chan_6ghz_1, + .center_freq1 = 5955, + }, + .c2 = { + .width = NL80211_CHAN_WIDTH_320, + .chan = &chan_6ghz_1, + .center_freq1 = 5955 + 10 + 20 + 40 + 80, + }, + .compat = true, + }, + { + .desc = "different 20 MHz", + .c1 = { + .width = NL80211_CHAN_WIDTH_20, + .chan = &chan_6ghz_1, + .center_freq1 = 5955, + }, + .c2 = { + .width = NL80211_CHAN_WIDTH_20, + .chan = &chan_6ghz_5, + .center_freq1 = 5975, + }, + }, + { + .desc = "different primary 160 MHz", + .c1 = { + .width = NL80211_CHAN_WIDTH_320, + .chan = &chan_6ghz_105, + .center_freq1 = 6475 + 150, + }, + .c2 = { + .width = NL80211_CHAN_WIDTH_320, + .chan = &chan_6ghz_105, + .center_freq1 = 6475 - 10, + }, + }, + { + /* similar to previous test but one has lower BW */ + .desc = "matching primary 160 MHz", + .c1 = { + .width = NL80211_CHAN_WIDTH_160, + .chan = &chan_6ghz_105, + .center_freq1 = 6475 + 70, + }, + .c2 = { + .width = NL80211_CHAN_WIDTH_320, + .chan = &chan_6ghz_105, + .center_freq1 = 6475 - 10, + }, + .compat = true, + }, + { + .desc = "matching primary 160 MHz & punctured secondary 160 Mhz", + .c1 = { + .width = NL80211_CHAN_WIDTH_160, + .chan = &chan_6ghz_105, + .center_freq1 = 6475 + 70, + }, + .c2 = { + .width = NL80211_CHAN_WIDTH_320, + .chan = &chan_6ghz_105, + .center_freq1 = 6475 - 10, + .punctured = 0xf, + }, + .compat = true, + }, + { + .desc = "matching primary 160 MHz & punctured matching", + .c1 = { + .width = NL80211_CHAN_WIDTH_160, + .chan = &chan_6ghz_105, + .center_freq1 = 6475 + 70, + .punctured = 0xc0, + }, + .c2 = { + .width = NL80211_CHAN_WIDTH_320, + .chan = &chan_6ghz_105, + .center_freq1 = 6475 - 10, + .punctured = 0xc000, + }, + .compat = true, + }, + { + .desc = "matching primary 160 MHz & punctured not matching", + .c1 = { + .width = NL80211_CHAN_WIDTH_160, + .chan = &chan_6ghz_105, + .center_freq1 = 6475 + 70, + .punctured = 0x80, + }, + .c2 = { + .width = NL80211_CHAN_WIDTH_320, + .chan = &chan_6ghz_105, + .center_freq1 = 6475 - 10, + .punctured = 0xc000, + }, + }, +}; + +KUNIT_ARRAY_PARAM_DESC(chandef_compat, chandef_compat_cases, desc) + +static void test_chandef_compat(struct kunit *test) +{ + const struct chandef_compat_case *params = test->param_value; + const struct cfg80211_chan_def *ret, *expect; + struct cfg80211_chan_def c1 = params->c1; + + /* tests with identical ones */ + if (!params->c1.chan) + c1 = params->c2; + + KUNIT_EXPECT_EQ(test, cfg80211_chandef_valid(&c1), true); + KUNIT_EXPECT_EQ(test, cfg80211_chandef_valid(¶ms->c2), true); + + expect = params->compat ? ¶ms->c2 : NULL; + + ret = cfg80211_chandef_compatible(&c1, ¶ms->c2); + KUNIT_EXPECT_PTR_EQ(test, ret, expect); + + if (!params->c1.chan) + expect = &c1; + + ret = cfg80211_chandef_compatible(¶ms->c2, &c1); + KUNIT_EXPECT_PTR_EQ(test, ret, expect); +} + +static struct kunit_case chandef_compat_test_cases[] = { + KUNIT_CASE_PARAM(test_chandef_compat, chandef_compat_gen_params), + {} +}; + +static struct kunit_suite chandef_compat = { + .name = "cfg80211-chandef-compat", + .test_cases = chandef_compat_test_cases, +}; + +kunit_test_suite(chandef_compat); diff --git a/net/wireless/tests/fragmentation.c b/net/wireless/tests/fragmentation.c index 49a339ca88..411fae18cd 100644 --- a/net/wireless/tests/fragmentation.c +++ b/net/wireless/tests/fragmentation.c @@ -2,7 +2,7 @@ /* * KUnit tests for element fragmentation * - * Copyright (C) 2023 Intel Corporation + * Copyright (C) 2023-2024 Intel Corporation */ #include <linux/ieee80211.h> #include <net/cfg80211.h> @@ -27,7 +27,12 @@ static void defragment_0(struct kunit *test) ret = cfg80211_defragment_element((void *)input, input, sizeof(input), - data, sizeof(input), + NULL, 0, + WLAN_EID_FRAGMENT); + KUNIT_EXPECT_EQ(test, ret, 253); + ret = cfg80211_defragment_element((void *)input, + input, sizeof(input), + data, ret, WLAN_EID_FRAGMENT); KUNIT_EXPECT_EQ(test, ret, 253); KUNIT_EXPECT_MEMEQ(test, data, input + 3, 253); @@ -63,7 +68,12 @@ static void defragment_1(struct kunit *test) ret = cfg80211_defragment_element((void *)input, input, sizeof(input), - data, sizeof(input), + NULL, 0, + WLAN_EID_FRAGMENT); + KUNIT_EXPECT_EQ(test, ret, 254 + 7); + ret = cfg80211_defragment_element((void *)input, + input, sizeof(input), + data, ret, WLAN_EID_FRAGMENT); /* this means the last fragment was not used */ KUNIT_EXPECT_EQ(test, ret, 254 + 7); @@ -106,10 +116,15 @@ static void defragment_2(struct kunit *test) ret = cfg80211_defragment_element((void *)input, input, sizeof(input), - data, sizeof(input), + NULL, 0, WLAN_EID_FRAGMENT); /* this means the last fragment was not used */ KUNIT_EXPECT_EQ(test, ret, 254 + 255 + 1); + ret = cfg80211_defragment_element((void *)input, + input, sizeof(input), + data, ret, + WLAN_EID_FRAGMENT); + KUNIT_EXPECT_EQ(test, ret, 254 + 255 + 1); KUNIT_EXPECT_MEMEQ(test, data, input + 3, 254); KUNIT_EXPECT_MEMEQ(test, data + 254, input + 257 + 2, 255); KUNIT_EXPECT_MEMEQ(test, data + 254 + 255, input + 2 * 257 + 2, 1); @@ -134,7 +149,12 @@ static void defragment_at_end(struct kunit *test) ret = cfg80211_defragment_element((void *)input, input, sizeof(input), - data, sizeof(input), + NULL, 0, + WLAN_EID_FRAGMENT); + KUNIT_EXPECT_EQ(test, ret, 254 + 7); + ret = cfg80211_defragment_element((void *)input, + input, sizeof(input), + data, ret, WLAN_EID_FRAGMENT); KUNIT_EXPECT_EQ(test, ret, 254 + 7); KUNIT_EXPECT_MEMEQ(test, data, input + 3, 254); diff --git a/net/wireless/tests/scan.c b/net/wireless/tests/scan.c index f9ea44aee9..9f458be716 100644 --- a/net/wireless/tests/scan.c +++ b/net/wireless/tests/scan.c @@ -407,6 +407,7 @@ static struct inform_bss_ml_sta_case { int mld_id; bool sta_prof_vendor_elems; bool include_oper_class; + bool nstr; } inform_bss_ml_sta_cases[] = { { .desc = "zero_mld_id", @@ -426,6 +427,10 @@ static struct inform_bss_ml_sta_case { .mld_id = 1, .sta_prof_vendor_elems = true, .include_oper_class = true, + }, { + .desc = "nstr", + .mld_id = 0, + .nstr = true, }, }; KUNIT_ARRAY_PARAM_DESC(inform_bss_ml_sta, inform_bss_ml_sta_cases, desc) @@ -458,7 +463,7 @@ static void test_inform_bss_ml_sta(struct kunit *test) struct { struct ieee80211_neighbor_ap_info info; struct ieee80211_tbtt_info_ge_11 ap; - } __packed rnr = { + } __packed rnr_normal = { .info = { .tbtt_info_hdr = u8_encode_bits(0, IEEE80211_AP_INFO_TBTT_HDR_COUNT), .tbtt_info_len = sizeof(struct ieee80211_tbtt_info_ge_11), @@ -478,6 +483,28 @@ static void test_inform_bss_ml_sta(struct kunit *test) } }; struct { + struct ieee80211_neighbor_ap_info info; + struct ieee80211_rnr_mld_params mld_params; + } __packed rnr_nstr = { + .info = { + .tbtt_info_hdr = + u8_encode_bits(0, IEEE80211_AP_INFO_TBTT_HDR_COUNT) | + u8_encode_bits(IEEE80211_TBTT_INFO_TYPE_MLD, + IEEE80211_AP_INFO_TBTT_HDR_TYPE), + .tbtt_info_len = sizeof(struct ieee80211_rnr_mld_params), + .op_class = 81, + .channel = 11, + }, + .mld_params = { + .mld_id = params->mld_id, + .params = + le16_encode_bits(link_id, + IEEE80211_RNR_MLD_PARAMS_LINK_ID), + } + }; + size_t rnr_len = params->nstr ? sizeof(rnr_nstr) : sizeof(rnr_normal); + void *rnr = params->nstr ? (void *)&rnr_nstr : (void *)&rnr_normal; + struct { __le16 control; u8 var_len; u8 mld_mac_addr[ETH_ALEN]; @@ -516,7 +543,7 @@ static void test_inform_bss_ml_sta(struct kunit *test) u16_encode_bits(link_id, IEEE80211_MLE_STA_CONTROL_LINK_ID)), .var_len = sizeof(sta_prof) - 2 - 2, - .bssid = { *rnr.ap.bssid }, + .bssid = { *rnr_normal.ap.bssid }, .beacon_int = cpu_to_le16(101), .tsf_offset = cpu_to_le64(-123ll), .capabilities = cpu_to_le16(0xdead), @@ -540,8 +567,8 @@ static void test_inform_bss_ml_sta(struct kunit *test) } skb_put_u8(input, WLAN_EID_REDUCED_NEIGHBOR_REPORT); - skb_put_u8(input, sizeof(rnr)); - skb_put_data(input, &rnr, sizeof(rnr)); + skb_put_u8(input, rnr_len); + skb_put_data(input, rnr, rnr_len); /* build a multi-link element */ skb_put_u8(input, WLAN_EID_EXTENSION); @@ -587,9 +614,10 @@ static void test_inform_bss_ml_sta(struct kunit *test) KUNIT_EXPECT_EQ(test, ctx.inform_bss_count, 2); /* Check link_bss *****************************************************/ - link_bss = cfg80211_get_bss(wiphy, NULL, sta_prof.bssid, NULL, 0, - IEEE80211_BSS_TYPE_ANY, - IEEE80211_PRIVACY_ANY); + link_bss = __cfg80211_get_bss(wiphy, NULL, sta_prof.bssid, NULL, 0, + IEEE80211_BSS_TYPE_ANY, + IEEE80211_PRIVACY_ANY, + 0); KUNIT_ASSERT_NOT_NULL(test, link_bss); KUNIT_EXPECT_EQ(test, link_bss->signal, 0); KUNIT_EXPECT_EQ(test, link_bss->beacon_interval, @@ -600,6 +628,22 @@ static void test_inform_bss_ml_sta(struct kunit *test) KUNIT_EXPECT_PTR_EQ(test, link_bss->channel, ieee80211_get_channel_khz(wiphy, MHZ_TO_KHZ(2462))); + /* Test wiphy does not set WIPHY_FLAG_SUPPORTS_NSTR_NONPRIMARY */ + if (params->nstr) { + KUNIT_EXPECT_EQ(test, link_bss->use_for, 0); + KUNIT_EXPECT_EQ(test, link_bss->cannot_use_reasons, + NL80211_BSS_CANNOT_USE_NSTR_NONPRIMARY); + KUNIT_EXPECT_NULL(test, + cfg80211_get_bss(wiphy, NULL, sta_prof.bssid, + NULL, 0, + IEEE80211_BSS_TYPE_ANY, + IEEE80211_PRIVACY_ANY)); + } else { + KUNIT_EXPECT_EQ(test, link_bss->use_for, + NL80211_BSS_USE_FOR_ALL); + KUNIT_EXPECT_EQ(test, link_bss->cannot_use_reasons, 0); + } + rcu_read_lock(); ies = rcu_dereference(link_bss->ies); KUNIT_EXPECT_NOT_NULL(test, ies); @@ -607,20 +651,20 @@ static void test_inform_bss_ml_sta(struct kunit *test) /* Resulting length should be: * SSID (inherited) + RNR (inherited) + vendor element(s) + * operating class (if requested) + - * generated RNR (if MLD ID == 0) + + * generated RNR (if MLD ID == 0 and not NSTR) + * MLE common info + MLE header and control */ if (params->sta_prof_vendor_elems) KUNIT_EXPECT_EQ(test, ies->len, - 6 + 2 + sizeof(rnr) + 2 + 160 + 2 + 165 + + 6 + 2 + rnr_len + 2 + 160 + 2 + 165 + (params->include_oper_class ? 3 : 0) + - (!params->mld_id ? 22 : 0) + + (!params->mld_id && !params->nstr ? 22 : 0) + mle_basic_common_info.var_len + 5); else KUNIT_EXPECT_EQ(test, ies->len, - 6 + 2 + sizeof(rnr) + 2 + 155 + + 6 + 2 + rnr_len + 2 + 155 + (params->include_oper_class ? 3 : 0) + - (!params->mld_id ? 22 : 0) + + (!params->mld_id && !params->nstr ? 22 : 0) + mle_basic_common_info.var_len + 5); rcu_read_unlock(); @@ -628,6 +672,172 @@ static void test_inform_bss_ml_sta(struct kunit *test) cfg80211_put_bss(wiphy, link_bss); } +static struct cfg80211_parse_colocated_ap_case { + const char *desc; + u8 op_class; + u8 channel; + struct ieee80211_neighbor_ap_info info; + union { + struct ieee80211_tbtt_info_ge_11 tbtt_long; + struct ieee80211_tbtt_info_7_8_9 tbtt_short; + }; + bool add_junk; + bool same_ssid; + bool valid; +} cfg80211_parse_colocated_ap_cases[] = { + { + .desc = "wrong_band", + .info.op_class = 81, + .info.channel = 11, + .tbtt_long = { + .bssid = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55 }, + .bss_params = IEEE80211_RNR_TBTT_PARAMS_COLOC_AP, + }, + .valid = false, + }, + { + .desc = "wrong_type", + /* IEEE80211_AP_INFO_TBTT_HDR_TYPE is in the least significant bits */ + .info.tbtt_info_hdr = IEEE80211_TBTT_INFO_TYPE_MLD, + .tbtt_long = { + .bssid = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55 }, + .bss_params = IEEE80211_RNR_TBTT_PARAMS_COLOC_AP, + }, + .valid = false, + }, + { + .desc = "colocated_invalid_len_short", + .info.tbtt_info_len = 6, + .tbtt_short = { + .bssid = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55 }, + .bss_params = IEEE80211_RNR_TBTT_PARAMS_COLOC_AP | + IEEE80211_RNR_TBTT_PARAMS_SAME_SSID, + }, + .valid = false, + }, + { + .desc = "colocated_invalid_len_short_mld", + .info.tbtt_info_len = 10, + .tbtt_long = { + .bssid = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55 }, + .bss_params = IEEE80211_RNR_TBTT_PARAMS_COLOC_AP, + }, + .valid = false, + }, + { + .desc = "colocated_non_mld", + .info.tbtt_info_len = sizeof(struct ieee80211_tbtt_info_7_8_9), + .tbtt_short = { + .bssid = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55 }, + .bss_params = IEEE80211_RNR_TBTT_PARAMS_COLOC_AP | + IEEE80211_RNR_TBTT_PARAMS_SAME_SSID, + }, + .same_ssid = true, + .valid = true, + }, + { + .desc = "colocated_non_mld_invalid_bssid", + .info.tbtt_info_len = sizeof(struct ieee80211_tbtt_info_7_8_9), + .tbtt_short = { + .bssid = { 0xff, 0x11, 0x22, 0x33, 0x44, 0x55 }, + .bss_params = IEEE80211_RNR_TBTT_PARAMS_COLOC_AP | + IEEE80211_RNR_TBTT_PARAMS_SAME_SSID, + }, + .same_ssid = true, + .valid = false, + }, + { + .desc = "colocated_mld", + .tbtt_long = { + .bssid = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55 }, + .bss_params = IEEE80211_RNR_TBTT_PARAMS_COLOC_AP, + }, + .valid = true, + }, + { + .desc = "colocated_mld", + .tbtt_long = { + .bssid = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55 }, + .bss_params = IEEE80211_RNR_TBTT_PARAMS_COLOC_AP, + }, + .add_junk = true, + .valid = false, + }, + { + .desc = "colocated_disabled_mld", + .tbtt_long = { + .bssid = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55 }, + .bss_params = IEEE80211_RNR_TBTT_PARAMS_COLOC_AP, + .mld_params.params = cpu_to_le16(IEEE80211_RNR_MLD_PARAMS_DISABLED_LINK), + }, + .valid = false, + }, +}; +KUNIT_ARRAY_PARAM_DESC(cfg80211_parse_colocated_ap, cfg80211_parse_colocated_ap_cases, desc) + +static void test_cfg80211_parse_colocated_ap(struct kunit *test) +{ + const struct cfg80211_parse_colocated_ap_case *params = test->param_value; + struct sk_buff *input = kunit_zalloc_skb(test, 1024, GFP_KERNEL); + struct cfg80211_bss_ies *ies; + struct ieee80211_neighbor_ap_info info; + LIST_HEAD(coloc_ap_list); + int count; + + KUNIT_ASSERT_NOT_NULL(test, input); + + info = params->info; + + /* Reasonable values for a colocated AP */ + if (!info.tbtt_info_len) + info.tbtt_info_len = sizeof(params->tbtt_long); + if (!info.op_class) + info.op_class = 131; + if (!info.channel) + info.channel = 33; + /* Zero is the correct default for .btt_info_hdr (one entry, TBTT type) */ + + skb_put_u8(input, WLAN_EID_SSID); + skb_put_u8(input, 4); + skb_put_data(input, "TEST", 4); + + skb_put_u8(input, WLAN_EID_REDUCED_NEIGHBOR_REPORT); + skb_put_u8(input, sizeof(info) + info.tbtt_info_len + (params->add_junk ? 3 : 0)); + skb_put_data(input, &info, sizeof(info)); + skb_put_data(input, ¶ms->tbtt_long, info.tbtt_info_len); + + if (params->add_junk) + skb_put_data(input, "123", 3); + + ies = kunit_kzalloc(test, struct_size(ies, data, input->len), GFP_KERNEL); + ies->len = input->len; + memcpy(ies->data, input->data, input->len); + + count = cfg80211_parse_colocated_ap(ies, &coloc_ap_list); + + KUNIT_EXPECT_EQ(test, count, params->valid); + KUNIT_EXPECT_EQ(test, list_count_nodes(&coloc_ap_list), params->valid); + + if (params->valid && !list_empty(&coloc_ap_list)) { + struct cfg80211_colocated_ap *ap; + + ap = list_first_entry(&coloc_ap_list, typeof(*ap), list); + if (info.tbtt_info_len <= sizeof(params->tbtt_short)) + KUNIT_EXPECT_MEMEQ(test, ap->bssid, params->tbtt_short.bssid, ETH_ALEN); + else + KUNIT_EXPECT_MEMEQ(test, ap->bssid, params->tbtt_long.bssid, ETH_ALEN); + + if (params->same_ssid) { + KUNIT_EXPECT_EQ(test, ap->ssid_len, 4); + KUNIT_EXPECT_MEMEQ(test, ap->ssid, "TEST", 4); + } else { + KUNIT_EXPECT_EQ(test, ap->ssid_len, 0); + } + } + + cfg80211_free_coloc_ap_list(&coloc_ap_list); +} + static struct kunit_case gen_new_ie_test_cases[] = { KUNIT_CASE_PARAM(test_gen_new_ie, gen_new_ie_gen_params), KUNIT_CASE(test_gen_new_ie_malformed), @@ -653,3 +863,16 @@ static struct kunit_suite inform_bss = { }; kunit_test_suite(inform_bss); + +static struct kunit_case scan_6ghz_cases[] = { + KUNIT_CASE_PARAM(test_cfg80211_parse_colocated_ap, + cfg80211_parse_colocated_ap_gen_params), + {} +}; + +static struct kunit_suite scan_6ghz = { + .name = "cfg80211-scan-6ghz", + .test_cases = scan_6ghz_cases, +}; + +kunit_test_suite(scan_6ghz); diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 1f374c8a17..87986170d1 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1,4 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* + * Portions of this file + * Copyright(c) 2016-2017 Intel Deutschland GmbH + * Copyright (C) 2018, 2020-2024 Intel Corporation + */ #undef TRACE_SYSTEM #define TRACE_SYSTEM cfg80211 @@ -135,7 +140,8 @@ __field(u32, width) \ __field(u32, center_freq1) \ __field(u32, freq1_offset) \ - __field(u32, center_freq2) + __field(u32, center_freq2) \ + __field(u16, punctured) #define CHAN_DEF_ASSIGN(chandef) \ do { \ if ((chandef) && (chandef)->chan) { \ @@ -148,6 +154,7 @@ __entry->center_freq1 = (chandef)->center_freq1;\ __entry->freq1_offset = (chandef)->freq1_offset;\ __entry->center_freq2 = (chandef)->center_freq2;\ + __entry->punctured = (chandef)->punctured; \ } else { \ __entry->band = 0; \ __entry->control_freq = 0; \ @@ -156,14 +163,15 @@ __entry->center_freq1 = 0; \ __entry->freq1_offset = 0; \ __entry->center_freq2 = 0; \ + __entry->punctured = 0; \ } \ } while (0) #define CHAN_DEF_PR_FMT \ - "band: %d, control freq: %u.%03u, width: %d, cf1: %u.%03u, cf2: %u" + "band: %d, control freq: %u.%03u, width: %d, cf1: %u.%03u, cf2: %u, punct: 0x%x" #define CHAN_DEF_PR_ARG __entry->band, __entry->control_freq, \ __entry->freq_offset, __entry->width, \ __entry->center_freq1, __entry->freq1_offset, \ - __entry->center_freq2 + __entry->center_freq2, __entry->punctured #define FILS_AAD_ASSIGN(fa) \ do { \ @@ -364,7 +372,7 @@ TRACE_EVENT(rdev_add_virtual_intf, ), TP_fast_assign( WIPHY_ASSIGN; - __assign_str(vir_intf_name, name ? name : "<noname>"); + __assign_str(vir_intf_name); __entry->type = type; ), TP_printk(WIPHY_PR_FMT ", virtual intf name: %s, type: %d", @@ -810,8 +818,8 @@ DECLARE_EVENT_CLASS(station_add_change, params->link_sta_params.opmode_notif_used; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM" - ", station flags mask: %u, station flags set: %u, " - "station modify mask: %u, listen interval: %d, aid: %u, " + ", station flags mask: 0x%x, station flags set: 0x%x, " + "station modify mask: 0x%x, listen interval: %d, aid: %u, " "plink action: %u, plink state: %u, uapsd queues: %u, vlan:%s", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->sta_mac, __entry->sta_flags_mask, __entry->sta_flags_set, @@ -859,6 +867,7 @@ DECLARE_EVENT_CLASS(station_del, MAC_ENTRY(sta_mac) __field(u8, subtype) __field(u16, reason_code) + __field(int, link_id) ), TP_fast_assign( WIPHY_ASSIGN; @@ -866,11 +875,13 @@ DECLARE_EVENT_CLASS(station_del, MAC_ASSIGN(sta_mac, params->mac); __entry->subtype = params->subtype; __entry->reason_code = params->reason_code; + __entry->link_id = params->link_id; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM" - ", subtype: %u, reason_code: %u", + ", subtype: %u, reason_code: %u, link_id: %d", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->sta_mac, - __entry->subtype, __entry->reason_code) + __entry->subtype, __entry->reason_code, + __entry->link_id) ); DEFINE_EVENT(station_del, rdev_del_station, @@ -1013,7 +1024,7 @@ TRACE_EVENT(rdev_get_mpp, TRACE_EVENT(rdev_dump_mpp, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx, u8 *dst, u8 *mpp), - TP_ARGS(wiphy, netdev, _idx, mpp, dst), + TP_ARGS(wiphy, netdev, _idx, dst, mpp), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY @@ -1064,7 +1075,7 @@ TRACE_EVENT(rdev_return_int_mpath_info, ), TP_printk(WIPHY_PR_FMT ", returned %d. mpath info - generation: %d, " "filled: %u, frame qlen: %u, sn: %u, metric: %u, exptime: %u," - " discovery timeout: %u, discovery retries: %u, flags: %u", + " discovery timeout: %u, discovery retries: %u, flags: 0x%x", WIPHY_PR_ARG, __entry->ret, __entry->generation, __entry->filled, __entry->frame_qlen, __entry->sn, __entry->metric, __entry->exptime, __entry->discovery_timeout, @@ -1306,7 +1317,7 @@ TRACE_EVENT(rdev_assoc, req->fils_nonces, 2 * FILS_NONCE_LEN); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM" - ", previous bssid: %pM, use mfp: %s, flags: %u", + ", previous bssid: %pM, use mfp: %s, flags: 0x%x", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid, __entry->prev_bssid, BOOL_TO_STR(__entry->use_mfp), __entry->flags) @@ -1428,7 +1439,7 @@ TRACE_EVENT(rdev_connect, ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM" ", ssid: %s, auth type: %d, privacy: %s, wpa versions: %u, " - "flags: %u, previous bssid: %pM", + "flags: 0x%x, previous bssid: %pM", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid, __entry->ssid, __entry->auth_type, BOOL_TO_STR(__entry->privacy), __entry->wpa_versions, __entry->flags, __entry->prev_bssid) @@ -1747,7 +1758,7 @@ TRACE_EVENT(rdev_return_void_tx_rx, DECLARE_EVENT_CLASS(tx_rx_evt, TP_PROTO(struct wiphy *wiphy, u32 tx, u32 rx), - TP_ARGS(wiphy, rx, tx), + TP_ARGS(wiphy, tx, rx), TP_STRUCT__entry( WIPHY_ENTRY __field(u32, tx) @@ -1764,7 +1775,7 @@ DECLARE_EVENT_CLASS(tx_rx_evt, DEFINE_EVENT(tx_rx_evt, rdev_set_antenna, TP_PROTO(struct wiphy *wiphy, u32 tx, u32 rx), - TP_ARGS(wiphy, rx, tx) + TP_ARGS(wiphy, tx, rx) ); DECLARE_EVENT_CLASS(wiphy_netdev_id_evt, @@ -2324,6 +2335,7 @@ TRACE_EVENT(rdev_channel_switch, __field(u8, count) __dynamic_array(u16, bcn_ofs, params->n_counter_offsets_beacon) __dynamic_array(u16, pres_ofs, params->n_counter_offsets_presp) + __field(u8, link_id) ), TP_fast_assign( WIPHY_ASSIGN; @@ -2341,11 +2353,13 @@ TRACE_EVENT(rdev_channel_switch, memcpy(__get_dynamic_array(pres_ofs), params->counter_offsets_presp, params->n_counter_offsets_presp * sizeof(u16)); + __entry->link_id = params->link_id; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT - ", block_tx: %d, count: %u, radar_required: %d", + ", block_tx: %d, count: %u, radar_required: %d, link_id: %d", WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG, - __entry->block_tx, __entry->count, __entry->radar_required) + __entry->block_tx, __entry->count, __entry->radar_required, + __entry->link_id) ); TRACE_EVENT(rdev_set_qos_map, @@ -2828,6 +2842,7 @@ TRACE_EVENT(rdev_color_change, __field(u8, count) __field(u16, bcn_ofs) __field(u16, pres_ofs) + __field(u8, link_id) ), TP_fast_assign( WIPHY_ASSIGN; @@ -2835,11 +2850,12 @@ TRACE_EVENT(rdev_color_change, __entry->count = params->count; __entry->bcn_ofs = params->counter_offset_beacon; __entry->pres_ofs = params->counter_offset_presp; + __entry->link_id = params->link_id; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT - ", count: %u", + ", count: %u, link_id: %d", WIPHY_PR_ARG, NETDEV_PR_ARG, - __entry->count) + __entry->count, __entry->link_id) ); TRACE_EVENT(rdev_set_radar_background, @@ -3267,47 +3283,39 @@ TRACE_EVENT(cfg80211_chandef_dfs_required, TRACE_EVENT(cfg80211_ch_switch_notify, TP_PROTO(struct net_device *netdev, struct cfg80211_chan_def *chandef, - unsigned int link_id, - u16 punct_bitmap), - TP_ARGS(netdev, chandef, link_id, punct_bitmap), + unsigned int link_id), + TP_ARGS(netdev, chandef, link_id), TP_STRUCT__entry( NETDEV_ENTRY CHAN_DEF_ENTRY __field(unsigned int, link_id) - __field(u16, punct_bitmap) ), TP_fast_assign( NETDEV_ASSIGN; CHAN_DEF_ASSIGN(chandef); __entry->link_id = link_id; - __entry->punct_bitmap = punct_bitmap; ), - TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d, punct_bitmap:%u", - NETDEV_PR_ARG, CHAN_DEF_PR_ARG, __entry->link_id, - __entry->punct_bitmap) + TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d", + NETDEV_PR_ARG, CHAN_DEF_PR_ARG, __entry->link_id) ); TRACE_EVENT(cfg80211_ch_switch_started_notify, TP_PROTO(struct net_device *netdev, struct cfg80211_chan_def *chandef, - unsigned int link_id, - u16 punct_bitmap), - TP_ARGS(netdev, chandef, link_id, punct_bitmap), + unsigned int link_id), + TP_ARGS(netdev, chandef, link_id), TP_STRUCT__entry( NETDEV_ENTRY CHAN_DEF_ENTRY __field(unsigned int, link_id) - __field(u16, punct_bitmap) ), TP_fast_assign( NETDEV_ASSIGN; CHAN_DEF_ASSIGN(chandef); __entry->link_id = link_id; - __entry->punct_bitmap = punct_bitmap; ), - TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d, punct_bitmap:%u", - NETDEV_PR_ARG, CHAN_DEF_PR_ARG, __entry->link_id, - __entry->punct_bitmap) + TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d", + NETDEV_PR_ARG, CHAN_DEF_PR_ARG, __entry->link_id) ); TRACE_EVENT(cfg80211_radar_event, diff --git a/net/wireless/util.c b/net/wireless/util.c index b9d15f3693..af6ec71956 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1504,7 +1504,7 @@ static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate) 5120, /* 0.833333... */ }; u32 rates_160M[3] = { 960777777, 907400000, 816666666 }; - u32 rates_969[3] = { 480388888, 453700000, 408333333 }; + u32 rates_996[3] = { 480388888, 453700000, 408333333 }; u32 rates_484[3] = { 229411111, 216666666, 195000000 }; u32 rates_242[3] = { 114711111, 108333333, 97500000 }; u32 rates_106[3] = { 40000000, 37777777, 34000000 }; @@ -1524,12 +1524,14 @@ static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate) if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8)) return 0; - if (rate->bw == RATE_INFO_BW_160) + if (rate->bw == RATE_INFO_BW_160 || + (rate->bw == RATE_INFO_BW_HE_RU && + rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_2x996)) result = rates_160M[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_80 || (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_996)) - result = rates_969[rate->he_gi]; + result = rates_996[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_40 || (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_484)) @@ -2079,6 +2081,82 @@ bool ieee80211_operating_class_to_band(u8 operating_class, } EXPORT_SYMBOL(ieee80211_operating_class_to_band); +bool ieee80211_operating_class_to_chandef(u8 operating_class, + struct ieee80211_channel *chan, + struct cfg80211_chan_def *chandef) +{ + u32 control_freq, offset = 0; + enum nl80211_band band; + + if (!ieee80211_operating_class_to_band(operating_class, &band) || + !chan || band != chan->band) + return false; + + control_freq = chan->center_freq; + chandef->chan = chan; + + if (control_freq >= 5955) + offset = control_freq - 5955; + else if (control_freq >= 5745) + offset = control_freq - 5745; + else if (control_freq >= 5180) + offset = control_freq - 5180; + offset /= 20; + + switch (operating_class) { + case 81: /* 2 GHz band; 20 MHz; channels 1..13 */ + case 82: /* 2 GHz band; 20 MHz; channel 14 */ + case 115: /* 5 GHz band; 20 MHz; channels 36,40,44,48 */ + case 118: /* 5 GHz band; 20 MHz; channels 52,56,60,64 */ + case 121: /* 5 GHz band; 20 MHz; channels 100..144 */ + case 124: /* 5 GHz band; 20 MHz; channels 149,153,157,161 */ + case 125: /* 5 GHz band; 20 MHz; channels 149..177 */ + case 131: /* 6 GHz band; 20 MHz; channels 1..233*/ + case 136: /* 6 GHz band; 20 MHz; channel 2 */ + chandef->center_freq1 = control_freq; + chandef->width = NL80211_CHAN_WIDTH_20; + return true; + case 83: /* 2 GHz band; 40 MHz; channels 1..9 */ + case 116: /* 5 GHz band; 40 MHz; channels 36,44 */ + case 119: /* 5 GHz band; 40 MHz; channels 52,60 */ + case 122: /* 5 GHz band; 40 MHz; channels 100,108,116,124,132,140 */ + case 126: /* 5 GHz band; 40 MHz; channels 149,157,165,173 */ + chandef->center_freq1 = control_freq + 10; + chandef->width = NL80211_CHAN_WIDTH_40; + return true; + case 84: /* 2 GHz band; 40 MHz; channels 5..13 */ + case 117: /* 5 GHz band; 40 MHz; channels 40,48 */ + case 120: /* 5 GHz band; 40 MHz; channels 56,64 */ + case 123: /* 5 GHz band; 40 MHz; channels 104,112,120,128,136,144 */ + case 127: /* 5 GHz band; 40 MHz; channels 153,161,169,177 */ + chandef->center_freq1 = control_freq - 10; + chandef->width = NL80211_CHAN_WIDTH_40; + return true; + case 132: /* 6 GHz band; 40 MHz; channels 1,5,..,229*/ + chandef->center_freq1 = control_freq + 10 - (offset & 1) * 20; + chandef->width = NL80211_CHAN_WIDTH_40; + return true; + case 128: /* 5 GHz band; 80 MHz; channels 36..64,100..144,149..177 */ + case 133: /* 6 GHz band; 80 MHz; channels 1,5,..,229 */ + chandef->center_freq1 = control_freq + 30 - (offset & 3) * 20; + chandef->width = NL80211_CHAN_WIDTH_80; + return true; + case 129: /* 5 GHz band; 160 MHz; channels 36..64,100..144,149..177 */ + case 134: /* 6 GHz band; 160 MHz; channels 1,5,..,229 */ + chandef->center_freq1 = control_freq + 70 - (offset & 7) * 20; + chandef->width = NL80211_CHAN_WIDTH_160; + return true; + case 130: /* 5 GHz band; 80+80 MHz; channels 36..64,100..144,149..177 */ + case 135: /* 6 GHz band; 80+80 MHz; channels 1,5,..,229 */ + /* The center_freq2 of 80+80 MHz is unknown */ + case 137: /* 6 GHz band; 320 MHz; channels 1,5,..,229 */ + /* 320-1 or 320-2 channelization is unknown */ + default: + return false; + } +} +EXPORT_SYMBOL(ieee80211_operating_class_to_chandef); + bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef, u8 *op_class) { @@ -2473,6 +2551,7 @@ int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr, { struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; + int ret; wdev = dev->ieee80211_ptr; if (!wdev) @@ -2484,7 +2563,11 @@ int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr, memset(sinfo, 0, sizeof(*sinfo)); - return rdev_get_station(rdev, dev, mac_addr, sinfo); + wiphy_lock(&rdev->wiphy); + ret = rdev_get_station(rdev, dev, mac_addr, sinfo); + wiphy_unlock(&rdev->wiphy); + + return ret; } EXPORT_SYMBOL(cfg80211_get_station); diff --git a/net/x25/Kconfig b/net/x25/Kconfig index 68729aa3a5..dc72302cbd 100644 --- a/net/x25/Kconfig +++ b/net/x25/Kconfig @@ -17,8 +17,6 @@ config X25 if you want that) and the lower level data link layer protocol LAPB (say Y to "LAPB Data Link Driver" below if you want that). - You can read more about X.25 at <https://www.sangoma.com/tutorials/x25/> and - <http://docwiki.cisco.com/wiki/X.25>. Information about X.25 for Linux is contained in the files <file:Documentation/networking/x25.rst> and <file:Documentation/networking/x25-iface.rst>. diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index d18d51412c..8dda417849 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -871,8 +871,8 @@ static int x25_wait_for_data(struct sock *sk, long timeout) return rc; } -static int x25_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern) +static int x25_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg) { struct sock *sk = sock->sk; struct sock *newsk; diff --git a/net/x25/sysctl_net_x25.c b/net/x25/sysctl_net_x25.c index e9802afa43..643f50874d 100644 --- a/net/x25/sysctl_net_x25.c +++ b/net/x25/sysctl_net_x25.c @@ -71,7 +71,6 @@ static struct ctl_table x25_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { }, }; int __init x25_register_sysctl(void) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index caa340134b..9f76ca591d 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -151,6 +151,7 @@ static int xdp_umem_account_pages(struct xdp_umem *umem) #define XDP_UMEM_FLAGS_VALID ( \ XDP_UMEM_UNALIGNED_CHUNK_FLAG | \ XDP_UMEM_TX_SW_CSUM | \ + XDP_UMEM_TX_METADATA_LEN | \ 0) static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) @@ -204,8 +205,11 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (headroom >= chunk_size - XDP_PACKET_HEADROOM) return -EINVAL; - if (mr->tx_metadata_len >= 256 || mr->tx_metadata_len % 8) - return -EINVAL; + if (mr->flags & XDP_UMEM_TX_METADATA_LEN) { + if (mr->tx_metadata_len >= 256 || mr->tx_metadata_len % 8) + return -EINVAL; + umem->tx_metadata_len = mr->tx_metadata_len; + } umem->size = size; umem->headroom = headroom; @@ -215,7 +219,6 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) umem->pgs = NULL; umem->user = NULL; umem->flags = mr->flags; - umem->tx_metadata_len = mr->tx_metadata_len; INIT_LIST_HEAD(&umem->xsk_dma_list); refcount_set(&umem->users, 1); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index ce60ecd48a..c0e0204b96 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -338,7 +338,6 @@ static struct xsk_dma_map *xp_create_dma_map(struct device *dev, struct net_devi dma_map->netdev = netdev; dma_map->dev = dev; - dma_map->dma_need_sync = false; dma_map->dma_pages_cnt = nr_pages; refcount_set(&dma_map->users, 1); list_add(&dma_map->list, &umem->xsk_dma_list); @@ -424,7 +423,6 @@ static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_ pool->dev = dma_map->dev; pool->dma_pages_cnt = dma_map->dma_pages_cnt; - pool->dma_need_sync = dma_map->dma_need_sync; memcpy(pool->dma_pages, dma_map->dma_pages, pool->dma_pages_cnt * sizeof(*pool->dma_pages)); @@ -460,8 +458,6 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, __xp_dma_unmap(dma_map, attrs); return -ENOMEM; } - if (dma_need_sync(dev, dma)) - dma_map->dma_need_sync = true; dma_map->dma_pages[i] = dma; } @@ -557,11 +553,9 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool) xskb->xdp.data_meta = xskb->xdp.data; xskb->xdp.flags = 0; - if (pool->dma_need_sync) { - dma_sync_single_range_for_device(pool->dev, xskb->dma, 0, - pool->frame_len, - DMA_BIDIRECTIONAL); - } + if (pool->dev) + xp_dma_sync_for_device(pool, xskb->dma, pool->frame_len); + return &xskb->xdp; } EXPORT_SYMBOL(xp_alloc); @@ -633,7 +627,7 @@ u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) { u32 nb_entries1 = 0, nb_entries2; - if (unlikely(pool->dma_need_sync)) { + if (unlikely(pool->dev && dma_dev_need_sync(pool->dev))) { struct xdp_buff *buff; /* Slow path */ @@ -693,18 +687,3 @@ dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) (addr & ~PAGE_MASK); } EXPORT_SYMBOL(xp_raw_get_dma); - -void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb) -{ - dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0, - xskb->pool->frame_len, DMA_BIDIRECTIONAL); -} -EXPORT_SYMBOL(xp_dma_sync_for_cpu_slow); - -void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma, - size_t size) -{ - dma_sync_single_range_for_device(pool->dev, dma, 0, - size, DMA_BIDIRECTIONAL); -} -EXPORT_SYMBOL(xp_dma_sync_for_device_slow); diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c index 9f89553672..09dcea0cbb 100644 --- a/net/xdp/xsk_diag.c +++ b/net/xdp/xsk_diag.c @@ -194,6 +194,7 @@ static int xsk_diag_handler_dump(struct sk_buff *nlskb, struct nlmsghdr *hdr) } static const struct sock_diag_handler xsk_diag_handler = { + .owner = THIS_MODULE, .family = AF_XDP, .dump = xsk_diag_handler_dump, }; diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c index d3b3f9e720..fe82e2d073 100644 --- a/net/xfrm/espintcp.c +++ b/net/xfrm/espintcp.c @@ -10,6 +10,7 @@ #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6_stubs.h> #endif +#include <net/hotdata.h> static void handle_nonesp(struct espintcp_ctx *ctx, struct sk_buff *skb, struct sock *sk) @@ -169,7 +170,8 @@ int espintcp_queue_out(struct sock *sk, struct sk_buff *skb) { struct espintcp_ctx *ctx = espintcp_getctx(sk); - if (skb_queue_len(&ctx->out_queue) >= READ_ONCE(netdev_max_backlog)) + if (skb_queue_len(&ctx->out_queue) >= + READ_ONCE(net_hotdata.max_backlog)) return -ENOBUFS; __skb_queue_tail(&ctx->out_queue, skb); diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index 655fe4ff86..703d4172c7 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -98,6 +98,7 @@ static const int compat_msg_min[XFRM_NR_MSGTYPES] = { }; static const struct nla_policy compat_policy[XFRMA_MAX+1] = { + [XFRMA_UNSPEC] = { .strict_start_type = XFRMA_SA_DIR }, [XFRMA_SA] = { .len = XMSGSIZE(compat_xfrm_usersa_info)}, [XFRMA_POLICY] = { .len = XMSGSIZE(compat_xfrm_userpolicy_info)}, [XFRMA_LASTUSED] = { .type = NLA_U64}, @@ -129,6 +130,7 @@ static const struct nla_policy compat_policy[XFRMA_MAX+1] = { [XFRMA_SET_MARK_MASK] = { .type = NLA_U32 }, [XFRMA_IF_ID] = { .type = NLA_U32 }, [XFRMA_MTIMER_THRESH] = { .type = NLA_U32 }, + [XFRMA_SA_DIR] = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT), }; static struct nlmsghdr *xfrm_nlmsg_put_compat(struct sk_buff *skb, @@ -277,9 +279,10 @@ static int xfrm_xlate64_attr(struct sk_buff *dst, const struct nlattr *src) case XFRMA_SET_MARK_MASK: case XFRMA_IF_ID: case XFRMA_MTIMER_THRESH: + case XFRMA_SA_DIR: return xfrm_nla_cpy(dst, src, nla_len(src)); default: - BUILD_BUG_ON(XFRMA_MAX != XFRMA_MTIMER_THRESH); + BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_DIR); pr_warn_once("unsupported nla_type %d\n", src->nla_type); return -EOPNOTSUPP; } @@ -434,7 +437,7 @@ static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla, int err; if (type > XFRMA_MAX) { - BUILD_BUG_ON(XFRMA_MAX != XFRMA_MTIMER_THRESH); + BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_DIR); NL_SET_ERR_MSG(extack, "Bad attribute"); return -EOPNOTSUPP; } diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 6346690d5c..2455a76a1c 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -253,6 +253,12 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, return -EINVAL; } + if ((xuo->flags & XFRM_OFFLOAD_INBOUND && x->dir == XFRM_SA_DIR_OUT) || + (!(xuo->flags & XFRM_OFFLOAD_INBOUND) && x->dir == XFRM_SA_DIR_IN)) { + NL_SET_ERR_MSG(extack, "Mismatched SA and offload direction"); + return -EINVAL; + } + is_packet_offload = xuo->flags & XFRM_OFFLOAD_PACKET; /* We don't yet support UDP encapsulation and TFC padding. */ diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index bd4ce21d76..e95462b982 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -21,6 +21,7 @@ #include <net/ip_tunnels.h> #include <net/ip6_tunnel.h> #include <net/dst_metadata.h> +#include <net/hotdata.h> #include "xfrm_inout.h" @@ -388,11 +389,15 @@ static int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb) */ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) { + struct xfrm_offload *xo = xfrm_offload(skb); int ihl = skb->data - skb_transport_header(skb); if (skb->transport_header != skb->network_header) { memmove(skb_transport_header(skb), skb_network_header(skb), ihl); + if (xo) + xo->orig_mac_len = + skb_mac_header_was_set(skb) ? skb_mac_header_len(skb) : 0; skb->network_header = skb->transport_header; } ip_hdr(skb)->tot_len = htons(skb->len + ihl); @@ -403,11 +408,15 @@ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IPV6) + struct xfrm_offload *xo = xfrm_offload(skb); int ihl = skb->data - skb_transport_header(skb); if (skb->transport_header != skb->network_header) { memmove(skb_transport_header(skb), skb_network_header(skb), ihl); + if (xo) + xo->orig_mac_len = + skb_mac_header_was_set(skb) ? skb_mac_header_len(skb) : 0; skb->network_header = skb->transport_header; } ipv6_hdr(skb)->payload_len = htons(skb->len + ihl - @@ -570,6 +579,15 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) goto drop; } + if (unlikely(x->dir && x->dir != XFRM_SA_DIR_IN)) { + secpath_reset(skb); + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEDIRERROR); + xfrm_audit_state_notfound(skb, family, spi, seq); + xfrm_state_put(x); + x = NULL; + goto drop; + } + skb->mark = xfrm_smark_get(skb->mark, x); sp->xvec[sp->len++] = x; @@ -764,7 +782,7 @@ int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, trans = this_cpu_ptr(&xfrm_trans_tasklet); - if (skb_queue_len(&trans->queue) >= READ_ONCE(netdev_max_backlog)) + if (skb_queue_len(&trans->queue) >= READ_ONCE(net_hotdata.max_backlog)) return -ENOBUFS; BUILD_BUG_ON(sizeof(struct xfrm_trans_cb) > sizeof(skb->cb)); diff --git a/net/xfrm/xfrm_interface_bpf.c b/net/xfrm/xfrm_interface_bpf.c index 7d5e920141..5ea15037eb 100644 --- a/net/xfrm/xfrm_interface_bpf.c +++ b/net/xfrm/xfrm_interface_bpf.c @@ -93,10 +93,10 @@ __bpf_kfunc int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx, const struct bp __bpf_kfunc_end_defs(); -BTF_SET8_START(xfrm_ifc_kfunc_set) +BTF_KFUNCS_START(xfrm_ifc_kfunc_set) BTF_ID_FLAGS(func, bpf_skb_get_xfrm_info) BTF_ID_FLAGS(func, bpf_skb_set_xfrm_info) -BTF_SET8_END(xfrm_ifc_kfunc_set) +BTF_KFUNCS_END(xfrm_ifc_kfunc_set) static const struct btf_kfunc_id_set xfrm_interface_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index 21d50d75c2..e50e4bf993 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -240,7 +240,6 @@ static void xfrmi_dev_free(struct net_device *dev) struct xfrm_if *xi = netdev_priv(dev); gro_cells_destroy(&xi->gro_cells); - free_percpu(dev->tstats); } static int xfrmi_create(struct net_device *dev) @@ -727,7 +726,7 @@ static int xfrmi_get_iflink(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); - return xi->p.link; + return READ_ONCE(xi->p.link); } static const struct net_device_ops xfrmi_netdev_ops = { @@ -749,6 +748,7 @@ static void xfrmi_dev_setup(struct net_device *dev) dev->flags = IFF_NOARP; dev->needs_free_netdev = true; dev->priv_destructor = xfrmi_dev_free; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; netif_keep_dst(dev); eth_broadcast_addr(dev->broadcast); @@ -765,15 +765,9 @@ static int xfrmi_dev_init(struct net_device *dev) struct net_device *phydev = __dev_get_by_index(xi->net, xi->p.link); int err; - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - err = gro_cells_init(&xi->gro_cells, dev); - if (err) { - free_percpu(dev->tstats); + if (err) return err; - } dev->features |= NETIF_F_LLTX; dev->features |= XFRMI_FEATURES; @@ -932,7 +926,7 @@ static struct net *xfrmi_get_link_net(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); - return xi->net; + return READ_ONCE(xi->net); } static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = { @@ -957,12 +951,12 @@ static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { .get_link_net = xfrmi_get_link_net, }; -static void __net_exit xfrmi_exit_batch_net(struct list_head *net_exit_list) +static void __net_exit xfrmi_exit_batch_rtnl(struct list_head *net_exit_list, + struct list_head *dev_to_kill) { struct net *net; - LIST_HEAD(list); - rtnl_lock(); + ASSERT_RTNL(); list_for_each_entry(net, net_exit_list, exit_list) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); struct xfrm_if __rcu **xip; @@ -973,18 +967,16 @@ static void __net_exit xfrmi_exit_batch_net(struct list_head *net_exit_list) for (xip = &xfrmn->xfrmi[i]; (xi = rtnl_dereference(*xip)) != NULL; xip = &xi->next) - unregister_netdevice_queue(xi->dev, &list); + unregister_netdevice_queue(xi->dev, dev_to_kill); } xi = rtnl_dereference(xfrmn->collect_md_xfrmi); if (xi) - unregister_netdevice_queue(xi->dev, &list); + unregister_netdevice_queue(xi->dev, dev_to_kill); } - unregister_netdevice_many(&list); - rtnl_unlock(); } static struct pernet_operations xfrmi_net_ops = { - .exit_batch = xfrmi_exit_batch_net, + .exit_batch_rtnl = xfrmi_exit_batch_rtnl, .id = &xfrmi_net_id, .size = sizeof(struct xfrmi_net), }; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index da6ecc6b3e..56b88ad88d 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -29,6 +29,7 @@ #include <linux/audit.h> #include <linux/rhashtable.h> #include <linux/if_tunnel.h> +#include <linux/icmp.h> #include <net/dst.h> #include <net/flow.h> #include <net/inet_ecn.h> @@ -451,6 +452,8 @@ EXPORT_SYMBOL(xfrm_policy_destroy); static void xfrm_policy_kill(struct xfrm_policy *policy) { + xfrm_dev_policy_delete(policy); + write_lock_bh(&policy->lock); policy->walk.dead = 1; write_unlock_bh(&policy->lock); @@ -1849,7 +1852,6 @@ again: __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); - xfrm_dev_policy_delete(pol); cnt++; xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); @@ -1890,7 +1892,6 @@ again: __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); - xfrm_dev_policy_delete(pol); cnt++; xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); @@ -2341,7 +2342,6 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir) pol = __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (pol) { - xfrm_dev_policy_delete(pol); xfrm_policy_kill(pol); return 0; } @@ -2488,6 +2488,12 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family, policy->if_id); + if (x && x->dir && x->dir != XFRM_SA_DIR_OUT) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEDIRERROR); + xfrm_state_put(x); + error = -EINVAL; + goto fail; + } if (x && x->km.state == XFRM_STATE_VALID) { xfrm[nx++] = x; @@ -2597,8 +2603,7 @@ static void xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len) { if (dst->ops->family == AF_INET6) { - struct rt6_info *rt = (struct rt6_info *)dst; - path->path_cookie = rt6_get_cookie(rt); + path->path_cookie = rt6_get_cookie(dst_rt6_info(dst)); path->u.rt6.rt6i_nfheader_len = nfheader_len; } } @@ -3505,6 +3510,130 @@ static inline int secpath_has_nontransport(const struct sec_path *sp, int k, int return 0; } +static bool icmp_err_packet(const struct flowi *fl, unsigned short family) +{ + const struct flowi4 *fl4 = &fl->u.ip4; + + if (family == AF_INET && + fl4->flowi4_proto == IPPROTO_ICMP && + (fl4->fl4_icmp_type == ICMP_DEST_UNREACH || + fl4->fl4_icmp_type == ICMP_TIME_EXCEEDED)) + return true; + +#if IS_ENABLED(CONFIG_IPV6) + if (family == AF_INET6) { + const struct flowi6 *fl6 = &fl->u.ip6; + + if (fl6->flowi6_proto == IPPROTO_ICMPV6 && + (fl6->fl6_icmp_type == ICMPV6_DEST_UNREACH || + fl6->fl6_icmp_type == ICMPV6_PKT_TOOBIG || + fl6->fl6_icmp_type == ICMPV6_TIME_EXCEED)) + return true; + } +#endif + return false; +} + +static bool xfrm_icmp_flow_decode(struct sk_buff *skb, unsigned short family, + const struct flowi *fl, struct flowi *fl1) +{ + bool ret = true; + struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); + int hl = family == AF_INET ? (sizeof(struct iphdr) + sizeof(struct icmphdr)) : + (sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr)); + + if (!newskb) + return true; + + if (!pskb_pull(newskb, hl)) + goto out; + + skb_reset_network_header(newskb); + + if (xfrm_decode_session_reverse(dev_net(skb->dev), newskb, fl1, family) < 0) + goto out; + + fl1->flowi_oif = fl->flowi_oif; + fl1->flowi_mark = fl->flowi_mark; + fl1->flowi_tos = fl->flowi_tos; + nf_nat_decode_session(newskb, fl1, family); + ret = false; + +out: + consume_skb(newskb); + return ret; +} + +static bool xfrm_selector_inner_icmp_match(struct sk_buff *skb, unsigned short family, + const struct xfrm_selector *sel, + const struct flowi *fl) +{ + bool ret = false; + + if (icmp_err_packet(fl, family)) { + struct flowi fl1; + + if (xfrm_icmp_flow_decode(skb, family, fl, &fl1)) + return ret; + + ret = xfrm_selector_match(sel, &fl1, family); + } + + return ret; +} + +static inline struct +xfrm_policy *xfrm_in_fwd_icmp(struct sk_buff *skb, + const struct flowi *fl, unsigned short family, + u32 if_id) +{ + struct xfrm_policy *pol = NULL; + + if (icmp_err_packet(fl, family)) { + struct flowi fl1; + struct net *net = dev_net(skb->dev); + + if (xfrm_icmp_flow_decode(skb, family, fl, &fl1)) + return pol; + + pol = xfrm_policy_lookup(net, &fl1, family, XFRM_POLICY_FWD, if_id); + if (IS_ERR(pol)) + pol = NULL; + } + + return pol; +} + +static inline struct +dst_entry *xfrm_out_fwd_icmp(struct sk_buff *skb, struct flowi *fl, + unsigned short family, struct dst_entry *dst) +{ + if (icmp_err_packet(fl, family)) { + struct net *net = dev_net(skb->dev); + struct dst_entry *dst2; + struct flowi fl1; + + if (xfrm_icmp_flow_decode(skb, family, fl, &fl1)) + return dst; + + dst_hold(dst); + + dst2 = xfrm_lookup(net, dst, &fl1, NULL, (XFRM_LOOKUP_QUEUE | XFRM_LOOKUP_ICMP)); + + if (IS_ERR(dst2)) + return dst; + + if (dst2->xfrm) { + dst_release(dst); + dst = dst2; + } else { + dst_release(dst2); + } + } + + return dst; +} + int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family) { @@ -3551,9 +3680,17 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, for (i = sp->len - 1; i >= 0; i--) { struct xfrm_state *x = sp->xvec[i]; + int ret = 0; + if (!xfrm_selector_match(&x->sel, &fl, family)) { - XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH); - return 0; + ret = 1; + if (x->props.flags & XFRM_STATE_ICMP && + xfrm_selector_inner_icmp_match(skb, family, &x->sel, &fl)) + ret = 0; + if (ret) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH); + return 0; + } } } } @@ -3576,6 +3713,9 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, return 0; } + if (!pol && dir == XFRM_POLICY_FWD) + pol = xfrm_in_fwd_icmp(skb, &fl, family, if_id); + if (!pol) { if (net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS); @@ -3709,6 +3849,10 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) res = 0; dst = NULL; } + + if (dst && !dst->xfrm) + dst = xfrm_out_fwd_icmp(skb, &fl, family, dst); + skb_dst_set(skb, dst); return res; } @@ -3765,15 +3909,10 @@ static void xfrm_link_failure(struct sk_buff *skb) /* Impossible. Such dst must be popped before reaches point of failure. */ } -static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst) +static void xfrm_negative_advice(struct sock *sk, struct dst_entry *dst) { - if (dst) { - if (dst->obsolete) { - dst_release(dst); - dst = NULL; - } - } - return dst; + if (dst->obsolete) + sk_dst_reset(sk); } static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr) @@ -4027,10 +4166,7 @@ static int __net_init xfrm_policy_init(struct net *net) int dir, err; if (net_eq(net, &init_net)) { - xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache", - sizeof(struct xfrm_dst), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, - NULL); + xfrm_dst_cache = KMEM_CACHE(xfrm_dst, SLAB_HWCACHE_ALIGN | SLAB_PANIC); err = rhashtable_init(&xfrm_policy_inexact_table, &xfrm_pol_inexact_params); BUG_ON(err); diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c index fee9b5cf37..eeb984be03 100644 --- a/net/xfrm/xfrm_proc.c +++ b/net/xfrm/xfrm_proc.c @@ -41,6 +41,8 @@ static const struct snmp_mib xfrm_mib_list[] = { SNMP_MIB_ITEM("XfrmFwdHdrError", LINUX_MIB_XFRMFWDHDRERROR), SNMP_MIB_ITEM("XfrmOutStateInvalid", LINUX_MIB_XFRMOUTSTATEINVALID), SNMP_MIB_ITEM("XfrmAcquireError", LINUX_MIB_XFRMACQUIREERROR), + SNMP_MIB_ITEM("XfrmOutStateDirError", LINUX_MIB_XFRMOUTSTATEDIRERROR), + SNMP_MIB_ITEM("XfrmInStateDirError", LINUX_MIB_XFRMINSTATEDIRERROR), SNMP_MIB_SENTINEL }; @@ -52,6 +54,7 @@ static int xfrm_statistics_seq_show(struct seq_file *seq, void *v) memset(buff, 0, sizeof(unsigned long) * LINUX_MIB_XFRMMAX); + xfrm_state_update_stats(net); snmp_get_cpu_field_batch(buff, xfrm_mib_list, net->mib.xfrm_statistics); for (i = 0; xfrm_mib_list[i].name; i++) diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index ce56d659c5..bc56c63057 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -778,7 +778,8 @@ int xfrm_init_replay(struct xfrm_state *x, struct netlink_ext_ack *extack) } if (x->props.flags & XFRM_STATE_ESN) { - if (replay_esn->replay_window == 0) { + if (replay_esn->replay_window == 0 && + (!x->dir || x->dir == XFRM_SA_DIR_IN)) { NL_SET_ERR_MSG(extack, "ESN replay window must be > 0"); return -EINVAL; } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index bda5327bf3..67b2a399a4 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -49,6 +49,7 @@ static struct kmem_cache *xfrm_state_cache __ro_after_init; static DECLARE_WORK(xfrm_state_gc_work, xfrm_state_gc_task); static HLIST_HEAD(xfrm_state_gc_list); +static HLIST_HEAD(xfrm_state_dev_gc_list); static inline bool xfrm_state_hold_rcu(struct xfrm_state __rcu *x) { @@ -214,6 +215,7 @@ static DEFINE_SPINLOCK(xfrm_state_afinfo_lock); static struct xfrm_state_afinfo __rcu *xfrm_state_afinfo[NPROTO]; static DEFINE_SPINLOCK(xfrm_state_gc_lock); +static DEFINE_SPINLOCK(xfrm_state_dev_gc_lock); int __xfrm_state_delete(struct xfrm_state *x); @@ -570,7 +572,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) int err = 0; spin_lock(&x->lock); - xfrm_dev_state_update_curlft(x); + xfrm_dev_state_update_stats(x); if (x->km.state == XFRM_STATE_DEAD) goto out; @@ -683,6 +685,41 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) } EXPORT_SYMBOL(xfrm_state_alloc); +#ifdef CONFIG_XFRM_OFFLOAD +void xfrm_dev_state_delete(struct xfrm_state *x) +{ + struct xfrm_dev_offload *xso = &x->xso; + struct net_device *dev = READ_ONCE(xso->dev); + + if (dev) { + dev->xfrmdev_ops->xdo_dev_state_delete(x); + spin_lock_bh(&xfrm_state_dev_gc_lock); + hlist_add_head(&x->dev_gclist, &xfrm_state_dev_gc_list); + spin_unlock_bh(&xfrm_state_dev_gc_lock); + } +} +EXPORT_SYMBOL_GPL(xfrm_dev_state_delete); + +void xfrm_dev_state_free(struct xfrm_state *x) +{ + struct xfrm_dev_offload *xso = &x->xso; + struct net_device *dev = READ_ONCE(xso->dev); + + if (dev && dev->xfrmdev_ops) { + spin_lock_bh(&xfrm_state_dev_gc_lock); + if (!hlist_unhashed(&x->dev_gclist)) + hlist_del(&x->dev_gclist); + spin_unlock_bh(&xfrm_state_dev_gc_lock); + + if (dev->xfrmdev_ops->xdo_dev_state_free) + dev->xfrmdev_ops->xdo_dev_state_free(x); + WRITE_ONCE(xso->dev, NULL); + xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; + netdev_put(dev, &xso->dev_tracker); + } +} +#endif + void __xfrm_state_destroy(struct xfrm_state *x, bool sync) { WARN_ON(x->km.state != XFRM_STATE_DEAD); @@ -848,6 +885,9 @@ EXPORT_SYMBOL(xfrm_state_flush); int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid) { + struct xfrm_state *x; + struct hlist_node *tmp; + struct xfrm_dev_offload *xso; int i, err = 0, cnt = 0; spin_lock_bh(&net->xfrm.xfrm_state_lock); @@ -857,8 +897,6 @@ int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_vali err = -ESRCH; for (i = 0; i <= net->xfrm.state_hmask; i++) { - struct xfrm_state *x; - struct xfrm_dev_offload *xso; restart: hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) { xso = &x->xso; @@ -868,6 +906,8 @@ restart: spin_unlock_bh(&net->xfrm.xfrm_state_lock); err = xfrm_state_delete(x); + xfrm_dev_state_free(x); + xfrm_audit_state_delete(x, err ? 0 : 1, task_valid); xfrm_state_put(x); @@ -884,6 +924,24 @@ restart: out: spin_unlock_bh(&net->xfrm.xfrm_state_lock); + + spin_lock_bh(&xfrm_state_dev_gc_lock); +restart_gc: + hlist_for_each_entry_safe(x, tmp, &xfrm_state_dev_gc_list, dev_gclist) { + xso = &x->xso; + + if (xso->dev == dev) { + spin_unlock_bh(&xfrm_state_dev_gc_lock); + xfrm_dev_state_free(x); + spin_lock_bh(&xfrm_state_dev_gc_lock); + goto restart_gc; + } + + } + spin_unlock_bh(&xfrm_state_dev_gc_lock); + + xfrm_flush_gc(); + return err; } EXPORT_SYMBOL(xfrm_dev_state_flush); @@ -1273,8 +1331,7 @@ found: xso->dev = xdo->dev; xso->real_dev = xdo->real_dev; xso->flags = XFRM_DEV_OFFLOAD_FLAG_ACQ; - netdev_tracker_alloc(xso->dev, &xso->dev_tracker, - GFP_ATOMIC); + netdev_hold(xso->dev, &xso->dev_tracker, GFP_ATOMIC); error = xso->dev->xfrmdev_ops->xdo_dev_state_add(x, NULL); if (error) { xso->dir = 0; @@ -1292,6 +1349,7 @@ found: if (km_query(x, tmpl, pol) == 0) { spin_lock_bh(&net->xfrm.xfrm_state_lock); x->km.state = XFRM_STATE_ACQ; + x->dir = XFRM_SA_DIR_OUT; list_add(&x->km.all, &net->xfrm.state_all); XFRM_STATE_INSERT(bydst, &x->bydst, net->xfrm.state_bydst + h, @@ -1744,6 +1802,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, x->lastused = orig->lastused; x->new_mapping = 0; x->new_mapping_sport = 0; + x->dir = orig->dir; return x; @@ -1864,8 +1923,14 @@ int xfrm_state_update(struct xfrm_state *x) } if (x1->km.state == XFRM_STATE_ACQ) { + if (x->dir && x1->dir != x->dir) + goto out; + __xfrm_state_insert(x); x = NULL; + } else { + if (x1->dir != x->dir) + goto out; } err = 0; @@ -1935,7 +2000,7 @@ EXPORT_SYMBOL(xfrm_state_update); int xfrm_state_check_expire(struct xfrm_state *x) { - xfrm_dev_state_update_curlft(x); + xfrm_dev_state_update_stats(x); if (!READ_ONCE(x->curlft.use_time)) WRITE_ONCE(x->curlft.use_time, ktime_get_real_seconds()); @@ -1957,6 +2022,19 @@ int xfrm_state_check_expire(struct xfrm_state *x) } EXPORT_SYMBOL(xfrm_state_check_expire); +void xfrm_state_update_stats(struct net *net) +{ + struct xfrm_state *x; + int i; + + spin_lock_bh(&net->xfrm.xfrm_state_lock); + for (i = 0; i <= net->xfrm.state_hmask; i++) { + hlist_for_each_entry(x, net->xfrm.state_bydst + i, bydst) + xfrm_dev_state_update_stats(x); + } + spin_unlock_bh(&net->xfrm.xfrm_state_lock); +} + struct xfrm_state * xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family) diff --git a/net/xfrm/xfrm_state_bpf.c b/net/xfrm/xfrm_state_bpf.c index 9e20d4a377..2248eda741 100644 --- a/net/xfrm/xfrm_state_bpf.c +++ b/net/xfrm/xfrm_state_bpf.c @@ -117,10 +117,10 @@ __bpf_kfunc void bpf_xdp_xfrm_state_release(struct xfrm_state *x) __bpf_kfunc_end_defs(); -BTF_SET8_START(xfrm_state_kfunc_set) +BTF_KFUNCS_START(xfrm_state_kfunc_set) BTF_ID_FLAGS(func, bpf_xdp_get_xfrm_state, KF_RET_NULL | KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_xdp_xfrm_state_release, KF_RELEASE) -BTF_SET8_END(xfrm_state_kfunc_set) +BTF_KFUNCS_END(xfrm_state_kfunc_set) static const struct btf_kfunc_id_set xfrm_state_xdp_kfunc_set = { .owner = THIS_MODULE, diff --git a/net/xfrm/xfrm_sysctl.c b/net/xfrm/xfrm_sysctl.c index 7fdeafc838..ca003e8a03 100644 --- a/net/xfrm/xfrm_sysctl.c +++ b/net/xfrm/xfrm_sysctl.c @@ -38,7 +38,6 @@ static struct ctl_table xfrm_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - {} }; int __net_init xfrm_sysctl_init(struct net *net) @@ -57,10 +56,8 @@ int __net_init xfrm_sysctl_init(struct net *net) table[3].data = &net->xfrm.sysctl_acq_expires; /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - table[0].procname = NULL; + if (net->user_ns != &init_user_ns) table_size = 0; - } net->xfrm.sysctl_hdr = register_net_sysctl_sz(net, "net/core", table, table_size); @@ -76,7 +73,7 @@ out_kmemdup: void __net_exit xfrm_sysctl_fini(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->xfrm.sysctl_hdr->ctl_table_arg; unregister_net_sysctl_table(net->xfrm.sysctl_hdr); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 912c1189ba..77355422ce 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -130,7 +130,7 @@ static inline int verify_sec_ctx_len(struct nlattr **attrs, struct netlink_ext_a } static inline int verify_replay(struct xfrm_usersa_info *p, - struct nlattr **attrs, + struct nlattr **attrs, u8 sa_dir, struct netlink_ext_ack *extack) { struct nlattr *rt = attrs[XFRMA_REPLAY_ESN_VAL]; @@ -168,6 +168,30 @@ static inline int verify_replay(struct xfrm_usersa_info *p, return -EINVAL; } + if (sa_dir == XFRM_SA_DIR_OUT) { + if (rs->replay_window) { + NL_SET_ERR_MSG(extack, "Replay window should be 0 for output SA"); + return -EINVAL; + } + if (rs->seq || rs->seq_hi) { + NL_SET_ERR_MSG(extack, + "Replay seq and seq_hi should be 0 for output SA"); + return -EINVAL; + } + if (rs->bmp_len) { + NL_SET_ERR_MSG(extack, "Replay bmp_len should 0 for output SA"); + return -EINVAL; + } + } + + if (sa_dir == XFRM_SA_DIR_IN) { + if (rs->oseq || rs->oseq_hi) { + NL_SET_ERR_MSG(extack, + "Replay oseq and oseq_hi should be 0 for input SA"); + return -EINVAL; + } + } + return 0; } @@ -176,6 +200,7 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, struct netlink_ext_ack *extack) { int err; + u8 sa_dir = attrs[XFRMA_SA_DIR] ? nla_get_u8(attrs[XFRMA_SA_DIR]) : 0; err = -EINVAL; switch (p->family) { @@ -334,7 +359,7 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, goto out; if ((err = verify_sec_ctx_len(attrs, extack))) goto out; - if ((err = verify_replay(p, attrs, extack))) + if ((err = verify_replay(p, attrs, sa_dir, extack))) goto out; err = -EINVAL; @@ -358,6 +383,77 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, err = -EINVAL; goto out; } + + if (sa_dir == XFRM_SA_DIR_OUT) { + NL_SET_ERR_MSG(extack, + "MTIMER_THRESH attribute should not be set on output SA"); + err = -EINVAL; + goto out; + } + } + + if (sa_dir == XFRM_SA_DIR_OUT) { + if (p->flags & XFRM_STATE_DECAP_DSCP) { + NL_SET_ERR_MSG(extack, "Flag DECAP_DSCP should not be set for output SA"); + err = -EINVAL; + goto out; + } + + if (p->flags & XFRM_STATE_ICMP) { + NL_SET_ERR_MSG(extack, "Flag ICMP should not be set for output SA"); + err = -EINVAL; + goto out; + } + + if (p->flags & XFRM_STATE_WILDRECV) { + NL_SET_ERR_MSG(extack, "Flag WILDRECV should not be set for output SA"); + err = -EINVAL; + goto out; + } + + if (p->replay_window) { + NL_SET_ERR_MSG(extack, "Replay window should be 0 for output SA"); + err = -EINVAL; + goto out; + } + + if (attrs[XFRMA_REPLAY_VAL]) { + struct xfrm_replay_state *replay; + + replay = nla_data(attrs[XFRMA_REPLAY_VAL]); + + if (replay->seq || replay->bitmap) { + NL_SET_ERR_MSG(extack, + "Replay seq and bitmap should be 0 for output SA"); + err = -EINVAL; + goto out; + } + } + } + + if (sa_dir == XFRM_SA_DIR_IN) { + if (p->flags & XFRM_STATE_NOPMTUDISC) { + NL_SET_ERR_MSG(extack, "Flag NOPMTUDISC should not be set for input SA"); + err = -EINVAL; + goto out; + } + + if (attrs[XFRMA_SA_EXTRA_FLAGS]) { + u32 xflags = nla_get_u32(attrs[XFRMA_SA_EXTRA_FLAGS]); + + if (xflags & XFRM_SA_XFLAG_DONT_ENCAP_DSCP) { + NL_SET_ERR_MSG(extack, "Flag DONT_ENCAP_DSCP should not be set for input SA"); + err = -EINVAL; + goto out; + } + + if (xflags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP) { + NL_SET_ERR_MSG(extack, "Flag OSEQ_MAY_WRAP should not be set for input SA"); + err = -EINVAL; + goto out; + } + + } } out: @@ -734,6 +830,9 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, if (attrs[XFRMA_IF_ID]) x->if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + if (attrs[XFRMA_SA_DIR]) + x->dir = nla_get_u8(attrs[XFRMA_SA_DIR]); + err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV], extack); if (err) goto error; @@ -902,7 +1001,7 @@ static void copy_to_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p) memcpy(&p->sel, &x->sel, sizeof(p->sel)); memcpy(&p->lft, &x->lft, sizeof(p->lft)); if (x->xso.dev) - xfrm_dev_state_update_curlft(x); + xfrm_dev_state_update_stats(x); memcpy(&p->curlft, &x->curlft, sizeof(p->curlft)); put_unaligned(x->stats.replay_window, &p->stats.replay_window); put_unaligned(x->stats.replay, &p->stats.replay); @@ -1182,8 +1281,13 @@ static int copy_to_user_state_extra(struct xfrm_state *x, if (ret) goto out; } - if (x->mapping_maxage) + if (x->mapping_maxage) { ret = nla_put_u32(skb, XFRMA_MTIMER_THRESH, x->mapping_maxage); + if (ret) + goto out; + } + if (x->dir) + ret = nla_put_u8(skb, XFRMA_SA_DIR, x->dir); out: return ret; } @@ -1618,6 +1722,9 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) goto out; + if (attrs[XFRMA_SA_DIR]) + x->dir = nla_get_u8(attrs[XFRMA_SA_DIR]); + resp_skb = xfrm_state_netlink(skb, x, nlh->nlmsg_seq); if (IS_ERR(resp_skb)) { err = PTR_ERR(resp_skb); @@ -2348,7 +2455,6 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, NETLINK_CB(skb).portid); } } else { - xfrm_dev_policy_delete(xp); xfrm_audit_policy_delete(xp, err ? 0 : 1, true); if (err != 0) @@ -2402,7 +2508,8 @@ static inline unsigned int xfrm_aevent_msgsize(struct xfrm_state *x) + nla_total_size_64bit(sizeof(struct xfrm_lifetime_cur)) + nla_total_size(sizeof(struct xfrm_mark)) + nla_total_size(4) /* XFRM_AE_RTHR */ - + nla_total_size(4); /* XFRM_AE_ETHR */ + + nla_total_size(4) /* XFRM_AE_ETHR */ + + nla_total_size(sizeof(x->dir)); /* XFRMA_SA_DIR */ } static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c) @@ -2459,6 +2566,12 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct if (err) goto out_cancel; + if (x->dir) { + err = nla_put_u8(skb, XFRMA_SA_DIR, x->dir); + if (err) + goto out_cancel; + } + nlmsg_end(skb, nlh); return 0; @@ -3018,6 +3131,7 @@ EXPORT_SYMBOL_GPL(xfrm_msg_min); #undef XMSGSIZE const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { + [XFRMA_UNSPEC] = { .strict_start_type = XFRMA_SA_DIR }, [XFRMA_SA] = { .len = sizeof(struct xfrm_usersa_info)}, [XFRMA_POLICY] = { .len = sizeof(struct xfrm_userpolicy_info)}, [XFRMA_LASTUSED] = { .type = NLA_U64}, @@ -3049,6 +3163,7 @@ const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_SET_MARK_MASK] = { .type = NLA_U32 }, [XFRMA_IF_ID] = { .type = NLA_U32 }, [XFRMA_MTIMER_THRESH] = { .type = NLA_U32 }, + [XFRMA_SA_DIR] = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT), }; EXPORT_SYMBOL_GPL(xfrma_policy); @@ -3097,6 +3212,24 @@ static const struct xfrm_link { [XFRM_MSG_GETDEFAULT - XFRM_MSG_BASE] = { .doit = xfrm_get_default }, }; +static int xfrm_reject_unused_attr(int type, struct nlattr **attrs, + struct netlink_ext_ack *extack) +{ + if (attrs[XFRMA_SA_DIR]) { + switch (type) { + case XFRM_MSG_NEWSA: + case XFRM_MSG_UPDSA: + case XFRM_MSG_ALLOCSPI: + break; + default: + NL_SET_ERR_MSG(extack, "Invalid attribute SA_DIR"); + return -EINVAL; + } + } + + return 0; +} + static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -3156,6 +3289,12 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto err; + if (!link->nla_pol || link->nla_pol == xfrma_policy) { + err = xfrm_reject_unused_attr((type + XFRM_MSG_BASE), attrs, extack); + if (err < 0) + goto err; + } + if (link->doit == NULL) { err = -EINVAL; goto err; @@ -3189,8 +3328,9 @@ static void xfrm_netlink_rcv(struct sk_buff *skb) static inline unsigned int xfrm_expire_msgsize(void) { - return NLMSG_ALIGN(sizeof(struct xfrm_user_expire)) - + nla_total_size(sizeof(struct xfrm_mark)); + return NLMSG_ALIGN(sizeof(struct xfrm_user_expire)) + + nla_total_size(sizeof(struct xfrm_mark)) + + nla_total_size(sizeof_field(struct xfrm_state, dir)); } static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c) @@ -3217,6 +3357,12 @@ static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct if (err) return err; + if (x->dir) { + err = nla_put_u8(skb, XFRMA_SA_DIR, x->dir); + if (err) + return err; + } + nlmsg_end(skb, nlh); return 0; } @@ -3324,6 +3470,9 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x) if (x->mapping_maxage) l += nla_total_size(sizeof(x->mapping_maxage)); + if (x->dir) + l += nla_total_size(sizeof(x->dir)); + return l; } |