diff options
Diffstat (limited to 'net')
396 files changed, 11465 insertions, 3825 deletions
diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c index 7b3341cef9..850d4a185f 100644 --- a/net/6lowpan/core.c +++ b/net/6lowpan/core.c @@ -179,4 +179,5 @@ static void __exit lowpan_module_exit(void) module_init(lowpan_module_init); module_exit(lowpan_module_exit); +MODULE_DESCRIPTION("IPv6 over Low-Power Wireless Personal Area Network core module"); MODULE_LICENSE("GPL"); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 2a7f1b1571..407b2335f0 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -702,20 +702,7 @@ static int vlan_ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info) { const struct vlan_dev_priv *vlan = vlan_dev_priv(dev); - const struct ethtool_ops *ops = vlan->real_dev->ethtool_ops; - struct phy_device *phydev = vlan->real_dev->phydev; - - if (phy_has_tsinfo(phydev)) { - return phy_ts_info(phydev, info); - } else if (ops->get_ts_info) { - return ops->get_ts_info(vlan->real_dev, info); - } else { - info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE | - SOF_TIMESTAMPING_SOFTWARE; - info->phc_index = -1; - } - - return 0; + return ethtool_get_ts_info_by_layer(vlan->real_dev, info); } static void vlan_dev_get_stats64(struct net_device *dev, diff --git a/net/9p/client.c b/net/9p/client.c index e265a0ca6b..f7e90b4769 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -1583,7 +1583,7 @@ p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to, received = rsize; } - p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count); + p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", received); if (non_zc) { int n = copy_to_iter(dataptr, received, to); @@ -1609,9 +1609,6 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) int total = 0; *err = 0; - p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %zd\n", - fid->fid, offset, iov_iter_count(from)); - while (iov_iter_count(from)) { int count = iov_iter_count(from); int rsize = fid->iounit; @@ -1623,6 +1620,9 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) if (count < rsize) rsize = count; + p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %d (/%d)\n", + fid->fid, offset, rsize, count); + /* Don't bother zerocopy for small IO (< 1024) */ if (clnt->trans_mod->zc_request && rsize > 1024) { req = p9_client_zc_rpc(clnt, P9_TWRITE, NULL, from, 0, @@ -1650,7 +1650,7 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) written = rsize; } - p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", count); + p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", written); p9_req_put(clnt, req); iov_iter_revert(from, count - written - iov_iter_count(from)); diff --git a/net/Kconfig b/net/Kconfig index 3ec6bc98fa..4adc47d0c9 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -233,8 +233,6 @@ source "net/bridge/netfilter/Kconfig" endif -source "net/bpfilter/Kconfig" - source "net/dccp/Kconfig" source "net/sctp/Kconfig" source "net/rds/Kconfig" diff --git a/net/Makefile b/net/Makefile index 4c4dc53545..b06b5539e7 100644 --- a/net/Makefile +++ b/net/Makefile @@ -19,7 +19,6 @@ obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX_SCM) += unix/ obj-y += ipv6/ -obj-$(CONFIG_BPFILTER) += bpfilter/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ obj-$(CONFIG_BRIDGE) += bridge/ diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index a852ec093f..198f5ba2fe 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1581,7 +1581,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) } /* Build a packet */ - SOCK_DEBUG(sk, "SK %p: Got address.\n", sk); + net_dbg_ratelimited("SK %p: Got address.\n", sk); /* For headers */ size = sizeof(struct ddpehdr) + len + ddp_dl->header_length; @@ -1602,7 +1602,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) dev = rt->dev; - SOCK_DEBUG(sk, "SK %p: Size needed %d, device %s\n", + net_dbg_ratelimited("SK %p: Size needed %d, device %s\n", sk, size, dev->name); hard_header_len = dev->hard_header_len; @@ -1631,7 +1631,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) skb_reserve(skb, hard_header_len); skb->dev = dev; - SOCK_DEBUG(sk, "SK %p: Begin build.\n", sk); + net_dbg_ratelimited("SK %p: Begin build.\n", sk); ddp = skb_put(skb, sizeof(struct ddpehdr)); ddp->deh_len_hops = htons(len + sizeof(*ddp)); @@ -1642,7 +1642,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) ddp->deh_dport = usat->sat_port; ddp->deh_sport = at->src_port; - SOCK_DEBUG(sk, "SK %p: Copy user data (%zd bytes).\n", sk, len); + net_dbg_ratelimited("SK %p: Copy user data (%zd bytes).\n", sk, len); err = memcpy_from_msg(skb_put(skb, len), msg, len); if (err) { @@ -1666,7 +1666,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (skb2) { loopback = 1; - SOCK_DEBUG(sk, "SK %p: send out(copy).\n", sk); + net_dbg_ratelimited("SK %p: send out(copy).\n", sk); /* * If it fails it is queued/sent above in the aarp queue */ @@ -1675,7 +1675,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) } if (dev->flags & IFF_LOOPBACK || loopback) { - SOCK_DEBUG(sk, "SK %p: Loop back.\n", sk); + net_dbg_ratelimited("SK %p: Loop back.\n", sk); /* loop back */ skb_orphan(skb); if (ddp->deh_dnode == ATADDR_BCAST) { @@ -1689,7 +1689,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) } ddp_dl->request(ddp_dl, skb, dev->dev_addr); } else { - SOCK_DEBUG(sk, "SK %p: send out.\n", sk); + net_dbg_ratelimited("SK %p: send out.\n", sk); if (rt->flags & RTF_GATEWAY) { gsat.sat_addr = rt->gateway; usat = &gsat; @@ -1700,7 +1700,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) */ aarp_send_ddp(dev, skb, &usat->sat_addr, NULL); } - SOCK_DEBUG(sk, "SK %p: Done write (%zd).\n", sk, len); + net_dbg_ratelimited("SK %p: Done write (%zd).\n", sk, len); out: release_sock(sk); diff --git a/net/atm/common.c b/net/atm/common.c index f7019df41c..2a1ec014e9 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -890,6 +890,7 @@ subsys_initcall(atm_init); module_exit(atm_exit); +MODULE_DESCRIPTION("Asynchronous Transfer Mode (ATM) networking core"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_ATMPVC); MODULE_ALIAS_NETPROTO(PF_ATMSVC); diff --git a/net/atm/lec.c b/net/atm/lec.c index 6257bf12e5..ffef658862 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -2234,4 +2234,5 @@ out: spin_unlock_irqrestore(&priv->lec_arp_lock, flags); } +MODULE_DESCRIPTION("ATM LAN Emulation (LANE) support"); MODULE_LICENSE("GPL"); diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 033871e718..324e3ab96b 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -1532,4 +1532,5 @@ static void __exit atm_mpoa_cleanup(void) module_init(atm_mpoa_init); module_exit(atm_mpoa_cleanup); +MODULE_DESCRIPTION("Multi-Protocol Over ATM (MPOA) driver"); MODULE_LICENSE("GPL"); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 558e158c98..9169efb2f4 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -103,7 +103,7 @@ again: s->ax25_dev = NULL; if (sk->sk_socket) { netdev_put(ax25_dev->dev, - &ax25_dev->dev_tracker); + &s->dev_tracker); ax25_dev_put(ax25_dev); } ax25_cb_del(s); diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index c5462486db..282ec581c0 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -105,7 +105,7 @@ void ax25_dev_device_down(struct net_device *dev) spin_lock_bh(&ax25_dev_lock); #ifdef CONFIG_AX25_DAMA_SLAVE - ax25_ds_del_timer(ax25_dev); + timer_shutdown_sync(&ax25_dev->dama.slave_timer); #endif /* diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index 3bd0760c76..b51d8b071b 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -20,6 +20,7 @@ batman-adv-y += hash.o batman-adv-$(CONFIG_BATMAN_ADV_DEBUG) += log.o batman-adv-y += main.o batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o +batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast_forw.o batman-adv-y += netlink.o batman-adv-$(CONFIG_BATMAN_ADV_NC) += network-coding.o batman-adv-y += originator.o diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 37ce6cfb35..5f46ca3d4b 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -20,7 +20,6 @@ #include <linux/if_vlan.h> #include <linux/jhash.h> #include <linux/jiffies.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> @@ -31,6 +30,7 @@ #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> +#include <linux/sprintf.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/workqueue.h> diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index c120c7c6d2..757c084ac2 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -25,7 +25,6 @@ #include "hard-interface.h" #include "originator.h" -#include "routing.h" #include "send.h" /** @@ -351,18 +350,14 @@ bool batadv_frag_skb_fwd(struct sk_buff *skb, struct batadv_orig_node *orig_node_src) { struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface); - struct batadv_orig_node *orig_node_dst; struct batadv_neigh_node *neigh_node = NULL; struct batadv_frag_packet *packet; u16 total_size; bool ret = false; packet = (struct batadv_frag_packet *)skb->data; - orig_node_dst = batadv_orig_hash_find(bat_priv, packet->dest); - if (!orig_node_dst) - goto out; - neigh_node = batadv_find_router(bat_priv, orig_node_dst, recv_if); + neigh_node = batadv_orig_to_router(bat_priv, packet->dest, recv_if); if (!neigh_node) goto out; @@ -381,7 +376,6 @@ bool batadv_frag_skb_fwd(struct sk_buff *skb, } out: - batadv_orig_node_put(orig_node_dst); batadv_neigh_node_put(neigh_node); return ret; } diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index d26124bc27..0ddd8b4b3f 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -18,7 +18,6 @@ #include <linux/in.h> #include <linux/ip.h> #include <linux/ipv6.h> -#include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> @@ -29,6 +28,7 @@ #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> +#include <linux/sprintf.h> #include <linux/stddef.h> #include <linux/udp.h> #include <net/sock.h> diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index e8a4499155..5fc754b0b3 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -6,6 +6,7 @@ #include "main.h" +#include <linux/array_size.h> #include <linux/atomic.h> #include <linux/build_bug.h> #include <linux/byteorder/generic.h> @@ -20,7 +21,6 @@ #include <linux/init.h> #include <linux/ip.h> #include <linux/ipv6.h> -#include <linux/kernel.h> #include <linux/kobject.h> #include <linux/kref.h> #include <linux/list.h> @@ -33,6 +33,7 @@ #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> +#include <linux/sprintf.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/workqueue.h> @@ -532,6 +533,8 @@ static void batadv_recv_handler_init(void) /* broadcast packet */ batadv_rx_handler[BATADV_BCAST] = batadv_recv_bcast_packet; + /* multicast packet */ + batadv_rx_handler[BATADV_MCAST] = batadv_recv_mcast_packet; /* unicast packets ... */ /* unicast with 4 addresses packet */ diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 10007c5894..870dcd7f17 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2023.3" +#define BATADV_SOURCE_VERSION "2024.0" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 315394f12c..14088c4ff2 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -25,7 +25,6 @@ #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/jiffies.h> -#include <linux/kernel.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> @@ -36,6 +35,7 @@ #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> +#include <linux/sprintf.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> @@ -236,6 +236,37 @@ static u8 batadv_mcast_mla_rtr_flags_get(struct batadv_priv *bat_priv, } /** + * batadv_mcast_mla_forw_flags_get() - get multicast forwarding flags + * @bat_priv: the bat priv with all the soft interface information + * + * Checks if all active hard interfaces have an MTU larger or equal to 1280 + * bytes (IPv6 minimum MTU). + * + * Return: BATADV_MCAST_HAVE_MC_PTYPE_CAPA if yes, BATADV_NO_FLAGS otherwise. + */ +static u8 batadv_mcast_mla_forw_flags_get(struct batadv_priv *bat_priv) +{ + const struct batadv_hard_iface *hard_iface; + + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->if_status != BATADV_IF_ACTIVE) + continue; + + if (hard_iface->soft_iface != bat_priv->soft_iface) + continue; + + if (hard_iface->net_dev->mtu < IPV6_MIN_MTU) { + rcu_read_unlock(); + return BATADV_NO_FLAGS; + } + } + rcu_read_unlock(); + + return BATADV_MCAST_HAVE_MC_PTYPE_CAPA; +} + +/** * batadv_mcast_mla_flags_get() - get the new multicast flags * @bat_priv: the bat priv with all the soft interface information * @@ -256,6 +287,7 @@ batadv_mcast_mla_flags_get(struct batadv_priv *bat_priv) mla_flags.enabled = 1; mla_flags.tvlv_flags |= batadv_mcast_mla_rtr_flags_get(bat_priv, bridge); + mla_flags.tvlv_flags |= batadv_mcast_mla_forw_flags_get(bat_priv); if (!bridge) return mla_flags; @@ -806,23 +838,25 @@ static void batadv_mcast_flags_log(struct batadv_priv *bat_priv, u8 flags) { bool old_enabled = bat_priv->mcast.mla_flags.enabled; u8 old_flags = bat_priv->mcast.mla_flags.tvlv_flags; - char str_old_flags[] = "[.... . ]"; + char str_old_flags[] = "[.... . .]"; - sprintf(str_old_flags, "[%c%c%c%s%s]", + sprintf(str_old_flags, "[%c%c%c%s%s%c]", (old_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (old_flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', (old_flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.', !(old_flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ", - !(old_flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". "); + !(old_flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". ", + !(old_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) ? 'P' : '.'); batadv_dbg(BATADV_DBG_MCAST, bat_priv, - "Changing multicast flags from '%s' to '[%c%c%c%s%s]'\n", + "Changing multicast flags from '%s' to '[%c%c%c%s%s%c]'\n", old_enabled ? str_old_flags : "<undefined>", (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.', !(flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ", - !(flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". "); + !(flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". ", + !(flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) ? 'P' : '.'); } /** @@ -1136,16 +1170,61 @@ static int batadv_mcast_forw_rtr_count(struct batadv_priv *bat_priv, } /** + * batadv_mcast_forw_mode_by_count() - get forwarding mode by count + * @bat_priv: the bat priv with all the soft interface information + * @skb: the multicast packet to check + * @vid: the vlan identifier + * @is_routable: stores whether the destination is routable + * @count: the number of originators the multicast packet need to be sent to + * + * For a multicast packet with multiple destination originators, checks which + * mode to use. For BATADV_FORW_MCAST it also encapsulates the packet with a + * complete batman-adv multicast header. + * + * Return: + * BATADV_FORW_MCAST: If all nodes have multicast packet routing + * capabilities and an MTU >= 1280 on all hard interfaces (including us) + * and the encapsulated multicast packet with all destination addresses + * would still fit into an 1280 bytes batman-adv multicast packet + * (excluding the outer ethernet frame) and we could successfully push + * the full batman-adv multicast packet header. + * BATADV_FORW_UCASTS: If the packet cannot be sent in a batman-adv + * multicast packet and the amount of batman-adv unicast packets needed + * is smaller or equal to the configured multicast fanout. + * BATADV_FORW_BCAST: Otherwise. + */ +static enum batadv_forw_mode +batadv_mcast_forw_mode_by_count(struct batadv_priv *bat_priv, + struct sk_buff *skb, unsigned short vid, + int is_routable, int count) +{ + unsigned int mcast_hdrlen = batadv_mcast_forw_packet_hdrlen(count); + u8 own_tvlv_flags = bat_priv->mcast.mla_flags.tvlv_flags; + + if (!atomic_read(&bat_priv->mcast.num_no_mc_ptype_capa) && + own_tvlv_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA && + skb->len + mcast_hdrlen <= IPV6_MIN_MTU && + batadv_mcast_forw_push(bat_priv, skb, vid, is_routable, count)) + return BATADV_FORW_MCAST; + + if (count <= atomic_read(&bat_priv->multicast_fanout)) + return BATADV_FORW_UCASTS; + + return BATADV_FORW_BCAST; +} + +/** * batadv_mcast_forw_mode() - check on how to forward a multicast packet * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast packet to check + * @vid: the vlan identifier * @is_routable: stores whether the destination is routable * * Return: The forwarding mode as enum batadv_forw_mode. */ enum batadv_forw_mode batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, - int *is_routable) + unsigned short vid, int *is_routable) { int ret, tt_count, ip_count, unsnoop_count, total_count; bool is_unsnoopable = false; @@ -1175,10 +1254,8 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, else if (unsnoop_count) return BATADV_FORW_BCAST; - if (total_count <= atomic_read(&bat_priv->multicast_fanout)) - return BATADV_FORW_UCASTS; - - return BATADV_FORW_BCAST; + return batadv_mcast_forw_mode_by_count(bat_priv, skb, vid, *is_routable, + total_count); } /** @@ -1739,6 +1816,31 @@ static void batadv_mcast_want_rtr6_update(struct batadv_priv *bat_priv, } /** + * batadv_mcast_have_mc_ptype_update() - update multicast packet type counter + * @bat_priv: the bat priv with all the soft interface information + * @orig: the orig_node which multicast state might have changed of + * @mcast_flags: flags indicating the new multicast state + * + * If the BATADV_MCAST_HAVE_MC_PTYPE_CAPA flag of this originator, orig, has + * toggled then this method updates the counter accordingly. + */ +static void batadv_mcast_have_mc_ptype_update(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig, + u8 mcast_flags) +{ + lockdep_assert_held(&orig->mcast_handler_lock); + + /* switched from flag set to unset */ + if (!(mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) && + orig->mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) + atomic_inc(&bat_priv->mcast.num_no_mc_ptype_capa); + /* switched from flag unset to set */ + else if (mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA && + !(orig->mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA)) + atomic_dec(&bat_priv->mcast.num_no_mc_ptype_capa); +} + +/** * batadv_mcast_tvlv_flags_get() - get multicast flags from an OGM TVLV * @enabled: whether the originator has multicast TVLV support enabled * @tvlv_value: tvlv buffer containing the multicast flags @@ -1806,6 +1908,7 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv, batadv_mcast_want_ipv6_update(bat_priv, orig, mcast_flags); batadv_mcast_want_rtr4_update(bat_priv, orig, mcast_flags); batadv_mcast_want_rtr6_update(bat_priv, orig, mcast_flags); + batadv_mcast_have_mc_ptype_update(bat_priv, orig, mcast_flags); orig->mcast_flags = mcast_flags; spin_unlock_bh(&orig->mcast_handler_lock); @@ -1820,6 +1923,10 @@ void batadv_mcast_init(struct batadv_priv *bat_priv) batadv_tvlv_handler_register(bat_priv, batadv_mcast_tvlv_ogm_handler, NULL, NULL, BATADV_TVLV_MCAST, 2, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); + batadv_tvlv_handler_register(bat_priv, NULL, NULL, + batadv_mcast_forw_tracker_tvlv_handler, + BATADV_TVLV_MCAST_TRACKER, 1, + BATADV_TVLV_HANDLER_OGM_CIFNOTFND); INIT_DELAYED_WORK(&bat_priv->mcast.work, batadv_mcast_mla_update); batadv_mcast_start_timer(bat_priv); @@ -2068,6 +2175,7 @@ void batadv_mcast_free(struct batadv_priv *bat_priv) cancel_delayed_work_sync(&bat_priv->mcast.work); batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 2); + batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST_TRACKER, 1); batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 2); /* safely calling outside of worker, as worker was canceled above */ @@ -2091,6 +2199,8 @@ void batadv_mcast_purge_orig(struct batadv_orig_node *orig) BATADV_MCAST_WANT_NO_RTR4); batadv_mcast_want_rtr6_update(bat_priv, orig, BATADV_MCAST_WANT_NO_RTR6); + batadv_mcast_have_mc_ptype_update(bat_priv, orig, + BATADV_MCAST_HAVE_MC_PTYPE_CAPA); spin_unlock_bh(&orig->mcast_handler_lock); } diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index a9770d8d6d..d97ee51d26 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -11,6 +11,7 @@ #include <linux/netlink.h> #include <linux/skbuff.h> +#include <linux/types.h> /** * enum batadv_forw_mode - the way a packet should be forwarded as @@ -28,6 +29,12 @@ enum batadv_forw_mode { */ BATADV_FORW_UCASTS, + /** + * @BATADV_FORW_MCAST: forward the packet to some nodes via a + * batman-adv multicast packet + */ + BATADV_FORW_MCAST, + /** @BATADV_FORW_NONE: don't forward, drop it */ BATADV_FORW_NONE, }; @@ -36,7 +43,7 @@ enum batadv_forw_mode { enum batadv_forw_mode batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, - int *is_routable); + unsigned short vid, int *is_routable); int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, int is_routable); @@ -52,11 +59,23 @@ void batadv_mcast_free(struct batadv_priv *bat_priv); void batadv_mcast_purge_orig(struct batadv_orig_node *orig_node); +/* multicast_forw.c */ + +int batadv_mcast_forw_tracker_tvlv_handler(struct batadv_priv *bat_priv, + struct sk_buff *skb); + +unsigned int batadv_mcast_forw_packet_hdrlen(unsigned int num_dests); + +bool batadv_mcast_forw_push(struct batadv_priv *bat_priv, struct sk_buff *skb, + unsigned short vid, int is_routable, int count); + +int batadv_mcast_forw_mcsend(struct batadv_priv *bat_priv, struct sk_buff *skb); + #else static inline enum batadv_forw_mode batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, - int *is_routable) + unsigned short vid, int *is_routable) { return BATADV_FORW_BCAST; } @@ -94,6 +113,13 @@ static inline void batadv_mcast_purge_orig(struct batadv_orig_node *orig_node) { } +static inline int batadv_mcast_forw_mcsend(struct batadv_priv *bat_priv, + struct sk_buff *skb) +{ + kfree_skb(skb); + return NET_XMIT_DROP; +} + #endif /* CONFIG_BATMAN_ADV_MCAST */ #endif /* _NET_BATMAN_ADV_MULTICAST_H_ */ diff --git a/net/batman-adv/multicast_forw.c b/net/batman-adv/multicast_forw.c new file mode 100644 index 0000000000..fafd6ba8c0 --- /dev/null +++ b/net/batman-adv/multicast_forw.c @@ -0,0 +1,1178 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) B.A.T.M.A.N. contributors: + * + * Linus Lüssing + */ + +#include "multicast.h" +#include "main.h" + +#include <linux/bug.h> +#include <linux/build_bug.h> +#include <linux/byteorder/generic.h> +#include <linux/compiler.h> +#include <linux/errno.h> +#include <linux/etherdevice.h> +#include <linux/gfp.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <linux/ipv6.h> +#include <linux/limits.h> +#include <linux/netdevice.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/skbuff.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/types.h> +#include <uapi/linux/batadv_packet.h> + +#include "bridge_loop_avoidance.h" +#include "originator.h" +#include "send.h" +#include "translation-table.h" + +#define batadv_mcast_forw_tracker_for_each_dest(dest, num_dests) \ + for (; num_dests; num_dests--, (dest) += ETH_ALEN) + +#define batadv_mcast_forw_tracker_for_each_dest2(dest1, dest2, num_dests) \ + for (; num_dests; num_dests--, (dest1) += ETH_ALEN, (dest2) += ETH_ALEN) + +/** + * batadv_mcast_forw_skb_push() - skb_push and memorize amount of pushed bytes + * @skb: the skb to push onto + * @size: the amount of bytes to push + * @len: stores the total amount of bytes pushed + * + * Performs an skb_push() onto the given skb and adds the amount of pushed bytes + * to the given len pointer. + * + * Return: the return value of the skb_push() call. + */ +static void *batadv_mcast_forw_skb_push(struct sk_buff *skb, size_t size, + unsigned short *len) +{ + *len += size; + return skb_push(skb, size); +} + +/** + * batadv_mcast_forw_push_padding() - push 2 padding bytes to skb's front + * @skb: the skb to push onto + * @tvlv_len: stores the amount of currently pushed TVLV bytes + * + * Pushes two padding bytes to the front of the given skb. + * + * Return: On success a pointer to the first byte of the two pushed padding + * bytes within the skb. NULL otherwise. + */ +static char * +batadv_mcast_forw_push_padding(struct sk_buff *skb, unsigned short *tvlv_len) +{ + const int pad_len = 2; + char *padding; + + if (skb_headroom(skb) < pad_len) + return NULL; + + padding = batadv_mcast_forw_skb_push(skb, pad_len, tvlv_len); + memset(padding, 0, pad_len); + + return padding; +} + +/** + * batadv_mcast_forw_push_est_padding() - push padding bytes if necessary + * @skb: the skb to potentially push the padding onto + * @count: the (estimated) number of originators the multicast packet needs to + * be sent to + * @tvlv_len: stores the amount of currently pushed TVLV bytes + * + * If the number of destination entries is even then this adds two + * padding bytes to the end of the tracker TVLV. + * + * Return: true on success or if no padding is needed, false otherwise. + */ +static bool +batadv_mcast_forw_push_est_padding(struct sk_buff *skb, int count, + unsigned short *tvlv_len) +{ + if (!(count % 2) && !batadv_mcast_forw_push_padding(skb, tvlv_len)) + return false; + + return true; +} + +/** + * batadv_mcast_forw_orig_entry() - get orig_node from an hlist node + * @node: the hlist node to get the orig_node from + * @entry_offset: the offset of the hlist node within the orig_node struct + * + * Return: The orig_node containing the hlist node on success, NULL on error. + */ +static struct batadv_orig_node * +batadv_mcast_forw_orig_entry(struct hlist_node *node, + size_t entry_offset) +{ + /* sanity check */ + switch (entry_offset) { + case offsetof(struct batadv_orig_node, mcast_want_all_ipv4_node): + case offsetof(struct batadv_orig_node, mcast_want_all_ipv6_node): + case offsetof(struct batadv_orig_node, mcast_want_all_rtr4_node): + case offsetof(struct batadv_orig_node, mcast_want_all_rtr6_node): + break; + default: + WARN_ON(1); + return NULL; + } + + return (struct batadv_orig_node *)((void *)node - entry_offset); +} + +/** + * batadv_mcast_forw_push_dest() - push an originator MAC address onto an skb + * @bat_priv: the bat priv with all the soft interface information + * @skb: the skb to push the destination address onto + * @vid: the vlan identifier + * @orig_node: the originator node to get the MAC address from + * @num_dests: a pointer to store the number of pushed addresses in + * @tvlv_len: stores the amount of currently pushed TVLV bytes + * + * If the orig_node is a BLA backbone gateway, if there is not enough skb + * headroom available or if num_dests is already at its maximum (65535) then + * neither the skb nor num_dests is changed. Otherwise the originator's MAC + * address is pushed onto the given skb and num_dests incremented by one. + * + * Return: true if the orig_node is a backbone gateway or if an orig address + * was pushed successfully, false otherwise. + */ +static bool batadv_mcast_forw_push_dest(struct batadv_priv *bat_priv, + struct sk_buff *skb, unsigned short vid, + struct batadv_orig_node *orig_node, + unsigned short *num_dests, + unsigned short *tvlv_len) +{ + BUILD_BUG_ON(sizeof_field(struct batadv_tvlv_mcast_tracker, num_dests) + != sizeof(__be16)); + + /* Avoid sending to other BLA gateways - they already got the frame from + * the LAN side we share with them. + * TODO: Refactor to take BLA into account earlier in mode check. + */ + if (batadv_bla_is_backbone_gw_orig(bat_priv, orig_node->orig, vid)) + return true; + + if (skb_headroom(skb) < ETH_ALEN || *num_dests == U16_MAX) + return false; + + batadv_mcast_forw_skb_push(skb, ETH_ALEN, tvlv_len); + ether_addr_copy(skb->data, orig_node->orig); + (*num_dests)++; + + return true; +} + +/** + * batadv_mcast_forw_push_dests_list() - push originators from list onto an skb + * @bat_priv: the bat priv with all the soft interface information + * @skb: the skb to push the destination addresses onto + * @vid: the vlan identifier + * @head: the list to gather originators from + * @entry_offset: offset of an hlist node in an orig_node structure + * @num_dests: a pointer to store the number of pushed addresses in + * @tvlv_len: stores the amount of currently pushed TVLV bytes + * + * Push the MAC addresses of all originators in the given list onto the given + * skb. + * + * Return: true on success, false otherwise. + */ +static int batadv_mcast_forw_push_dests_list(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned short vid, + struct hlist_head *head, + size_t entry_offset, + unsigned short *num_dests, + unsigned short *tvlv_len) +{ + struct hlist_node *node; + struct batadv_orig_node *orig_node; + + rcu_read_lock(); + __hlist_for_each_rcu(node, head) { + orig_node = batadv_mcast_forw_orig_entry(node, entry_offset); + if (!orig_node || + !batadv_mcast_forw_push_dest(bat_priv, skb, vid, orig_node, + num_dests, tvlv_len)) { + rcu_read_unlock(); + return false; + } + } + rcu_read_unlock(); + + return true; +} + +/** + * batadv_mcast_forw_push_tt() - push originators with interest through TT + * @bat_priv: the bat priv with all the soft interface information + * @skb: the skb to push the destination addresses onto + * @vid: the vlan identifier + * @num_dests: a pointer to store the number of pushed addresses in + * @tvlv_len: stores the amount of currently pushed TVLV bytes + * + * Push the MAC addresses of all originators which have indicated interest in + * this multicast packet through the translation table onto the given skb. + * + * Return: true on success, false otherwise. + */ +static bool +batadv_mcast_forw_push_tt(struct batadv_priv *bat_priv, struct sk_buff *skb, + unsigned short vid, unsigned short *num_dests, + unsigned short *tvlv_len) +{ + struct batadv_tt_orig_list_entry *orig_entry; + + struct batadv_tt_global_entry *tt_global; + const u8 *addr = eth_hdr(skb)->h_dest; + + /* ok */ + int ret = true; + + tt_global = batadv_tt_global_hash_find(bat_priv, addr, vid); + if (!tt_global) + goto out; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_entry, &tt_global->orig_list, list) { + if (!batadv_mcast_forw_push_dest(bat_priv, skb, vid, + orig_entry->orig_node, + num_dests, tvlv_len)) { + ret = false; + break; + } + } + rcu_read_unlock(); + + batadv_tt_global_entry_put(tt_global); + +out: + return ret; +} + +/** + * batadv_mcast_forw_push_want_all() - push originators with want-all flag + * @bat_priv: the bat priv with all the soft interface information + * @skb: the skb to push the destination addresses onto + * @vid: the vlan identifier + * @num_dests: a pointer to store the number of pushed addresses in + * @tvlv_len: stores the amount of currently pushed TVLV bytes + * + * Push the MAC addresses of all originators which have indicated interest in + * this multicast packet through the want-all flag onto the given skb. + * + * Return: true on success, false otherwise. + */ +static bool batadv_mcast_forw_push_want_all(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned short vid, + unsigned short *num_dests, + unsigned short *tvlv_len) +{ + struct hlist_head *head = NULL; + size_t offset; + int ret; + + switch (eth_hdr(skb)->h_proto) { + case htons(ETH_P_IP): + head = &bat_priv->mcast.want_all_ipv4_list; + offset = offsetof(struct batadv_orig_node, + mcast_want_all_ipv4_node); + break; + case htons(ETH_P_IPV6): + head = &bat_priv->mcast.want_all_ipv6_list; + offset = offsetof(struct batadv_orig_node, + mcast_want_all_ipv6_node); + break; + default: + return false; + } + + ret = batadv_mcast_forw_push_dests_list(bat_priv, skb, vid, head, + offset, num_dests, tvlv_len); + if (!ret) + return false; + + return true; +} + +/** + * batadv_mcast_forw_push_want_rtr() - push originators with want-router flag + * @bat_priv: the bat priv with all the soft interface information + * @skb: the skb to push the destination addresses onto + * @vid: the vlan identifier + * @num_dests: a pointer to store the number of pushed addresses in + * @tvlv_len: stores the amount of currently pushed TVLV bytes + * + * Push the MAC addresses of all originators which have indicated interest in + * this multicast packet through the want-all-rtr flag onto the given skb. + * + * Return: true on success, false otherwise. + */ +static bool batadv_mcast_forw_push_want_rtr(struct batadv_priv *bat_priv, + struct sk_buff *skb, + unsigned short vid, + unsigned short *num_dests, + unsigned short *tvlv_len) +{ + struct hlist_head *head = NULL; + size_t offset; + int ret; + + switch (eth_hdr(skb)->h_proto) { + case htons(ETH_P_IP): + head = &bat_priv->mcast.want_all_rtr4_list; + offset = offsetof(struct batadv_orig_node, + mcast_want_all_rtr4_node); + break; + case htons(ETH_P_IPV6): + head = &bat_priv->mcast.want_all_rtr6_list; + offset = offsetof(struct batadv_orig_node, + mcast_want_all_rtr6_node); + break; + default: + return false; + } + + ret = batadv_mcast_forw_push_dests_list(bat_priv, skb, vid, head, + offset, num_dests, tvlv_len); + if (!ret) + return false; + + return true; +} + +/** + * batadv_mcast_forw_scrape() - remove bytes within skb data + * @skb: the skb to remove bytes from + * @offset: the offset from the skb data from which to scrape + * @len: the amount of bytes to scrape starting from the offset + * + * Scrapes/removes len bytes from the given skb at the given offset from the + * skb data. + * + * Caller needs to ensure that the region from the skb data's start up + * to/including the to be removed bytes are linearized. + */ +static void batadv_mcast_forw_scrape(struct sk_buff *skb, + unsigned short offset, + unsigned short len) +{ + char *to, *from; + + SKB_LINEAR_ASSERT(skb); + + to = skb_pull(skb, len); + from = to - len; + + memmove(to, from, offset); +} + +/** + * batadv_mcast_forw_push_scrape_padding() - remove TVLV padding + * @skb: the skb to potentially adjust the TVLV's padding on + * @tvlv_len: stores the amount of currently pushed TVLV bytes + * + * Remove two padding bytes from the end of the multicast tracker TVLV, + * from before the payload data. + * + * Caller needs to ensure that the TVLV bytes are linearized. + */ +static void batadv_mcast_forw_push_scrape_padding(struct sk_buff *skb, + unsigned short *tvlv_len) +{ + const int pad_len = 2; + + batadv_mcast_forw_scrape(skb, *tvlv_len - pad_len, pad_len); + *tvlv_len -= pad_len; +} + +/** + * batadv_mcast_forw_push_insert_padding() - insert TVLV padding + * @skb: the skb to potentially adjust the TVLV's padding on + * @tvlv_len: stores the amount of currently pushed TVLV bytes + * + * Inserts two padding bytes at the end of the multicast tracker TVLV, + * before the payload data in the given skb. + * + * Return: true on success, false otherwise. + */ +static bool batadv_mcast_forw_push_insert_padding(struct sk_buff *skb, + unsigned short *tvlv_len) +{ + unsigned short offset = *tvlv_len; + char *to, *from = skb->data; + + to = batadv_mcast_forw_push_padding(skb, tvlv_len); + if (!to) + return false; + + memmove(to, from, offset); + memset(to + offset, 0, *tvlv_len - offset); + return true; +} + +/** + * batadv_mcast_forw_push_adjust_padding() - adjust padding if necessary + * @skb: the skb to potentially adjust the TVLV's padding on + * @count: the estimated number of originators the multicast packet needs to + * be sent to + * @num_dests_pushed: the number of originators that were actually added to the + * multicast packet's tracker TVLV + * @tvlv_len: stores the amount of currently pushed TVLV bytes + * + * Adjusts the padding in the multicast packet's tracker TVLV depending on the + * initially estimated amount of destinations versus the amount of destinations + * that were actually added to the tracker TVLV. + * + * If the initial estimate was correct or at least the oddness was the same then + * no padding adjustment is performed. + * If the initially estimated number was even, so padding was initially added, + * but it turned out to be odd then padding is removed. + * If the initially estimated number was odd, so no padding was initially added, + * but it turned out to be even then padding is added. + * + * Return: true if no padding adjustment is needed or the adjustment was + * successful, false otherwise. + */ +static bool +batadv_mcast_forw_push_adjust_padding(struct sk_buff *skb, int *count, + unsigned short num_dests_pushed, + unsigned short *tvlv_len) +{ + int ret = true; + + if (likely((num_dests_pushed % 2) == (*count % 2))) + goto out; + + /** + * estimated even number of destinations, but turned out to be odd + * -> remove padding + */ + if (!(*count % 2) && (num_dests_pushed % 2)) + batadv_mcast_forw_push_scrape_padding(skb, tvlv_len); + /** + * estimated odd number of destinations, but turned out to be even + * -> add padding + */ + else if ((*count % 2) && (!(num_dests_pushed % 2))) + ret = batadv_mcast_forw_push_insert_padding(skb, tvlv_len); + +out: + *count = num_dests_pushed; + return ret; +} + +/** + * batadv_mcast_forw_push_dests() - push originator addresses onto an skb + * @bat_priv: the bat priv with all the soft interface information + * @skb: the skb to push the destination addresses onto + * @vid: the vlan identifier + * @is_routable: indicates whether the destination is routable + * @count: the number of originators the multicast packet needs to be sent to + * @tvlv_len: stores the amount of currently pushed TVLV bytes + * + * Push the MAC addresses of all originators which have indicated interest in + * this multicast packet onto the given skb. + * + * Return: -ENOMEM if there is not enough skb headroom available. Otherwise, on + * success 0. + */ +static int +batadv_mcast_forw_push_dests(struct batadv_priv *bat_priv, struct sk_buff *skb, + unsigned short vid, int is_routable, int *count, + unsigned short *tvlv_len) +{ + unsigned short num_dests = 0; + + if (!batadv_mcast_forw_push_est_padding(skb, *count, tvlv_len)) + goto err; + + if (!batadv_mcast_forw_push_tt(bat_priv, skb, vid, &num_dests, + tvlv_len)) + goto err; + + if (!batadv_mcast_forw_push_want_all(bat_priv, skb, vid, &num_dests, + tvlv_len)) + goto err; + + if (is_routable && + !batadv_mcast_forw_push_want_rtr(bat_priv, skb, vid, &num_dests, + tvlv_len)) + goto err; + + if (!batadv_mcast_forw_push_adjust_padding(skb, count, num_dests, + tvlv_len)) + goto err; + + return 0; +err: + return -ENOMEM; +} + +/** + * batadv_mcast_forw_push_tracker() - push a multicast tracker TVLV header + * @skb: the skb to push the tracker TVLV onto + * @num_dests: the number of destination addresses to set in the header + * @tvlv_len: stores the amount of currently pushed TVLV bytes + * + * Pushes a multicast tracker TVLV header onto the given skb, including the + * generic TVLV header but excluding the destination MAC addresses. + * + * The provided num_dests value is taken into consideration to set the + * num_dests field in the tracker header and to set the appropriate TVLV length + * value fields. + * + * Return: -ENOMEM if there is not enough skb headroom available. Otherwise, on + * success 0. + */ +static int batadv_mcast_forw_push_tracker(struct sk_buff *skb, int num_dests, + unsigned short *tvlv_len) +{ + struct batadv_tvlv_mcast_tracker *mcast_tracker; + struct batadv_tvlv_hdr *tvlv_hdr; + unsigned int tvlv_value_len; + + if (skb_headroom(skb) < sizeof(*mcast_tracker) + sizeof(*tvlv_hdr)) + return -ENOMEM; + + tvlv_value_len = sizeof(*mcast_tracker) + *tvlv_len; + if (tvlv_value_len + sizeof(*tvlv_hdr) > U16_MAX) + return -ENOMEM; + + batadv_mcast_forw_skb_push(skb, sizeof(*mcast_tracker), tvlv_len); + mcast_tracker = (struct batadv_tvlv_mcast_tracker *)skb->data; + mcast_tracker->num_dests = htons(num_dests); + + skb_reset_network_header(skb); + + batadv_mcast_forw_skb_push(skb, sizeof(*tvlv_hdr), tvlv_len); + tvlv_hdr = (struct batadv_tvlv_hdr *)skb->data; + tvlv_hdr->type = BATADV_TVLV_MCAST_TRACKER; + tvlv_hdr->version = 1; + tvlv_hdr->len = htons(tvlv_value_len); + + return 0; +} + +/** + * batadv_mcast_forw_push_tvlvs() - push a multicast tracker TVLV onto an skb + * @bat_priv: the bat priv with all the soft interface information + * @skb: the skb to push the tracker TVLV onto + * @vid: the vlan identifier + * @is_routable: indicates whether the destination is routable + * @count: the number of originators the multicast packet needs to be sent to + * @tvlv_len: stores the amount of currently pushed TVLV bytes + * + * Pushes a multicast tracker TVLV onto the given skb, including the collected + * destination MAC addresses and the generic TVLV header. + * + * Return: -ENOMEM if there is not enough skb headroom available. Otherwise, on + * success 0. + */ +static int +batadv_mcast_forw_push_tvlvs(struct batadv_priv *bat_priv, struct sk_buff *skb, + unsigned short vid, int is_routable, int count, + unsigned short *tvlv_len) +{ + int ret; + + ret = batadv_mcast_forw_push_dests(bat_priv, skb, vid, is_routable, + &count, tvlv_len); + if (ret < 0) + return ret; + + ret = batadv_mcast_forw_push_tracker(skb, count, tvlv_len); + if (ret < 0) + return ret; + + return 0; +} + +/** + * batadv_mcast_forw_push_hdr() - push a multicast packet header onto an skb + * @skb: the skb to push the header onto + * @tvlv_len: the total TVLV length value to set in the header + * + * Pushes a batman-adv multicast packet header onto the given skb and sets + * the provided total TVLV length value in it. + * + * Caller needs to ensure enough skb headroom is available. + * + * Return: -ENOMEM if there is not enough skb headroom available. Otherwise, on + * success 0. + */ +static int +batadv_mcast_forw_push_hdr(struct sk_buff *skb, unsigned short tvlv_len) +{ + struct batadv_mcast_packet *mcast_packet; + + if (skb_headroom(skb) < sizeof(*mcast_packet)) + return -ENOMEM; + + skb_push(skb, sizeof(*mcast_packet)); + + mcast_packet = (struct batadv_mcast_packet *)skb->data; + mcast_packet->version = BATADV_COMPAT_VERSION; + mcast_packet->ttl = BATADV_TTL; + mcast_packet->packet_type = BATADV_MCAST; + mcast_packet->reserved = 0; + mcast_packet->tvlv_len = htons(tvlv_len); + + return 0; +} + +/** + * batadv_mcast_forw_scrub_dests() - scrub destinations in a tracker TVLV + * @bat_priv: the bat priv with all the soft interface information + * @comp_neigh: next hop neighbor to scrub+collect destinations for + * @dest: start MAC entry in original skb's tracker TVLV + * @next_dest: start MAC entry in to be sent skb's tracker TVLV + * @num_dests: number of remaining destination MAC entries to iterate over + * + * This sorts destination entries into either the original batman-adv + * multicast packet or the skb (copy) that is going to be sent to comp_neigh + * next. + * + * In preparation for the next, to be (unicast) transmitted batman-adv multicast + * packet skb to be sent to the given neighbor node, tries to collect all + * originator MAC addresses that have the given neighbor node as their next hop + * in the to be transmitted skb (copy), which next_dest points into. That is we + * zero all destination entries in next_dest which do not have comp_neigh as + * their next hop. And zero all destination entries in the original skb that + * would have comp_neigh as their next hop (to avoid redundant transmissions and + * duplicated payload later). + */ +static void +batadv_mcast_forw_scrub_dests(struct batadv_priv *bat_priv, + struct batadv_neigh_node *comp_neigh, u8 *dest, + u8 *next_dest, u16 num_dests) +{ + struct batadv_neigh_node *next_neigh; + + /* skip first entry, this is what we are comparing with */ + eth_zero_addr(dest); + dest += ETH_ALEN; + next_dest += ETH_ALEN; + num_dests--; + + batadv_mcast_forw_tracker_for_each_dest2(dest, next_dest, num_dests) { + if (is_zero_ether_addr(next_dest)) + continue; + + /* sanity check, we expect unicast destinations */ + if (is_multicast_ether_addr(next_dest)) { + eth_zero_addr(dest); + eth_zero_addr(next_dest); + continue; + } + + next_neigh = batadv_orig_to_router(bat_priv, next_dest, NULL); + if (!next_neigh) { + eth_zero_addr(next_dest); + continue; + } + + if (!batadv_compare_eth(next_neigh->addr, comp_neigh->addr)) { + eth_zero_addr(next_dest); + batadv_neigh_node_put(next_neigh); + continue; + } + + /* found an entry for our next packet to transmit, so remove it + * from the original packet + */ + eth_zero_addr(dest); + batadv_neigh_node_put(next_neigh); + } +} + +/** + * batadv_mcast_forw_shrink_fill() - swap slot with next non-zero destination + * @slot: the to be filled zero-MAC destination entry in a tracker TVLV + * @num_dests_slot: remaining entries in tracker TVLV from/including slot + * + * Searches for the next non-zero-MAC destination entry in a tracker TVLV after + * the given slot pointer. And if found, swaps it with the zero-MAC destination + * entry which the slot points to. + * + * Return: true if slot was swapped/filled successfully, false otherwise. + */ +static bool batadv_mcast_forw_shrink_fill(u8 *slot, u16 num_dests_slot) +{ + u16 num_dests_filler; + u8 *filler; + + /* sanity check, should not happen */ + if (!num_dests_slot) + return false; + + num_dests_filler = num_dests_slot - 1; + filler = slot + ETH_ALEN; + + /* find a candidate to fill the empty slot */ + batadv_mcast_forw_tracker_for_each_dest(filler, num_dests_filler) { + if (is_zero_ether_addr(filler)) + continue; + + ether_addr_copy(slot, filler); + eth_zero_addr(filler); + return true; + } + + return false; +} + +/** + * batadv_mcast_forw_shrink_pack_dests() - pack destinations of a tracker TVLV + * @skb: the batman-adv multicast packet to compact destinations in + * + * Compacts the originator destination MAC addresses in the multicast tracker + * TVLV of the given multicast packet. This is done by moving all non-zero + * MAC addresses in direction of the skb head and all zero MAC addresses in skb + * tail direction, within the multicast tracker TVLV. + * + * Return: The number of consecutive zero MAC address destinations which are + * now at the end of the multicast tracker TVLV. + */ +static int batadv_mcast_forw_shrink_pack_dests(struct sk_buff *skb) +{ + struct batadv_tvlv_mcast_tracker *mcast_tracker; + unsigned char *skb_net_hdr; + u16 num_dests_slot; + u8 *slot; + + skb_net_hdr = skb_network_header(skb); + mcast_tracker = (struct batadv_tvlv_mcast_tracker *)skb_net_hdr; + num_dests_slot = ntohs(mcast_tracker->num_dests); + + slot = (u8 *)mcast_tracker + sizeof(*mcast_tracker); + + batadv_mcast_forw_tracker_for_each_dest(slot, num_dests_slot) { + /* find an empty slot */ + if (!is_zero_ether_addr(slot)) + continue; + + if (!batadv_mcast_forw_shrink_fill(slot, num_dests_slot)) + /* could not find a filler, so we successfully packed + * and can stop - and must not reduce num_dests_slot! + */ + break; + } + + /* num_dests_slot is now the amount of reduced, zeroed + * destinations at the end of the tracker TVLV + */ + return num_dests_slot; +} + +/** + * batadv_mcast_forw_shrink_align_offset() - get new alignment offset + * @num_dests_old: the old, to be updated amount of destination nodes + * @num_dests_reduce: the number of destinations that were removed + * + * Calculates the amount of potential extra alignment offset that is needed to + * adjust the TVLV padding after the change in destination nodes. + * + * Return: + * 0: If no change to padding is needed. + * 2: If padding needs to be removed. + * -2: If padding needs to be added. + */ +static short +batadv_mcast_forw_shrink_align_offset(unsigned int num_dests_old, + unsigned int num_dests_reduce) +{ + /* even amount of removed destinations -> no alignment change */ + if (!(num_dests_reduce % 2)) + return 0; + + /* even to odd amount of destinations -> remove padding */ + if (!(num_dests_old % 2)) + return 2; + + /* odd to even amount of destinations -> add padding */ + return -2; +} + +/** + * batadv_mcast_forw_shrink_update_headers() - update shrunk mc packet headers + * @skb: the batman-adv multicast packet to update headers of + * @num_dests_reduce: the number of destinations that were removed + * + * This updates any fields of a batman-adv multicast packet that are affected + * by the reduced number of destinations in the multicast tracket TVLV. In + * particular this updates: + * + * The num_dest field of the multicast tracker TVLV. + * The TVLV length field of the according generic TVLV header. + * The batman-adv multicast packet's total TVLV length field. + * + * Return: The offset in skb's tail direction at which the new batman-adv + * multicast packet header needs to start. + */ +static unsigned int +batadv_mcast_forw_shrink_update_headers(struct sk_buff *skb, + unsigned int num_dests_reduce) +{ + struct batadv_tvlv_mcast_tracker *mcast_tracker; + struct batadv_mcast_packet *mcast_packet; + struct batadv_tvlv_hdr *tvlv_hdr; + unsigned char *skb_net_hdr; + unsigned int offset; + short align_offset; + u16 num_dests; + + skb_net_hdr = skb_network_header(skb); + mcast_tracker = (struct batadv_tvlv_mcast_tracker *)skb_net_hdr; + num_dests = ntohs(mcast_tracker->num_dests); + + align_offset = batadv_mcast_forw_shrink_align_offset(num_dests, + num_dests_reduce); + offset = ETH_ALEN * num_dests_reduce + align_offset; + num_dests -= num_dests_reduce; + + /* update tracker header */ + mcast_tracker->num_dests = htons(num_dests); + + /* update tracker's tvlv header's length field */ + tvlv_hdr = (struct batadv_tvlv_hdr *)(skb_network_header(skb) - + sizeof(*tvlv_hdr)); + tvlv_hdr->len = htons(ntohs(tvlv_hdr->len) - offset); + + /* update multicast packet header's tvlv length field */ + mcast_packet = (struct batadv_mcast_packet *)skb->data; + mcast_packet->tvlv_len = htons(ntohs(mcast_packet->tvlv_len) - offset); + + return offset; +} + +/** + * batadv_mcast_forw_shrink_move_headers() - move multicast headers by offset + * @skb: the batman-adv multicast packet to move headers for + * @offset: a non-negative offset to move headers by, towards the skb tail + * + * Moves the batman-adv multicast packet header, its multicast tracker TVLV and + * any TVLVs in between by the given offset in direction towards the tail. + */ +static void +batadv_mcast_forw_shrink_move_headers(struct sk_buff *skb, unsigned int offset) +{ + struct batadv_tvlv_mcast_tracker *mcast_tracker; + unsigned char *skb_net_hdr; + unsigned int len; + u16 num_dests; + + skb_net_hdr = skb_network_header(skb); + mcast_tracker = (struct batadv_tvlv_mcast_tracker *)skb_net_hdr; + num_dests = ntohs(mcast_tracker->num_dests); + len = skb_network_offset(skb) + sizeof(*mcast_tracker); + len += num_dests * ETH_ALEN; + + batadv_mcast_forw_scrape(skb, len, offset); +} + +/** + * batadv_mcast_forw_shrink_tracker() - remove zero addresses in a tracker tvlv + * @skb: the batman-adv multicast packet to (potentially) shrink + * + * Removes all destinations with a zero MAC addresses (00:00:00:00:00:00) from + * the given batman-adv multicast packet's tracker TVLV and updates headers + * accordingly to maintain a valid batman-adv multicast packet. + */ +static void batadv_mcast_forw_shrink_tracker(struct sk_buff *skb) +{ + unsigned int offset; + u16 dests_reduced; + + dests_reduced = batadv_mcast_forw_shrink_pack_dests(skb); + if (!dests_reduced) + return; + + offset = batadv_mcast_forw_shrink_update_headers(skb, dests_reduced); + batadv_mcast_forw_shrink_move_headers(skb, offset); +} + +/** + * batadv_mcast_forw_packet() - forward a batman-adv multicast packet + * @bat_priv: the bat priv with all the soft interface information + * @skb: the received or locally generated batman-adv multicast packet + * @local_xmit: indicates that the packet was locally generated and not received + * + * Parses the tracker TVLV of a batman-adv multicast packet and forwards the + * packet as indicated in this TVLV. + * + * Caller needs to set the skb network header to the start of the multicast + * tracker TVLV (excluding the generic TVLV header) and the skb transport header + * to the next byte after this multicast tracker TVLV. + * + * Caller needs to free the skb. + * + * Return: NET_RX_SUCCESS or NET_RX_DROP on success or a negative error + * code on failure. NET_RX_SUCCESS if the received packet is supposed to be + * decapsulated and forwarded to the own soft interface, NET_RX_DROP otherwise. + */ +static int batadv_mcast_forw_packet(struct batadv_priv *bat_priv, + struct sk_buff *skb, bool local_xmit) +{ + struct batadv_tvlv_mcast_tracker *mcast_tracker; + struct batadv_neigh_node *neigh_node; + unsigned long offset, num_dests_off; + struct sk_buff *nexthop_skb; + unsigned char *skb_net_hdr; + bool local_recv = false; + unsigned int tvlv_len; + bool xmitted = false; + u8 *dest, *next_dest; + u16 num_dests; + int ret; + + /* (at least) TVLV part needs to be linearized */ + SKB_LINEAR_ASSERT(skb); + + /* check if num_dests is within skb length */ + num_dests_off = offsetof(struct batadv_tvlv_mcast_tracker, num_dests); + if (num_dests_off > skb_network_header_len(skb)) + return -EINVAL; + + skb_net_hdr = skb_network_header(skb); + mcast_tracker = (struct batadv_tvlv_mcast_tracker *)skb_net_hdr; + num_dests = ntohs(mcast_tracker->num_dests); + + dest = (u8 *)mcast_tracker + sizeof(*mcast_tracker); + + /* check if full tracker tvlv is within skb length */ + tvlv_len = sizeof(*mcast_tracker) + ETH_ALEN * num_dests; + if (tvlv_len > skb_network_header_len(skb)) + return -EINVAL; + + /* invalidate checksum: */ + skb->ip_summed = CHECKSUM_NONE; + + batadv_mcast_forw_tracker_for_each_dest(dest, num_dests) { + if (is_zero_ether_addr(dest)) + continue; + + /* only unicast originator addresses supported */ + if (is_multicast_ether_addr(dest)) { + eth_zero_addr(dest); + continue; + } + + if (batadv_is_my_mac(bat_priv, dest)) { + eth_zero_addr(dest); + local_recv = true; + continue; + } + + neigh_node = batadv_orig_to_router(bat_priv, dest, NULL); + if (!neigh_node) { + eth_zero_addr(dest); + continue; + } + + nexthop_skb = skb_copy(skb, GFP_ATOMIC); + if (!nexthop_skb) { + batadv_neigh_node_put(neigh_node); + return -ENOMEM; + } + + offset = dest - skb->data; + next_dest = nexthop_skb->data + offset; + + batadv_mcast_forw_scrub_dests(bat_priv, neigh_node, dest, + next_dest, num_dests); + batadv_mcast_forw_shrink_tracker(nexthop_skb); + + batadv_inc_counter(bat_priv, BATADV_CNT_MCAST_TX); + batadv_add_counter(bat_priv, BATADV_CNT_MCAST_TX_BYTES, + nexthop_skb->len + ETH_HLEN); + xmitted = true; + ret = batadv_send_unicast_skb(nexthop_skb, neigh_node); + + batadv_neigh_node_put(neigh_node); + + if (ret < 0) + return ret; + } + + if (xmitted) { + if (local_xmit) { + batadv_inc_counter(bat_priv, BATADV_CNT_MCAST_TX_LOCAL); + batadv_add_counter(bat_priv, + BATADV_CNT_MCAST_TX_LOCAL_BYTES, + skb->len - + skb_transport_offset(skb)); + } else { + batadv_inc_counter(bat_priv, BATADV_CNT_MCAST_FWD); + batadv_add_counter(bat_priv, BATADV_CNT_MCAST_FWD_BYTES, + skb->len + ETH_HLEN); + } + } + + if (local_recv) + return NET_RX_SUCCESS; + else + return NET_RX_DROP; +} + +/** + * batadv_mcast_forw_tracker_tvlv_handler() - handle an mcast tracker tvlv + * @bat_priv: the bat priv with all the soft interface information + * @skb: the received batman-adv multicast packet + * + * Parses the tracker TVLV of an incoming batman-adv multicast packet and + * forwards the packet as indicated in this TVLV. + * + * Caller needs to set the skb network header to the start of the multicast + * tracker TVLV (excluding the generic TVLV header) and the skb transport header + * to the next byte after this multicast tracker TVLV. + * + * Caller needs to free the skb. + * + * Return: NET_RX_SUCCESS or NET_RX_DROP on success or a negative error + * code on failure. NET_RX_SUCCESS if the received packet is supposed to be + * decapsulated and forwarded to the own soft interface, NET_RX_DROP otherwise. + */ +int batadv_mcast_forw_tracker_tvlv_handler(struct batadv_priv *bat_priv, + struct sk_buff *skb) +{ + return batadv_mcast_forw_packet(bat_priv, skb, false); +} + +/** + * batadv_mcast_forw_packet_hdrlen() - multicast packet header length + * @num_dests: number of destination nodes + * + * Calculates the total batman-adv multicast packet header length for a given + * number of destination nodes (excluding the outer ethernet frame). + * + * Return: The calculated total batman-adv multicast packet header length. + */ +unsigned int batadv_mcast_forw_packet_hdrlen(unsigned int num_dests) +{ + /** + * If the number of destination entries is even then we need to add + * two byte padding to the tracker TVLV. + */ + int padding = (!(num_dests % 2)) ? 2 : 0; + + return padding + num_dests * ETH_ALEN + + sizeof(struct batadv_tvlv_mcast_tracker) + + sizeof(struct batadv_tvlv_hdr) + + sizeof(struct batadv_mcast_packet); +} + +/** + * batadv_mcast_forw_expand_head() - expand headroom for an mcast packet + * @bat_priv: the bat priv with all the soft interface information + * @skb: the multicast packet to send + * + * Tries to expand an skb's headroom so that its head to tail is 1298 + * bytes (minimum IPv6 MTU + vlan ethernet header size) large. + * + * Return: -EINVAL if the given skb's length is too large or -ENOMEM on memory + * allocation failure. Otherwise, on success, zero is returned. + */ +static int batadv_mcast_forw_expand_head(struct batadv_priv *bat_priv, + struct sk_buff *skb) +{ + int hdr_size = VLAN_ETH_HLEN + IPV6_MIN_MTU - skb->len; + + /* TODO: Could be tightened to actual number of destination nodes? + * But it's tricky, number of destinations might have increased since + * we last checked. + */ + if (hdr_size < 0) { + /* batadv_mcast_forw_mode_check_count() should ensure we do not + * end up here + */ + WARN_ON(1); + return -EINVAL; + } + + if (skb_headroom(skb) < hdr_size && + pskb_expand_head(skb, hdr_size, 0, GFP_ATOMIC) < 0) + return -ENOMEM; + + return 0; +} + +/** + * batadv_mcast_forw_push() - encapsulate skb in a batman-adv multicast packet + * @bat_priv: the bat priv with all the soft interface information + * @skb: the multicast packet to encapsulate and send + * @vid: the vlan identifier + * @is_routable: indicates whether the destination is routable + * @count: the number of originators the multicast packet needs to be sent to + * + * Encapsulates the given multicast packet in a batman-adv multicast packet. + * A multicast tracker TVLV with destination originator addresses for any node + * that signaled interest in it, that is either via the translation table or the + * according want-all flags, is attached accordingly. + * + * Return: true on success, false otherwise. + */ +bool batadv_mcast_forw_push(struct batadv_priv *bat_priv, struct sk_buff *skb, + unsigned short vid, int is_routable, int count) +{ + unsigned short tvlv_len = 0; + int ret; + + if (batadv_mcast_forw_expand_head(bat_priv, skb) < 0) + goto err; + + skb_reset_transport_header(skb); + + ret = batadv_mcast_forw_push_tvlvs(bat_priv, skb, vid, is_routable, + count, &tvlv_len); + if (ret < 0) + goto err; + + ret = batadv_mcast_forw_push_hdr(skb, tvlv_len); + if (ret < 0) + goto err; + + return true; + +err: + if (tvlv_len) + skb_pull(skb, tvlv_len); + + return false; +} + +/** + * batadv_mcast_forw_mcsend() - send a self prepared batman-adv multicast packet + * @bat_priv: the bat priv with all the soft interface information + * @skb: the multicast packet to encapsulate and send + * + * Transmits a batman-adv multicast packet that was locally prepared and + * consumes/frees it. + * + * Return: NET_XMIT_DROP on memory allocation failure. NET_XMIT_SUCCESS + * otherwise. + */ +int batadv_mcast_forw_mcsend(struct batadv_priv *bat_priv, + struct sk_buff *skb) +{ + int ret = batadv_mcast_forw_packet(bat_priv, skb, true); + + if (ret < 0) { + kfree_skb(skb); + return NET_XMIT_DROP; + } + + consume_skb(skb); + return NET_XMIT_SUCCESS; +} diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 0c64d81a77..1f7ed9d4f6 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -7,6 +7,7 @@ #include "netlink.h" #include "main.h" +#include <linux/array_size.h> #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/bug.h> @@ -20,7 +21,6 @@ #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/init.h> -#include <linux/kernel.h> #include <linux/limits.h> #include <linux/list.h> #include <linux/minmax.h> diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 34903df4fe..71c143d4b6 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -312,6 +312,33 @@ batadv_orig_router_get(struct batadv_orig_node *orig_node, } /** + * batadv_orig_to_router() - get next hop neighbor to an orig address + * @bat_priv: the bat priv with all the soft interface information + * @orig_addr: the originator MAC address to search the best next hop router for + * @if_outgoing: the interface where the payload packet has been received or + * the OGM should be sent to + * + * Return: A neighbor node which is the best router towards the given originator + * address. + */ +struct batadv_neigh_node * +batadv_orig_to_router(struct batadv_priv *bat_priv, u8 *orig_addr, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_neigh_node *neigh_node; + struct batadv_orig_node *orig_node; + + orig_node = batadv_orig_hash_find(bat_priv, orig_addr); + if (!orig_node) + return NULL; + + neigh_node = batadv_find_router(bat_priv, orig_node, if_outgoing); + batadv_orig_node_put(orig_node); + + return neigh_node; +} + +/** * batadv_orig_ifinfo_get() - find the ifinfo from an orig_node * @orig_node: the orig node to be queried * @if_outgoing: the interface for which the ifinfo should be acquired @@ -942,6 +969,7 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, #ifdef CONFIG_BATMAN_ADV_MCAST orig_node->mcast_flags = BATADV_MCAST_WANT_NO_RTR4; orig_node->mcast_flags |= BATADV_MCAST_WANT_NO_RTR6; + orig_node->mcast_flags |= BATADV_MCAST_HAVE_MC_PTYPE_CAPA; INIT_HLIST_NODE(&orig_node->mcast_want_all_unsnoopables_node); INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv4_node); INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv6_node); diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index ea3d69e4e6..db0c551281 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -36,6 +36,9 @@ void batadv_neigh_node_release(struct kref *ref); struct batadv_neigh_node * batadv_orig_router_get(struct batadv_orig_node *orig_node, const struct batadv_hard_iface *if_outgoing); +struct batadv_neigh_node * +batadv_orig_to_router(struct batadv_priv *bat_priv, u8 *orig_addr, + struct batadv_hard_iface *if_outgoing); struct batadv_neigh_ifinfo * batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh, struct batadv_hard_iface *if_outgoing); diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 163cd43c48..f106198514 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -1270,3 +1270,73 @@ out: batadv_orig_node_put(orig_node); return ret; } + +#ifdef CONFIG_BATMAN_ADV_MCAST +/** + * batadv_recv_mcast_packet() - process received batman-adv multicast packet + * @skb: the received batman-adv multicast packet + * @recv_if: interface that the skb is received on + * + * Parses the given, received batman-adv multicast packet. Depending on the + * contents of its TVLV forwards it and/or decapsulates it to hand it to the + * soft interface. + * + * Return: NET_RX_DROP if the skb is not consumed, NET_RX_SUCCESS otherwise. + */ +int batadv_recv_mcast_packet(struct sk_buff *skb, + struct batadv_hard_iface *recv_if) +{ + struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface); + struct batadv_mcast_packet *mcast_packet; + int hdr_size = sizeof(*mcast_packet); + unsigned char *tvlv_buff; + int ret = NET_RX_DROP; + u16 tvlv_buff_len; + + if (batadv_check_unicast_packet(bat_priv, skb, hdr_size) < 0) + goto free_skb; + + /* create a copy of the skb, if needed, to modify it. */ + if (skb_cow(skb, ETH_HLEN) < 0) + goto free_skb; + + /* packet needs to be linearized to access the tvlv content */ + if (skb_linearize(skb) < 0) + goto free_skb; + + mcast_packet = (struct batadv_mcast_packet *)skb->data; + if (mcast_packet->ttl-- < 2) + goto free_skb; + + tvlv_buff = (unsigned char *)(skb->data + hdr_size); + tvlv_buff_len = ntohs(mcast_packet->tvlv_len); + + if (tvlv_buff_len > skb->len - hdr_size) + goto free_skb; + + ret = batadv_tvlv_containers_process(bat_priv, BATADV_MCAST, NULL, skb, + tvlv_buff, tvlv_buff_len); + if (ret >= 0) { + batadv_inc_counter(bat_priv, BATADV_CNT_MCAST_RX); + batadv_add_counter(bat_priv, BATADV_CNT_MCAST_RX_BYTES, + skb->len + ETH_HLEN); + } + + hdr_size += tvlv_buff_len; + + if (ret == NET_RX_SUCCESS && (skb->len - hdr_size >= ETH_HLEN)) { + batadv_inc_counter(bat_priv, BATADV_CNT_MCAST_RX_LOCAL); + batadv_add_counter(bat_priv, BATADV_CNT_MCAST_RX_LOCAL_BYTES, + skb->len - hdr_size); + + batadv_interface_rx(bat_priv->soft_iface, skb, hdr_size, NULL); + /* skb was consumed */ + skb = NULL; + } + +free_skb: + kfree_skb(skb); + + return ret; +} +#endif /* CONFIG_BATMAN_ADV_MCAST */ diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index afd15b3879..e9849f032a 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -27,6 +27,17 @@ int batadv_recv_frag_packet(struct sk_buff *skb, struct batadv_hard_iface *iface); int batadv_recv_bcast_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if); +#ifdef CONFIG_BATMAN_ADV_MCAST +int batadv_recv_mcast_packet(struct sk_buff *skb, + struct batadv_hard_iface *recv_if); +#else +static inline int batadv_recv_mcast_packet(struct sk_buff *skb, + struct batadv_hard_iface *recv_if) +{ + kfree_skb(skb); + return NET_RX_DROP; +} +#endif int batadv_recv_unicast_tvlv(struct sk_buff *skb, struct batadv_hard_iface *recv_if); int batadv_recv_unhandled_unicast_packet(struct sk_buff *skb, diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 1bf1232a4f..89c51b3cf4 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -301,12 +301,13 @@ static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, send: if (do_bcast && !is_broadcast_ether_addr(ethhdr->h_dest)) { - forw_mode = batadv_mcast_forw_mode(bat_priv, skb, + forw_mode = batadv_mcast_forw_mode(bat_priv, skb, vid, &mcast_is_routable); switch (forw_mode) { case BATADV_FORW_BCAST: break; case BATADV_FORW_UCASTS: + case BATADV_FORW_MCAST: do_bcast = false; break; case BATADV_FORW_NONE: @@ -365,6 +366,8 @@ send: } else if (forw_mode == BATADV_FORW_UCASTS) { ret = batadv_mcast_forw_send(bat_priv, skb, vid, mcast_is_routable); + } else if (forw_mode == BATADV_FORW_MCAST) { + ret = batadv_mcast_forw_mcsend(bat_priv, skb); } else { if (batadv_dat_snoop_outgoing_arp_request(bat_priv, skb)) @@ -762,6 +765,7 @@ static int batadv_softif_init_late(struct net_device *dev) atomic_set(&bat_priv->mcast.num_want_all_unsnoopables, 0); atomic_set(&bat_priv->mcast.num_want_all_ipv4, 0); atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0); + atomic_set(&bat_priv->mcast.num_no_mc_ptype_capa, 0); #endif atomic_set(&bat_priv->gw.mode, BATADV_GW_MODE_OFF); atomic_set(&bat_priv->gw.bandwidth_down, 100); @@ -925,6 +929,18 @@ static const struct { { "tt_response_rx" }, { "tt_roam_adv_tx" }, { "tt_roam_adv_rx" }, +#ifdef CONFIG_BATMAN_ADV_MCAST + { "mcast_tx" }, + { "mcast_tx_bytes" }, + { "mcast_tx_local" }, + { "mcast_tx_local_bytes" }, + { "mcast_rx" }, + { "mcast_rx_bytes" }, + { "mcast_rx_local" }, + { "mcast_rx_local_bytes" }, + { "mcast_fwd" }, + { "mcast_fwd_bytes" }, +#endif #ifdef CONFIG_BATMAN_ADV_DAT { "dat_get_tx" }, { "dat_get_rx" }, diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index b95c36765d..2243cec18e 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -3948,7 +3948,7 @@ void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface) spin_lock_bh(&bat_priv->tt.commit_lock); - while (true) { + while (timeout) { table_size = batadv_tt_local_table_transmit_size(bat_priv); if (packet_size_max >= table_size) break; diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 17d5ea1d8e..00840d5784 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -862,6 +862,70 @@ enum batadv_counters { */ BATADV_CNT_TT_ROAM_ADV_RX, +#ifdef CONFIG_BATMAN_ADV_MCAST + /** + * @BATADV_CNT_MCAST_TX: transmitted batman-adv multicast packets + * counter + */ + BATADV_CNT_MCAST_TX, + + /** + * @BATADV_CNT_MCAST_TX_BYTES: transmitted batman-adv multicast packets + * bytes counter + */ + BATADV_CNT_MCAST_TX_BYTES, + + /** + * @BATADV_CNT_MCAST_TX_LOCAL: counter for multicast packets which + * were locally encapsulated and transmitted as batman-adv multicast + * packets + */ + BATADV_CNT_MCAST_TX_LOCAL, + + /** + * @BATADV_CNT_MCAST_TX_LOCAL_BYTES: bytes counter for multicast packets + * which were locally encapsulated and transmitted as batman-adv + * multicast packets + */ + BATADV_CNT_MCAST_TX_LOCAL_BYTES, + + /** + * @BATADV_CNT_MCAST_RX: received batman-adv multicast packet counter + */ + BATADV_CNT_MCAST_RX, + + /** + * @BATADV_CNT_MCAST_RX_BYTES: received batman-adv multicast packet + * bytes counter + */ + BATADV_CNT_MCAST_RX_BYTES, + + /** + * @BATADV_CNT_MCAST_RX_LOCAL: counter for received batman-adv multicast + * packets which were forwarded to the local soft interface + */ + BATADV_CNT_MCAST_RX_LOCAL, + + /** + * @BATADV_CNT_MCAST_RX_LOCAL_BYTES: bytes counter for received + * batman-adv multicast packets which were forwarded to the local soft + * interface + */ + BATADV_CNT_MCAST_RX_LOCAL_BYTES, + + /** + * @BATADV_CNT_MCAST_FWD: counter for received batman-adv multicast + * packets which were forwarded to other, neighboring nodes + */ + BATADV_CNT_MCAST_FWD, + + /** + * @BATADV_CNT_MCAST_FWD_BYTES: bytes counter for received batman-adv + * multicast packets which were forwarded to other, neighboring nodes + */ + BATADV_CNT_MCAST_FWD_BYTES, +#endif + #ifdef CONFIG_BATMAN_ADV_DAT /** * @BATADV_CNT_DAT_GET_TX: transmitted dht GET traffic packet counter @@ -1279,6 +1343,12 @@ struct batadv_priv_mcast { atomic_t num_want_all_rtr6; /** + * @num_no_mc_ptype_capa: counter for number of nodes without the + * BATADV_MCAST_HAVE_MC_PTYPE_CAPA flag + */ + atomic_t num_no_mc_ptype_capa; + + /** * @want_lists_lock: lock for protecting modifications to mcasts * want_all_{unsnoopables,ipv4,ipv6}_list (traversals are rcu-locked) */ diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 4eb1b3ced0..715cbafbf6 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -892,7 +892,7 @@ static int bt_6lowpan_connect(bdaddr_t *addr, u8 dst_type) chan->ops = &bt_6lowpan_chan_ops; err = l2cap_chan_connect(chan, cpu_to_le16(L2CAP_PSM_IPSP), 0, - addr, dst_type); + addr, dst_type, L2CAP_CONN_TIMEOUT); BT_DBG("chan %p err %d", chan, err); if (err < 0) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 50c55d7335..18f97b2288 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -68,7 +68,7 @@ static const struct sco_param esco_param_msbc[] = { }; /* This function requires the caller holds hdev->lock */ -static void hci_connect_le_scan_cleanup(struct hci_conn *conn, u8 status) +void hci_connect_le_scan_cleanup(struct hci_conn *conn, u8 status) { struct hci_conn_params *params; struct hci_dev *hdev = conn->hdev; @@ -178,64 +178,6 @@ static void hci_conn_cleanup(struct hci_conn *conn) hci_dev_put(hdev); } -static void hci_acl_create_connection(struct hci_conn *conn) -{ - struct hci_dev *hdev = conn->hdev; - struct inquiry_entry *ie; - struct hci_cp_create_conn cp; - - BT_DBG("hcon %p", conn); - - /* Many controllers disallow HCI Create Connection while it is doing - * HCI Inquiry. So we cancel the Inquiry first before issuing HCI Create - * Connection. This may cause the MGMT discovering state to become false - * without user space's request but it is okay since the MGMT Discovery - * APIs do not promise that discovery should be done forever. Instead, - * the user space monitors the status of MGMT discovering and it may - * request for discovery again when this flag becomes false. - */ - if (test_bit(HCI_INQUIRY, &hdev->flags)) { - /* Put this connection to "pending" state so that it will be - * executed after the inquiry cancel command complete event. - */ - conn->state = BT_CONNECT2; - hci_send_cmd(hdev, HCI_OP_INQUIRY_CANCEL, 0, NULL); - return; - } - - conn->state = BT_CONNECT; - conn->out = true; - conn->role = HCI_ROLE_MASTER; - - conn->attempt++; - - conn->link_policy = hdev->link_policy; - - memset(&cp, 0, sizeof(cp)); - bacpy(&cp.bdaddr, &conn->dst); - cp.pscan_rep_mode = 0x02; - - ie = hci_inquiry_cache_lookup(hdev, &conn->dst); - if (ie) { - if (inquiry_entry_age(ie) <= INQUIRY_ENTRY_AGE_MAX) { - cp.pscan_rep_mode = ie->data.pscan_rep_mode; - cp.pscan_mode = ie->data.pscan_mode; - cp.clock_offset = ie->data.clock_offset | - cpu_to_le16(0x8000); - } - - memcpy(conn->dev_class, ie->data.dev_class, 3); - } - - cp.pkt_type = cpu_to_le16(conn->pkt_type); - if (lmp_rswitch_capable(hdev) && !(hdev->link_mode & HCI_LM_MASTER)) - cp.role_switch = 0x01; - else - cp.role_switch = 0x00; - - hci_send_cmd(hdev, HCI_OP_CREATE_CONN, sizeof(cp), &cp); -} - int hci_disconnect(struct hci_conn *conn, __u8 reason) { BT_DBG("hcon %p", conn); @@ -299,6 +241,13 @@ static int configure_datapath_sync(struct hci_dev *hdev, struct bt_codec *codec) __u8 vnd_len, *vnd_data = NULL; struct hci_op_configure_data_path *cmd = NULL; + if (!codec->data_path || !hdev->get_codec_config_data) + return 0; + + /* Do not take me as error */ + if (!hdev->get_codec_config_data) + return 0; + err = hdev->get_codec_config_data(hdev, ESCO_LINK, codec, &vnd_len, &vnd_data); if (err < 0) @@ -344,9 +293,7 @@ static int hci_enhanced_setup_sync(struct hci_dev *hdev, void *data) bt_dev_dbg(hdev, "hcon %p", conn); - /* for offload use case, codec needs to configured before opening SCO */ - if (conn->codec.data_path) - configure_datapath_sync(hdev, &conn->codec); + configure_datapath_sync(hdev, &conn->codec); conn->state = BT_CONNECT; conn->out = true; @@ -1085,8 +1032,9 @@ static void hci_conn_cleanup_child(struct hci_conn *conn, u8 reason) hci_conn_failed(conn, reason); break; case ISO_LINK: - if (conn->state != BT_CONNECTED && - !test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) + if ((conn->state != BT_CONNECTED && + !test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) || + test_bit(HCI_CONN_BIG_CREATED, &conn->flags)) hci_conn_failed(conn, reason); break; } @@ -1176,6 +1124,9 @@ void hci_conn_del(struct hci_conn *conn) * rest of hci_conn_del. */ hci_conn_cleanup(conn); + + /* Dequeue callbacks using connection pointer as data */ + hci_cmd_sync_dequeue(hdev, NULL, conn, NULL); } struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src, uint8_t src_type) @@ -1310,56 +1261,9 @@ u8 hci_conn_set_handle(struct hci_conn *conn, u16 handle) return 0; } -static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err) -{ - struct hci_conn *conn; - u16 handle = PTR_UINT(data); - - conn = hci_conn_hash_lookup_handle(hdev, handle); - if (!conn) - return; - - bt_dev_dbg(hdev, "err %d", err); - - hci_dev_lock(hdev); - - if (!err) { - hci_connect_le_scan_cleanup(conn, 0x00); - goto done; - } - - /* Check if connection is still pending */ - if (conn != hci_lookup_le_connect(hdev)) - goto done; - - /* Flush to make sure we send create conn cancel command if needed */ - flush_delayed_work(&conn->le_conn_timeout); - hci_conn_failed(conn, bt_status(err)); - -done: - hci_dev_unlock(hdev); -} - -static int hci_connect_le_sync(struct hci_dev *hdev, void *data) -{ - struct hci_conn *conn; - u16 handle = PTR_UINT(data); - - conn = hci_conn_hash_lookup_handle(hdev, handle); - if (!conn) - return 0; - - bt_dev_dbg(hdev, "conn %p", conn); - - clear_bit(HCI_CONN_SCANNING, &conn->flags); - conn->state = BT_CONNECT; - - return hci_le_create_conn_sync(hdev, conn); -} - struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, u8 dst_type, bool dst_resolved, u8 sec_level, - u16 conn_timeout, u8 role) + u16 conn_timeout, u8 role, u8 phy, u8 sec_phy) { struct hci_conn *conn; struct smp_irk *irk; @@ -1422,10 +1326,10 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, conn->dst_type = dst_type; conn->sec_level = BT_SECURITY_LOW; conn->conn_timeout = conn_timeout; + conn->le_adv_phy = phy; + conn->le_adv_sec_phy = sec_phy; - err = hci_cmd_sync_queue(hdev, hci_connect_le_sync, - UINT_PTR(conn->handle), - create_le_conn_complete); + err = hci_connect_le_sync(hdev, conn); if (err) { hci_conn_del(conn); return ERR_PTR(err); @@ -1659,7 +1563,7 @@ done: struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, u8 sec_level, u8 auth_type, - enum conn_reasons conn_reason) + enum conn_reasons conn_reason, u16 timeout) { struct hci_conn *acl; @@ -1690,10 +1594,18 @@ struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, acl->conn_reason = conn_reason; if (acl->state == BT_OPEN || acl->state == BT_CLOSED) { + int err; + acl->sec_level = BT_SECURITY_LOW; acl->pending_sec_level = sec_level; acl->auth_type = auth_type; - hci_acl_create_connection(acl); + acl->conn_timeout = timeout; + + err = hci_connect_acl_sync(hdev, acl); + if (err) { + hci_conn_del(acl); + return ERR_PTR(err); + } } return acl; @@ -1728,14 +1640,15 @@ static struct hci_link *hci_conn_link(struct hci_conn *parent, } struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, - __u16 setting, struct bt_codec *codec) + __u16 setting, struct bt_codec *codec, + u16 timeout) { struct hci_conn *acl; struct hci_conn *sco; struct hci_link *link; acl = hci_connect_acl(hdev, dst, BT_SECURITY_LOW, HCI_AT_NO_BONDING, - CONN_REASON_SCO_CONNECT); + CONN_REASON_SCO_CONNECT, timeout); if (IS_ERR(acl)) return acl; @@ -2224,7 +2137,17 @@ struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 base_len, __u8 *base) { struct hci_conn *conn; + struct hci_conn *parent; __u8 eir[HCI_MAX_PER_AD_LENGTH]; + struct hci_link *link; + + /* Look for any BIS that is open for rebinding */ + conn = hci_conn_hash_lookup_big_state(hdev, qos->bcast.big, BT_OPEN); + if (conn) { + memcpy(qos, &conn->iso_qos, sizeof(*qos)); + conn->state = BT_CONNECTED; + return conn; + } if (base_len && base) base_len = eir_append_service_data(eir, 0, 0x1851, @@ -2252,6 +2175,20 @@ struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, conn->iso_qos = *qos; conn->state = BT_BOUND; + /* Link BISes together */ + parent = hci_conn_hash_lookup_big(hdev, + conn->iso_qos.bcast.big); + if (parent && parent != conn) { + link = hci_conn_link(parent, conn); + if (!link) { + hci_conn_drop(conn); + return ERR_PTR(-ENOLINK); + } + + /* Link takes the refcount */ + hci_conn_drop(conn); + } + return conn; } @@ -2283,6 +2220,9 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, if (IS_ERR(conn)) return conn; + if (conn->state == BT_CONNECTED) + return conn; + data.big = qos->bcast.big; data.bis = qos->bcast.bis; @@ -2315,7 +2255,7 @@ struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, le = hci_connect_le(hdev, dst, dst_type, false, BT_SECURITY_LOW, HCI_LE_CONN_TIMEOUT, - HCI_ROLE_SLAVE); + HCI_ROLE_SLAVE, 0, 0); else le = hci_connect_le_scan(hdev, dst, dst_type, BT_SECURITY_LOW, @@ -2610,22 +2550,6 @@ void hci_conn_hash_flush(struct hci_dev *hdev) } } -/* Check pending connect attempts */ -void hci_conn_check_pending(struct hci_dev *hdev) -{ - struct hci_conn *conn; - - BT_DBG("hdev %s", hdev->name); - - hci_dev_lock(hdev); - - conn = hci_conn_hash_lookup_state(hdev, ACL_LINK, BT_CONNECT2); - if (conn) - hci_acl_create_connection(conn); - - hci_dev_unlock(hdev); -} - static u32 get_link_mode(struct hci_conn *conn) { u32 link_mode = 0; @@ -2941,12 +2865,10 @@ u32 hci_conn_get_phy(struct hci_conn *conn) static int abort_conn_sync(struct hci_dev *hdev, void *data) { - struct hci_conn *conn; - u16 handle = PTR_UINT(data); + struct hci_conn *conn = data; - conn = hci_conn_hash_lookup_handle(hdev, handle); - if (!conn) - return 0; + if (!hci_conn_valid(hdev, conn)) + return -ECANCELED; return hci_abort_conn_sync(hdev, conn, conn->abort_reason); } @@ -2974,14 +2896,17 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason) */ if (conn->state == BT_CONNECT && hdev->req_status == HCI_REQ_PEND) { switch (hci_skb_event(hdev->sent_cmd)) { + case HCI_EV_CONN_COMPLETE: case HCI_EV_LE_CONN_COMPLETE: case HCI_EV_LE_ENHANCED_CONN_COMPLETE: case HCI_EVT_LE_CIS_ESTABLISHED: hci_cmd_sync_cancel(hdev, ECANCELED); break; } + /* Cancel connect attempt if still queued/pending */ + } else if (!hci_cancel_connect_sync(hdev, conn)) { + return 0; } - return hci_cmd_sync_queue(hdev, abort_conn_sync, UINT_PTR(conn->handle), - NULL); + return hci_cmd_sync_queue_once(hdev, abort_conn_sync, conn, NULL); } diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 233453807b..ce3ff2fa72 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -218,10 +218,12 @@ static int conn_info_min_age_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val == 0 || val > hdev->conn_info_max_age) + hci_dev_lock(hdev); + if (val == 0 || val > hdev->conn_info_max_age) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->conn_info_min_age = val; hci_dev_unlock(hdev); @@ -246,10 +248,12 @@ static int conn_info_max_age_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val == 0 || val < hdev->conn_info_min_age) + hci_dev_lock(hdev); + if (val == 0 || val < hdev->conn_info_min_age) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->conn_info_max_age = val; hci_dev_unlock(hdev); @@ -567,10 +571,12 @@ static int sniff_min_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val == 0 || val % 2 || val > hdev->sniff_max_interval) + hci_dev_lock(hdev); + if (val == 0 || val % 2 || val > hdev->sniff_max_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->sniff_min_interval = val; hci_dev_unlock(hdev); @@ -595,10 +601,12 @@ static int sniff_max_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val == 0 || val % 2 || val < hdev->sniff_min_interval) + hci_dev_lock(hdev); + if (val == 0 || val % 2 || val < hdev->sniff_min_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->sniff_max_interval = val; hci_dev_unlock(hdev); @@ -850,10 +858,12 @@ static int conn_min_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val < 0x0006 || val > 0x0c80 || val > hdev->le_conn_max_interval) + hci_dev_lock(hdev); + if (val < 0x0006 || val > 0x0c80 || val > hdev->le_conn_max_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->le_conn_min_interval = val; hci_dev_unlock(hdev); @@ -878,10 +888,12 @@ static int conn_max_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val < 0x0006 || val > 0x0c80 || val < hdev->le_conn_min_interval) + hci_dev_lock(hdev); + if (val < 0x0006 || val > 0x0c80 || val < hdev->le_conn_min_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->le_conn_max_interval = val; hci_dev_unlock(hdev); @@ -990,10 +1002,12 @@ static int adv_min_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val < 0x0020 || val > 0x4000 || val > hdev->le_adv_max_interval) + hci_dev_lock(hdev); + if (val < 0x0020 || val > 0x4000 || val > hdev->le_adv_max_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->le_adv_min_interval = val; hci_dev_unlock(hdev); @@ -1018,10 +1032,12 @@ static int adv_max_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val < 0x0020 || val > 0x4000 || val < hdev->le_adv_min_interval) + hci_dev_lock(hdev); + if (val < 0x0020 || val > 0x4000 || val < hdev->le_adv_min_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->le_adv_max_interval = val; hci_dev_unlock(hdev); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 6275b14b56..9d1063c51e 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -93,11 +93,11 @@ static u8 hci_cc_inquiry_cancel(struct hci_dev *hdev, void *data, /* It is possible that we receive Inquiry Complete event right * before we receive Inquiry Cancel Command Complete event, in * which case the latter event should have status of Command - * Disallowed (0x0c). This should not be treated as error, since + * Disallowed. This should not be treated as error, since * we actually achieve what Inquiry Cancel wants to achieve, * which is to end the last Inquiry session. */ - if (rp->status == 0x0c && !test_bit(HCI_INQUIRY, &hdev->flags)) { + if (rp->status == HCI_ERROR_COMMAND_DISALLOWED && !test_bit(HCI_INQUIRY, &hdev->flags)) { bt_dev_warn(hdev, "Ignoring error of Inquiry Cancel command"); rp->status = 0x00; } @@ -118,8 +118,6 @@ static u8 hci_cc_inquiry_cancel(struct hci_dev *hdev, void *data, hci_discovery_set_state(hdev, DISCOVERY_STOPPED); hci_dev_unlock(hdev); - hci_conn_check_pending(hdev); - return rp->status; } @@ -150,8 +148,6 @@ static u8 hci_cc_exit_periodic_inq(struct hci_dev *hdev, void *data, hci_dev_clear_flag(hdev, HCI_PERIODIC_INQ); - hci_conn_check_pending(hdev); - return rp->status; } @@ -2312,10 +2308,8 @@ static void hci_cs_inquiry(struct hci_dev *hdev, __u8 status) { bt_dev_dbg(hdev, "status 0x%2.2x", status); - if (status) { - hci_conn_check_pending(hdev); + if (status) return; - } if (hci_sent_cmd_data(hdev, HCI_OP_INQUIRY)) set_bit(HCI_INQUIRY, &hdev->flags); @@ -2340,12 +2334,9 @@ static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status) if (status) { if (conn && conn->state == BT_CONNECT) { - if (status != 0x0c || conn->attempt > 2) { - conn->state = BT_CLOSED; - hci_connect_cfm(conn, status); - hci_conn_del(conn); - } else - conn->state = BT_CONNECT2; + conn->state = BT_CLOSED; + hci_connect_cfm(conn, status); + hci_conn_del(conn); } } else { if (!conn) { @@ -3035,8 +3026,6 @@ static void hci_inquiry_complete_evt(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); - hci_conn_check_pending(hdev); - if (!test_and_clear_bit(HCI_INQUIRY, &hdev->flags)) return; @@ -3219,6 +3208,31 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, if (test_bit(HCI_ENCRYPT, &hdev->flags)) set_bit(HCI_CONN_ENCRYPT, &conn->flags); + /* "Link key request" completed ahead of "connect request" completes */ + if (ev->encr_mode == 1 && !test_bit(HCI_CONN_ENCRYPT, &conn->flags) && + ev->link_type == ACL_LINK) { + struct link_key *key; + struct hci_cp_read_enc_key_size cp; + + key = hci_find_link_key(hdev, &ev->bdaddr); + if (key) { + set_bit(HCI_CONN_ENCRYPT, &conn->flags); + + if (!read_key_size_capable(hdev)) { + conn->enc_key_size = HCI_LINK_KEY_SIZE; + } else { + cp.handle = cpu_to_le16(conn->handle); + if (hci_send_cmd(hdev, HCI_OP_READ_ENC_KEY_SIZE, + sizeof(cp), &cp)) { + bt_dev_err(hdev, "sending read key size failed"); + conn->enc_key_size = HCI_LINK_KEY_SIZE; + } + } + + hci_encrypt_cfm(conn, ev->status); + } + } + /* Get remote features */ if (conn->type == ACL_LINK) { struct hci_cp_read_remote_features cp; @@ -3258,8 +3272,6 @@ done: unlock: hci_dev_unlock(hdev); - - hci_conn_check_pending(hdev); } static void hci_reject_conn(struct hci_dev *hdev, bdaddr_t *bdaddr) @@ -3654,7 +3666,7 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, void *data, * controller really supports it. If it doesn't, assume * the default size (16). */ - if (!(hdev->commands[20] & 0x10)) { + if (!read_key_size_capable(hdev)) { conn->enc_key_size = HCI_LINK_KEY_SIZE; goto notify; } @@ -6169,7 +6181,7 @@ static void hci_le_conn_update_complete_evt(struct hci_dev *hdev, void *data, static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type, bool addr_resolved, - u8 adv_type) + u8 adv_type, u8 phy, u8 sec_phy) { struct hci_conn *conn; struct hci_conn_params *params; @@ -6224,7 +6236,7 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, conn = hci_connect_le(hdev, addr, addr_type, addr_resolved, BT_SECURITY_LOW, hdev->def_le_autoconnect_timeout, - HCI_ROLE_MASTER); + HCI_ROLE_MASTER, phy, sec_phy); if (!IS_ERR(conn)) { /* If HCI_AUTO_CONN_EXPLICIT is set, conn is already owned * by higher layer that tried to connect, if no then @@ -6259,8 +6271,9 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, u8 bdaddr_type, bdaddr_t *direct_addr, - u8 direct_addr_type, s8 rssi, u8 *data, u8 len, - bool ext_adv, bool ctl_time, u64 instant) + u8 direct_addr_type, u8 phy, u8 sec_phy, s8 rssi, + u8 *data, u8 len, bool ext_adv, bool ctl_time, + u64 instant) { struct discovery_state *d = &hdev->discovery; struct smp_irk *irk; @@ -6348,7 +6361,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, * for advertising reports) and is already verified to be RPA above. */ conn = check_pending_le_conn(hdev, bdaddr, bdaddr_type, bdaddr_resolved, - type); + type, phy, sec_phy); if (!ext_adv && conn && type == LE_ADV_IND && len <= max_adv_len(hdev)) { /* Store report for later inclusion by @@ -6494,7 +6507,8 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, void *data, if (info->length <= max_adv_len(hdev)) { rssi = info->data[info->length]; process_adv_report(hdev, info->type, &info->bdaddr, - info->bdaddr_type, NULL, 0, rssi, + info->bdaddr_type, NULL, 0, + HCI_ADV_PHY_1M, 0, rssi, info->data, info->length, false, false, instant); } else { @@ -6579,6 +6593,8 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, void *data, if (legacy_evt_type != LE_ADV_INVALID) { process_adv_report(hdev, legacy_evt_type, &info->bdaddr, info->bdaddr_type, NULL, 0, + info->primary_phy, + info->secondary_phy, info->rssi, info->data, info->length, !(evt_type & LE_EXT_ADV_LEGACY_PDU), false, instant); @@ -6684,7 +6700,7 @@ static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev, void *data, * transition into connected state and mark it as * successful. */ - if (!conn->out && ev->status == 0x1a && + if (!conn->out && ev->status == HCI_ERROR_UNSUPPORTED_REMOTE_FEATURE && (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES)) status = 0x00; else @@ -6861,8 +6877,8 @@ static void hci_le_direct_adv_report_evt(struct hci_dev *hdev, void *data, process_adv_report(hdev, info->type, &info->bdaddr, info->bdaddr_type, &info->direct_addr, - info->direct_addr_type, info->rssi, NULL, 0, - false, false, instant); + info->direct_addr_type, HCI_ADV_PHY_1M, 0, + info->rssi, NULL, 0, false, false, instant); } hci_dev_unlock(hdev); diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 00e0213800..efea25eb56 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -105,8 +105,10 @@ void hci_req_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode, if (hdev->req_status == HCI_REQ_PEND) { hdev->req_result = result; hdev->req_status = HCI_REQ_DONE; - if (skb) + if (skb) { + kfree_skb(hdev->req_skb); hdev->req_skb = skb_get(skb); + } wake_up_interruptible(&hdev->req_wait_q); } } diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 3e7cd330d7..3f5f093233 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1946,10 +1946,9 @@ static int hci_sock_setsockopt_old(struct socket *sock, int level, int optname, switch (optname) { case HCI_DATA_DIR: - if (copy_from_sockptr(&opt, optval, sizeof(opt))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, len); + if (err) break; - } if (opt) hci_pi(sk)->cmsg_mask |= HCI_CMSG_DIR; @@ -1958,10 +1957,9 @@ static int hci_sock_setsockopt_old(struct socket *sock, int level, int optname, break; case HCI_TIME_STAMP: - if (copy_from_sockptr(&opt, optval, sizeof(opt))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, len); + if (err) break; - } if (opt) hci_pi(sk)->cmsg_mask |= HCI_CMSG_TSTAMP; @@ -1979,11 +1977,9 @@ static int hci_sock_setsockopt_old(struct socket *sock, int level, int optname, uf.event_mask[1] = *((u32 *) f->event_mask + 1); } - len = min_t(unsigned int, len, sizeof(uf)); - if (copy_from_sockptr(&uf, optval, len)) { - err = -EFAULT; + err = bt_copy_from_sockptr(&uf, sizeof(uf), optval, len); + if (err) break; - } if (!capable(CAP_NET_RAW)) { uf.type_mask &= hci_sec_filter.type_mask; @@ -2042,10 +2038,9 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname, goto done; } - if (copy_from_sockptr(&opt, optval, sizeof(opt))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, len); + if (err) break; - } hci_pi(sk)->mtu = opt; break; diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 5ce71c483b..40b71bc505 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -352,8 +352,6 @@ static void le_scan_disable(struct work_struct *work) if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) goto _return; - cancel_delayed_work(&hdev->le_scan_restart); - status = hci_cmd_sync_queue(hdev, scan_disable_sync, NULL, NULL); if (status) { bt_dev_err(hdev, "failed to disable LE scan: %d", status); @@ -401,71 +399,6 @@ _return: static int hci_le_set_scan_enable_sync(struct hci_dev *hdev, u8 val, u8 filter_dup); -static int hci_le_scan_restart_sync(struct hci_dev *hdev) -{ - /* If controller is not scanning we are done. */ - if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) - return 0; - - if (hdev->scanning_paused) { - bt_dev_dbg(hdev, "Scanning is paused for suspend"); - return 0; - } - - hci_le_set_scan_enable_sync(hdev, LE_SCAN_DISABLE, 0x00); - return hci_le_set_scan_enable_sync(hdev, LE_SCAN_ENABLE, - LE_SCAN_FILTER_DUP_ENABLE); -} - -static void le_scan_restart(struct work_struct *work) -{ - struct hci_dev *hdev = container_of(work, struct hci_dev, - le_scan_restart.work); - unsigned long timeout, duration, scan_start, now; - int status; - - bt_dev_dbg(hdev, ""); - - status = hci_le_scan_restart_sync(hdev); - if (status) { - bt_dev_err(hdev, "failed to restart LE scan: status %d", - status); - return; - } - - hci_dev_lock(hdev); - - if (!test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks) || - !hdev->discovery.scan_start) - goto unlock; - - /* When the scan was started, hdev->le_scan_disable has been queued - * after duration from scan_start. During scan restart this job - * has been canceled, and we need to queue it again after proper - * timeout, to make sure that scan does not run indefinitely. - */ - duration = hdev->discovery.scan_duration; - scan_start = hdev->discovery.scan_start; - now = jiffies; - if (now - scan_start <= duration) { - int elapsed; - - if (now >= scan_start) - elapsed = now - scan_start; - else - elapsed = ULONG_MAX - scan_start + now; - - timeout = duration - elapsed; - } else { - timeout = 0; - } - - queue_delayed_work(hdev->req_workqueue, - &hdev->le_scan_disable, timeout); - -unlock: - hci_dev_unlock(hdev); -} static int reenable_adv_sync(struct hci_dev *hdev, void *data) { @@ -634,10 +567,20 @@ void hci_cmd_sync_init(struct hci_dev *hdev) INIT_WORK(&hdev->cmd_sync_cancel_work, hci_cmd_sync_cancel_work); INIT_WORK(&hdev->reenable_adv_work, reenable_adv); INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable); - INIT_DELAYED_WORK(&hdev->le_scan_restart, le_scan_restart); INIT_DELAYED_WORK(&hdev->adv_instance_expire, adv_timeout_expire); } +static void _hci_cmd_sync_cancel_entry(struct hci_dev *hdev, + struct hci_cmd_sync_work_entry *entry, + int err) +{ + if (entry->destroy) + entry->destroy(hdev, entry->data, err); + + list_del(&entry->list); + kfree(entry); +} + void hci_cmd_sync_clear(struct hci_dev *hdev) { struct hci_cmd_sync_work_entry *entry, *tmp; @@ -646,13 +589,8 @@ void hci_cmd_sync_clear(struct hci_dev *hdev) cancel_work_sync(&hdev->reenable_adv_work); mutex_lock(&hdev->cmd_sync_work_lock); - list_for_each_entry_safe(entry, tmp, &hdev->cmd_sync_work_list, list) { - if (entry->destroy) - entry->destroy(hdev, entry->data, -ECANCELED); - - list_del(&entry->list); - kfree(entry); - } + list_for_each_entry_safe(entry, tmp, &hdev->cmd_sync_work_list, list) + _hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED); mutex_unlock(&hdev->cmd_sync_work_lock); } @@ -744,6 +682,115 @@ int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, } EXPORT_SYMBOL(hci_cmd_sync_queue); +static struct hci_cmd_sync_work_entry * +_hci_cmd_sync_lookup_entry(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, + void *data, hci_cmd_sync_work_destroy_t destroy) +{ + struct hci_cmd_sync_work_entry *entry, *tmp; + + list_for_each_entry_safe(entry, tmp, &hdev->cmd_sync_work_list, list) { + if (func && entry->func != func) + continue; + + if (data && entry->data != data) + continue; + + if (destroy && entry->destroy != destroy) + continue; + + return entry; + } + + return NULL; +} + +/* Queue HCI command entry once: + * + * - Lookup if an entry already exist and only if it doesn't creates a new entry + * and queue it. + */ +int hci_cmd_sync_queue_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, + void *data, hci_cmd_sync_work_destroy_t destroy) +{ + if (hci_cmd_sync_lookup_entry(hdev, func, data, destroy)) + return 0; + + return hci_cmd_sync_queue(hdev, func, data, destroy); +} +EXPORT_SYMBOL(hci_cmd_sync_queue_once); + +/* Lookup HCI command entry: + * + * - Return first entry that matches by function callback or data or + * destroy callback. + */ +struct hci_cmd_sync_work_entry * +hci_cmd_sync_lookup_entry(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, + void *data, hci_cmd_sync_work_destroy_t destroy) +{ + struct hci_cmd_sync_work_entry *entry; + + mutex_lock(&hdev->cmd_sync_work_lock); + entry = _hci_cmd_sync_lookup_entry(hdev, func, data, destroy); + mutex_unlock(&hdev->cmd_sync_work_lock); + + return entry; +} +EXPORT_SYMBOL(hci_cmd_sync_lookup_entry); + +/* Cancel HCI command entry */ +void hci_cmd_sync_cancel_entry(struct hci_dev *hdev, + struct hci_cmd_sync_work_entry *entry) +{ + mutex_lock(&hdev->cmd_sync_work_lock); + _hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED); + mutex_unlock(&hdev->cmd_sync_work_lock); +} +EXPORT_SYMBOL(hci_cmd_sync_cancel_entry); + +/* Dequeue one HCI command entry: + * + * - Lookup and cancel first entry that matches. + */ +bool hci_cmd_sync_dequeue_once(struct hci_dev *hdev, + hci_cmd_sync_work_func_t func, + void *data, hci_cmd_sync_work_destroy_t destroy) +{ + struct hci_cmd_sync_work_entry *entry; + + entry = hci_cmd_sync_lookup_entry(hdev, func, data, destroy); + if (!entry) + return false; + + hci_cmd_sync_cancel_entry(hdev, entry); + + return true; +} +EXPORT_SYMBOL(hci_cmd_sync_dequeue_once); + +/* Dequeue HCI command entry: + * + * - Lookup and cancel any entry that matches by function callback or data or + * destroy callback. + */ +bool hci_cmd_sync_dequeue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, + void *data, hci_cmd_sync_work_destroy_t destroy) +{ + struct hci_cmd_sync_work_entry *entry; + bool ret = false; + + mutex_lock(&hdev->cmd_sync_work_lock); + while ((entry = _hci_cmd_sync_lookup_entry(hdev, func, data, + destroy))) { + _hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED); + ret = true; + } + mutex_unlock(&hdev->cmd_sync_work_lock); + + return ret; +} +EXPORT_SYMBOL(hci_cmd_sync_dequeue); + int hci_update_eir_sync(struct hci_dev *hdev) { struct hci_cp_write_eir cp; @@ -2679,6 +2726,14 @@ done: return filter_policy; } +static void hci_le_scan_phy_params(struct hci_cp_le_scan_phy_params *cp, + u8 type, u16 interval, u16 window) +{ + cp->type = type; + cp->interval = cpu_to_le16(interval); + cp->window = cpu_to_le16(window); +} + static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type, u16 interval, u16 window, u8 own_addr_type, u8 filter_policy) @@ -2686,7 +2741,7 @@ static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type, struct hci_cp_le_set_ext_scan_params *cp; struct hci_cp_le_scan_phy_params *phy; u8 data[sizeof(*cp) + sizeof(*phy) * 2]; - u8 num_phy = 0; + u8 num_phy = 0x00; cp = (void *)data; phy = (void *)cp->data; @@ -2696,28 +2751,64 @@ static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type, cp->own_addr_type = own_addr_type; cp->filter_policy = filter_policy; + /* Check if PA Sync is in progress then select the PHY based on the + * hci_conn.iso_qos. + */ + if (hci_dev_test_flag(hdev, HCI_PA_SYNC)) { + struct hci_cp_le_add_to_accept_list *sent; + + sent = hci_sent_cmd_data(hdev, HCI_OP_LE_ADD_TO_ACCEPT_LIST); + if (sent) { + struct hci_conn *conn; + + conn = hci_conn_hash_lookup_ba(hdev, ISO_LINK, + &sent->bdaddr); + if (conn) { + struct bt_iso_qos *qos = &conn->iso_qos; + + if (qos->bcast.in.phy & BT_ISO_PHY_1M || + qos->bcast.in.phy & BT_ISO_PHY_2M) { + cp->scanning_phys |= LE_SCAN_PHY_1M; + hci_le_scan_phy_params(phy, type, + interval, + window); + num_phy++; + phy++; + } + + if (qos->bcast.in.phy & BT_ISO_PHY_CODED) { + cp->scanning_phys |= LE_SCAN_PHY_CODED; + hci_le_scan_phy_params(phy, type, + interval * 3, + window * 3); + num_phy++; + phy++; + } + + if (num_phy) + goto done; + } + } + } + if (scan_1m(hdev) || scan_2m(hdev)) { cp->scanning_phys |= LE_SCAN_PHY_1M; - - phy->type = type; - phy->interval = cpu_to_le16(interval); - phy->window = cpu_to_le16(window); - + hci_le_scan_phy_params(phy, type, interval, window); num_phy++; phy++; } if (scan_coded(hdev)) { cp->scanning_phys |= LE_SCAN_PHY_CODED; - - phy->type = type; - phy->interval = cpu_to_le16(interval); - phy->window = cpu_to_le16(window); - + hci_le_scan_phy_params(phy, type, interval * 3, window * 3); num_phy++; phy++; } +done: + if (!num_phy) + return -EINVAL; + return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_SCAN_PARAMS, sizeof(*cp) + sizeof(*phy) * num_phy, data, HCI_CMD_TIMEOUT); @@ -2956,7 +3047,8 @@ int hci_update_passive_scan(struct hci_dev *hdev) hci_dev_test_flag(hdev, HCI_UNREGISTER)) return 0; - return hci_cmd_sync_queue(hdev, update_passive_scan_sync, NULL, NULL); + return hci_cmd_sync_queue_once(hdev, update_passive_scan_sync, NULL, + NULL); } int hci_write_sc_support_sync(struct hci_dev *hdev, u8 val) @@ -3292,7 +3384,10 @@ static void hci_dev_get_bd_addr_from_property(struct hci_dev *hdev) if (ret < 0 || !bacmp(&ba, BDADDR_ANY)) return; - bacpy(&hdev->public_addr, &ba); + if (test_bit(HCI_QUIRK_BDADDR_PROPERTY_BROKEN, &hdev->quirks)) + baswap(&hdev->public_addr, &ba); + else + bacpy(&hdev->public_addr, &ba); } struct hci_init_stage { @@ -4979,7 +5074,6 @@ int hci_dev_close_sync(struct hci_dev *hdev) cancel_delayed_work(&hdev->power_off); cancel_delayed_work(&hdev->ncmd_timer); cancel_delayed_work(&hdev->le_scan_disable); - cancel_delayed_work(&hdev->le_scan_restart); hci_request_cancel_all(hdev); @@ -5203,7 +5297,6 @@ int hci_stop_discovery_sync(struct hci_dev *hdev) if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { cancel_delayed_work(&hdev->le_scan_disable); - cancel_delayed_work(&hdev->le_scan_restart); err = hci_scan_disable_sync(hdev); if (err) @@ -5711,19 +5804,18 @@ static int hci_active_scan_sync(struct hci_dev *hdev, uint16_t interval) if (err < 0) own_addr_type = ADDR_LE_DEV_PUBLIC; - if (hci_is_adv_monitoring(hdev)) { + if (hci_is_adv_monitoring(hdev) || + (test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks) && + hdev->discovery.result_filtering)) { /* Duplicate filter should be disabled when some advertisement * monitor is activated, otherwise AdvMon can only receive one * advertisement for one peer(*) during active scanning, and * might report loss to these peers. * - * Note that different controllers have different meanings of - * |duplicate|. Some of them consider packets with the same - * address as duplicate, and others consider packets with the - * same address and the same RSSI as duplicate. Although in the - * latter case we don't need to disable duplicate filter, but - * it is common to have active scanning for a short period of - * time, the power impact should be neglectable. + * If controller does strict duplicate filtering and the + * discovery requires result filtering disables controller based + * filtering since that can cause reports that would match the + * host filter to not be reported. */ filter_dup = LE_SCAN_FILTER_DUP_DISABLE; } @@ -5803,17 +5895,6 @@ int hci_start_discovery_sync(struct hci_dev *hdev) bt_dev_dbg(hdev, "timeout %u ms", jiffies_to_msecs(timeout)); - /* When service discovery is used and the controller has a - * strict duplicate filter, it is important to remember the - * start and duration of the scan. This is required for - * restarting scanning during the discovery phase. - */ - if (test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks) && - hdev->discovery.result_filtering) { - hdev->discovery.scan_start = jiffies; - hdev->discovery.scan_duration = timeout; - } - queue_delayed_work(hdev->req_workqueue, &hdev->le_scan_disable, timeout); return 0; @@ -6234,7 +6315,8 @@ static int hci_le_ext_create_conn_sync(struct hci_dev *hdev, plen = sizeof(*cp); - if (scan_1m(hdev)) { + if (scan_1m(hdev) && (conn->le_adv_phy == HCI_ADV_PHY_1M || + conn->le_adv_sec_phy == HCI_ADV_PHY_1M)) { cp->phys |= LE_SCAN_PHY_1M; set_ext_conn_params(conn, p); @@ -6242,7 +6324,8 @@ static int hci_le_ext_create_conn_sync(struct hci_dev *hdev, plen += sizeof(*p); } - if (scan_2m(hdev)) { + if (scan_2m(hdev) && (conn->le_adv_phy == HCI_ADV_PHY_2M || + conn->le_adv_sec_phy == HCI_ADV_PHY_2M)) { cp->phys |= LE_SCAN_PHY_2M; set_ext_conn_params(conn, p); @@ -6250,7 +6333,8 @@ static int hci_le_ext_create_conn_sync(struct hci_dev *hdev, plen += sizeof(*p); } - if (scan_coded(hdev)) { + if (scan_coded(hdev) && (conn->le_adv_phy == HCI_ADV_PHY_CODED || + conn->le_adv_sec_phy == HCI_ADV_PHY_CODED)) { cp->phys |= LE_SCAN_PHY_CODED; set_ext_conn_params(conn, p); @@ -6263,12 +6347,21 @@ static int hci_le_ext_create_conn_sync(struct hci_dev *hdev, conn->conn_timeout, NULL); } -int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn) +static int hci_le_create_conn_sync(struct hci_dev *hdev, void *data) { struct hci_cp_le_create_conn cp; struct hci_conn_params *params; u8 own_addr_type; int err; + struct hci_conn *conn = data; + + if (!hci_conn_valid(hdev, conn)) + return -ECANCELED; + + bt_dev_dbg(hdev, "conn %p", conn); + + clear_bit(HCI_CONN_SCANNING, &conn->flags); + conn->state = BT_CONNECT; /* If requested to connect as peripheral use directed advertising */ if (conn->role == HCI_ROLE_SLAVE) { @@ -6586,3 +6679,125 @@ int hci_update_adv_data(struct hci_dev *hdev, u8 instance) return hci_cmd_sync_queue(hdev, _update_adv_data_sync, UINT_PTR(instance), NULL); } + +static int hci_acl_create_conn_sync(struct hci_dev *hdev, void *data) +{ + struct hci_conn *conn = data; + struct inquiry_entry *ie; + struct hci_cp_create_conn cp; + int err; + + if (!hci_conn_valid(hdev, conn)) + return -ECANCELED; + + /* Many controllers disallow HCI Create Connection while it is doing + * HCI Inquiry. So we cancel the Inquiry first before issuing HCI Create + * Connection. This may cause the MGMT discovering state to become false + * without user space's request but it is okay since the MGMT Discovery + * APIs do not promise that discovery should be done forever. Instead, + * the user space monitors the status of MGMT discovering and it may + * request for discovery again when this flag becomes false. + */ + if (test_bit(HCI_INQUIRY, &hdev->flags)) { + err = __hci_cmd_sync_status(hdev, HCI_OP_INQUIRY_CANCEL, 0, + NULL, HCI_CMD_TIMEOUT); + if (err) + bt_dev_warn(hdev, "Failed to cancel inquiry %d", err); + } + + conn->state = BT_CONNECT; + conn->out = true; + conn->role = HCI_ROLE_MASTER; + + conn->attempt++; + + conn->link_policy = hdev->link_policy; + + memset(&cp, 0, sizeof(cp)); + bacpy(&cp.bdaddr, &conn->dst); + cp.pscan_rep_mode = 0x02; + + ie = hci_inquiry_cache_lookup(hdev, &conn->dst); + if (ie) { + if (inquiry_entry_age(ie) <= INQUIRY_ENTRY_AGE_MAX) { + cp.pscan_rep_mode = ie->data.pscan_rep_mode; + cp.pscan_mode = ie->data.pscan_mode; + cp.clock_offset = ie->data.clock_offset | + cpu_to_le16(0x8000); + } + + memcpy(conn->dev_class, ie->data.dev_class, 3); + } + + cp.pkt_type = cpu_to_le16(conn->pkt_type); + if (lmp_rswitch_capable(hdev) && !(hdev->link_mode & HCI_LM_MASTER)) + cp.role_switch = 0x01; + else + cp.role_switch = 0x00; + + return __hci_cmd_sync_status_sk(hdev, HCI_OP_CREATE_CONN, + sizeof(cp), &cp, + HCI_EV_CONN_COMPLETE, + conn->conn_timeout, NULL); +} + +int hci_connect_acl_sync(struct hci_dev *hdev, struct hci_conn *conn) +{ + return hci_cmd_sync_queue_once(hdev, hci_acl_create_conn_sync, conn, + NULL); +} + +static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err) +{ + struct hci_conn *conn = data; + + bt_dev_dbg(hdev, "err %d", err); + + if (err == -ECANCELED) + return; + + hci_dev_lock(hdev); + + if (!hci_conn_valid(hdev, conn)) + goto done; + + if (!err) { + hci_connect_le_scan_cleanup(conn, 0x00); + goto done; + } + + /* Check if connection is still pending */ + if (conn != hci_lookup_le_connect(hdev)) + goto done; + + /* Flush to make sure we send create conn cancel command if needed */ + flush_delayed_work(&conn->le_conn_timeout); + hci_conn_failed(conn, bt_status(err)); + +done: + hci_dev_unlock(hdev); +} + +int hci_connect_le_sync(struct hci_dev *hdev, struct hci_conn *conn) +{ + return hci_cmd_sync_queue_once(hdev, hci_le_create_conn_sync, conn, + create_le_conn_complete); +} + +int hci_cancel_connect_sync(struct hci_dev *hdev, struct hci_conn *conn) +{ + if (conn->state != BT_OPEN) + return -EINVAL; + + switch (conn->type) { + case ACL_LINK: + return !hci_cmd_sync_dequeue_once(hdev, + hci_acl_create_conn_sync, + conn, NULL); + case LE_LINK: + return !hci_cmd_sync_dequeue_once(hdev, hci_le_create_conn_sync, + conn, create_le_conn_complete); + } + + return -ENOENT; +} diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index fd81289fd3..fa6c2e95d5 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -608,19 +608,68 @@ static struct sock *iso_get_sock_listen(bdaddr_t *src, bdaddr_t *dst, continue; /* Exact match. */ - if (!bacmp(&iso_pi(sk)->src, src)) + if (!bacmp(&iso_pi(sk)->src, src)) { + sock_hold(sk); break; + } /* Closest match */ - if (!bacmp(&iso_pi(sk)->src, BDADDR_ANY)) + if (!bacmp(&iso_pi(sk)->src, BDADDR_ANY)) { + if (sk1) + sock_put(sk1); + sk1 = sk; + sock_hold(sk1); + } } + if (sk && sk1) + sock_put(sk1); + read_unlock(&iso_sk_list.lock); return sk ? sk : sk1; } +static struct sock *iso_get_sock_big(struct sock *match_sk, bdaddr_t *src, + bdaddr_t *dst, uint8_t big) +{ + struct sock *sk = NULL; + + read_lock(&iso_sk_list.lock); + + sk_for_each(sk, &iso_sk_list.head) { + if (match_sk == sk) + continue; + + /* Look for sockets that have already been + * connected to the BIG + */ + if (sk->sk_state != BT_CONNECTED && + sk->sk_state != BT_CONNECT) + continue; + + /* Match Broadcast destination */ + if (bacmp(&iso_pi(sk)->dst, dst)) + continue; + + /* Match BIG handle */ + if (iso_pi(sk)->qos.bcast.big != big) + continue; + + /* Match source address */ + if (bacmp(&iso_pi(sk)->src, src)) + continue; + + sock_hold(sk); + break; + } + + read_unlock(&iso_sk_list.lock); + + return sk; +} + static void iso_sock_destruct(struct sock *sk) { BT_DBG("sk %p", sk); @@ -673,6 +722,28 @@ static void iso_sock_kill(struct sock *sk) static void iso_sock_disconn(struct sock *sk) { + struct sock *bis_sk; + struct hci_conn *hcon = iso_pi(sk)->conn->hcon; + + if (test_bit(HCI_CONN_BIG_CREATED, &hcon->flags)) { + bis_sk = iso_get_sock_big(sk, &iso_pi(sk)->src, + &iso_pi(sk)->dst, + iso_pi(sk)->qos.bcast.big); + + /* If there are any other connected sockets for the + * same BIG, just delete the sk and leave the bis + * hcon active, in case later rebinding is needed. + */ + if (bis_sk) { + hcon->state = BT_OPEN; + iso_pi(sk)->conn->hcon = NULL; + iso_sock_clear_timer(sk); + iso_chan_del(sk, bt_to_errno(hcon->abort_reason)); + sock_put(bis_sk); + return; + } + } + sk->sk_state = BT_DISCONN; iso_sock_set_timer(sk, ISO_DISCONN_TIMEOUT); iso_conn_lock(iso_pi(sk)->conn); @@ -766,10 +837,10 @@ static struct bt_iso_qos default_qos = { .bcode = {0x00}, .options = 0x00, .skip = 0x0000, - .sync_timeout = 0x4000, + .sync_timeout = BT_ISO_SYNC_TIMEOUT, .sync_cte_type = 0x00, .mse = 0x00, - .timeout = 0x4000, + .timeout = BT_ISO_SYNC_TIMEOUT, }, }; @@ -826,27 +897,75 @@ static int iso_sock_bind_bc(struct socket *sock, struct sockaddr *addr, BT_DBG("sk %p bc_sid %u bc_num_bis %u", sk, sa->iso_bc->bc_sid, sa->iso_bc->bc_num_bis); - if (addr_len > sizeof(*sa) + sizeof(*sa->iso_bc)) + if (addr_len != sizeof(*sa) + sizeof(*sa->iso_bc)) return -EINVAL; bacpy(&iso_pi(sk)->dst, &sa->iso_bc->bc_bdaddr); + + /* Check if the address type is of LE type */ + if (!bdaddr_type_is_le(sa->iso_bc->bc_bdaddr_type)) + return -EINVAL; + iso_pi(sk)->dst_type = sa->iso_bc->bc_bdaddr_type; iso_pi(sk)->sync_handle = -1; + + if (sa->iso_bc->bc_sid > 0x0f) + return -EINVAL; + iso_pi(sk)->bc_sid = sa->iso_bc->bc_sid; + + if (sa->iso_bc->bc_num_bis > ISO_MAX_NUM_BIS) + return -EINVAL; + iso_pi(sk)->bc_num_bis = sa->iso_bc->bc_num_bis; - for (i = 0; i < iso_pi(sk)->bc_num_bis; i++) { + for (i = 0; i < iso_pi(sk)->bc_num_bis; i++) if (sa->iso_bc->bc_bis[i] < 0x01 || sa->iso_bc->bc_bis[i] > 0x1f) return -EINVAL; - memcpy(iso_pi(sk)->bc_bis, sa->iso_bc->bc_bis, - iso_pi(sk)->bc_num_bis); - } + memcpy(iso_pi(sk)->bc_bis, sa->iso_bc->bc_bis, + iso_pi(sk)->bc_num_bis); return 0; } +static int iso_sock_bind_pa_sk(struct sock *sk, struct sockaddr_iso *sa, + int addr_len) +{ + int err = 0; + + if (sk->sk_type != SOCK_SEQPACKET) { + err = -EINVAL; + goto done; + } + + if (addr_len != sizeof(*sa) + sizeof(*sa->iso_bc)) { + err = -EINVAL; + goto done; + } + + if (sa->iso_bc->bc_num_bis > ISO_MAX_NUM_BIS) { + err = -EINVAL; + goto done; + } + + iso_pi(sk)->bc_num_bis = sa->iso_bc->bc_num_bis; + + for (int i = 0; i < iso_pi(sk)->bc_num_bis; i++) + if (sa->iso_bc->bc_bis[i] < 0x01 || + sa->iso_bc->bc_bis[i] > 0x1f) { + err = -EINVAL; + goto done; + } + + memcpy(iso_pi(sk)->bc_bis, sa->iso_bc->bc_bis, + iso_pi(sk)->bc_num_bis); + +done: + return err; +} + static int iso_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { @@ -862,6 +981,15 @@ static int iso_sock_bind(struct socket *sock, struct sockaddr *addr, lock_sock(sk); + /* Allow the user to bind a PA sync socket to a number + * of BISes to sync to. + */ + if (sk->sk_state == BT_CONNECT2 && + test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) { + err = iso_sock_bind_pa_sk(sk, sa, addr_len); + goto done; + } + if (sk->sk_state != BT_OPEN) { err = -EBADFD; goto done; @@ -1302,8 +1430,8 @@ static bool check_ucast_qos(struct bt_iso_qos *qos) static bool check_bcast_qos(struct bt_iso_qos *qos) { - if (qos->bcast.sync_factor == 0x00) - return false; + if (!qos->bcast.sync_factor) + qos->bcast.sync_factor = 0x01; if (qos->bcast.packing > 0x01) return false; @@ -1326,6 +1454,9 @@ static bool check_bcast_qos(struct bt_iso_qos *qos) if (qos->bcast.skip > 0x01f3) return false; + if (!qos->bcast.sync_timeout) + qos->bcast.sync_timeout = BT_ISO_SYNC_TIMEOUT; + if (qos->bcast.sync_timeout < 0x000a || qos->bcast.sync_timeout > 0x4000) return false; @@ -1335,6 +1466,9 @@ static bool check_bcast_qos(struct bt_iso_qos *qos) if (qos->bcast.mse > 0x1f) return false; + if (!qos->bcast.timeout) + qos->bcast.sync_timeout = BT_ISO_SYNC_TIMEOUT; + if (qos->bcast.timeout < 0x000a || qos->bcast.timeout > 0x4000) return false; @@ -1345,7 +1479,7 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; - int len, err = 0; + int err = 0; struct bt_iso_qos qos = default_qos; u32 opt; @@ -1360,10 +1494,9 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt) set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); @@ -1372,10 +1505,9 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, break; case BT_PKT_STATUS: - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt) set_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); @@ -1390,17 +1522,9 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, break; } - len = min_t(unsigned int, sizeof(qos), optlen); - - if (copy_from_sockptr(&qos, optval, len)) { - err = -EFAULT; - break; - } - - if (len == sizeof(qos.ucast) && !check_ucast_qos(&qos)) { - err = -EINVAL; + err = bt_copy_from_sockptr(&qos, sizeof(qos), optval, optlen); + if (err) break; - } iso_pi(sk)->qos = qos; iso_pi(sk)->qos_user_set = true; @@ -1415,18 +1539,16 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, } if (optlen > sizeof(iso_pi(sk)->base)) { - err = -EOVERFLOW; + err = -EINVAL; break; } - len = min_t(unsigned int, sizeof(iso_pi(sk)->base), optlen); - - if (copy_from_sockptr(iso_pi(sk)->base, optval, len)) { - err = -EFAULT; + err = bt_copy_from_sockptr(iso_pi(sk)->base, optlen, optval, + optlen); + if (err) break; - } - iso_pi(sk)->base_len = len; + iso_pi(sk)->base_len = optlen; break; @@ -1728,6 +1850,7 @@ static void iso_conn_ready(struct iso_conn *conn) parent->sk_data_ready(parent); release_sock(parent); + sock_put(parent); } } @@ -1823,6 +1946,7 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) if (err) { bt_dev_err(hdev, "hci_le_big_create_sync: %d", err); + sock_put(sk); sk = NULL; } } @@ -1831,16 +1955,58 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) ev3 = hci_recv_event_data(hdev, HCI_EV_LE_PER_ADV_REPORT); if (ev3) { - size_t base_len = ev3->length; + size_t base_len = 0; u8 *base; + struct hci_conn *hcon; sk = iso_get_sock_listen(&hdev->bdaddr, bdaddr, iso_match_sync_handle_pa_report, ev3); - base = eir_get_service_data(ev3->data, ev3->length, - EIR_BAA_SERVICE_UUID, &base_len); - if (base && sk && base_len <= sizeof(iso_pi(sk)->base)) { + if (!sk) + goto done; + + hcon = iso_pi(sk)->conn->hcon; + if (!hcon) + goto done; + + if (ev3->data_status == LE_PA_DATA_TRUNCATED) { + /* The controller was unable to retrieve PA data. */ + memset(hcon->le_per_adv_data, 0, + HCI_MAX_PER_AD_TOT_LEN); + hcon->le_per_adv_data_len = 0; + hcon->le_per_adv_data_offset = 0; + goto done; + } + + if (hcon->le_per_adv_data_offset + ev3->length > + HCI_MAX_PER_AD_TOT_LEN) + goto done; + + memcpy(hcon->le_per_adv_data + hcon->le_per_adv_data_offset, + ev3->data, ev3->length); + hcon->le_per_adv_data_offset += ev3->length; + + if (ev3->data_status == LE_PA_DATA_COMPLETE) { + /* All PA data has been received. */ + hcon->le_per_adv_data_len = + hcon->le_per_adv_data_offset; + hcon->le_per_adv_data_offset = 0; + + /* Extract BASE */ + base = eir_get_service_data(hcon->le_per_adv_data, + hcon->le_per_adv_data_len, + EIR_BAA_SERVICE_UUID, + &base_len); + + if (!base || base_len > BASE_MAX_LENGTH) + goto done; + memcpy(iso_pi(sk)->base, base, base_len); iso_pi(sk)->base_len = base_len; + } else { + /* This is a PA data fragment. Keep pa_data_len set to 0 + * until all data has been reassembled. + */ + hcon->le_per_adv_data_len = 0; } } else { sk = iso_get_sock_listen(&hdev->bdaddr, BDADDR_ANY, NULL, NULL); @@ -1855,6 +2021,8 @@ done: if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) *flags |= HCI_PROTO_DEFER; + sock_put(sk); + return lm; } diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index ab5a9d42fa..84fc70862d 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -4054,8 +4054,7 @@ static int l2cap_connect_req(struct l2cap_conn *conn, return -EPROTO; hci_dev_lock(hdev); - if (hci_dev_test_flag(hdev, HCI_MGMT) && - !test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &hcon->flags)) + if (hci_dev_test_flag(hdev, HCI_MGMT)) mgmt_device_connected(hdev, hcon, NULL, 0); hci_dev_unlock(hdev); @@ -6925,7 +6924,7 @@ static void l2cap_chan_by_pid(struct l2cap_chan *chan, void *data) } int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, - bdaddr_t *dst, u8 dst_type) + bdaddr_t *dst, u8 dst_type, u16 timeout) { struct l2cap_conn *conn; struct hci_conn *hcon; @@ -7018,19 +7017,17 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) hcon = hci_connect_le(hdev, dst, dst_type, false, - chan->sec_level, - HCI_LE_CONN_TIMEOUT, - HCI_ROLE_SLAVE); + chan->sec_level, timeout, + HCI_ROLE_SLAVE, 0, 0); else hcon = hci_connect_le_scan(hdev, dst, dst_type, - chan->sec_level, - HCI_LE_CONN_TIMEOUT, + chan->sec_level, timeout, CONN_REASON_L2CAP_CHAN); } else { u8 auth_type = l2cap_get_auth_type(chan); hcon = hci_connect_acl(hdev, dst, chan->sec_level, auth_type, - CONN_REASON_L2CAP_CHAN); + CONN_REASON_L2CAP_CHAN, timeout); } if (IS_ERR(hcon)) { diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index ee7a41d699..5cc83f906c 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -254,7 +254,8 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, chan->mode = L2CAP_MODE_LE_FLOWCTL; err = l2cap_chan_connect(chan, la.l2_psm, __le16_to_cpu(la.l2_cid), - &la.l2_bdaddr, la.l2_bdaddr_type); + &la.l2_bdaddr, la.l2_bdaddr_type, + sk->sk_sndtimeo); if (err) return err; @@ -438,7 +439,8 @@ static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, struct l2cap_chan *chan = l2cap_pi(sk)->chan; struct l2cap_options opts; struct l2cap_conninfo cinfo; - int len, err = 0; + int err = 0; + size_t len; u32 opt; BT_DBG("sk %p", sk); @@ -485,7 +487,7 @@ static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, BT_DBG("mode 0x%2.2x", chan->mode); - len = min_t(unsigned int, len, sizeof(opts)); + len = min(len, sizeof(opts)); if (copy_to_user(optval, (char *) &opts, len)) err = -EFAULT; @@ -535,7 +537,7 @@ static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, cinfo.hci_handle = chan->conn->hcon->handle; memcpy(cinfo.dev_class, chan->conn->hcon->dev_class, 3); - len = min_t(unsigned int, len, sizeof(cinfo)); + len = min(len, sizeof(cinfo)); if (copy_to_user(optval, (char *) &cinfo, len)) err = -EFAULT; @@ -726,7 +728,7 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, struct sock *sk = sock->sk; struct l2cap_chan *chan = l2cap_pi(sk)->chan; struct l2cap_options opts; - int len, err = 0; + int err = 0; u32 opt; BT_DBG("sk %p", sk); @@ -753,11 +755,9 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, opts.max_tx = chan->max_tx; opts.txwin_size = chan->tx_win; - len = min_t(unsigned int, sizeof(opts), optlen); - if (copy_from_sockptr(&opts, optval, len)) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opts, sizeof(opts), optval, optlen); + if (err) break; - } if (opts.txwin_size > L2CAP_DEFAULT_EXT_WINDOW) { err = -EINVAL; @@ -800,10 +800,9 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, break; case L2CAP_LM: - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt & L2CAP_LM_FIPS) { err = -EINVAL; @@ -884,7 +883,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, struct bt_security sec; struct bt_power pwr; struct l2cap_conn *conn; - int len, err = 0; + int err = 0; u32 opt; u16 mtu; u8 mode; @@ -910,11 +909,9 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, sec.level = BT_SECURITY_LOW; - len = min_t(unsigned int, sizeof(sec), optlen); - if (copy_from_sockptr(&sec, optval, len)) { - err = -EFAULT; + err = bt_copy_from_sockptr(&sec, sizeof(sec), optval, optlen); + if (err) break; - } if (sec.level < BT_SECURITY_LOW || sec.level > BT_SECURITY_FIPS) { @@ -959,10 +956,9 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt) { set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); @@ -974,10 +970,9 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; case BT_FLUSHABLE: - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt > BT_FLUSHABLE_ON) { err = -EINVAL; @@ -1009,11 +1004,9 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, pwr.force_active = BT_POWER_FORCE_ACTIVE_ON; - len = min_t(unsigned int, sizeof(pwr), optlen); - if (copy_from_sockptr(&pwr, optval, len)) { - err = -EFAULT; + err = bt_copy_from_sockptr(&pwr, sizeof(pwr), optval, optlen); + if (err) break; - } if (pwr.force_active) set_bit(FLAG_FORCE_ACTIVE, &chan->flags); @@ -1022,10 +1015,9 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; case BT_CHANNEL_POLICY: - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } err = -EOPNOTSUPP; break; @@ -1054,10 +1046,9 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&mtu, optval, sizeof(u16))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&mtu, sizeof(mtu), optval, optlen); + if (err) break; - } if (chan->mode == L2CAP_MODE_EXT_FLOWCTL && sk->sk_state == BT_CONNECTED) @@ -1085,10 +1076,9 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&mode, optval, sizeof(u8))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&mode, sizeof(mode), optval, optlen); + if (err) break; - } BT_DBG("mode %u", mode); diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c index 53a796ac07..43aa01fd07 100644 --- a/net/bluetooth/lib.c +++ b/net/bluetooth/lib.c @@ -30,6 +30,15 @@ #include <net/bluetooth/bluetooth.h> +/** + * baswap() - Swaps the order of a bd address + * @dst: Pointer to a bdaddr_t struct that will store the swapped + * bd address. + * @src: Pointer to the bdaddr_t struct to be swapped. + * + * This function reverses the byte order of a Bluetooth device + * address. + */ void baswap(bdaddr_t *dst, const bdaddr_t *src) { const unsigned char *s = (const unsigned char *)src; @@ -41,7 +50,19 @@ void baswap(bdaddr_t *dst, const bdaddr_t *src) } EXPORT_SYMBOL(baswap); -/* Bluetooth error codes to Unix errno mapping */ +/** + * bt_to_errno() - Bluetooth error codes to standard errno + * @code: Bluetooth error code to be converted + * + * This function takes a Bluetooth error code as input and convets + * it to an equivalent Unix/standard errno value. + * + * Return: + * + * If the bt error code is known, an equivalent Unix errno value + * is returned. + * If the given bt error code is not known, ENOSYS is returned. + */ int bt_to_errno(__u16 code) { switch (code) { @@ -135,10 +156,22 @@ int bt_to_errno(__u16 code) } EXPORT_SYMBOL(bt_to_errno); -/* Unix errno to Bluetooth error codes mapping */ +/** + * bt_status() - Standard errno value to Bluetooth error code + * @err: Unix/standard errno value to be converted + * + * This function converts a standard/Unix errno value to an + * equivalent Bluetooth error code. + * + * Return: Bluetooth error code. + * + * If the given errno is not found, 0x1f is returned by default + * which indicates an unspecified error. + * For err >= 0, no conversion is performed, and the same value + * is immediately returned. + */ __u8 bt_status(int err) { - /* Don't convert if already positive value */ if (err >= 0) return err; @@ -206,6 +239,10 @@ __u8 bt_status(int err) } EXPORT_SYMBOL(bt_status); +/** + * bt_info() - Log Bluetooth information message + * @format: Message's format string + */ void bt_info(const char *format, ...) { struct va_format vaf; @@ -222,6 +259,10 @@ void bt_info(const char *format, ...) } EXPORT_SYMBOL(bt_info); +/** + * bt_warn() - Log Bluetooth warning message + * @format: Message's format string + */ void bt_warn(const char *format, ...) { struct va_format vaf; @@ -238,6 +279,10 @@ void bt_warn(const char *format, ...) } EXPORT_SYMBOL(bt_warn); +/** + * bt_err() - Log Bluetooth error message + * @format: Message's format string + */ void bt_err(const char *format, ...) { struct va_format vaf; @@ -267,6 +312,10 @@ bool bt_dbg_get(void) return debug_enable; } +/** + * bt_dbg() - Log Bluetooth debugging message + * @format: Message's format string + */ void bt_dbg(const char *format, ...) { struct va_format vaf; @@ -287,6 +336,13 @@ void bt_dbg(const char *format, ...) EXPORT_SYMBOL(bt_dbg); #endif +/** + * bt_warn_ratelimited() - Log rate-limited Bluetooth warning message + * @format: Message's format string + * + * This functions works like bt_warn, but it uses rate limiting + * to prevent the message from being logged too often. + */ void bt_warn_ratelimited(const char *format, ...) { struct va_format vaf; @@ -303,6 +359,13 @@ void bt_warn_ratelimited(const char *format, ...) } EXPORT_SYMBOL(bt_warn_ratelimited); +/** + * bt_err_ratelimited() - Log rate-limited Bluetooth error message + * @format: Message's format string + * + * This functions works like bt_err, but it uses rate limiting + * to prevent the message from being logged too often. + */ void bt_err_ratelimited(const char *format, ...) { struct va_format vaf; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 92fd3786bb..b8e05ddeed 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -2616,7 +2616,11 @@ static int add_uuid(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) goto failed; } - err = hci_cmd_sync_queue(hdev, add_uuid_sync, cmd, mgmt_class_complete); + /* MGMT_OP_ADD_UUID don't require adapter the UP/Running so use + * hci_cmd_sync_submit instead of hci_cmd_sync_queue. + */ + err = hci_cmd_sync_submit(hdev, add_uuid_sync, cmd, + mgmt_class_complete); if (err < 0) { mgmt_pending_free(cmd); goto failed; @@ -2710,8 +2714,11 @@ update_class: goto unlock; } - err = hci_cmd_sync_queue(hdev, remove_uuid_sync, cmd, - mgmt_class_complete); + /* MGMT_OP_REMOVE_UUID don't require adapter the UP/Running so use + * hci_cmd_sync_submit instead of hci_cmd_sync_queue. + */ + err = hci_cmd_sync_submit(hdev, remove_uuid_sync, cmd, + mgmt_class_complete); if (err < 0) mgmt_pending_free(cmd); @@ -2777,8 +2784,11 @@ static int set_dev_class(struct sock *sk, struct hci_dev *hdev, void *data, goto unlock; } - err = hci_cmd_sync_queue(hdev, set_class_sync, cmd, - mgmt_class_complete); + /* MGMT_OP_SET_DEV_CLASS don't require adapter the UP/Running so use + * hci_cmd_sync_submit instead of hci_cmd_sync_queue. + */ + err = hci_cmd_sync_submit(hdev, set_class_sync, cmd, + mgmt_class_complete); if (err < 0) mgmt_pending_free(cmd); @@ -3436,7 +3446,8 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, if (cp->addr.type == BDADDR_BREDR) { conn = hci_connect_acl(hdev, &cp->addr.bdaddr, sec_level, - auth_type, CONN_REASON_PAIR_DEVICE); + auth_type, CONN_REASON_PAIR_DEVICE, + HCI_ACL_CONN_TIMEOUT); } else { u8 addr_type = le_addr_type(cp->addr.type); struct hci_conn_params *p; @@ -5467,8 +5478,8 @@ static int remove_adv_monitor(struct sock *sk, struct hci_dev *hdev, goto unlock; } - err = hci_cmd_sync_queue(hdev, mgmt_remove_adv_monitor_sync, cmd, - mgmt_remove_adv_monitor_complete); + err = hci_cmd_sync_submit(hdev, mgmt_remove_adv_monitor_sync, cmd, + mgmt_remove_adv_monitor_complete); if (err) { mgmt_pending_remove(cmd); @@ -10064,21 +10075,6 @@ static bool eir_has_uuids(u8 *eir, u16 eir_len, u16 uuid_count, u8 (*uuids)[16]) return false; } -static void restart_le_scan(struct hci_dev *hdev) -{ - /* If controller is not scanning we are done. */ - if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) - return; - - if (time_after(jiffies + DISCOV_LE_RESTART_DELAY, - hdev->discovery.scan_start + - hdev->discovery.scan_duration)) - return; - - queue_delayed_work(hdev->req_workqueue, &hdev->le_scan_restart, - DISCOV_LE_RESTART_DELAY); -} - static bool is_filter_match(struct hci_dev *hdev, s8 rssi, u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len) { @@ -10113,8 +10109,6 @@ static bool is_filter_match(struct hci_dev *hdev, s8 rssi, u8 *eir, * scanning to ensure updated result with updated RSSI values. */ if (test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks)) { - restart_le_scan(hdev); - /* Validate RSSI value against the RSSI threshold once more. */ if (hdev->discovery.rssi != HCI_RSSI_INVALID && rssi < hdev->discovery.rssi) diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index b54e8a530f..29aa07e9db 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -629,7 +629,7 @@ static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, switch (optname) { case RFCOMM_LM: - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { + if (bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen)) { err = -EFAULT; break; } @@ -664,7 +664,6 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, struct sock *sk = sock->sk; struct bt_security sec; int err = 0; - size_t len; u32 opt; BT_DBG("sk %p", sk); @@ -686,11 +685,9 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, sec.level = BT_SECURITY_LOW; - len = min_t(unsigned int, sizeof(sec), optlen); - if (copy_from_sockptr(&sec, optval, len)) { - err = -EFAULT; + err = bt_copy_from_sockptr(&sec, sizeof(sec), optval, optlen); + if (err) break; - } if (sec.level > BT_SECURITY_HIGH) { err = -EINVAL; @@ -706,10 +703,9 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt) set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index 94ec913dfb..69c75c041f 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -1041,7 +1041,7 @@ static void rfcomm_tty_flush_buffer(struct tty_struct *tty) tty_wakeup(tty); } -static void rfcomm_tty_send_xchar(struct tty_struct *tty, char ch) +static void rfcomm_tty_send_xchar(struct tty_struct *tty, u8 ch) { BT_DBG("tty %p ch %c", tty, ch); } diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index c736186aba..5d03c5440b 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -264,7 +264,8 @@ static int sco_connect(struct sock *sk) } hcon = hci_connect_sco(hdev, type, &sco_pi(sk)->dst, - sco_pi(sk)->setting, &sco_pi(sk)->codec); + sco_pi(sk)->setting, &sco_pi(sk)->codec, + sk->sk_sndtimeo); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; @@ -823,7 +824,7 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; - int len, err = 0; + int err = 0; struct bt_voice voice; u32 opt; struct bt_codecs *codecs; @@ -842,10 +843,9 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt) set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); @@ -862,11 +862,10 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, voice.setting = sco_pi(sk)->setting; - len = min_t(unsigned int, sizeof(voice), optlen); - if (copy_from_sockptr(&voice, optval, len)) { - err = -EFAULT; + err = bt_copy_from_sockptr(&voice, sizeof(voice), optval, + optlen); + if (err) break; - } /* Explicitly check for these values */ if (voice.setting != BT_VOICE_TRANSPARENT && @@ -889,10 +888,9 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, break; case BT_PKT_STATUS: - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt) set_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); @@ -933,9 +931,9 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(buffer, optval, optlen)) { + err = bt_copy_from_sockptr(buffer, optlen, optval, optlen); + if (err) { hci_dev_put(hdev); - err = -EFAULT; break; } @@ -966,7 +964,8 @@ static int sco_sock_getsockopt_old(struct socket *sock, int optname, struct sock *sk = sock->sk; struct sco_options opts; struct sco_conninfo cinfo; - int len, err = 0; + int err = 0; + size_t len; BT_DBG("sk %p", sk); @@ -988,7 +987,7 @@ static int sco_sock_getsockopt_old(struct socket *sock, int optname, BT_DBG("mtu %u", opts.mtu); - len = min_t(unsigned int, len, sizeof(opts)); + len = min(len, sizeof(opts)); if (copy_to_user(optval, (char *)&opts, len)) err = -EFAULT; @@ -1006,7 +1005,7 @@ static int sco_sock_getsockopt_old(struct socket *sock, int optname, cinfo.hci_handle = sco_pi(sk)->conn->hcon->handle; memcpy(cinfo.dev_class, sco_pi(sk)->conn->hcon->dev_class, 3); - len = min_t(unsigned int, len, sizeof(cinfo)); + len = min(len, sizeof(cinfo)); if (copy_to_user(optval, (char *)&cinfo, len)) err = -EFAULT; diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index 5918d1b32e..8906f7bdf4 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -12,6 +12,11 @@ extern struct bpf_struct_ops bpf_bpf_dummy_ops; /* A common type for test_N with return value in bpf_dummy_ops */ typedef int (*dummy_ops_test_ret_fn)(struct bpf_dummy_ops_state *state, ...); +static int dummy_ops_test_ret_function(struct bpf_dummy_ops_state *state, ...) +{ + return 0; +} + struct bpf_dummy_ops_test_args { u64 args[MAX_BPF_FUNC_ARGS]; struct bpf_dummy_ops_state state; @@ -62,7 +67,7 @@ static int dummy_ops_copy_args(struct bpf_dummy_ops_test_args *args) static int dummy_ops_call_op(void *image, struct bpf_dummy_ops_test_args *args) { - dummy_ops_test_ret_fn test = (void *)image; + dummy_ops_test_ret_fn test = (void *)image + cfi_get_offset(); struct bpf_dummy_ops_state *state = NULL; /* state needs to be NULL if args[0] is 0 */ @@ -101,12 +106,11 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, goto out; } - image = bpf_jit_alloc_exec(PAGE_SIZE); + image = arch_alloc_bpf_trampoline(PAGE_SIZE); if (!image) { err = -ENOMEM; goto out; } - set_vm_flush_reset_perms(image); link = kzalloc(sizeof(*link), GFP_USER); if (!link) { @@ -120,11 +124,12 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, op_idx = prog->expected_attach_type; err = bpf_struct_ops_prepare_trampoline(tlinks, link, &st_ops->func_models[op_idx], + &dummy_ops_test_ret_function, image, image + PAGE_SIZE); if (err < 0) goto out; - set_memory_rox((long)image, 1); + arch_protect_bpf_trampoline(image, PAGE_SIZE); prog_ret = dummy_ops_call_op(image, args); err = dummy_ops_copy_args(args); @@ -134,7 +139,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, err = -EFAULT; out: kfree(args); - bpf_jit_free_exec(image); + arch_free_bpf_trampoline(image, PAGE_SIZE); if (link) bpf_link_put(&link->link); kfree(tlinks); @@ -220,6 +225,28 @@ static void bpf_dummy_unreg(void *kdata) { } +static int bpf_dummy_test_1(struct bpf_dummy_ops_state *cb) +{ + return 0; +} + +static int bpf_dummy_test_2(struct bpf_dummy_ops_state *cb, int a1, unsigned short a2, + char a3, unsigned long a4) +{ + return 0; +} + +static int bpf_dummy_test_sleepable(struct bpf_dummy_ops_state *cb) +{ + return 0; +} + +static struct bpf_dummy_ops __bpf_bpf_dummy_ops = { + .test_1 = bpf_dummy_test_1, + .test_2 = bpf_dummy_test_2, + .test_sleepable = bpf_dummy_test_sleepable, +}; + struct bpf_struct_ops bpf_bpf_dummy_ops = { .verifier_ops = &bpf_dummy_verifier_ops, .init = bpf_dummy_init, @@ -228,4 +255,5 @@ struct bpf_struct_ops bpf_bpf_dummy_ops = { .reg = bpf_dummy_reg, .unreg = bpf_dummy_unreg, .name = "bpf_dummy_ops", + .cfi_stubs = &__bpf_bpf_dummy_ops, }; diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 711cf5d598..dfd9193740 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -600,10 +600,21 @@ __bpf_kfunc void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) refcount_dec(&p->cnt); } +__bpf_kfunc void bpf_kfunc_call_test_release_dtor(void *p) +{ + bpf_kfunc_call_test_release(p); +} +CFI_NOSEAL(bpf_kfunc_call_test_release_dtor); + __bpf_kfunc void bpf_kfunc_call_memb_release(struct prog_test_member *p) { } +__bpf_kfunc void bpf_kfunc_call_memb_release_dtor(void *p) +{ +} +CFI_NOSEAL(bpf_kfunc_call_memb_release_dtor); + __bpf_kfunc_end_defs(); BTF_SET8_START(bpf_test_modify_return_ids) @@ -1671,9 +1682,9 @@ static const struct btf_kfunc_id_set bpf_prog_test_kfunc_set = { BTF_ID_LIST(bpf_prog_test_dtor_kfunc_ids) BTF_ID(struct, prog_test_ref_kfunc) -BTF_ID(func, bpf_kfunc_call_test_release) +BTF_ID(func, bpf_kfunc_call_test_release_dtor) BTF_ID(struct, prog_test_member) -BTF_ID(func, bpf_kfunc_call_memb_release) +BTF_ID(func, bpf_kfunc_call_memb_release_dtor) static int __init bpf_prog_test_run_init(void) { diff --git a/net/bpfilter/.gitignore b/net/bpfilter/.gitignore deleted file mode 100644 index f34e85ee82..0000000000 --- a/net/bpfilter/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -bpfilter_umh diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig deleted file mode 100644 index 3d4a214624..0000000000 --- a/net/bpfilter/Kconfig +++ /dev/null @@ -1,23 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -menuconfig BPFILTER - bool "BPF based packet filtering framework (BPFILTER)" - depends on BPF && INET - select USERMODE_DRIVER - help - This builds experimental bpfilter framework that is aiming to - provide netfilter compatible functionality via BPF - -if BPFILTER -config BPFILTER_UMH - tristate "bpfilter kernel module with user mode helper" - depends on CC_CAN_LINK - depends on m || CC_CAN_LINK_STATIC - default m - help - This builds bpfilter kernel module with embedded user mode helper - - Note: To compile this as built-in, your toolchain must support - building static binaries, since rootfs isn't mounted at the time - when __init functions are called and do_execv won't be able to find - the elf interpreter. -endif diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile deleted file mode 100644 index cdac82b8c5..0000000000 --- a/net/bpfilter/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile for the Linux BPFILTER layer. -# - -userprogs := bpfilter_umh -bpfilter_umh-objs := main.o -userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi - -ifeq ($(CONFIG_BPFILTER_UMH), y) -# builtin bpfilter_umh should be linked with -static -# since rootfs isn't mounted at the time of __init -# function is called and do_execv won't find elf interpreter -userldflags += -static -endif - -$(obj)/bpfilter_umh_blob.o: $(obj)/bpfilter_umh - -obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o -bpfilter-objs += bpfilter_kern.o bpfilter_umh_blob.o diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c deleted file mode 100644 index 97e129e3f3..0000000000 --- a/net/bpfilter/bpfilter_kern.c +++ /dev/null @@ -1,136 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/init.h> -#include <linux/module.h> -#include <linux/umh.h> -#include <linux/bpfilter.h> -#include <linux/sched.h> -#include <linux/sched/signal.h> -#include <linux/fs.h> -#include <linux/file.h> -#include "msgfmt.h" - -extern char bpfilter_umh_start; -extern char bpfilter_umh_end; - -static void shutdown_umh(void) -{ - struct umd_info *info = &bpfilter_ops.info; - struct pid *tgid = info->tgid; - - if (tgid) { - kill_pid(tgid, SIGKILL, 1); - wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); - umd_cleanup_helper(info); - } -} - -static void __stop_umh(void) -{ - if (IS_ENABLED(CONFIG_INET)) - shutdown_umh(); -} - -static int bpfilter_send_req(struct mbox_request *req) -{ - struct mbox_reply reply; - loff_t pos = 0; - ssize_t n; - - if (!bpfilter_ops.info.tgid) - return -EFAULT; - pos = 0; - n = kernel_write(bpfilter_ops.info.pipe_to_umh, req, sizeof(*req), - &pos); - if (n != sizeof(*req)) { - pr_err("write fail %zd\n", n); - goto stop; - } - pos = 0; - n = kernel_read(bpfilter_ops.info.pipe_from_umh, &reply, sizeof(reply), - &pos); - if (n != sizeof(reply)) { - pr_err("read fail %zd\n", n); - goto stop; - } - return reply.status; -stop: - __stop_umh(); - return -EFAULT; -} - -static int bpfilter_process_sockopt(struct sock *sk, int optname, - sockptr_t optval, unsigned int optlen, - bool is_set) -{ - struct mbox_request req = { - .is_set = is_set, - .pid = current->pid, - .cmd = optname, - .addr = (uintptr_t)optval.user, - .len = optlen, - }; - if (sockptr_is_kernel(optval)) { - pr_err("kernel access not supported\n"); - return -EFAULT; - } - return bpfilter_send_req(&req); -} - -static int start_umh(void) -{ - struct mbox_request req = { .pid = current->pid }; - int err; - - /* fork usermode process */ - err = fork_usermode_driver(&bpfilter_ops.info); - if (err) - return err; - pr_info("Loaded bpfilter_umh pid %d\n", pid_nr(bpfilter_ops.info.tgid)); - - /* health check that usermode process started correctly */ - if (bpfilter_send_req(&req) != 0) { - shutdown_umh(); - return -EFAULT; - } - - return 0; -} - -static int __init load_umh(void) -{ - int err; - - err = umd_load_blob(&bpfilter_ops.info, - &bpfilter_umh_start, - &bpfilter_umh_end - &bpfilter_umh_start); - if (err) - return err; - - mutex_lock(&bpfilter_ops.lock); - err = start_umh(); - if (!err && IS_ENABLED(CONFIG_INET)) { - bpfilter_ops.sockopt = &bpfilter_process_sockopt; - bpfilter_ops.start = &start_umh; - } - mutex_unlock(&bpfilter_ops.lock); - if (err) - umd_unload_blob(&bpfilter_ops.info); - return err; -} - -static void __exit fini_umh(void) -{ - mutex_lock(&bpfilter_ops.lock); - if (IS_ENABLED(CONFIG_INET)) { - shutdown_umh(); - bpfilter_ops.start = NULL; - bpfilter_ops.sockopt = NULL; - } - mutex_unlock(&bpfilter_ops.lock); - - umd_unload_blob(&bpfilter_ops.info); -} -module_init(load_umh); -module_exit(fini_umh); -MODULE_LICENSE("GPL"); diff --git a/net/bpfilter/bpfilter_umh_blob.S b/net/bpfilter/bpfilter_umh_blob.S deleted file mode 100644 index 40311d10d2..0000000000 --- a/net/bpfilter/bpfilter_umh_blob.S +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - .section .init.rodata, "a" - .global bpfilter_umh_start -bpfilter_umh_start: - .incbin "net/bpfilter/bpfilter_umh" - .global bpfilter_umh_end -bpfilter_umh_end: diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c deleted file mode 100644 index 291a925462..0000000000 --- a/net/bpfilter/main.c +++ /dev/null @@ -1,64 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#define _GNU_SOURCE -#include <sys/uio.h> -#include <errno.h> -#include <stdio.h> -#include <sys/socket.h> -#include <fcntl.h> -#include <unistd.h> -#include "../../include/uapi/linux/bpf.h" -#include <asm/unistd.h> -#include "msgfmt.h" - -FILE *debug_f; - -static int handle_get_cmd(struct mbox_request *cmd) -{ - switch (cmd->cmd) { - case 0: - return 0; - default: - break; - } - return -ENOPROTOOPT; -} - -static int handle_set_cmd(struct mbox_request *cmd) -{ - return -ENOPROTOOPT; -} - -static void loop(void) -{ - while (1) { - struct mbox_request req; - struct mbox_reply reply; - int n; - - n = read(0, &req, sizeof(req)); - if (n != sizeof(req)) { - fprintf(debug_f, "invalid request %d\n", n); - return; - } - - reply.status = req.is_set ? - handle_set_cmd(&req) : - handle_get_cmd(&req); - - n = write(1, &reply, sizeof(reply)); - if (n != sizeof(reply)) { - fprintf(debug_f, "reply failed %d\n", n); - return; - } - } -} - -int main(void) -{ - debug_f = fopen("/dev/kmsg", "w"); - setvbuf(debug_f, 0, _IOLBF, 0); - fprintf(debug_f, "<5>Started bpfilter\n"); - loop(); - fclose(debug_f); - return 0; -} diff --git a/net/bpfilter/msgfmt.h b/net/bpfilter/msgfmt.h deleted file mode 100644 index 98d121c629..0000000000 --- a/net/bpfilter/msgfmt.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NET_BPFILTER_MSGFMT_H -#define _NET_BPFILTER_MSGFMT_H - -struct mbox_request { - __u64 addr; - __u32 len; - __u32 is_set; - __u32 cmd; - __u32 pid; -}; - -struct mbox_reply { - __u32 status; -}; - -#endif diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 8f40de3af1..65cee0ad3c 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -471,6 +471,7 @@ static const struct net_device_ops br_netdev_ops = { .ndo_fdb_get = br_fdb_get, .ndo_mdb_add = br_mdb_add, .ndo_mdb_del = br_mdb_del, + .ndo_mdb_del_bulk = br_mdb_del_bulk, .ndo_mdb_dump = br_mdb_dump, .ndo_mdb_get = br_mdb_get, .ndo_bridge_getlink = br_getlink, diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index f21097e734..ceaa5a89b9 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -30,7 +30,7 @@ br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb) return netif_receive_skb(skb); } -static int br_pass_frame_up(struct sk_buff *skb) +static int br_pass_frame_up(struct sk_buff *skb, bool promisc) { struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev; struct net_bridge *br = netdev_priv(brdev); @@ -65,6 +65,8 @@ static int br_pass_frame_up(struct sk_buff *skb) br_multicast_count(br, NULL, skb, br_multicast_igmp_type(skb), BR_MCAST_DIR_TX); + BR_INPUT_SKB_CB(skb)->promisc = promisc; + return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(indev), NULL, skb, indev, NULL, br_netif_receive_skb); @@ -82,6 +84,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb struct net_bridge_mcast *brmctx; struct net_bridge_vlan *vlan; struct net_bridge *br; + bool promisc; u16 vid = 0; u8 state; @@ -137,7 +140,9 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (p->flags & BR_LEARNING) br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, 0); - local_rcv = !!(br->dev->flags & IFF_PROMISC); + promisc = !!(br->dev->flags & IFF_PROMISC); + local_rcv = promisc; + if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) { /* by definition the broadcast is also a multicast address */ if (is_broadcast_ether_addr(eth_hdr(skb)->h_dest)) { @@ -200,7 +205,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb unsigned long now = jiffies; if (test_bit(BR_FDB_LOCAL, &dst->flags)) - return br_pass_frame_up(skb); + return br_pass_frame_up(skb, false); if (now != dst->used) dst->used = now; @@ -213,7 +218,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb } if (local_rcv) - return br_pass_frame_up(skb); + return br_pass_frame_up(skb, promisc); out: return 0; @@ -386,6 +391,8 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) goto forward; } + BR_INPUT_SKB_CB(skb)->promisc = false; + /* The else clause should be hit when nf_hook(): * - returns < 0 (drop/error) * - returns = 0 (stolen/nf_queue) diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 8cc526067b..bc37e47ad8 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -1412,6 +1412,139 @@ int br_mdb_del(struct net_device *dev, struct nlattr *tb[], return err; } +struct br_mdb_flush_desc { + u32 port_ifindex; + u16 vid; + u8 rt_protocol; + u8 state; + u8 state_mask; +}; + +static const struct nla_policy br_mdbe_attrs_del_bulk_pol[MDBE_ATTR_MAX + 1] = { + [MDBE_ATTR_RTPROT] = NLA_POLICY_MIN(NLA_U8, RTPROT_STATIC), + [MDBE_ATTR_STATE_MASK] = NLA_POLICY_MASK(NLA_U8, MDB_PERMANENT), +}; + +static int br_mdb_flush_desc_init(struct br_mdb_flush_desc *desc, + struct nlattr *tb[], + struct netlink_ext_ack *extack) +{ + struct br_mdb_entry *entry = nla_data(tb[MDBA_SET_ENTRY]); + struct nlattr *mdbe_attrs[MDBE_ATTR_MAX + 1]; + int err; + + desc->port_ifindex = entry->ifindex; + desc->vid = entry->vid; + desc->state = entry->state; + + if (!tb[MDBA_SET_ENTRY_ATTRS]) + return 0; + + err = nla_parse_nested(mdbe_attrs, MDBE_ATTR_MAX, + tb[MDBA_SET_ENTRY_ATTRS], + br_mdbe_attrs_del_bulk_pol, extack); + if (err) + return err; + + if (mdbe_attrs[MDBE_ATTR_STATE_MASK]) + desc->state_mask = nla_get_u8(mdbe_attrs[MDBE_ATTR_STATE_MASK]); + + if (mdbe_attrs[MDBE_ATTR_RTPROT]) + desc->rt_protocol = nla_get_u8(mdbe_attrs[MDBE_ATTR_RTPROT]); + + return 0; +} + +static void br_mdb_flush_host(struct net_bridge *br, + struct net_bridge_mdb_entry *mp, + const struct br_mdb_flush_desc *desc) +{ + u8 state; + + if (desc->port_ifindex && desc->port_ifindex != br->dev->ifindex) + return; + + if (desc->rt_protocol) + return; + + state = br_group_is_l2(&mp->addr) ? MDB_PERMANENT : 0; + if (desc->state_mask && (state & desc->state_mask) != desc->state) + return; + + br_multicast_host_leave(mp, true); + if (!mp->ports && netif_running(br->dev)) + mod_timer(&mp->timer, jiffies); +} + +static void br_mdb_flush_pgs(struct net_bridge *br, + struct net_bridge_mdb_entry *mp, + const struct br_mdb_flush_desc *desc) +{ + struct net_bridge_port_group __rcu **pp; + struct net_bridge_port_group *p; + + for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL;) { + u8 state; + + if (desc->port_ifindex && + desc->port_ifindex != p->key.port->dev->ifindex) { + pp = &p->next; + continue; + } + + if (desc->rt_protocol && desc->rt_protocol != p->rt_protocol) { + pp = &p->next; + continue; + } + + state = p->flags & MDB_PG_FLAGS_PERMANENT ? MDB_PERMANENT : 0; + if (desc->state_mask && + (state & desc->state_mask) != desc->state) { + pp = &p->next; + continue; + } + + br_multicast_del_pg(mp, p, pp); + } +} + +static void br_mdb_flush(struct net_bridge *br, + const struct br_mdb_flush_desc *desc) +{ + struct net_bridge_mdb_entry *mp; + + spin_lock_bh(&br->multicast_lock); + + /* Safe variant is not needed because entries are removed from the list + * upon group timer expiration or bridge deletion. + */ + hlist_for_each_entry(mp, &br->mdb_list, mdb_node) { + if (desc->vid && desc->vid != mp->addr.vid) + continue; + + br_mdb_flush_host(br, mp, desc); + br_mdb_flush_pgs(br, mp, desc); + } + + spin_unlock_bh(&br->multicast_lock); +} + +int br_mdb_del_bulk(struct net_device *dev, struct nlattr *tb[], + struct netlink_ext_ack *extack) +{ + struct net_bridge *br = netdev_priv(dev); + struct br_mdb_flush_desc desc = {}; + int err; + + err = br_mdb_flush_desc_init(&desc, tb, extack); + if (err) + return err; + + br_mdb_flush(br, &desc); + + return 0; +} + static const struct nla_policy br_mdbe_attrs_get_pol[MDBE_ATTR_MAX + 1] = { [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 35e10c5a76..22e35623c1 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -600,11 +600,17 @@ static unsigned int br_nf_local_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { + bool promisc = BR_INPUT_SKB_CB(skb)->promisc; struct nf_conntrack *nfct = skb_nfct(skb); const struct nf_ct_hook *ct_hook; struct nf_conn *ct; int ret; + if (promisc) { + nf_reset_ct(skb); + return NF_ACCEPT; + } + if (!nfct || skb->pkt_type == PACKET_HOST) return NF_ACCEPT; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 5ad4abfcb7..d415833fce 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -666,7 +666,7 @@ void br_ifinfo_notify(int event, const struct net_bridge *br, { u32 filter = RTEXT_FILTER_BRVLAN_COMPRESSED; - return br_info_notify(event, br, port, filter); + br_info_notify(event, br, port, filter); } /* diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index f317d8295b..d4bedc87b1 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -186,6 +186,7 @@ enum { * struct net_bridge_vlan - per-vlan entry * * @vnode: rhashtable member + * @tnode: rhashtable member * @vid: VLAN id * @flags: bridge vlan flags * @priv_flags: private (in-kernel) bridge vlan flags @@ -196,6 +197,7 @@ enum { * @refcnt: if MASTER flag set, this is bumped for each port referencing it * @brvlan: if MASTER flag unset, this points to the global per-VLAN context * for this VLAN entry + * @tinfo: bridge tunnel info * @br_mcast_ctx: if MASTER flag set, this is the global vlan multicast context * @port_mcast_ctx: if MASTER flag unset, this is the per-port/vlan multicast * context @@ -587,6 +589,7 @@ struct br_input_skb_cb { #endif u8 proxyarp_replied:1; u8 src_port_isolated:1; + u8 promisc:1; #ifdef CONFIG_BRIDGE_VLAN_FILTERING u8 vlan_filtered:1; #endif @@ -1020,6 +1023,8 @@ int br_mdb_add(struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags, struct netlink_ext_ack *extack); int br_mdb_del(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack); +int br_mdb_del_bulk(struct net_device *dev, struct nlattr *tb[], + struct netlink_ext_ack *extack); int br_mdb_dump(struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb); int br_mdb_get(struct net_device *dev, struct nlattr *tb[], u32 portid, u32 seq, @@ -1428,6 +1433,12 @@ static inline int br_mdb_del(struct net_device *dev, struct nlattr *tb[], return -EOPNOTSUPP; } +static inline int br_mdb_del_bulk(struct net_device *dev, struct nlattr *tb[], + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} + static inline int br_mdb_dump(struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb) { diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 99d82676f7..cbd0e3586c 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1111,6 +1111,8 @@ static int do_replace(struct net *net, sockptr_t arg, unsigned int len) struct ebt_table_info *newinfo; struct ebt_replace tmp; + if (len < sizeof(tmp)) + return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; @@ -1423,6 +1425,8 @@ static int update_counters(struct net *net, sockptr_t arg, unsigned int len) { struct ebt_replace hlp; + if (len < sizeof(hlp)) + return -EINVAL; if (copy_from_sockptr(&hlp, arg, sizeof(hlp))) return -EFAULT; @@ -2352,6 +2356,8 @@ static int compat_update_counters(struct net *net, sockptr_t arg, { struct compat_ebt_replace hlp; + if (len < sizeof(hlp)) + return -EINVAL; if (copy_from_sockptr(&hlp, arg, sizeof(hlp))) return -EFAULT; diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index 6f877e3170..c3c51b9a68 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -294,18 +294,24 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb, static unsigned int nf_ct_bridge_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - enum ip_conntrack_info ctinfo; + bool promisc = BR_INPUT_SKB_CB(skb)->promisc; + struct nf_conntrack *nfct = skb_nfct(skb); struct nf_conn *ct; - if (skb->pkt_type == PACKET_HOST) + if (promisc) { + nf_reset_ct(skb); + return NF_ACCEPT; + } + + if (!nfct || skb->pkt_type == PACKET_HOST) return NF_ACCEPT; /* nf_conntrack_confirm() cannot handle concurrent clones, * this happens for broad/multicast frames with e.g. macvlan on top * of the bridge device. */ - ct = nf_ct_get(skb, &ctinfo); - if (!ct || nf_ct_is_confirmed(ct) || nf_ct_is_template(ct)) + ct = container_of(nfct, struct nf_conn, ct_general); + if (nf_ct_is_confirmed(ct) || nf_ct_is_template(ct)) return NF_ACCEPT; /* let inet prerouting call conntrack again */ diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 6a0cba4fc2..24e85c5487 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -27,6 +27,7 @@ #include <net/caif/cfcnfg.h> #include <net/caif/cfserl.h> +MODULE_DESCRIPTION("ST-Ericsson CAIF modem protocol support"); MODULE_LICENSE("GPL"); /* Used for local tracking of the CAIF net devices */ diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 9c82698da4..039dfbd367 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -27,6 +27,7 @@ #include <net/caif/caif_dev.h> #include <net/caif/cfpkt.h> +MODULE_DESCRIPTION("ST-Ericsson CAIF modem protocol socket support (AF_CAIF)"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(AF_CAIF); diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c index bf61ea4b81..5dc05a1e31 100644 --- a/net/caif/caif_usb.c +++ b/net/caif/caif_usb.c @@ -20,6 +20,7 @@ #include <net/caif/cfpkt.h> #include <net/caif/cfcnfg.h> +MODULE_DESCRIPTION("ST-Ericsson CAIF modem protocol USB support"); MODULE_LICENSE("GPL"); #define CFUSB_PAD_DESCR_SZ 1 /* Alignment descriptor length */ diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index f35fc87c45..47901bd4de 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -31,6 +31,7 @@ /*This list is protected by the rtnl lock. */ static LIST_HEAD(chnl_net_list); +MODULE_DESCRIPTION("ST-Ericsson CAIF modem protocol GPRS network device"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("caif"); diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index a0ca5414b3..bd608ffa06 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -2034,6 +2034,9 @@ static int prepare_sparse_read_data(struct ceph_connection *con) if (!con_secure(con)) con->in_data_crc = -1; + ceph_msg_data_cursor_init(&con->v2.in_cursor, msg, + msg->sparse_read_total); + reset_in_kvecs(con); con->v2.in_state = IN_S_PREPARE_SPARSE_DATA_CONT; con->v2.data_len_remain = data_len(msg); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 3babcd5e65..9d078b37fe 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -5847,8 +5847,6 @@ static inline void convert_extent_map(struct ceph_sparse_read *sr) } #endif -#define MAX_EXTENTS 4096 - static int osd_sparse_read(struct ceph_connection *con, struct ceph_msg_data_cursor *cursor, char **pbuf) @@ -5879,23 +5877,16 @@ next_op: if (count > 0) { if (!sr->sr_extent || count > sr->sr_ext_len) { - /* - * Apply a hard cap to the number of extents. - * If we have more, assume something is wrong. - */ - if (count > MAX_EXTENTS) { - dout("%s: OSD returned 0x%x extents in a single reply!\n", - __func__, count); - return -EREMOTEIO; - } - /* no extent array provided, or too short */ kfree(sr->sr_extent); sr->sr_extent = kmalloc_array(count, sizeof(*sr->sr_extent), GFP_NOIO); - if (!sr->sr_extent) + if (!sr->sr_extent) { + pr_err("%s: failed to allocate %u extents\n", + __func__, count); return -ENOMEM; + } sr->sr_ext_len = count; } ret = count * sizeof(*sr->sr_extent); diff --git a/net/compat.c b/net/compat.c index 6564720f32..485db8ee9b 100644 --- a/net/compat.c +++ b/net/compat.c @@ -297,7 +297,7 @@ void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm) int err = 0, i; for (i = 0; i < fdmax; i++) { - err = receive_fd_user(scm->fp->fp[i], cmsg_data + i, o_flags); + err = scm_recv_one_fd(scm->fp->fp[i], cmsg_data + i, o_flags); if (err < 0) break; } diff --git a/net/core/Makefile b/net/core/Makefile index 0cb734cbc2..821aec06ab 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -18,7 +18,7 @@ obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o obj-y += net-sysfs.o -obj-$(CONFIG_PAGE_POOL) += page_pool.o +obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index cca7594be9..6c4d90b24d 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -275,9 +275,10 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) static int bpf_sk_storage_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) { - int optmem_max = READ_ONCE(sysctl_optmem_max); struct sock *sk = (struct sock *)owner; + int optmem_max; + optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); /* same check as in sock_kmalloc() */ if (size <= optmem_max && atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { diff --git a/net/core/dev.c b/net/core/dev.c index e3c06ccf21..c365aa06f8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -165,7 +165,6 @@ static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_extack(unsigned long val, struct net_device *dev, struct netlink_ext_ack *extack); -static struct napi_struct *napi_by_id(unsigned int napi_id); /* * The @dev_base_head list is protected by @dev_base_lock and the rtnl @@ -337,7 +336,7 @@ int netdev_name_node_alt_create(struct net_device *dev, const char *name) return -ENOMEM; netdev_name_node_add(net, name_node); /* The node that holds dev->name acts as a head of per-device list. */ - list_add_tail(&name_node->list, &dev->name_node->list); + list_add_tail_rcu(&name_node->list, &dev->name_node->list); return 0; } @@ -3757,6 +3756,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_calculate_pkt_len(skb, q); + tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP); + if (q->flags & TCQ_F_NOLOCK) { if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) && qdisc_run_begin(q)) { @@ -3786,10 +3787,14 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, no_lock_out: if (unlikely(to_free)) kfree_skb_list_reason(to_free, - SKB_DROP_REASON_QDISC_DROP); + tcf_get_drop_reason(to_free)); return rc; } + if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) { + kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP); + return NET_XMIT_DROP; + } /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. @@ -3829,7 +3834,9 @@ no_lock_out: qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { + WRITE_ONCE(q->owner, smp_processor_id()); rc = dev_qdisc_enqueue(skb, q, &to_free, txq); + WRITE_ONCE(q->owner, -1); if (qdisc_run_begin(q)) { if (unlikely(contended)) { spin_unlock(&q->busylock); @@ -3841,7 +3848,8 @@ no_lock_out: } spin_unlock(root_lock); if (unlikely(to_free)) - kfree_skb_list_reason(to_free, SKB_DROP_REASON_QDISC_DROP); + kfree_skb_list_reason(to_free, + tcf_get_drop_reason(to_free)); if (unlikely(contended)) spin_unlock(&q->busylock); return rc; @@ -3927,14 +3935,14 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb, tc_skb_cb(skb)->mru = 0; tc_skb_cb(skb)->post_ct = false; - res.drop_reason = *drop_reason; + tcf_set_drop_reason(skb, *drop_reason); mini_qdisc_bstats_cpu_update(miniq, skb); ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false); /* Only tcf related quirks below. */ switch (ret) { case TC_ACT_SHOT: - *drop_reason = res.drop_reason; + *drop_reason = tcf_get_drop_reason(skb); mini_qdisc_qstats_cpu_drop(miniq); break; case TC_ACT_OK: @@ -6142,7 +6150,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) EXPORT_SYMBOL(napi_complete_done); /* must be called under rcu_read_lock(), as we dont take a reference */ -static struct napi_struct *napi_by_id(unsigned int napi_id) +struct napi_struct *napi_by_id(unsigned int napi_id) { unsigned int hash = napi_id % HASH_SIZE(napi_hash); struct napi_struct *napi; @@ -6403,6 +6411,43 @@ int dev_set_threaded(struct net_device *dev, bool threaded) } EXPORT_SYMBOL(dev_set_threaded); +/** + * netif_queue_set_napi - Associate queue with the napi + * @dev: device to which NAPI and queue belong + * @queue_index: Index of queue + * @type: queue type as RX or TX + * @napi: NAPI context, pass NULL to clear previously set NAPI + * + * Set queue with its corresponding napi context. This should be done after + * registering the NAPI handler for the queue-vector and the queues have been + * mapped to the corresponding interrupt vector. + */ +void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, + enum netdev_queue_type type, struct napi_struct *napi) +{ + struct netdev_rx_queue *rxq; + struct netdev_queue *txq; + + if (WARN_ON_ONCE(napi && !napi->dev)) + return; + if (dev->reg_state >= NETREG_REGISTERED) + ASSERT_RTNL(); + + switch (type) { + case NETDEV_QUEUE_TYPE_RX: + rxq = __netif_get_rx_queue(dev, queue_index); + rxq->napi = napi; + return; + case NETDEV_QUEUE_TYPE_TX: + txq = netdev_get_tx_queue(dev, queue_index); + txq->napi = napi; + return; + default: + return; + } +} +EXPORT_SYMBOL(netif_queue_set_napi); + void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { @@ -6438,6 +6483,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, */ if (dev->threaded && napi_kthread_create(napi)) dev->threaded = 0; + netif_napi_set_irq(napi, -1); } EXPORT_SYMBOL(netif_napi_add_weight); @@ -9037,28 +9083,6 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) } EXPORT_SYMBOL(netdev_port_same_parent_id); -static void netdev_dpll_pin_assign(struct net_device *dev, struct dpll_pin *dpll_pin) -{ -#if IS_ENABLED(CONFIG_DPLL) - rtnl_lock(); - dev->dpll_pin = dpll_pin; - rtnl_unlock(); -#endif -} - -void netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin) -{ - WARN_ON(!dpll_pin); - netdev_dpll_pin_assign(dev, dpll_pin); -} -EXPORT_SYMBOL(netdev_dpll_pin_set); - -void netdev_dpll_pin_clear(struct net_device *dev) -{ - netdev_dpll_pin_assign(dev, NULL); -} -EXPORT_SYMBOL(netdev_dpll_pin_clear); - /** * dev_change_proto_down - set carrier according to proto_down. * @@ -10517,7 +10541,7 @@ void netdev_run_todo(void) write_lock(&dev_base_lock); dev->reg_state = NETREG_UNREGISTERED; write_unlock(&dev_base_lock); - linkwatch_forget_dev(dev); + linkwatch_sync_dev(dev); } while (!list_empty(&list)) { @@ -11242,17 +11266,19 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, dev_net_set(dev, net); dev->ifindex = new_ifindex; - /* Send a netdev-add uevent to the new namespace */ - kobject_uevent(&dev->dev.kobj, KOBJ_ADD); - netdev_adjacent_add_links(dev); - if (new_name[0]) /* Rename the netdev to prepared name */ strscpy(dev->name, new_name, IFNAMSIZ); /* Fixup kobjects */ + dev_set_uevent_suppress(&dev->dev, 1); err = device_rename(&dev->dev, dev->name); + dev_set_uevent_suppress(&dev->dev, 0); WARN_ON(err); + /* Send a netdev-add uevent to the new namespace */ + kobject_uevent(&dev->dev.kobj, KOBJ_ADD); + netdev_adjacent_add_links(dev); + /* Adapt owner in case owning user namespace of target network * namespace is different from the original one. */ @@ -11585,6 +11611,63 @@ static struct pernet_operations __net_initdata default_device_ops = { .exit_batch = default_device_exit_batch, }; +static void __init net_dev_struct_check(void) +{ + /* TX read-mostly hotpath */ + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq); +#ifdef CONFIG_XPS + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps); +#endif +#ifdef CONFIG_NETFILTER_EGRESS + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress); +#endif +#ifdef CONFIG_NET_XGRESS + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress); +#endif + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160); + + /* TXRX read-mostly hotpath */ + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr); + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46); + + /* RX read-mostly hotpath */ + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net); +#ifdef CONFIG_NETPOLL + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo); +#endif +#ifdef CONFIG_NET_XGRESS + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress); +#endif + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104); +} + /* * Initialize the DEV module. At boot time this walks the device list and * unhooks any devices that fail to initialise (normally hardware not @@ -11602,6 +11685,8 @@ static int __init net_dev_init(void) BUG_ON(!dev_boot_phase); + net_dev_struct_check(); + if (dev_proc_init()) goto out; diff --git a/net/core/dev.h b/net/core/dev.h index 3f5eb92396..7480b4c842 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -30,7 +30,6 @@ int __init dev_proc_init(void); #endif void linkwatch_init_dev(struct net_device *dev); -void linkwatch_forget_dev(struct net_device *dev); void linkwatch_run_queue(void); void dev_addr_flush(struct net_device *dev); @@ -148,4 +147,6 @@ void xdp_do_check_flushed(struct napi_struct *napi); #else static inline void xdp_do_check_flushed(struct napi_struct *napi) { } #endif + +struct napi_struct *napi_by_id(unsigned int napi_id); #endif diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index feeddf95f4..9a66cf5015 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -322,9 +322,9 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) * frames and not forward them), it must set IFF_SEE_ALL_HWTSTAMP_REQUESTS in * dev->priv_flags. */ -static int dev_set_hwtstamp_phylib(struct net_device *dev, - struct kernel_hwtstamp_config *cfg, - struct netlink_ext_ack *extack) +int dev_set_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; bool phy_ts = phy_has_hwtstamp(dev->phydev); @@ -363,6 +363,7 @@ static int dev_set_hwtstamp_phylib(struct net_device *dev, return 0; } +EXPORT_SYMBOL_GPL(dev_set_hwtstamp_phylib); static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) { diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index b240d9aae4..b0f221d658 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -183,7 +183,7 @@ out: } static const struct genl_multicast_group dropmon_mcgrps[] = { - { .name = "events", .cap_sys_admin = 1 }, + { .name = "events", .flags = GENL_MCAST_CAP_SYS_ADMIN, }, }; static void send_dm_alert(struct work_struct *work) diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 75282222e0..3f933ffcef 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -53,7 +53,7 @@ bool fib_rule_matchall(const struct fib_rule *rule) EXPORT_SYMBOL_GPL(fib_rule_matchall); int fib_default_rule_add(struct fib_rules_ops *ops, - u32 pref, u32 table, u32 flags) + u32 pref, u32 table) { struct fib_rule *r; @@ -65,7 +65,6 @@ int fib_default_rule_add(struct fib_rules_ops *ops, r->action = FR_ACT_TO_TBL; r->pref = pref; r->table = table; - r->flags = flags; r->proto = RTPROT_KERNEL; r->fr_net = ops->fro_net; r->uid_range = fib_kuid_range_unset; @@ -594,7 +593,6 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[FRA_TUN_ID]) nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); - err = -EINVAL; if (tb[FRA_L3MDEV] && fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0) goto errout_free; diff --git a/net/core/filter.c b/net/core/filter.c index cee5383831..ef3e78b6a3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -204,7 +204,7 @@ BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) return 0; nla = (struct nlattr *) &skb->data[a]; - if (nla->nla_len > skb->len - a) + if (!nla_ok(nla, skb->len - a)) return 0; nla = nla_find_nested(nla, x); @@ -1220,8 +1220,8 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) */ static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) { + int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); u32 filter_size = bpf_prog_size(fp->prog->len); - int optmem_max = READ_ONCE(sysctl_optmem_max); /* same check as in sock_kmalloc() */ if (filter_size <= optmem_max && @@ -1551,12 +1551,13 @@ EXPORT_SYMBOL_GPL(sk_attach_filter); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) { struct bpf_prog *prog = __get_filter(fprog, sk); - int err; + int err, optmem_max; if (IS_ERR(prog)) return PTR_ERR(prog); - if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) + optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); + if (bpf_prog_size(prog->len) > optmem_max) err = -ENOMEM; else err = reuseport_attach_prog(sk, prog); @@ -1595,7 +1596,7 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) { struct bpf_prog *prog; - int err; + int err, optmem_max; if (sock_flag(sk, SOCK_FILTER_LOCKED)) return -EPERM; @@ -1623,7 +1624,8 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) } } else { /* BPF_PROG_TYPE_SOCKET_FILTER */ - if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) { + optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); + if (bpf_prog_size(prog->len) > optmem_max) { err = -ENOMEM; goto err_prog_put; } @@ -7289,7 +7291,6 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len struct tcphdr *, th, u32, th_len) { #ifdef CONFIG_SYN_COOKIES - u32 cookie; int ret; if (unlikely(!sk || th_len < sizeof(*th))) @@ -7311,8 +7312,6 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len if (tcp_synq_no_recent_overflow(sk)) return -ENOENT; - cookie = ntohl(th->ack_seq) - 1; - /* Both struct iphdr and struct ipv6hdr have the version field at the * same offset so we can cast to the shorter header (struct iphdr). */ @@ -7321,7 +7320,7 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) return -EINVAL; - ret = __cookie_v4_check((struct iphdr *)iph, th, cookie); + ret = __cookie_v4_check((struct iphdr *)iph, th); break; #if IS_BUILTIN(CONFIG_IPV6) @@ -7332,7 +7331,7 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len if (sk->sk_family != AF_INET6) return -EINVAL; - ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie); + ret = __cookie_v6_check((struct ipv6hdr *)iph, th); break; #endif /* CONFIG_IPV6 */ @@ -7785,9 +7784,7 @@ static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = { BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph, struct tcphdr *, th) { - u32 cookie = ntohl(th->ack_seq) - 1; - - if (__cookie_v4_check(iph, th, cookie) > 0) + if (__cookie_v4_check(iph, th) > 0) return 0; return -EACCES; @@ -7808,9 +7805,7 @@ BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph, struct tcphdr *, th) { #if IS_BUILTIN(CONFIG_IPV6) - u32 cookie = ntohl(th->ack_seq) - 1; - - if (__cookie_v6_check(iph, th, cookie) > 0) + if (__cookie_v6_check(iph, th) > 0) return 0; return -EACCES; diff --git a/net/core/gro.c b/net/core/gro.c index 0759277dc1..cefddf65f7 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -195,8 +195,9 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) } merge: - /* sk owenrship - if any - completely transferred to the aggregated packet */ + /* sk ownership - if any - completely transferred to the aggregated packet */ skb->destructor = NULL; + skb->sk = NULL; delta_truesize = skb->truesize; if (offset > headlen) { unsigned int eat = offset - headlen; diff --git a/net/core/link_watch.c b/net/core/link_watch.c index c469d1c4db..429571c258 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -192,7 +192,10 @@ static void __linkwatch_run_queue(int urgent_only) #define MAX_DO_DEV_PER_LOOP 100 int do_dev = MAX_DO_DEV_PER_LOOP; - struct net_device *dev; + /* Use a local list here since we add non-urgent + * events back to the global one when called with + * urgent_only=1. + */ LIST_HEAD(wrk); /* Give urgent case more budget */ @@ -218,6 +221,7 @@ static void __linkwatch_run_queue(int urgent_only) list_splice_init(&lweventlist, &wrk); while (!list_empty(&wrk) && do_dev > 0) { + struct net_device *dev; dev = list_first_entry(&wrk, struct net_device, link_watch_list); list_del_init(&dev->link_watch_list); @@ -245,7 +249,7 @@ static void __linkwatch_run_queue(int urgent_only) spin_unlock_irq(&lweventlist_lock); } -void linkwatch_forget_dev(struct net_device *dev) +void linkwatch_sync_dev(struct net_device *dev) { unsigned long flags; int clean = 0; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index fccaa5bac0..a09d507c5b 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -193,11 +193,22 @@ static ssize_t carrier_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); + int ret = -EINVAL; - if (netif_running(netdev)) - return sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev)); + if (!rtnl_trylock()) + return restart_syscall(); - return -EINVAL; + if (netif_running(netdev)) { + /* Synchronize carrier state with link watch, + * see also rtnl_getlink(). + */ + linkwatch_sync_dev(netdev); + + ret = sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev)); + } + rtnl_unlock(); + + return ret; } static DEVICE_ATTR_RW(carrier); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index f4183c4c1e..7279953342 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -372,6 +372,10 @@ out_undo: static int __net_init net_defaults_init_net(struct net *net) { net->core.sysctl_somaxconn = SOMAXCONN; + /* Limits per socket sk_omem_alloc usage. + * TCP zerocopy regular usage needs 128 KB. + */ + net->core.sysctl_optmem_max = 128 * 1024; net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; return 0; @@ -1099,11 +1103,56 @@ out: rtnl_set_sk_err(net, RTNLGRP_NSID, err); } +#ifdef CONFIG_NET_NS +static void __init netns_ipv4_struct_check(void) +{ + /* TX readonly hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_early_retrans); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_tso_win_divisor); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_tso_rtt_log); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_autocorking); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_min_snd_mss); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_notsent_lowat); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_limit_output_bytes); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_min_rtt_wlen); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_tcp_wmem); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, + sysctl_ip_fwd_use_pmtu); + CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_tx, 33); + + /* TXRX readonly hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_txrx, + sysctl_tcp_moderate_rcvbuf); + CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_txrx, 1); + + /* RX readonly hotpath cache line */ + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_ip_early_demux); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_early_demux); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_reordering); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_rmem); + CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 18); +} +#endif + void __init net_ns_init(void) { struct net_generic *ng; #ifdef CONFIG_NET_NS + netns_ipv4_struct_check(); net_cachep = kmem_cache_create("net_namespace", sizeof(struct net), SMP_CACHE_BYTES, SLAB_PANIC|SLAB_ACCOUNT, NULL); diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index ea9231378a..be7f2ebd61 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -10,11 +10,64 @@ #include <uapi/linux/netdev.h> +/* Integer value ranges */ +static const struct netlink_range_validation netdev_a_page_pool_id_range = { + .min = 1ULL, + .max = 4294967295ULL, +}; + +static const struct netlink_range_validation netdev_a_page_pool_ifindex_range = { + .min = 1ULL, + .max = 2147483647ULL, +}; + +/* Common nested types */ +const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = { + [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range), + [NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range), +}; + /* NETDEV_CMD_DEV_GET - do */ static const struct nla_policy netdev_dev_get_nl_policy[NETDEV_A_DEV_IFINDEX + 1] = { [NETDEV_A_DEV_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), }; +/* NETDEV_CMD_PAGE_POOL_GET - do */ +#ifdef CONFIG_PAGE_POOL +static const struct nla_policy netdev_page_pool_get_nl_policy[NETDEV_A_PAGE_POOL_ID + 1] = { + [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range), +}; +#endif /* CONFIG_PAGE_POOL */ + +/* NETDEV_CMD_PAGE_POOL_STATS_GET - do */ +#ifdef CONFIG_PAGE_POOL_STATS +static const struct nla_policy netdev_page_pool_stats_get_nl_policy[NETDEV_A_PAGE_POOL_STATS_INFO + 1] = { + [NETDEV_A_PAGE_POOL_STATS_INFO] = NLA_POLICY_NESTED(netdev_page_pool_info_nl_policy), +}; +#endif /* CONFIG_PAGE_POOL_STATS */ + +/* NETDEV_CMD_QUEUE_GET - do */ +static const struct nla_policy netdev_queue_get_do_nl_policy[NETDEV_A_QUEUE_TYPE + 1] = { + [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1), + [NETDEV_A_QUEUE_ID] = { .type = NLA_U32, }, +}; + +/* NETDEV_CMD_QUEUE_GET - dump */ +static const struct nla_policy netdev_queue_get_dump_nl_policy[NETDEV_A_QUEUE_IFINDEX + 1] = { + [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +/* NETDEV_CMD_NAPI_GET - do */ +static const struct nla_policy netdev_napi_get_do_nl_policy[NETDEV_A_NAPI_ID + 1] = { + [NETDEV_A_NAPI_ID] = { .type = NLA_U32, }, +}; + +/* NETDEV_CMD_NAPI_GET - dump */ +static const struct nla_policy netdev_napi_get_dump_nl_policy[NETDEV_A_NAPI_IFINDEX + 1] = { + [NETDEV_A_NAPI_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), +}; + /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -29,10 +82,67 @@ static const struct genl_split_ops netdev_nl_ops[] = { .dumpit = netdev_nl_dev_get_dumpit, .flags = GENL_CMD_CAP_DUMP, }, +#ifdef CONFIG_PAGE_POOL + { + .cmd = NETDEV_CMD_PAGE_POOL_GET, + .doit = netdev_nl_page_pool_get_doit, + .policy = netdev_page_pool_get_nl_policy, + .maxattr = NETDEV_A_PAGE_POOL_ID, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_PAGE_POOL_GET, + .dumpit = netdev_nl_page_pool_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, +#endif /* CONFIG_PAGE_POOL */ +#ifdef CONFIG_PAGE_POOL_STATS + { + .cmd = NETDEV_CMD_PAGE_POOL_STATS_GET, + .doit = netdev_nl_page_pool_stats_get_doit, + .policy = netdev_page_pool_stats_get_nl_policy, + .maxattr = NETDEV_A_PAGE_POOL_STATS_INFO, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_PAGE_POOL_STATS_GET, + .dumpit = netdev_nl_page_pool_stats_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, +#endif /* CONFIG_PAGE_POOL_STATS */ + { + .cmd = NETDEV_CMD_QUEUE_GET, + .doit = netdev_nl_queue_get_doit, + .policy = netdev_queue_get_do_nl_policy, + .maxattr = NETDEV_A_QUEUE_TYPE, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_QUEUE_GET, + .dumpit = netdev_nl_queue_get_dumpit, + .policy = netdev_queue_get_dump_nl_policy, + .maxattr = NETDEV_A_QUEUE_IFINDEX, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = NETDEV_CMD_NAPI_GET, + .doit = netdev_nl_napi_get_doit, + .policy = netdev_napi_get_do_nl_policy, + .maxattr = NETDEV_A_NAPI_ID, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NETDEV_CMD_NAPI_GET, + .dumpit = netdev_nl_napi_get_dumpit, + .policy = netdev_napi_get_dump_nl_policy, + .maxattr = NETDEV_A_NAPI_IFINDEX, + .flags = GENL_CMD_CAP_DUMP, + }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { [NETDEV_NLGRP_MGMT] = { "mgmt", }, + [NETDEV_NLGRP_PAGE_POOL] = { "page-pool", }, }; struct genl_family netdev_nl_family __ro_after_init = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index 7b370c073e..a47f2bcbe4 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -11,11 +11,27 @@ #include <uapi/linux/netdev.h> +/* Common nested types */ +extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1]; + int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int netdev_nl_page_pool_get_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_page_pool_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int netdev_nl_page_pool_stats_get_doit(struct sk_buff *skb, + struct genl_info *info); +int netdev_nl_page_pool_stats_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_queue_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); enum { NETDEV_NLGRP_MGMT, + NETDEV_NLGRP_PAGE_POOL, }; extern struct genl_family netdev_nl_family; diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index fe61f85bcf..918b109e0c 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -6,13 +6,32 @@ #include <net/net_namespace.h> #include <net/sock.h> #include <net/xdp.h> +#include <net/xdp_sock.h> +#include <net/netdev_rx_queue.h> +#include <net/busy_poll.h> #include "netdev-genl-gen.h" +#include "dev.h" + +struct netdev_nl_dump_ctx { + unsigned long ifindex; + unsigned int rxq_idx; + unsigned int txq_idx; + unsigned int napi_id; +}; + +static struct netdev_nl_dump_ctx *netdev_dump_ctx(struct netlink_callback *cb) +{ + NL_ASSERT_DUMP_CTX_FITS(struct netdev_nl_dump_ctx); + + return (struct netdev_nl_dump_ctx *)cb->ctx; +} static int netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info) { + u64 xsk_features = 0; u64 xdp_rx_meta = 0; void *hdr; @@ -26,11 +45,20 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC + if (netdev->xsk_tx_metadata_ops) { + if (netdev->xsk_tx_metadata_ops->tmo_fill_timestamp) + xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP; + if (netdev->xsk_tx_metadata_ops->tmo_request_checksum) + xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM; + } + if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES, netdev->xdp_features, NETDEV_A_DEV_PAD) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, - xdp_rx_meta, NETDEV_A_DEV_PAD)) { + xdp_rx_meta, NETDEV_A_DEV_PAD) || + nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES, + xsk_features, NETDEV_A_DEV_PAD)) { genlmsg_cancel(rsp, hdr); return -EINVAL; } @@ -111,22 +139,325 @@ err_free_msg: int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { + struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; int err = 0; rtnl_lock(); - for_each_netdev_dump(net, netdev, cb->args[0]) { + for_each_netdev_dump(net, netdev, ctx->ifindex) { err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb)); if (err < 0) break; } rtnl_unlock(); - if (err != -EMSGSIZE) + return err; +} + +static int +netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, + const struct genl_info *info) +{ + void *hdr; + pid_t pid; + + if (WARN_ON_ONCE(!napi->dev)) + return -EINVAL; + if (!(napi->dev->flags & IFF_UP)) + return 0; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (napi->napi_id >= MIN_NAPI_ID && + nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id)) + goto nla_put_failure; + + if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex)) + goto nla_put_failure; + + if (napi->irq >= 0 && nla_put_u32(rsp, NETDEV_A_NAPI_IRQ, napi->irq)) + goto nla_put_failure; + + if (napi->thread) { + pid = task_pid_nr(napi->thread); + if (nla_put_u32(rsp, NETDEV_A_NAPI_PID, pid)) + goto nla_put_failure; + } + + genlmsg_end(rsp, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct napi_struct *napi; + struct sk_buff *rsp; + u32 napi_id; + int err; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID)) + return -EINVAL; + + napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + rtnl_lock(); + + napi = napi_by_id(napi_id); + if (napi) + err = netdev_nl_napi_fill_one(rsp, napi, info); + else + err = -EINVAL; + + rtnl_unlock(); + + if (err) + goto err_free_msg; + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} + +static int +netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp, + const struct genl_info *info, + struct netdev_nl_dump_ctx *ctx) +{ + struct napi_struct *napi; + int err = 0; + + if (!(netdev->flags & IFF_UP)) + return err; + + list_for_each_entry(napi, &netdev->napi_list, dev_list) { + if (ctx->napi_id && napi->napi_id >= ctx->napi_id) + continue; + + err = netdev_nl_napi_fill_one(rsp, napi, info); + if (err) + return err; + ctx->napi_id = napi->napi_id; + } + return err; +} + +int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); + const struct genl_info *info = genl_info_dump(cb); + struct net *net = sock_net(skb->sk); + struct net_device *netdev; + u32 ifindex = 0; + int err = 0; + + if (info->attrs[NETDEV_A_NAPI_IFINDEX]) + ifindex = nla_get_u32(info->attrs[NETDEV_A_NAPI_IFINDEX]); + + rtnl_lock(); + if (ifindex) { + netdev = __dev_get_by_index(net, ifindex); + if (netdev) + err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); + else + err = -ENODEV; + } else { + for_each_netdev_dump(net, netdev, ctx->ifindex) { + err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); + if (err < 0) + break; + ctx->napi_id = 0; + } + } + rtnl_unlock(); + + return err; +} + +static int +netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, + u32 q_idx, u32 q_type, const struct genl_info *info) +{ + struct netdev_rx_queue *rxq; + struct netdev_queue *txq; + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, q_idx) || + nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type) || + nla_put_u32(rsp, NETDEV_A_QUEUE_IFINDEX, netdev->ifindex)) + goto nla_put_failure; + + switch (q_type) { + case NETDEV_QUEUE_TYPE_RX: + rxq = __netif_get_rx_queue(netdev, q_idx); + if (rxq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, + rxq->napi->napi_id)) + goto nla_put_failure; + break; + case NETDEV_QUEUE_TYPE_TX: + txq = netdev_get_tx_queue(netdev, q_idx); + if (txq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, + txq->napi->napi_id)) + goto nla_put_failure; + } + + genlmsg_end(rsp, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +static int netdev_nl_queue_validate(struct net_device *netdev, u32 q_id, + u32 q_type) +{ + switch (q_type) { + case NETDEV_QUEUE_TYPE_RX: + if (q_id >= netdev->real_num_rx_queues) + return -EINVAL; + return 0; + case NETDEV_QUEUE_TYPE_TX: + if (q_id >= netdev->real_num_tx_queues) + return -EINVAL; + } + return 0; +} + +static int +netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, + u32 q_type, const struct genl_info *info) +{ + int err = 0; + + if (!(netdev->flags & IFF_UP)) + return err; + + err = netdev_nl_queue_validate(netdev, q_idx, q_type); + if (err) + return err; + + return netdev_nl_queue_fill_one(rsp, netdev, q_idx, q_type, info); +} + +int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + u32 q_id, q_type, ifindex; + struct net_device *netdev; + struct sk_buff *rsp; + int err; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_ID) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX)) + return -EINVAL; + + q_id = nla_get_u32(info->attrs[NETDEV_A_QUEUE_ID]); + q_type = nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]); + ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + rtnl_lock(); + + netdev = __dev_get_by_index(genl_info_net(info), ifindex); + if (netdev) + err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info); + else + err = -ENODEV; + + rtnl_unlock(); + + if (err) + goto err_free_msg; + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} + +static int +netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp, + const struct genl_info *info, + struct netdev_nl_dump_ctx *ctx) +{ + int err = 0; + int i; + + if (!(netdev->flags & IFF_UP)) return err; - return skb->len; + for (i = ctx->rxq_idx; i < netdev->real_num_rx_queues;) { + err = netdev_nl_queue_fill_one(rsp, netdev, i, + NETDEV_QUEUE_TYPE_RX, info); + if (err) + return err; + ctx->rxq_idx = i++; + } + for (i = ctx->txq_idx; i < netdev->real_num_tx_queues;) { + err = netdev_nl_queue_fill_one(rsp, netdev, i, + NETDEV_QUEUE_TYPE_TX, info); + if (err) + return err; + ctx->txq_idx = i++; + } + + return err; +} + +int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); + const struct genl_info *info = genl_info_dump(cb); + struct net *net = sock_net(skb->sk); + struct net_device *netdev; + u32 ifindex = 0; + int err = 0; + + if (info->attrs[NETDEV_A_QUEUE_IFINDEX]) + ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); + + rtnl_lock(); + if (ifindex) { + netdev = __dev_get_by_index(net, ifindex); + if (netdev) + err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); + else + err = -ENODEV; + } else { + for_each_netdev_dump(net, netdev, ctx->ifindex) { + err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); + if (err < 0) + break; + ctx->rxq_idx = 0; + ctx->txq_idx = 0; + } + } + rtnl_unlock(); + + return err; } static int netdev_genl_netdevice_event(struct notifier_block *nb, diff --git a/net/core/page_pool.c b/net/core/page_pool.c index dec5443372..4933762e5a 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -23,10 +23,12 @@ #include <trace/events/page_pool.h> +#include "page_pool_priv.h" + #define DEFER_TIME (msecs_to_jiffies(1000)) #define DEFER_WARN_INTERVAL (60 * HZ) -#define BIAS_MAX LONG_MAX +#define BIAS_MAX (LONG_MAX >> 1) #ifdef CONFIG_PAGE_POOL_STATS /* alloc_stat_inc is intended to be used in softirq context */ @@ -69,7 +71,7 @@ static const char pp_stats[][ETH_GSTRING_LEN] = { * is passed to this API which is filled in. The caller can then report * those stats to the user (perhaps via ethtool, debugfs, etc.). */ -bool page_pool_get_stats(struct page_pool *pool, +bool page_pool_get_stats(const struct page_pool *pool, struct page_pool_stats *stats) { int cpu = 0; @@ -173,7 +175,8 @@ static int page_pool_init(struct page_pool *pool, { unsigned int ring_qsize = 1024; /* Default */ - memcpy(&pool->p, params, sizeof(pool->p)); + memcpy(&pool->p, ¶ms->fast, sizeof(pool->p)); + memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow)); /* Validate only known flags were used */ if (pool->p.flags & ~(PP_FLAG_ALL)) @@ -211,6 +214,8 @@ static int page_pool_init(struct page_pool *pool, */ } + pool->has_init_callback = !!pool->slow.init_callback; + #ifdef CONFIG_PAGE_POOL_STATS pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); if (!pool->recycle_stats) @@ -235,6 +240,18 @@ static int page_pool_init(struct page_pool *pool, return 0; } +static void page_pool_uninit(struct page_pool *pool) +{ + ptr_ring_cleanup(&pool->ring, NULL); + + if (pool->p.flags & PP_FLAG_DMA_MAP) + put_device(pool->p.dev); + +#ifdef CONFIG_PAGE_POOL_STATS + free_percpu(pool->recycle_stats); +#endif +} + /** * page_pool_create() - create a page pool. * @params: parameters, see struct page_pool_params @@ -249,13 +266,21 @@ struct page_pool *page_pool_create(const struct page_pool_params *params) return ERR_PTR(-ENOMEM); err = page_pool_init(pool, params); - if (err < 0) { - pr_warn("%s() gave up with errno %d\n", __func__, err); - kfree(pool); - return ERR_PTR(err); - } + if (err < 0) + goto err_free; + + err = page_pool_list(pool); + if (err) + goto err_uninit; return pool; + +err_uninit: + page_pool_uninit(pool); +err_free: + pr_warn("%s() gave up with errno %d\n", __func__, err); + kfree(pool); + return ERR_PTR(err); } EXPORT_SYMBOL(page_pool_create); @@ -388,8 +413,8 @@ static void page_pool_set_pp_info(struct page_pool *pool, * the overhead is negligible. */ page_pool_fragment_page(page, 1); - if (pool->p.init_callback) - pool->p.init_callback(page, pool->p.init_arg); + if (pool->has_init_callback) + pool->slow.init_callback(page, pool->slow.init_arg); } static void page_pool_clear_pp_info(struct page *page) @@ -504,7 +529,7 @@ EXPORT_SYMBOL(page_pool_alloc_pages); */ #define _distance(a, b) (s32)((a) - (b)) -static s32 page_pool_inflight(struct page_pool *pool) +s32 page_pool_inflight(const struct page_pool *pool, bool strict) { u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); @@ -512,27 +537,27 @@ static s32 page_pool_inflight(struct page_pool *pool) inflight = _distance(hold_cnt, release_cnt); - trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); - WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); + if (strict) { + trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); + WARN(inflight < 0, "Negative(%d) inflight packet-pages", + inflight); + } else { + inflight = max(0, inflight); + } return inflight; } -/* Disconnects a page (from a page_pool). API users can have a need - * to disconnect a page (from a page_pool), to allow it to be used as - * a regular page (that will eventually be returned to the normal - * page-allocator via put_page). - */ -static void page_pool_return_page(struct page_pool *pool, struct page *page) +static __always_inline +void __page_pool_release_page_dma(struct page_pool *pool, struct page *page) { dma_addr_t dma; - int count; if (!(pool->p.flags & PP_FLAG_DMA_MAP)) /* Always account for inflight pages, even if we didn't * map them */ - goto skip_dma_unmap; + return; dma = page_pool_get_dma_addr(page); @@ -541,7 +566,19 @@ static void page_pool_return_page(struct page_pool *pool, struct page *page) PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); page_pool_set_dma_addr(page, 0); -skip_dma_unmap: +} + +/* Disconnects a page (from a page_pool). API users can have a need + * to disconnect a page (from a page_pool), to allow it to be used as + * a regular page (that will eventually be returned to the normal + * page-allocator via put_page). + */ +void page_pool_return_page(struct page_pool *pool, struct page *page) +{ + int count; + + __page_pool_release_page_dma(pool, page); + page_pool_clear_pp_info(page); /* This may be the last page returned, releasing the pool, so @@ -647,8 +684,8 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, return NULL; } -void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, bool allow_direct) +void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) { page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); if (page && !page_pool_recycle_in_ring(pool, page)) { @@ -657,7 +694,7 @@ void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, page_pool_return_page(pool, page); } } -EXPORT_SYMBOL(page_pool_put_defragged_page); +EXPORT_SYMBOL(page_pool_put_unrefed_page); /** * page_pool_put_page_bulk() - release references on multiple pages @@ -684,7 +721,7 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, struct page *page = virt_to_head_page(data[i]); /* It is not the last user for the page frag case */ - if (!page_pool_is_last_frag(page)) + if (!page_pool_is_last_ref(page)) continue; page = __page_pool_put_page(pool, page, -1, false); @@ -726,7 +763,7 @@ static struct page *page_pool_drain_frag(struct page_pool *pool, long drain_count = BIAS_MAX - pool->frag_users; /* Some user is still using the page frag */ - if (likely(page_pool_defrag_page(page, drain_count))) + if (likely(page_pool_unref_page(page, drain_count))) return NULL; if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) { @@ -747,7 +784,7 @@ static void page_pool_free_frag(struct page_pool *pool) pool->frag_page = NULL; - if (!page || page_pool_defrag_page(page, drain_count)) + if (!page || page_pool_unref_page(page, drain_count)) return; page_pool_return_page(pool, page); @@ -818,14 +855,8 @@ static void __page_pool_destroy(struct page_pool *pool) if (pool->disconnect) pool->disconnect(pool); - ptr_ring_cleanup(&pool->ring, NULL); - - if (pool->p.flags & PP_FLAG_DMA_MAP) - put_device(pool->p.dev); - -#ifdef CONFIG_PAGE_POOL_STATS - free_percpu(pool->recycle_stats); -#endif + page_pool_unlist(pool); + page_pool_uninit(pool); kfree(pool); } @@ -862,7 +893,7 @@ static int page_pool_release(struct page_pool *pool) int inflight; page_pool_scrub(pool); - inflight = page_pool_inflight(pool); + inflight = page_pool_inflight(pool, true); if (!inflight) __page_pool_destroy(pool); @@ -873,18 +904,21 @@ static void page_pool_release_retry(struct work_struct *wq) { struct delayed_work *dwq = to_delayed_work(wq); struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); + void *netdev; int inflight; inflight = page_pool_release(pool); if (!inflight) return; - /* Periodic warning */ - if (time_after_eq(jiffies, pool->defer_warn)) { + /* Periodic warning for page pools the user can't see */ + netdev = READ_ONCE(pool->slow.netdev); + if (time_after_eq(jiffies, pool->defer_warn) && + (!netdev || netdev == NET_PTR_POISON)) { int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; - pr_warn("%s() stalled pool shutdown %d inflight %d sec\n", - __func__, inflight, sec); + pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n", + __func__, pool->user.id, inflight, sec); pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; } @@ -929,6 +963,7 @@ void page_pool_destroy(struct page_pool *pool) if (!page_pool_release(pool)) return; + page_pool_detached(pool); pool->defer_start = jiffies; pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; diff --git a/net/core/page_pool_priv.h b/net/core/page_pool_priv.h new file mode 100644 index 0000000000..90665d40f1 --- /dev/null +++ b/net/core/page_pool_priv.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __PAGE_POOL_PRIV_H +#define __PAGE_POOL_PRIV_H + +s32 page_pool_inflight(const struct page_pool *pool, bool strict); + +int page_pool_list(struct page_pool *pool); +void page_pool_detached(struct page_pool *pool); +void page_pool_unlist(struct page_pool *pool); + +#endif diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c new file mode 100644 index 0000000000..3a3277ba16 --- /dev/null +++ b/net/core/page_pool_user.c @@ -0,0 +1,409 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/mutex.h> +#include <linux/netdevice.h> +#include <linux/xarray.h> +#include <net/net_debug.h> +#include <net/page_pool/types.h> +#include <net/page_pool/helpers.h> +#include <net/sock.h> + +#include "page_pool_priv.h" +#include "netdev-genl-gen.h" + +static DEFINE_XARRAY_FLAGS(page_pools, XA_FLAGS_ALLOC1); +/* Protects: page_pools, netdevice->page_pools, pool->slow.netdev, pool->user. + * Ordering: inside rtnl_lock + */ +static DEFINE_MUTEX(page_pools_lock); + +/* Page pools are only reachable from user space (via netlink) if they are + * linked to a netdev at creation time. Following page pool "visibility" + * states are possible: + * - normal + * - user.list: linked to real netdev, netdev: real netdev + * - orphaned - real netdev has disappeared + * - user.list: linked to lo, netdev: lo + * - invisible - either (a) created without netdev linking, (b) unlisted due + * to error, or (c) the entire namespace which owned this pool disappeared + * - user.list: unhashed, netdev: unknown + */ + +typedef int (*pp_nl_fill_cb)(struct sk_buff *rsp, const struct page_pool *pool, + const struct genl_info *info); + +static int +netdev_nl_page_pool_get_do(struct genl_info *info, u32 id, pp_nl_fill_cb fill) +{ + struct page_pool *pool; + struct sk_buff *rsp; + int err; + + mutex_lock(&page_pools_lock); + pool = xa_load(&page_pools, id); + if (!pool || hlist_unhashed(&pool->user.list) || + !net_eq(dev_net(pool->slow.netdev), genl_info_net(info))) { + err = -ENOENT; + goto err_unlock; + } + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) { + err = -ENOMEM; + goto err_unlock; + } + + err = fill(rsp, pool, info); + if (err) + goto err_free_msg; + + mutex_unlock(&page_pools_lock); + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); +err_unlock: + mutex_unlock(&page_pools_lock); + return err; +} + +struct page_pool_dump_cb { + unsigned long ifindex; + u32 pp_id; +}; + +static int +netdev_nl_page_pool_get_dump(struct sk_buff *skb, struct netlink_callback *cb, + pp_nl_fill_cb fill) +{ + struct page_pool_dump_cb *state = (void *)cb->ctx; + const struct genl_info *info = genl_info_dump(cb); + struct net *net = sock_net(skb->sk); + struct net_device *netdev; + struct page_pool *pool; + int err = 0; + + rtnl_lock(); + mutex_lock(&page_pools_lock); + for_each_netdev_dump(net, netdev, state->ifindex) { + hlist_for_each_entry(pool, &netdev->page_pools, user.list) { + if (state->pp_id && state->pp_id < pool->user.id) + continue; + + state->pp_id = pool->user.id; + err = fill(skb, pool, info); + if (err) + goto out; + } + + state->pp_id = 0; + } +out: + mutex_unlock(&page_pools_lock); + rtnl_unlock(); + + return err; +} + +static int +page_pool_nl_stats_fill(struct sk_buff *rsp, const struct page_pool *pool, + const struct genl_info *info) +{ +#ifdef CONFIG_PAGE_POOL_STATS + struct page_pool_stats stats = {}; + struct nlattr *nest; + void *hdr; + + if (!page_pool_get_stats(pool, &stats)) + return 0; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + nest = nla_nest_start(rsp, NETDEV_A_PAGE_POOL_STATS_INFO); + + if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ID, pool->user.id) || + (pool->slow.netdev->ifindex != LOOPBACK_IFINDEX && + nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX, + pool->slow.netdev->ifindex))) + goto err_cancel_nest; + + nla_nest_end(rsp, nest); + + if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_FAST, + stats.alloc_stats.fast) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW, + stats.alloc_stats.slow) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW_HIGH_ORDER, + stats.alloc_stats.slow_high_order) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_EMPTY, + stats.alloc_stats.empty) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_REFILL, + stats.alloc_stats.refill) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_WAIVE, + stats.alloc_stats.waive) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHED, + stats.recycle_stats.cached) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHE_FULL, + stats.recycle_stats.cache_full) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING, + stats.recycle_stats.ring) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING_FULL, + stats.recycle_stats.ring_full) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RELEASED_REFCNT, + stats.recycle_stats.released_refcnt)) + goto err_cancel_msg; + + genlmsg_end(rsp, hdr); + + return 0; +err_cancel_nest: + nla_nest_cancel(rsp, nest); +err_cancel_msg: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +#else + GENL_SET_ERR_MSG(info, "kernel built without CONFIG_PAGE_POOL_STATS"); + return -EOPNOTSUPP; +#endif +} + +int netdev_nl_page_pool_stats_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct nlattr *tb[ARRAY_SIZE(netdev_page_pool_info_nl_policy)]; + struct nlattr *nest; + int err; + u32 id; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_PAGE_POOL_STATS_INFO)) + return -EINVAL; + + nest = info->attrs[NETDEV_A_PAGE_POOL_STATS_INFO]; + err = nla_parse_nested(tb, ARRAY_SIZE(tb) - 1, nest, + netdev_page_pool_info_nl_policy, + info->extack); + if (err) + return err; + + if (NL_REQ_ATTR_CHECK(info->extack, nest, tb, NETDEV_A_PAGE_POOL_ID)) + return -EINVAL; + if (tb[NETDEV_A_PAGE_POOL_IFINDEX]) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NETDEV_A_PAGE_POOL_IFINDEX], + "selecting by ifindex not supported"); + return -EINVAL; + } + + id = nla_get_uint(tb[NETDEV_A_PAGE_POOL_ID]); + + return netdev_nl_page_pool_get_do(info, id, page_pool_nl_stats_fill); +} + +int netdev_nl_page_pool_stats_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_stats_fill); +} + +static int +page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, + const struct genl_info *info) +{ + size_t inflight, refsz; + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ID, pool->user.id)) + goto err_cancel; + + if (pool->slow.netdev->ifindex != LOOPBACK_IFINDEX && + nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX, + pool->slow.netdev->ifindex)) + goto err_cancel; + if (pool->user.napi_id && + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, pool->user.napi_id)) + goto err_cancel; + + inflight = page_pool_inflight(pool, false); + refsz = PAGE_SIZE << pool->p.order; + if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_INFLIGHT, inflight) || + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_INFLIGHT_MEM, + inflight * refsz)) + goto err_cancel; + if (pool->user.detach_time && + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_DETACH_TIME, + pool->user.detach_time)) + goto err_cancel; + + genlmsg_end(rsp, hdr); + + return 0; +err_cancel: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +static void netdev_nl_page_pool_event(const struct page_pool *pool, u32 cmd) +{ + struct genl_info info; + struct sk_buff *ntf; + struct net *net; + + lockdep_assert_held(&page_pools_lock); + + /* 'invisible' page pools don't matter */ + if (hlist_unhashed(&pool->user.list)) + return; + net = dev_net(pool->slow.netdev); + + if (!genl_has_listeners(&netdev_nl_family, net, NETDEV_NLGRP_PAGE_POOL)) + return; + + genl_info_init_ntf(&info, &netdev_nl_family, cmd); + + ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!ntf) + return; + + if (page_pool_nl_fill(ntf, pool, &info)) { + nlmsg_free(ntf); + return; + } + + genlmsg_multicast_netns(&netdev_nl_family, net, ntf, + 0, NETDEV_NLGRP_PAGE_POOL, GFP_KERNEL); +} + +int netdev_nl_page_pool_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + u32 id; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_PAGE_POOL_ID)) + return -EINVAL; + + id = nla_get_uint(info->attrs[NETDEV_A_PAGE_POOL_ID]); + + return netdev_nl_page_pool_get_do(info, id, page_pool_nl_fill); +} + +int netdev_nl_page_pool_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_fill); +} + +int page_pool_list(struct page_pool *pool) +{ + static u32 id_alloc_next; + int err; + + mutex_lock(&page_pools_lock); + err = xa_alloc_cyclic(&page_pools, &pool->user.id, pool, xa_limit_32b, + &id_alloc_next, GFP_KERNEL); + if (err < 0) + goto err_unlock; + + INIT_HLIST_NODE(&pool->user.list); + if (pool->slow.netdev) { + hlist_add_head(&pool->user.list, + &pool->slow.netdev->page_pools); + pool->user.napi_id = pool->p.napi ? pool->p.napi->napi_id : 0; + + netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_ADD_NTF); + } + + mutex_unlock(&page_pools_lock); + return 0; + +err_unlock: + mutex_unlock(&page_pools_lock); + return err; +} + +void page_pool_detached(struct page_pool *pool) +{ + mutex_lock(&page_pools_lock); + pool->user.detach_time = ktime_get_boottime_seconds(); + netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_CHANGE_NTF); + mutex_unlock(&page_pools_lock); +} + +void page_pool_unlist(struct page_pool *pool) +{ + mutex_lock(&page_pools_lock); + netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_DEL_NTF); + xa_erase(&page_pools, pool->user.id); + if (!hlist_unhashed(&pool->user.list)) + hlist_del(&pool->user.list); + mutex_unlock(&page_pools_lock); +} + +static void page_pool_unreg_netdev_wipe(struct net_device *netdev) +{ + struct page_pool *pool; + struct hlist_node *n; + + mutex_lock(&page_pools_lock); + hlist_for_each_entry_safe(pool, n, &netdev->page_pools, user.list) { + hlist_del_init(&pool->user.list); + pool->slow.netdev = NET_PTR_POISON; + } + mutex_unlock(&page_pools_lock); +} + +static void page_pool_unreg_netdev(struct net_device *netdev) +{ + struct page_pool *pool, *last; + struct net_device *lo; + + lo = dev_net(netdev)->loopback_dev; + + mutex_lock(&page_pools_lock); + last = NULL; + hlist_for_each_entry(pool, &netdev->page_pools, user.list) { + pool->slow.netdev = lo; + netdev_nl_page_pool_event(pool, + NETDEV_CMD_PAGE_POOL_CHANGE_NTF); + last = pool; + } + if (last) + hlist_splice_init(&netdev->page_pools, &last->user.list, + &lo->page_pools); + mutex_unlock(&page_pools_lock); +} + +static int +page_pool_netdevice_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + + if (event != NETDEV_UNREGISTER) + return NOTIFY_DONE; + + if (hlist_empty(&netdev->page_pools)) + return NOTIFY_OK; + + if (netdev->ifindex != LOOPBACK_IFINDEX) + page_pool_unreg_netdev(netdev); + else + page_pool_unreg_netdev_wipe(netdev); + return NOTIFY_OK; +} + +static struct notifier_block page_pool_netdevice_nb = { + .notifier_call = page_pool_netdevice_event, +}; + +static int __init page_pool_user_init(void) +{ + return register_netdevice_notifier(&page_pool_netdevice_nb); +} + +subsys_initcall(page_pool_user_init); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 57cea67b75..ea55a758a4 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3669,10 +3669,8 @@ static int pktgen_thread_worker(void *arg) if (unlikely(!pkt_dev && t->control == 0)) { if (t->net->pktgen_exiting) break; - wait_event_interruptible_timeout(t->queue, - t->control != 0, - HZ/10); - try_to_freeze(); + wait_event_freezable_timeout(t->queue, + t->control != 0, HZ / 10); continue; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 8f9cd6b792..bd50e9fe32 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -342,8 +342,7 @@ int rtnl_unregister(int protocol, int msgtype) return -ENOENT; } - link = rtnl_dereference(tab[msgindex]); - RCU_INIT_POINTER(tab[msgindex], NULL); + link = rcu_replace_pointer_rtnl(tab[msgindex], NULL); rtnl_unlock(); kfree_rcu(link, rcu); @@ -368,18 +367,13 @@ void rtnl_unregister_all(int protocol) BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); rtnl_lock(); - tab = rtnl_dereference(rtnl_msg_handlers[protocol]); + tab = rcu_replace_pointer_rtnl(rtnl_msg_handlers[protocol], NULL); if (!tab) { rtnl_unlock(); return; } - RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL); for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) { - link = rtnl_dereference(tab[msgindex]); - if (!link) - continue; - - RCU_INIT_POINTER(tab[msgindex], NULL); + link = rcu_replace_pointer_rtnl(tab[msgindex], NULL); kfree_rcu(link, rcu); } rtnl_unlock(); @@ -1026,14 +1020,17 @@ static size_t rtnl_xdp_size(void) static size_t rtnl_prop_list_size(const struct net_device *dev) { struct netdev_name_node *name_node; - size_t size; + unsigned int cnt = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(name_node, &dev->name_node->list, list) + cnt++; + rcu_read_unlock(); - if (list_empty(&dev->name_node->list)) + if (!cnt) return 0; - size = nla_total_size(0); - list_for_each_entry(name_node, &dev->name_node->list, list) - size += nla_total_size(ALTIFNAMSIZ); - return size; + + return nla_total_size(0) + cnt * nla_total_size(ALTIFNAMSIZ); } static size_t rtnl_proto_down_size(const struct net_device *dev) @@ -1060,7 +1057,7 @@ static size_t rtnl_dpll_pin_size(const struct net_device *dev) { size_t size = nla_total_size(0); /* nest IFLA_DPLL_PIN */ - size += dpll_msg_pin_handle_size(netdev_dpll_pin(dev)); + size += dpll_netdev_pin_handle_size(dev); return size; } @@ -1795,7 +1792,7 @@ static int rtnl_fill_dpll_pin(struct sk_buff *skb, if (!dpll_pin_nest) return -EMSGSIZE; - ret = dpll_msg_add_pin_handle(skb, netdev_dpll_pin(dev)); + ret = dpll_netdev_add_pin_handle(skb, dev); if (ret < 0) goto nest_cancel; @@ -3849,10 +3846,18 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; err = -ENOBUFS; - nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL); + nskb = nlmsg_new_large(if_nlmsg_size(dev, ext_filter_mask)); if (nskb == NULL) goto out; + /* Synchronize the carrier state so we don't report a state + * that we're not actually going to honour immediately; if + * the driver just did a carrier off->on transition, we can + * only TX if link watch work has run, but without this we'd + * already report carrier on, even if it doesn't work yet. + */ + linkwatch_sync_dev(dev); + err = rtnl_fill_ifinfo(nskb, dev, net, RTM_NEWLINK, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, ext_filter_mask, @@ -6407,17 +6412,64 @@ static int rtnl_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, return dev->netdev_ops->ndo_mdb_add(dev, tb, nlh->nlmsg_flags, extack); } +static int rtnl_validate_mdb_entry_del_bulk(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct br_mdb_entry *entry = nla_data(attr); + struct br_mdb_entry zero_entry = {}; + + if (nla_len(attr) != sizeof(struct br_mdb_entry)) { + NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length"); + return -EINVAL; + } + + if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) { + NL_SET_ERR_MSG(extack, "Unknown entry state"); + return -EINVAL; + } + + if (entry->flags) { + NL_SET_ERR_MSG(extack, "Entry flags cannot be set"); + return -EINVAL; + } + + if (entry->vid >= VLAN_N_VID - 1) { + NL_SET_ERR_MSG(extack, "Invalid entry VLAN id"); + return -EINVAL; + } + + if (memcmp(&entry->addr, &zero_entry.addr, sizeof(entry->addr))) { + NL_SET_ERR_MSG(extack, "Entry address cannot be set"); + return -EINVAL; + } + + return 0; +} + +static const struct nla_policy mdba_del_bulk_policy[MDBA_SET_ENTRY_MAX + 1] = { + [MDBA_SET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, + rtnl_validate_mdb_entry_del_bulk, + sizeof(struct br_mdb_entry)), + [MDBA_SET_ENTRY_ATTRS] = { .type = NLA_NESTED }, +}; + static int rtnl_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + bool del_bulk = !!(nlh->nlmsg_flags & NLM_F_BULK); struct nlattr *tb[MDBA_SET_ENTRY_MAX + 1]; struct net *net = sock_net(skb->sk); struct br_port_msg *bpm; struct net_device *dev; int err; - err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb, - MDBA_SET_ENTRY_MAX, mdba_policy, extack); + if (!del_bulk) + err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb, + MDBA_SET_ENTRY_MAX, mdba_policy, + extack); + else + err = nlmsg_parse(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY_MAX, + mdba_del_bulk_policy, extack); if (err) return err; @@ -6438,6 +6490,14 @@ static int rtnl_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; } + if (del_bulk) { + if (!dev->netdev_ops->ndo_mdb_del_bulk) { + NL_SET_ERR_MSG(extack, "Device does not support MDB bulk deletion"); + return -EOPNOTSUPP; + } + return dev->netdev_ops->ndo_mdb_del_bulk(dev, tb, extack); + } + if (!dev->netdev_ops->ndo_mdb_del) { NL_SET_ERR_MSG(extack, "Device does not support MDB operations"); return -EOPNOTSUPP; @@ -6683,5 +6743,6 @@ void __init rtnetlink_init(void) rtnl_register(PF_BRIDGE, RTM_GETMDB, rtnl_mdb_get, rtnl_mdb_dump, 0); rtnl_register(PF_BRIDGE, RTM_NEWMDB, rtnl_mdb_add, NULL, 0); - rtnl_register(PF_BRIDGE, RTM_DELMDB, rtnl_mdb_del, NULL, 0); + rtnl_register(PF_BRIDGE, RTM_DELMDB, rtnl_mdb_del, NULL, + RTNL_FLAG_BULK_DEL_SUPPORTED); } diff --git a/net/core/scm.c b/net/core/scm.c index 737917c7ac..d0e0852a24 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -325,7 +325,7 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) } for (i = 0; i < fdmax; i++) { - err = receive_fd_user(scm->fp->fp[i], cmsg_data + i, o_flags); + err = scm_recv_one_fd(scm->fp->fp[i], cmsg_data + i, o_flags); if (err < 0) break; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 78cb3304fb..71dee435d5 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -337,7 +337,7 @@ static struct sk_buff *napi_skb_cache_get(void) } skb = nc->skb_cache[--nc->skb_count]; - kasan_unpoison_object_data(skbuff_cache, skb); + kasan_mempool_unpoison_object(skb, kmem_cache_size(skbuff_cache)); return skb; } @@ -890,6 +890,11 @@ static void skb_clone_fraglist(struct sk_buff *skb) skb_get(list); } +static bool is_pp_page(struct page *page) +{ + return (page->pp_magic & ~0x3UL) == PP_SIGNATURE; +} + #if IS_ENABLED(CONFIG_PAGE_POOL) bool napi_pp_put_page(struct page *page, bool napi_safe) { @@ -905,7 +910,7 @@ bool napi_pp_put_page(struct page *page, bool napi_safe) * and page_is_pfmemalloc() is checked in __page_pool_put_page() * to avoid recycling the pfmemalloc page. */ - if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) + if (unlikely(!is_pp_page(page))) return false; pp = page->pp; @@ -942,6 +947,37 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) return napi_pp_put_page(virt_to_page(data), napi_safe); } +/** + * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb + * @skb: page pool aware skb + * + * Increase the fragment reference count (pp_ref_count) of a skb. This is + * intended to gain fragment references only for page pool aware skbs, + * i.e. when skb->pp_recycle is true, and not for fragments in a + * non-pp-recycling skb. It has a fallback to increase references on normal + * pages, as page pool aware skbs may also have normal page fragments. + */ +static int skb_pp_frag_ref(struct sk_buff *skb) +{ + struct skb_shared_info *shinfo; + struct page *head_page; + int i; + + if (!skb->pp_recycle) + return -EINVAL; + + shinfo = skb_shinfo(skb); + + for (i = 0; i < shinfo->nr_frags; i++) { + head_page = compound_head(skb_frag_page(&shinfo->frags[i])); + if (likely(is_pp_page(head_page))) + page_pool_ref_page(head_page); + else + page_ref_inc(head_page); + } + return 0; +} + static void skb_kfree_head(void *head, unsigned int end_offset) { if (end_offset == SKB_SMALL_HEAD_HEADROOM) @@ -1309,13 +1345,15 @@ static void napi_skb_cache_put(struct sk_buff *skb) struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); u32 i; - kasan_poison_object_data(skbuff_cache, skb); + if (!kasan_mempool_poison_object(skb)) + return; + nc->skb_cache[nc->skb_count++] = skb; if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++) - kasan_unpoison_object_data(skbuff_cache, - nc->skb_cache[i]); + kasan_mempool_unpoison_object(nc->skb_cache[i], + kmem_cache_size(skbuff_cache)); kmem_cache_free_bulk(skbuff_cache, NAPI_SKB_CACHE_HALF, nc->skb_cache + NAPI_SKB_CACHE_HALF); @@ -5767,17 +5805,12 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, return false; /* In general, avoid mixing page_pool and non-page_pool allocated - * pages within the same SKB. Additionally avoid dealing with clones - * with page_pool pages, in case the SKB is using page_pool fragment - * references (page_pool_alloc_frag()). Since we only take full page - * references for cloned SKBs at the moment that would result in - * inconsistent reference counts. - * In theory we could take full references if @from is cloned and - * !@to->pp_recycle but its tricky (due to potential race with - * the clone disappearing) and rare, so not worth dealing with. + * pages within the same SKB. In theory we could take full + * references if @from is cloned and !@to->pp_recycle but its + * tricky (due to potential race with the clone disappearing) and + * rare, so not worth dealing with. */ - if (to->pp_recycle != from->pp_recycle || - (from->pp_recycle && skb_cloned(from))) + if (to->pp_recycle != from->pp_recycle) return false; if (len <= skb_tailroom(to)) { @@ -5834,8 +5867,10 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, /* if the skb is not cloned this does nothing * since we set nr_frags to 0. */ - for (i = 0; i < from_shinfo->nr_frags; i++) - __skb_frag_ref(&from_shinfo->frags[i]); + if (skb_pp_frag_ref(from)) { + for (i = 0; i < from_shinfo->nr_frags; i++) + __skb_frag_ref(&from_shinfo->frags[i]); + } to->truesize += delta; to->len += len; @@ -5962,6 +5997,31 @@ int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len) } EXPORT_SYMBOL(skb_ensure_writable); +int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev) +{ + int needed_headroom = dev->needed_headroom; + int needed_tailroom = dev->needed_tailroom; + + /* For tail taggers, we need to pad short frames ourselves, to ensure + * that the tail tag does not fail at its role of being at the end of + * the packet, once the conduit interface pads the frame. Account for + * that pad length here, and pad later. + */ + if (unlikely(needed_tailroom && skb->len < ETH_ZLEN)) + needed_tailroom += ETH_ZLEN - skb->len; + /* skb_headroom() returns unsigned int... */ + needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0); + needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0); + + if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb))) + /* No reallocation needed, yay! */ + return 0; + + return pskb_expand_head(skb, needed_headroom, needed_tailroom, + GFP_ATOMIC); +} +EXPORT_SYMBOL(skb_ensure_writable_head_tail); + /* remove VLAN header from packet and update csum accordingly. * expects a non skb_vlan_tag_present skb with a vlan tag payload */ diff --git a/net/core/sock.c b/net/core/sock.c index 20160865ed..9cf404e803 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -283,10 +283,7 @@ __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; EXPORT_SYMBOL(sysctl_rmem_max); __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; - -/* Maximal space eaten by iovec or ancillary data plus some space */ -int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); -EXPORT_SYMBOL(sysctl_optmem_max); +int sysctl_mem_pcpu_rsv __read_mostly = SK_MEMORY_PCPU_RESERVE; int sysctl_tstamp_allow_data __read_mostly = 1; @@ -2658,7 +2655,7 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > - READ_ONCE(sysctl_optmem_max)) + READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) return NULL; skb = alloc_skb(size, priority); @@ -2676,7 +2673,7 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, */ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) { - int optmem_max = READ_ONCE(sysctl_optmem_max); + int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); if ((unsigned int)size <= optmem_max && atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 27d733c0f6..8598466a38 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -411,6 +411,9 @@ static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, struct sock *sk; int err = 0; + if (irqs_disabled()) + return -EOPNOTSUPP; /* locks here are hardirq-unsafe */ + spin_lock_bh(&stab->lock); sk = *psk; if (!sk_test || sk_test == sk) @@ -933,6 +936,9 @@ static long sock_hash_delete_elem(struct bpf_map *map, void *key) struct bpf_shtab_elem *elem; int ret = -ENOENT; + if (irqs_disabled()) + return -EOPNOTSUPP; /* locks here are hardirq-unsafe */ + hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 03f1edb948..986f15e5d6 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -30,6 +30,7 @@ static int int_3600 = 3600; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; +static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -408,6 +409,14 @@ static struct ctl_table net_core_table[] = { .extra1 = &min_rcvbuf, }, { + .procname = "mem_pcpu_rsv", + .data = &sysctl_mem_pcpu_rsv, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_mem_pcpu_rsv, + }, + { .procname = "dev_weight", .data = &weight_p, .maxlen = sizeof(int), @@ -509,13 +518,6 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec, }, { - .procname = "optmem_max", - .data = &sysctl_optmem_max, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tstamp_allow_data", .data = &sysctl_tstamp_allow_data, .maxlen = sizeof(int), @@ -674,6 +676,14 @@ static struct ctl_table netns_core_table[] = { .proc_handler = proc_dointvec_minmax }, { + .procname = "optmem_max", + .data = &init_net.core.sysctl_optmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .extra1 = SYSCTL_ZERO, + .proc_handler = proc_dointvec_minmax + }, + { .procname = "txrehash", .data = &init_net.core.sysctl_txrehash, .maxlen = sizeof(u8), diff --git a/net/core/xdp.c b/net/core/xdp.c index b6f1d6dab3..4869c1c2d8 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -736,6 +736,39 @@ __bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash, return -EOPNOTSUPP; } +/** + * bpf_xdp_metadata_rx_vlan_tag - Get XDP packet outermost VLAN tag + * @ctx: XDP context pointer. + * @vlan_proto: Destination pointer for VLAN Tag protocol identifier (TPID). + * @vlan_tci: Destination pointer for VLAN TCI (VID + DEI + PCP) + * + * In case of success, ``vlan_proto`` contains *Tag protocol identifier (TPID)*, + * usually ``ETH_P_8021Q`` or ``ETH_P_8021AD``, but some networks can use + * custom TPIDs. ``vlan_proto`` is stored in **network byte order (BE)** + * and should be used as follows: + * ``if (vlan_proto == bpf_htons(ETH_P_8021Q)) do_something();`` + * + * ``vlan_tci`` contains the remaining 16 bits of a VLAN tag. + * Driver is expected to provide those in **host byte order (usually LE)**, + * so the bpf program should not perform byte conversion. + * According to 802.1Q standard, *VLAN TCI (Tag control information)* + * is a bit field that contains: + * *VLAN identifier (VID)* that can be read with ``vlan_tci & 0xfff``, + * *Drop eligible indicator (DEI)* - 1 bit, + * *Priority code point (PCP)* - 3 bits. + * For detailed meaning of DEI and PCP, please refer to other sources. + * + * Return: + * * Returns 0 on success or ``-errno`` on error. + * * ``-EOPNOTSUPP`` : device driver doesn't implement kfunc + * * ``-ENODATA`` : VLAN tag was not stripped or is not available + */ +__bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, + __be16 *vlan_proto, u16 *vlan_tci) +{ + return -EOPNOTSUPP; +} + __bpf_kfunc_end_defs(); BTF_SET8_START(xdp_metadata_kfunc_ids) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 4550b68066..ded07e09f8 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -669,7 +669,7 @@ discard: ipv6_pktoptions: if (!((1 << sk->sk_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) { if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo) - np->mcast_oif = inet6_iif(opt_skb); + WRITE_ONCE(np->mcast_oif, inet6_iif(opt_skb)); if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) WRITE_ONCE(np->mcast_hops, ipv6_hdr(opt_skb)->hop_limit); if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass) @@ -889,7 +889,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, u32 exthdrlen = icsk->icsk_ext_hdr_len; struct sockaddr_in sin; - SOCK_DEBUG(sk, "connect: ipv4 mapped\n"); + net_dbg_ratelimited("connect: ipv4 mapped\n"); if (ipv6_only_sock(sk)) return -ENETUNREACH; diff --git a/net/devlink/dev.c b/net/devlink/dev.c index 4fc7adb326..19dbf54074 100644 --- a/net/devlink/dev.c +++ b/net/devlink/dev.c @@ -4,6 +4,7 @@ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ +#include <linux/device.h> #include <net/genetlink.h> #include <net/sock.h> #include "devl_internal.h" @@ -201,7 +202,10 @@ static void devlink_notify(struct devlink *devlink, enum devlink_command cmd) int err; WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL); - WARN_ON(!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)); + WARN_ON(!devl_is_registered(devlink)); + + if (!devlink_nl_notify_need(devlink)) + return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) @@ -213,8 +217,7 @@ static void devlink_notify(struct devlink *devlink, enum devlink_command cmd) return; } - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + devlink_nl_notify_send(devlink, msg); } int devlink_nl_get_doit(struct sk_buff *skb, struct genl_info *info) @@ -424,6 +427,18 @@ static void devlink_reload_netns_change(struct devlink *devlink, devlink_rel_nested_in_notify(devlink); } +static void devlink_reload_reinit_sanity_check(struct devlink *devlink) +{ + WARN_ON(!list_empty(&devlink->trap_policer_list)); + WARN_ON(!list_empty(&devlink->trap_group_list)); + WARN_ON(!list_empty(&devlink->trap_list)); + WARN_ON(!list_empty(&devlink->dpipe_table_list)); + WARN_ON(!list_empty(&devlink->sb_list)); + WARN_ON(!list_empty(&devlink->rate_list)); + WARN_ON(!list_empty(&devlink->linecard_list)); + WARN_ON(!xa_empty(&devlink->ports)); +} + int devlink_reload(struct devlink *devlink, struct net *dest_net, enum devlink_reload_action action, enum devlink_reload_limit limit, @@ -433,6 +448,13 @@ int devlink_reload(struct devlink *devlink, struct net *dest_net, struct net *curr_net; int err; + /* Make sure the reload operations are invoked with the device lock + * held to allow drivers to trigger functionality that expects it + * (e.g., PCI reset) and to close possible races between these + * operations and probe/remove. + */ + device_lock_assert(devlink->dev); + memcpy(remote_reload_stats, devlink->stats.remote_reload_stats, sizeof(remote_reload_stats)); @@ -444,8 +466,10 @@ int devlink_reload(struct devlink *devlink, struct net *dest_net, if (dest_net && !net_eq(dest_net, curr_net)) devlink_reload_netns_change(devlink, curr_net, dest_net); - if (action == DEVLINK_RELOAD_ACTION_DRIVER_REINIT) + if (action == DEVLINK_RELOAD_ACTION_DRIVER_REINIT) { devlink_params_driverinit_load_new(devlink); + devlink_reload_reinit_sanity_check(devlink); + } err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack); devlink_reload_failed_set(devlink, !!err); @@ -977,7 +1001,7 @@ static void __devlink_flash_update_notify(struct devlink *devlink, cmd != DEVLINK_CMD_FLASH_UPDATE_END && cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS); - if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); @@ -988,8 +1012,7 @@ static void __devlink_flash_update_notify(struct devlink *devlink, if (err) goto out_free_msg; - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + devlink_nl_notify_send(devlink, msg); return; out_free_msg: diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 5ea2e2012e..c7a8e13f91 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -91,10 +91,15 @@ extern struct genl_family devlink_nl_family; struct devlink *devlinks_xa_find_get(struct net *net, unsigned long *indexp); +static inline bool __devl_is_registered(struct devlink *devlink) +{ + return xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); +} + static inline bool devl_is_registered(struct devlink *devlink) { devl_assert_locked(devlink); - return xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); + return __devl_is_registered(devlink); } static inline void devl_dev_lock(struct devlink *devlink, bool dev_lock) @@ -180,6 +185,58 @@ int devlink_nl_put_nested_handle(struct sk_buff *msg, struct net *net, struct devlink *devlink, int attrtype); int devlink_nl_msg_reply_and_new(struct sk_buff **msg, struct genl_info *info); +static inline bool devlink_nl_notify_need(struct devlink *devlink) +{ + return genl_has_listeners(&devlink_nl_family, devlink_net(devlink), + DEVLINK_MCGRP_CONFIG); +} + +struct devlink_obj_desc { + struct rcu_head rcu; + const char *bus_name; + const char *dev_name; + unsigned int port_index; + bool port_index_valid; + long data[]; +}; + +static inline void devlink_nl_obj_desc_init(struct devlink_obj_desc *desc, + struct devlink *devlink) +{ + memset(desc, 0, sizeof(*desc)); + desc->bus_name = devlink->dev->bus->name; + desc->dev_name = dev_name(devlink->dev); +} + +static inline void devlink_nl_obj_desc_port_set(struct devlink_obj_desc *desc, + struct devlink_port *devlink_port) +{ + desc->port_index = devlink_port->index; + desc->port_index_valid = true; +} + +int devlink_nl_notify_filter(struct sock *dsk, struct sk_buff *skb, void *data); + +static inline void devlink_nl_notify_send_desc(struct devlink *devlink, + struct sk_buff *msg, + struct devlink_obj_desc *desc) +{ + genlmsg_multicast_netns_filtered(&devlink_nl_family, + devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, + GFP_KERNEL, + devlink_nl_notify_filter, desc); +} + +static inline void devlink_nl_notify_send(struct devlink *devlink, + struct sk_buff *msg) +{ + struct devlink_obj_desc desc; + + devlink_nl_obj_desc_init(&desc, devlink); + devlink_nl_notify_send_desc(devlink, msg, &desc); +} + /* Notify */ void devlink_notify_register(struct devlink *devlink); void devlink_notify_unregister(struct devlink *devlink); diff --git a/net/devlink/health.c b/net/devlink/health.c index 71ae121dc7..acb8c0e174 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -490,12 +490,16 @@ static void devlink_recover_notify(struct devlink_health_reporter *reporter, enum devlink_command cmd) { struct devlink *devlink = reporter->devlink; + struct devlink_obj_desc desc; struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_HEALTH_REPORTER_RECOVER); ASSERT_DEVLINK_REGISTERED(devlink); + if (!devlink_nl_notify_need(devlink)) + return; + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; @@ -506,8 +510,10 @@ static void devlink_recover_notify(struct devlink_health_reporter *reporter, return; } - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, - 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + devlink_nl_obj_desc_init(&desc, devlink); + if (reporter->devlink_port) + devlink_nl_obj_desc_port_set(&desc, reporter->devlink_port); + devlink_nl_notify_send_desc(devlink, msg, &desc); } void diff --git a/net/devlink/linecard.c b/net/devlink/linecard.c index 2f1c317b64..67f70a621d 100644 --- a/net/devlink/linecard.c +++ b/net/devlink/linecard.c @@ -136,7 +136,7 @@ static void devlink_linecard_notify(struct devlink_linecard *linecard, WARN_ON(cmd != DEVLINK_CMD_LINECARD_NEW && cmd != DEVLINK_CMD_LINECARD_DEL); - if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + if (!__devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); @@ -150,8 +150,7 @@ static void devlink_linecard_notify(struct devlink_linecard *linecard, return; } - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + devlink_nl_notify_send(devlink, msg); } void devlink_linecards_notify_register(struct devlink *devlink) diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c index 0f41fded6a..593605c1b1 100644 --- a/net/devlink/netlink.c +++ b/net/devlink/netlink.c @@ -17,6 +17,119 @@ static const struct genl_multicast_group devlink_nl_mcgrps[] = { [DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME }, }; +struct devlink_nl_sock_priv { + struct devlink_obj_desc __rcu *flt; + spinlock_t flt_lock; /* Protects flt. */ +}; + +static void devlink_nl_sock_priv_init(void *priv) +{ + struct devlink_nl_sock_priv *sk_priv = priv; + + spin_lock_init(&sk_priv->flt_lock); +} + +static void devlink_nl_sock_priv_destroy(void *priv) +{ + struct devlink_nl_sock_priv *sk_priv = priv; + struct devlink_obj_desc *flt; + + flt = rcu_dereference_protected(sk_priv->flt, true); + kfree_rcu(flt, rcu); +} + +int devlink_nl_notify_filter_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_nl_sock_priv *sk_priv; + struct nlattr **attrs = info->attrs; + struct devlink_obj_desc *flt; + size_t data_offset = 0; + size_t data_size = 0; + char *pos; + + if (attrs[DEVLINK_ATTR_BUS_NAME]) + data_size = size_add(data_size, + nla_len(attrs[DEVLINK_ATTR_BUS_NAME]) + 1); + if (attrs[DEVLINK_ATTR_DEV_NAME]) + data_size = size_add(data_size, + nla_len(attrs[DEVLINK_ATTR_DEV_NAME]) + 1); + + flt = kzalloc(size_add(sizeof(*flt), data_size), GFP_KERNEL); + if (!flt) + return -ENOMEM; + + pos = (char *) flt->data; + if (attrs[DEVLINK_ATTR_BUS_NAME]) { + data_offset += nla_strscpy(pos, + attrs[DEVLINK_ATTR_BUS_NAME], + data_size) + 1; + flt->bus_name = pos; + pos += data_offset; + } + if (attrs[DEVLINK_ATTR_DEV_NAME]) { + nla_strscpy(pos, attrs[DEVLINK_ATTR_DEV_NAME], + data_size - data_offset); + flt->dev_name = pos; + } + + if (attrs[DEVLINK_ATTR_PORT_INDEX]) { + flt->port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); + flt->port_index_valid = true; + } + + /* Don't attach empty filter. */ + if (!flt->bus_name && !flt->dev_name && !flt->port_index_valid) { + kfree(flt); + flt = NULL; + } + + sk_priv = genl_sk_priv_get(&devlink_nl_family, NETLINK_CB(skb).sk); + if (IS_ERR(sk_priv)) { + kfree(flt); + return PTR_ERR(sk_priv); + } + spin_lock(&sk_priv->flt_lock); + flt = rcu_replace_pointer(sk_priv->flt, flt, + lockdep_is_held(&sk_priv->flt_lock)); + spin_unlock(&sk_priv->flt_lock); + kfree_rcu(flt, rcu); + return 0; +} + +static bool devlink_obj_desc_match(const struct devlink_obj_desc *desc, + const struct devlink_obj_desc *flt) +{ + if (desc->bus_name && flt->bus_name && + strcmp(desc->bus_name, flt->bus_name)) + return false; + if (desc->dev_name && flt->dev_name && + strcmp(desc->dev_name, flt->dev_name)) + return false; + if (desc->port_index_valid && flt->port_index_valid && + desc->port_index != flt->port_index) + return false; + return true; +} + +int devlink_nl_notify_filter(struct sock *dsk, struct sk_buff *skb, void *data) +{ + struct devlink_obj_desc *desc = data; + struct devlink_nl_sock_priv *sk_priv; + struct devlink_obj_desc *flt; + int ret = 0; + + rcu_read_lock(); + sk_priv = __genl_sk_priv_get(&devlink_nl_family, dsk); + if (!IS_ERR_OR_NULL(sk_priv)) { + flt = rcu_dereference(sk_priv->flt); + if (flt) + ret = !devlink_obj_desc_match(desc, flt); + } + rcu_read_unlock(); + return ret; +} + int devlink_nl_put_nested_handle(struct sk_buff *msg, struct net *net, struct devlink *devlink, int attrtype) { @@ -139,6 +252,12 @@ int devlink_nl_pre_doit_port(const struct genl_split_ops *ops, return __devlink_nl_pre_doit(skb, info, DEVLINK_NL_FLAG_NEED_PORT); } +int devlink_nl_pre_doit_dev_lock(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + return __devlink_nl_pre_doit(skb, info, DEVLINK_NL_FLAG_NEED_DEV_LOCK); +} + int devlink_nl_pre_doit_port_optional(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) @@ -163,6 +282,13 @@ void devlink_nl_post_doit(const struct genl_split_ops *ops, __devlink_nl_post_doit(skb, info, 0); } +void +devlink_nl_post_doit_dev_lock(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + __devlink_nl_post_doit(skb, info, DEVLINK_NL_FLAG_NEED_DEV_LOCK); +} + static int devlink_nl_inst_single_dumpit(struct sk_buff *msg, struct netlink_callback *cb, int flags, devlink_nl_dump_one_func_t *dump_one, @@ -244,4 +370,7 @@ struct genl_family devlink_nl_family __ro_after_init = { .resv_start_op = DEVLINK_CMD_SELFTESTS_RUN + 1, .mcgrps = devlink_nl_mcgrps, .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps), + .sock_priv_size = sizeof(struct devlink_nl_sock_priv), + .sock_priv_init = devlink_nl_sock_priv_init, + .sock_priv_destroy = devlink_nl_sock_priv_destroy, }; diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c index 371f27f653..f9786d51f6 100644 --- a/net/devlink/netlink_gen.c +++ b/net/devlink/netlink_gen.c @@ -560,8 +560,15 @@ static const struct nla_policy devlink_selftests_run_nl_policy[DEVLINK_ATTR_SELF [DEVLINK_ATTR_SELFTESTS] = NLA_POLICY_NESTED(devlink_dl_selftest_id_nl_policy), }; +/* DEVLINK_CMD_NOTIFY_FILTER_SET - do */ +static const struct nla_policy devlink_notify_filter_set_nl_policy[DEVLINK_ATTR_PORT_INDEX + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, +}; + /* Ops table for devlink */ -const struct genl_split_ops devlink_nl_ops[73] = { +const struct genl_split_ops devlink_nl_ops[74] = { { .cmd = DEVLINK_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT, @@ -846,9 +853,9 @@ const struct genl_split_ops devlink_nl_ops[73] = { { .cmd = DEVLINK_CMD_RELOAD, .validate = GENL_DONT_VALIDATE_STRICT, - .pre_doit = devlink_nl_pre_doit, + .pre_doit = devlink_nl_pre_doit_dev_lock, .doit = devlink_nl_reload_doit, - .post_doit = devlink_nl_post_doit, + .post_doit = devlink_nl_post_doit_dev_lock, .policy = devlink_reload_nl_policy, .maxattr = DEVLINK_ATTR_RELOAD_LIMITS, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, @@ -1233,4 +1240,11 @@ const struct genl_split_ops devlink_nl_ops[73] = { .maxattr = DEVLINK_ATTR_SELFTESTS, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, + { + .cmd = DEVLINK_CMD_NOTIFY_FILTER_SET, + .doit = devlink_nl_notify_filter_set_doit, + .policy = devlink_notify_filter_set_nl_policy, + .maxattr = DEVLINK_ATTR_PORT_INDEX, + .flags = GENL_CMD_CAP_DO, + }, }; diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h index 0e9e89c31c..8f2bd50ddf 100644 --- a/net/devlink/netlink_gen.h +++ b/net/devlink/netlink_gen.h @@ -16,18 +16,23 @@ extern const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_F extern const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1]; /* Ops table for devlink */ -extern const struct genl_split_ops devlink_nl_ops[73]; +extern const struct genl_split_ops devlink_nl_ops[74]; int devlink_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); int devlink_nl_pre_doit_port(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); +int devlink_nl_pre_doit_dev_lock(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); int devlink_nl_pre_doit_port_optional(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); void devlink_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); +void +devlink_nl_post_doit_dev_lock(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); int devlink_nl_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); @@ -137,5 +142,7 @@ int devlink_nl_selftests_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_selftests_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int devlink_nl_selftests_run_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_notify_filter_set_doit(struct sk_buff *skb, + struct genl_info *info); #endif /* _LINUX_DEVLINK_GEN_H */ diff --git a/net/devlink/param.c b/net/devlink/param.c index d74df09311..22bc3b5005 100644 --- a/net/devlink/param.c +++ b/net/devlink/param.c @@ -343,7 +343,7 @@ static void devlink_param_notify(struct devlink *devlink, * will replay the notifications if the params are added/removed * outside of the lifetime of the instance. */ - if (!devl_is_registered(devlink)) + if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); @@ -356,8 +356,7 @@ static void devlink_param_notify(struct devlink *devlink, return; } - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + devlink_nl_notify_send(devlink, msg); } static void devlink_params_notify(struct devlink *devlink, diff --git a/net/devlink/port.c b/net/devlink/port.c index 2b3c2b1a3e..118d130d2a 100644 --- a/net/devlink/port.c +++ b/net/devlink/port.c @@ -507,12 +507,13 @@ static void devlink_port_notify(struct devlink_port *devlink_port, enum devlink_command cmd) { struct devlink *devlink = devlink_port->devlink; + struct devlink_obj_desc desc; struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL); - if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + if (!__devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); @@ -525,8 +526,9 @@ static void devlink_port_notify(struct devlink_port *devlink_port, return; } - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, - 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + devlink_nl_obj_desc_init(&desc, devlink); + devlink_nl_obj_desc_port_set(&desc, devlink_port); + devlink_nl_notify_send_desc(devlink, msg, &desc); } static void devlink_ports_notify(struct devlink *devlink, diff --git a/net/devlink/rate.c b/net/devlink/rate.c index 94b289b93f..7139e67e93 100644 --- a/net/devlink/rate.c +++ b/net/devlink/rate.c @@ -146,7 +146,7 @@ static void devlink_rate_notify(struct devlink_rate *devlink_rate, WARN_ON(cmd != DEVLINK_CMD_RATE_NEW && cmd != DEVLINK_CMD_RATE_DEL); - if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); @@ -159,8 +159,7 @@ static void devlink_rate_notify(struct devlink_rate *devlink_rate, return; } - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, - 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + devlink_nl_notify_send(devlink, msg); } void devlink_rates_notify_register(struct devlink *devlink) diff --git a/net/devlink/region.c b/net/devlink/region.c index e3bab458db..7319127c59 100644 --- a/net/devlink/region.c +++ b/net/devlink/region.c @@ -234,15 +234,15 @@ static void devlink_nl_region_notify(struct devlink_region *region, struct sk_buff *msg; WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL); - if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + + if (!__devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = devlink_nl_region_notify_build(region, snapshot, cmd, 0, 0); if (IS_ERR(msg)) return; - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, - 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + devlink_nl_notify_send(devlink, msg); } void devlink_regions_notify_register(struct devlink *devlink) diff --git a/net/devlink/trap.c b/net/devlink/trap.c index c26313e7ca..5d18c7424d 100644 --- a/net/devlink/trap.c +++ b/net/devlink/trap.c @@ -1173,7 +1173,8 @@ devlink_trap_group_notify(struct devlink *devlink, WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_GROUP_NEW && cmd != DEVLINK_CMD_TRAP_GROUP_DEL); - if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + + if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); @@ -1187,8 +1188,7 @@ devlink_trap_group_notify(struct devlink *devlink, return; } - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + devlink_nl_notify_send(devlink, msg); } void devlink_trap_groups_notify_register(struct devlink *devlink) @@ -1234,7 +1234,8 @@ static void devlink_trap_notify(struct devlink *devlink, WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_NEW && cmd != DEVLINK_CMD_TRAP_DEL); - if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + + if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); @@ -1247,8 +1248,7 @@ static void devlink_trap_notify(struct devlink *devlink, return; } - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + devlink_nl_notify_send(devlink, msg); } void devlink_traps_notify_register(struct devlink *devlink) @@ -1710,7 +1710,8 @@ devlink_trap_policer_notify(struct devlink *devlink, WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_POLICER_NEW && cmd != DEVLINK_CMD_TRAP_POLICER_DEL); - if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + + if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); @@ -1724,8 +1725,7 @@ devlink_trap_policer_notify(struct devlink *devlink, return; } - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + devlink_nl_notify_send(devlink, msg); } void devlink_trap_policers_notify_register(struct devlink *devlink) diff --git a/net/dns_resolver/Kconfig b/net/dns_resolver/Kconfig index 155b061634..7c2dba273e 100644 --- a/net/dns_resolver/Kconfig +++ b/net/dns_resolver/Kconfig @@ -23,6 +23,6 @@ config DNS_RESOLVER information. To compile this as a module, choose M here: the module will be called - dnsresolver. + dns_resolver. If unsure, say N. diff --git a/net/dsa/tag_ar9331.c b/net/dsa/tag_ar9331.c index 92ce67b93a..cbb588ca73 100644 --- a/net/dsa/tag_ar9331.c +++ b/net/dsa/tag_ar9331.c @@ -89,6 +89,7 @@ static const struct dsa_device_ops ar9331_netdev_ops = { .needed_headroom = AR9331_HDR_LEN, }; +MODULE_DESCRIPTION("DSA tag driver for Atheros AR9331 SoC with built-in switch"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_AR9331, AR9331_NAME); module_dsa_tag_driver(ar9331_netdev_ops); diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 83d283a5d2..8c3c068728 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -335,4 +335,5 @@ static struct dsa_tag_driver *dsa_tag_driver_array[] = { module_dsa_tag_drivers(dsa_tag_driver_array); +MODULE_DESCRIPTION("DSA tag driver for Broadcom switches using in-frame headers"); MODULE_LICENSE("GPL"); diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index 8ed52dd663..2a2c4fb61a 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -406,4 +406,5 @@ static struct dsa_tag_driver *dsa_tag_drivers[] = { module_dsa_tag_drivers(dsa_tag_drivers); +MODULE_DESCRIPTION("DSA tag driver for Marvell switches using DSA headers"); MODULE_LICENSE("GPL"); diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c index 3539141b53..51a1f46a56 100644 --- a/net/dsa/tag_gswip.c +++ b/net/dsa/tag_gswip.c @@ -107,6 +107,7 @@ static const struct dsa_device_ops gswip_netdev_ops = { .needed_headroom = GSWIP_RX_HEADER_LEN, }; +MODULE_DESCRIPTION("DSA tag driver for Lantiq / Intel GSWIP switches"); MODULE_LICENSE("GPL"); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_GSWIP, GSWIP_NAME); diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c index 6e233cd0aa..663b25785d 100644 --- a/net/dsa/tag_hellcreek.c +++ b/net/dsa/tag_hellcreek.c @@ -67,6 +67,7 @@ static const struct dsa_device_ops hellcreek_netdev_ops = { .needed_tailroom = HELLCREEK_TAG_LEN, }; +MODULE_DESCRIPTION("DSA tag driver for Hirschmann Hellcreek TSN switches"); MODULE_LICENSE("Dual MIT/GPL"); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_HELLCREEK, HELLCREEK_NAME); diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 9be341fa88..ee7b272ab7 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -459,4 +459,5 @@ static struct dsa_tag_driver *dsa_tag_driver_array[] = { module_dsa_tag_drivers(dsa_tag_driver_array); +MODULE_DESCRIPTION("DSA tag driver for Microchip 8795/937x/9477/9893 families of switches"); MODULE_LICENSE("GPL"); diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index 1ed8ee2485..258e5d7dc5 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -119,6 +119,7 @@ static const struct dsa_device_ops lan9303_netdev_ops = { .needed_headroom = LAN9303_TAG_LEN, }; +MODULE_DESCRIPTION("DSA tag driver for SMSC/Microchip LAN9303 family of switches"); MODULE_LICENSE("GPL"); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_LAN9303, LAN9303_NAME); diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index 2483785f6a..b670e3c53e 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -102,6 +102,7 @@ static const struct dsa_device_ops mtk_netdev_ops = { .needed_headroom = MTK_HDR_LEN, }; +MODULE_DESCRIPTION("DSA tag driver for Mediatek switches"); MODULE_LICENSE("GPL"); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_MTK, MTK_NAME); diff --git a/net/dsa/tag_none.c b/net/dsa/tag_none.c index 9a473624db..e9c9670a9c 100644 --- a/net/dsa/tag_none.c +++ b/net/dsa/tag_none.c @@ -27,4 +27,5 @@ static const struct dsa_device_ops none_ops = { module_dsa_tag_driver(none_ops); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_NONE, NONE_NAME); +MODULE_DESCRIPTION("DSA no-op tag driver"); MODULE_LICENSE("GPL"); diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index ef2f8fffb2..e0e4300bfb 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -217,4 +217,5 @@ static struct dsa_tag_driver *ocelot_tag_driver_array[] = { module_dsa_tag_drivers(ocelot_tag_driver_array); +MODULE_DESCRIPTION("DSA tag driver for Ocelot family of switches, using NPI port"); MODULE_LICENSE("GPL v2"); diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c index 2100393208..b059381310 100644 --- a/net/dsa/tag_ocelot_8021q.c +++ b/net/dsa/tag_ocelot_8021q.c @@ -133,6 +133,7 @@ static const struct dsa_device_ops ocelot_8021q_netdev_ops = { .promisc_on_conduit = true, }; +MODULE_DESCRIPTION("DSA tag driver for Ocelot family of switches, using VLAN"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_OCELOT_8021Q, OCELOT_8021Q_NAME); diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 6514aa7993..0cf61286b4 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -119,6 +119,7 @@ static const struct dsa_device_ops qca_netdev_ops = { .promisc_on_conduit = true, }; +MODULE_DESCRIPTION("DSA tag driver for Qualcomm Atheros QCA8K switches"); MODULE_LICENSE("GPL"); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_QCA, QCA_NAME); diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c index 4da5bad1a7..feaefa0e17 100644 --- a/net/dsa/tag_rtl4_a.c +++ b/net/dsa/tag_rtl4_a.c @@ -23,7 +23,6 @@ #define RTL4_A_NAME "rtl4a" #define RTL4_A_HDR_LEN 4 -#define RTL4_A_ETHERTYPE 0x8899 #define RTL4_A_PROTOCOL_SHIFT 12 /* * 0x1 = Realtek Remote Control protocol (RRCP) @@ -54,7 +53,7 @@ static struct sk_buff *rtl4a_tag_xmit(struct sk_buff *skb, /* Set Ethertype */ p = (__be16 *)tag; - *p = htons(RTL4_A_ETHERTYPE); + *p = htons(ETH_P_REALTEK); out = (RTL4_A_PROTOCOL_RTL8366RB << RTL4_A_PROTOCOL_SHIFT); /* The lower bits indicate the port number */ @@ -82,7 +81,7 @@ static struct sk_buff *rtl4a_tag_rcv(struct sk_buff *skb, tag = dsa_etype_header_pos_rx(skb); p = (__be16 *)tag; etype = ntohs(*p); - if (etype != RTL4_A_ETHERTYPE) { + if (etype != ETH_P_REALTEK) { /* Not custom, just pass through */ netdev_dbg(dev, "non-realtek ethertype 0x%04x\n", etype); return skb; @@ -122,5 +121,6 @@ static const struct dsa_device_ops rtl4a_netdev_ops = { }; module_dsa_tag_driver(rtl4a_netdev_ops); +MODULE_DESCRIPTION("DSA tag driver for Realtek 4 byte protocol A tags"); MODULE_LICENSE("GPL"); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_RTL4_A, RTL4_A_NAME); diff --git a/net/dsa/tag_rtl8_4.c b/net/dsa/tag_rtl8_4.c index 07e857deba..15c2bae2b4 100644 --- a/net/dsa/tag_rtl8_4.c +++ b/net/dsa/tag_rtl8_4.c @@ -258,4 +258,5 @@ static struct dsa_tag_driver *dsa_tag_drivers[] = { }; module_dsa_tag_drivers(dsa_tag_drivers); +MODULE_DESCRIPTION("DSA tag driver for Realtek 8 byte protocol 4 tags"); MODULE_LICENSE("GPL"); diff --git a/net/dsa/tag_rzn1_a5psw.c b/net/dsa/tag_rzn1_a5psw.c index 2ce866b456..69d51221b1 100644 --- a/net/dsa/tag_rzn1_a5psw.c +++ b/net/dsa/tag_rzn1_a5psw.c @@ -110,6 +110,7 @@ static const struct dsa_device_ops a5psw_netdev_ops = { .needed_headroom = A5PSW_TAG_LEN, }; +MODULE_DESCRIPTION("DSA tag driver for Renesas RZ/N1 A5PSW switch"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_A5PSW, A5PSW_NAME); module_dsa_tag_driver(a5psw_netdev_ops); diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 1fffe8c2b5..2717e9d7b6 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -806,4 +806,5 @@ static struct dsa_tag_driver *sja1105_tag_driver_array[] = { module_dsa_tag_drivers(sja1105_tag_driver_array); +MODULE_DESCRIPTION("DSA tag driver for NXP SJA1105 switches"); MODULE_LICENSE("GPL v2"); diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 1ebb25a8b1..22742a53d6 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -59,6 +59,7 @@ static const struct dsa_device_ops trailer_netdev_ops = { .needed_tailroom = 4, }; +MODULE_DESCRIPTION("DSA tag driver for switches using a trailer tag"); MODULE_LICENSE("GPL"); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_TRAILER, TRAILER_NAME); diff --git a/net/dsa/tag_xrs700x.c b/net/dsa/tag_xrs700x.c index c9c163598e..68d4633ddd 100644 --- a/net/dsa/tag_xrs700x.c +++ b/net/dsa/tag_xrs700x.c @@ -60,6 +60,7 @@ static const struct dsa_device_ops xrs700x_netdev_ops = { .needed_tailroom = 1, }; +MODULE_DESCRIPTION("DSA tag driver for XRS700x switches"); MODULE_LICENSE("GPL"); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_XRS700X, XRS700X_NAME); diff --git a/net/dsa/user.c b/net/dsa/user.c index a82c7f5a1a..b15e71cc34 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -920,30 +920,6 @@ netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev) } EXPORT_SYMBOL_GPL(dsa_enqueue_skb); -static int dsa_realloc_skb(struct sk_buff *skb, struct net_device *dev) -{ - int needed_headroom = dev->needed_headroom; - int needed_tailroom = dev->needed_tailroom; - - /* For tail taggers, we need to pad short frames ourselves, to ensure - * that the tail tag does not fail at its role of being at the end of - * the packet, once the conduit interface pads the frame. Account for - * that pad length here, and pad later. - */ - if (unlikely(needed_tailroom && skb->len < ETH_ZLEN)) - needed_tailroom += ETH_ZLEN - skb->len; - /* skb_headroom() returns unsigned int... */ - needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0); - needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0); - - if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb))) - /* No reallocation needed, yay! */ - return 0; - - return pskb_expand_head(skb, needed_headroom, needed_tailroom, - GFP_ATOMIC); -} - static netdev_tx_t dsa_user_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_user_priv *p = netdev_priv(dev); @@ -956,13 +932,14 @@ static netdev_tx_t dsa_user_xmit(struct sk_buff *skb, struct net_device *dev) /* Handle tx timestamp if any */ dsa_skb_tx_timestamp(p, skb); - if (dsa_realloc_skb(skb, dev)) { + if (skb_ensure_writable_head_tail(skb, dev)) { dev_kfree_skb_any(skb); return NETDEV_TX_OK; } /* needed_tailroom should still be 'warm' in the cache line from - * dsa_realloc_skb(), which has also ensured that padding is safe. + * skb_ensure_writable_head_tail(), which has also ensured that + * padding is safe. */ if (dev->needed_tailroom) eth_skb_pad(skb); diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 2edc8b796a..049c3adeb8 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -164,17 +164,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) eth = (struct ethhdr *)skb->data; skb_pull_inline(skb, ETH_HLEN); - if (unlikely(!ether_addr_equal_64bits(eth->h_dest, - dev->dev_addr))) { - if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { - if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) - skb->pkt_type = PACKET_BROADCAST; - else - skb->pkt_type = PACKET_MULTICAST; - } else { - skb->pkt_type = PACKET_OTHERHOST; - } - } + eth_skb_pkt_type(skb, dev); /* * Some variants of DSA tagging don't have an ethertype field diff --git a/net/ethtool/common.c b/net/ethtool/common.c index b4419fb6df..6b2a360dcd 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -589,8 +589,8 @@ err_free_info: int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max) { + struct ethtool_rxfh_param rxfh = {}; u32 dev_size, current_max = 0; - u32 *indir; int ret; if (!dev->ethtool_ops->get_rxfh_indir_size || @@ -600,21 +600,21 @@ int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max) if (dev_size == 0) return -EOPNOTSUPP; - indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); - if (!indir) + rxfh.indir = kcalloc(dev_size, sizeof(rxfh.indir[0]), GFP_USER); + if (!rxfh.indir) return -ENOMEM; - ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL); + ret = dev->ethtool_ops->get_rxfh(dev, &rxfh); if (ret) goto out; while (dev_size--) - current_max = max(current_max, indir[dev_size]); + current_max = max(current_max, rxfh.indir[dev_size]); *max = current_max; out: - kfree(indir); + kfree(rxfh.indir); return ret; } @@ -661,6 +661,12 @@ int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index) } EXPORT_SYMBOL(ethtool_get_phc_vclocks); +int ethtool_get_ts_info_by_layer(struct net_device *dev, struct ethtool_ts_info *info) +{ + return __ethtool_get_ts_info(dev, info); +} +EXPORT_SYMBOL(ethtool_get_ts_info_by_layer); + const struct ethtool_phy_ops *ethtool_phy_ops; void ethtool_set_ethtool_phy_ops(const struct ethtool_phy_ops *ops) diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 0b0ce4f81c..7519b0818b 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -58,6 +58,9 @@ static struct devlink *netdev_to_devlink_get(struct net_device *dev) u32 ethtool_op_get_link(struct net_device *dev) { + /* Synchronize carrier state with link watch, see also rtnl_getlink() */ + linkwatch_sync_dev(dev); + return netif_carrier_ok(dev) ? 1 : 0; } EXPORT_SYMBOL(ethtool_op_get_link); @@ -969,18 +972,38 @@ static int ethtool_rxnfc_copy_to_user(void __user *useraddr, static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, u32 cmd, void __user *useraddr) { + const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_rxnfc info; size_t info_size = sizeof(info); int rc; - if (!dev->ethtool_ops->set_rxnfc) + if (!ops->set_rxnfc) return -EOPNOTSUPP; rc = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr); if (rc) return rc; - rc = dev->ethtool_ops->set_rxnfc(dev, &info); + if (ops->get_rxfh) { + struct ethtool_rxfh_param rxfh = {}; + + rc = ops->get_rxfh(dev, &rxfh); + if (rc) + return rc; + + /* Sanity check: if symmetric-xor is set, then: + * 1 - no other fields besides IP src/dst and/or L4 src/dst + * 2 - If src is set, dst must also be set + */ + if ((rxfh.input_xfrm & RXH_XFRM_SYM_XOR) && + ((info.data & ~(RXH_IP_SRC | RXH_IP_DST | + RXH_L4_B_0_1 | RXH_L4_B_2_3)) || + (!!(info.data & RXH_IP_SRC) ^ !!(info.data & RXH_IP_DST)) || + (!!(info.data & RXH_L4_B_0_1) ^ !!(info.data & RXH_L4_B_2_3)))) + return -EINVAL; + } + + rc = ops->set_rxnfc(dev, &info); if (rc) return rc; @@ -1058,15 +1081,15 @@ EXPORT_SYMBOL(netdev_rss_key_fill); static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, void __user *useraddr) { - u32 user_size, dev_size; - u32 *indir; + struct ethtool_rxfh_param rxfh = {}; + u32 user_size; int ret; if (!dev->ethtool_ops->get_rxfh_indir_size || !dev->ethtool_ops->get_rxfh) return -EOPNOTSUPP; - dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); - if (dev_size == 0) + rxfh.indir_size = dev->ethtool_ops->get_rxfh_indir_size(dev); + if (rxfh.indir_size == 0) return -EOPNOTSUPP; if (copy_from_user(&user_size, @@ -1075,41 +1098,41 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, return -EFAULT; if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, size), - &dev_size, sizeof(dev_size))) + &rxfh.indir_size, sizeof(rxfh.indir_size))) return -EFAULT; /* If the user buffer size is 0, this is just a query for the * device table size. Otherwise, if it's smaller than the * device table size it's an error. */ - if (user_size < dev_size) + if (user_size < rxfh.indir_size) return user_size == 0 ? 0 : -EINVAL; - indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); - if (!indir) + rxfh.indir = kcalloc(rxfh.indir_size, sizeof(rxfh.indir[0]), GFP_USER); + if (!rxfh.indir) return -ENOMEM; - ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL); + ret = dev->ethtool_ops->get_rxfh(dev, &rxfh); if (ret) goto out; - if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, ring_index[0]), - indir, dev_size * sizeof(indir[0]))) + rxfh.indir, rxfh.indir_size * sizeof(*rxfh.indir))) ret = -EFAULT; out: - kfree(indir); + kfree(rxfh.indir); return ret; } static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, void __user *useraddr) { - struct ethtool_rxnfc rx_rings; - u32 user_size, dev_size, i; - u32 *indir; const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_rxfh_param rxfh_dev = {}; + struct netlink_ext_ack *extack = NULL; + struct ethtool_rxnfc rx_rings; + u32 user_size, i; int ret; u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]); @@ -1117,8 +1140,8 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, !ops->get_rxnfc) return -EOPNOTSUPP; - dev_size = ops->get_rxfh_indir_size(dev); - if (dev_size == 0) + rxfh_dev.indir_size = ops->get_rxfh_indir_size(dev); + if (rxfh_dev.indir_size == 0) return -EOPNOTSUPP; if (copy_from_user(&user_size, @@ -1126,11 +1149,12 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, sizeof(user_size))) return -EFAULT; - if (user_size != 0 && user_size != dev_size) + if (user_size != 0 && user_size != rxfh_dev.indir_size) return -EINVAL; - indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); - if (!indir) + rxfh_dev.indir = kcalloc(rxfh_dev.indir_size, + sizeof(rxfh_dev.indir[0]), GFP_USER); + if (!rxfh_dev.indir) return -ENOMEM; rx_rings.cmd = ETHTOOL_GRXRINGS; @@ -1139,18 +1163,21 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, goto out; if (user_size == 0) { - for (i = 0; i < dev_size; i++) + u32 *indir = rxfh_dev.indir; + + for (i = 0; i < rxfh_dev.indir_size; i++) indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); } else { - ret = ethtool_copy_validate_indir(indir, + ret = ethtool_copy_validate_indir(rxfh_dev.indir, useraddr + ringidx_offset, &rx_rings, - dev_size); + rxfh_dev.indir_size); if (ret) goto out; } - ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE); + rxfh_dev.hfunc = ETH_RSS_HASH_NO_CHANGE; + ret = ops->set_rxfh(dev, &rxfh_dev, extack); if (ret) goto out; @@ -1161,32 +1188,29 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, dev->priv_flags |= IFF_RXFH_CONFIGURED; out: - kfree(indir); + kfree(rxfh_dev.indir); return ret; } static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, void __user *useraddr) { - int ret; const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_rxfh_param rxfh_dev = {}; u32 user_indir_size, user_key_size; - u32 dev_indir_size = 0, dev_key_size = 0; struct ethtool_rxfh rxfh; - u32 total_size; u32 indir_bytes; - u32 *indir = NULL; - u8 dev_hfunc = 0; - u8 *hkey = NULL; u8 *rss_config; + u32 total_size; + int ret; if (!ops->get_rxfh) return -EOPNOTSUPP; if (ops->get_rxfh_indir_size) - dev_indir_size = ops->get_rxfh_indir_size(dev); + rxfh_dev.indir_size = ops->get_rxfh_indir_size(dev); if (ops->get_rxfh_key_size) - dev_key_size = ops->get_rxfh_key_size(dev); + rxfh_dev.key_size = ops->get_rxfh_key_size(dev); if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) return -EFAULT; @@ -1194,44 +1218,46 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, user_key_size = rxfh.key_size; /* Check that reserved fields are 0 for now */ - if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32) + if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd32) return -EINVAL; /* Most drivers don't handle rss_context, check it's 0 as well */ - if (rxfh.rss_context && !ops->get_rxfh_context) + if (rxfh.rss_context && !ops->cap_rss_ctx_supported) return -EOPNOTSUPP; - rxfh.indir_size = dev_indir_size; - rxfh.key_size = dev_key_size; + rxfh.indir_size = rxfh_dev.indir_size; + rxfh.key_size = rxfh_dev.key_size; if (copy_to_user(useraddr, &rxfh, sizeof(rxfh))) return -EFAULT; - if ((user_indir_size && (user_indir_size != dev_indir_size)) || - (user_key_size && (user_key_size != dev_key_size))) + if ((user_indir_size && user_indir_size != rxfh_dev.indir_size) || + (user_key_size && user_key_size != rxfh_dev.key_size)) return -EINVAL; - indir_bytes = user_indir_size * sizeof(indir[0]); + indir_bytes = user_indir_size * sizeof(rxfh_dev.indir[0]); total_size = indir_bytes + user_key_size; rss_config = kzalloc(total_size, GFP_USER); if (!rss_config) return -ENOMEM; if (user_indir_size) - indir = (u32 *)rss_config; + rxfh_dev.indir = (u32 *)rss_config; if (user_key_size) - hkey = rss_config + indir_bytes; + rxfh_dev.key = rss_config + indir_bytes; - if (rxfh.rss_context) - ret = dev->ethtool_ops->get_rxfh_context(dev, indir, hkey, - &dev_hfunc, - rxfh.rss_context); - else - ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc); + rxfh_dev.rss_context = rxfh.rss_context; + + ret = dev->ethtool_ops->get_rxfh(dev, &rxfh_dev); if (ret) goto out; if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, hfunc), - &dev_hfunc, sizeof(rxfh.hfunc))) { + &rxfh_dev.hfunc, sizeof(rxfh.hfunc))) { + ret = -EFAULT; + } else if (copy_to_user(useraddr + + offsetof(struct ethtool_rxfh, input_xfrm), + &rxfh_dev.input_xfrm, + sizeof(rxfh.input_xfrm))) { ret = -EFAULT; } else if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_config[0]), @@ -1247,16 +1273,16 @@ out: static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, void __user *useraddr) { - int ret; + u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]); const struct ethtool_ops *ops = dev->ethtool_ops; + u32 dev_indir_size = 0, dev_key_size = 0, i; + struct ethtool_rxfh_param rxfh_dev = {}; + struct netlink_ext_ack *extack = NULL; struct ethtool_rxnfc rx_rings; struct ethtool_rxfh rxfh; - u32 dev_indir_size = 0, dev_key_size = 0, i; - u32 *indir = NULL, indir_bytes = 0; - u8 *hkey = NULL; + u32 indir_bytes = 0; u8 *rss_config; - u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]); - bool delete = false; + int ret; if (!ops->get_rxnfc || !ops->set_rxfh) return -EOPNOTSUPP; @@ -1270,25 +1296,34 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, return -EFAULT; /* Check that reserved fields are 0 for now */ - if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32) + if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd32) return -EINVAL; /* Most drivers don't handle rss_context, check it's 0 as well */ - if (rxfh.rss_context && !ops->set_rxfh_context) + if (rxfh.rss_context && !ops->cap_rss_ctx_supported) + return -EOPNOTSUPP; + /* Check input data transformation capabilities */ + if (rxfh.input_xfrm && rxfh.input_xfrm != RXH_XFRM_SYM_XOR && + rxfh.input_xfrm != RXH_XFRM_NO_CHANGE) + return -EINVAL; + if ((rxfh.input_xfrm & RXH_XFRM_SYM_XOR) && + !ops->cap_rss_sym_xor_supported) return -EOPNOTSUPP; /* If either indir, hash key or function is valid, proceed further. - * Must request at least one change: indir size, hash key or function. + * Must request at least one change: indir size, hash key, function + * or input transformation. */ if ((rxfh.indir_size && rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE && rxfh.indir_size != dev_indir_size) || (rxfh.key_size && (rxfh.key_size != dev_key_size)) || (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE && - rxfh.key_size == 0 && rxfh.hfunc == ETH_RSS_HASH_NO_CHANGE)) + rxfh.key_size == 0 && rxfh.hfunc == ETH_RSS_HASH_NO_CHANGE && + rxfh.input_xfrm == RXH_XFRM_NO_CHANGE)) return -EINVAL; if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) - indir_bytes = dev_indir_size * sizeof(indir[0]); + indir_bytes = dev_indir_size * sizeof(rxfh_dev.indir[0]); rss_config = kzalloc(indir_bytes + rxfh.key_size, GFP_USER); if (!rss_config) @@ -1305,8 +1340,9 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, */ if (rxfh.indir_size && rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) { - indir = (u32 *)rss_config; - ret = ethtool_copy_validate_indir(indir, + rxfh_dev.indir = (u32 *)rss_config; + rxfh_dev.indir_size = dev_indir_size; + ret = ethtool_copy_validate_indir(rxfh_dev.indir, useraddr + rss_cfg_offset, &rx_rings, rxfh.indir_size); @@ -1314,17 +1350,22 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, goto out; } else if (rxfh.indir_size == 0) { if (rxfh.rss_context == 0) { - indir = (u32 *)rss_config; + u32 *indir; + + rxfh_dev.indir = (u32 *)rss_config; + rxfh_dev.indir_size = dev_indir_size; + indir = rxfh_dev.indir; for (i = 0; i < dev_indir_size; i++) indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); } else { - delete = true; + rxfh_dev.rss_delete = true; } } if (rxfh.key_size) { - hkey = rss_config + indir_bytes; - if (copy_from_user(hkey, + rxfh_dev.key_size = dev_key_size; + rxfh_dev.key = rss_config + indir_bytes; + if (copy_from_user(rxfh_dev.key, useraddr + rss_cfg_offset + indir_bytes, rxfh.key_size)) { ret = -EFAULT; @@ -1332,19 +1373,19 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, } } - if (rxfh.rss_context) - ret = ops->set_rxfh_context(dev, indir, hkey, rxfh.hfunc, - &rxfh.rss_context, delete); - else - ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc); + rxfh_dev.hfunc = rxfh.hfunc; + rxfh_dev.rss_context = rxfh.rss_context; + rxfh_dev.input_xfrm = rxfh.input_xfrm; + + ret = ops->set_rxfh(dev, &rxfh_dev, extack); if (ret) goto out; if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_context), - &rxfh.rss_context, sizeof(rxfh.rss_context))) + &rxfh_dev.rss_context, sizeof(rxfh_dev.rss_context))) ret = -EFAULT; - if (!rxfh.rss_context) { + if (!rxfh_dev.rss_context) { /* indicate whether rxfh was set to default */ if (rxfh.indir_size == 0) dev->priv_flags &= ~IFF_RXFH_CONFIGURED; @@ -1991,6 +2032,13 @@ __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...) } EXPORT_SYMBOL(ethtool_sprintf); +void ethtool_puts(u8 **data, const char *str) +{ + strscpy(*data, str, ETH_GSTRING_LEN); + *data += ETH_GSTRING_LEN; +} +EXPORT_SYMBOL(ethtool_puts); + static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) { struct ethtool_value id; diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c index fb09f774ea..b7865a14fd 100644 --- a/net/ethtool/rings.c +++ b/net/ethtool/rings.c @@ -124,6 +124,8 @@ const struct nla_policy ethnl_rings_set_policy[] = { [ETHTOOL_A_RINGS_RX_JUMBO] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_TX] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_RX_BUF_LEN] = NLA_POLICY_MIN(NLA_U32, 1), + [ETHTOOL_A_RINGS_TCP_DATA_SPLIT] = + NLA_POLICY_MAX(NLA_U8, ETHTOOL_TCP_DATA_SPLIT_ENABLED), [ETHTOOL_A_RINGS_CQE_SIZE] = NLA_POLICY_MIN(NLA_U32, 1), [ETHTOOL_A_RINGS_TX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_RINGS_RX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1), @@ -145,6 +147,14 @@ ethnl_set_rings_validate(struct ethnl_req_info *req_info, return -EOPNOTSUPP; } + if (tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT] && + !(ops->supported_ring_params & ETHTOOL_RING_USE_TCP_DATA_SPLIT)) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT], + "setting TCP data split is not supported"); + return -EOPNOTSUPP; + } + if (tb[ETHTOOL_A_RINGS_CQE_SIZE] && !(ops->supported_ring_params & ETHTOOL_RING_USE_CQE_SIZE)) { NL_SET_ERR_MSG_ATTR(info->extack, @@ -202,6 +212,8 @@ ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info) ethnl_update_u32(&ringparam.tx_pending, tb[ETHTOOL_A_RINGS_TX], &mod); ethnl_update_u32(&kernel_ringparam.rx_buf_len, tb[ETHTOOL_A_RINGS_RX_BUF_LEN], &mod); + ethnl_update_u8(&kernel_ringparam.tcp_data_split, + tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT], &mod); ethnl_update_u32(&kernel_ringparam.cqe_size, tb[ETHTOOL_A_RINGS_CQE_SIZE], &mod); ethnl_update_u8(&kernel_ringparam.tx_push, diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 5764202e6c..71679137ef 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -13,6 +13,7 @@ struct rss_reply_data { u32 indir_size; u32 hkey_size; u32 hfunc; + u32 input_xfrm; u32 *indir_table; u8 *hkey; }; @@ -48,9 +49,9 @@ rss_prepare_data(const struct ethnl_req_info *req_base, struct rss_reply_data *data = RSS_REPDATA(reply_base); struct rss_req_info *request = RSS_REQINFO(req_base); struct net_device *dev = reply_base->dev; + struct ethtool_rxfh_param rxfh = {}; const struct ethtool_ops *ops; u32 total_size, indir_bytes; - u8 dev_hfunc = 0; u8 *rss_config; int ret; @@ -59,7 +60,7 @@ rss_prepare_data(const struct ethnl_req_info *req_base, return -EOPNOTSUPP; /* Some drivers don't handle rss_context */ - if (request->rss_context && !ops->get_rxfh_context) + if (request->rss_context && !ops->cap_rss_ctx_supported) return -EOPNOTSUPP; ret = ethnl_ops_begin(dev); @@ -83,21 +84,21 @@ rss_prepare_data(const struct ethnl_req_info *req_base, if (data->indir_size) data->indir_table = (u32 *)rss_config; - if (data->hkey_size) data->hkey = rss_config + indir_bytes; - if (request->rss_context) - ret = ops->get_rxfh_context(dev, data->indir_table, data->hkey, - &dev_hfunc, request->rss_context); - else - ret = ops->get_rxfh(dev, data->indir_table, data->hkey, - &dev_hfunc); + rxfh.indir_size = data->indir_size; + rxfh.indir = data->indir_table; + rxfh.key_size = data->hkey_size; + rxfh.key = data->hkey; + rxfh.rss_context = request->rss_context; + ret = ops->get_rxfh(dev, &rxfh); if (ret) goto out_ops; - data->hfunc = dev_hfunc; + data->hfunc = rxfh.hfunc; + data->input_xfrm = rxfh.input_xfrm; out_ops: ethnl_ops_complete(dev); return ret; @@ -111,6 +112,7 @@ rss_reply_size(const struct ethnl_req_info *req_base, int len; len = nla_total_size(sizeof(u32)) + /* _RSS_HFUNC */ + nla_total_size(sizeof(u32)) + /* _RSS_INPUT_XFRM */ nla_total_size(sizeof(u32) * data->indir_size) + /* _RSS_INDIR */ nla_total_size(data->hkey_size); /* _RSS_HKEY */ @@ -125,6 +127,8 @@ rss_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, if ((data->hfunc && nla_put_u32(skb, ETHTOOL_A_RSS_HFUNC, data->hfunc)) || + (data->input_xfrm && + nla_put_u32(skb, ETHTOOL_A_RSS_INPUT_XFRM, data->input_xfrm)) || (data->indir_size && nla_put(skb, ETHTOOL_A_RSS_INDIR, sizeof(u32) * data->indir_size, data->indir_table)) || diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index dd4b5f0aa1..9d71b66183 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -173,7 +173,24 @@ static int hsr_dev_open(struct net_device *dev) static int hsr_dev_close(struct net_device *dev) { - /* Nothing to do here. */ + struct hsr_port *port; + struct hsr_priv *hsr; + + hsr = netdev_priv(dev); + hsr_for_each_port(hsr, port) { + if (port->type == HSR_PT_MASTER) + continue; + switch (port->type) { + case HSR_PT_SLAVE_A: + case HSR_PT_SLAVE_B: + dev_uc_unsync(port->dev, dev); + dev_mc_unsync(port->dev, dev); + break; + default: + break; + } + } + return 0; } @@ -404,12 +421,60 @@ void hsr_del_ports(struct hsr_priv *hsr) hsr_del_port(port); } +static void hsr_set_rx_mode(struct net_device *dev) +{ + struct hsr_port *port; + struct hsr_priv *hsr; + + hsr = netdev_priv(dev); + + hsr_for_each_port(hsr, port) { + if (port->type == HSR_PT_MASTER) + continue; + switch (port->type) { + case HSR_PT_SLAVE_A: + case HSR_PT_SLAVE_B: + dev_mc_sync_multiple(port->dev, dev); + dev_uc_sync_multiple(port->dev, dev); + break; + default: + break; + } + } +} + +static void hsr_change_rx_flags(struct net_device *dev, int change) +{ + struct hsr_port *port; + struct hsr_priv *hsr; + + hsr = netdev_priv(dev); + + hsr_for_each_port(hsr, port) { + if (port->type == HSR_PT_MASTER) + continue; + switch (port->type) { + case HSR_PT_SLAVE_A: + case HSR_PT_SLAVE_B: + if (change & IFF_ALLMULTI) + dev_set_allmulti(port->dev, + dev->flags & + IFF_ALLMULTI ? 1 : -1); + break; + default: + break; + } + } +} + static const struct net_device_ops hsr_device_ops = { .ndo_change_mtu = hsr_dev_change_mtu, .ndo_open = hsr_dev_open, .ndo_stop = hsr_dev_close, .ndo_start_xmit = hsr_dev_xmit, + .ndo_change_rx_flags = hsr_change_rx_flags, .ndo_fix_features = hsr_fix_features, + .ndo_set_rx_mode = hsr_set_rx_mode, }; static struct device_type hsr_type = { diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index 0323ab5023..5d68cb1816 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -435,7 +435,7 @@ static void hsr_forward_do(struct hsr_frame_info *frame) continue; /* Don't send frame over port where it has been sent before. - * Also fro SAN, this shouldn't be done. + * Also for SAN, this shouldn't be done. */ if (!frame->is_from_san && hsr_register_frame_out(port, frame->node_src, diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 257b50124c..9756e657ba 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -174,4 +174,5 @@ static void __exit hsr_exit(void) module_init(hsr_init); module_exit(hsr_exit); +MODULE_DESCRIPTION("High-availability Seamless Redundancy (HSR) driver"); MODULE_LICENSE("GPL"); diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index e5742f2a2d..1b6457f357 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -220,7 +220,8 @@ void hsr_del_port(struct hsr_port *port) netdev_update_features(master->dev); dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); netdev_rx_handler_unregister(port->dev); - dev_set_promiscuity(port->dev, -1); + if (!port->hsr->fwd_offloaded) + dev_set_promiscuity(port->dev, -1); netdev_upper_dev_unlink(port->dev, master->dev); } diff --git a/net/ieee802154/Makefile b/net/ieee802154/Makefile index f05b7bdae2..7bce67673e 100644 --- a/net/ieee802154/Makefile +++ b/net/ieee802154/Makefile @@ -4,7 +4,7 @@ obj-$(CONFIG_IEEE802154_SOCKET) += ieee802154_socket.o obj-y += 6lowpan/ ieee802154-y := netlink.o nl-mac.o nl-phy.o nl_policy.o core.o \ - header_ops.o sysfs.o nl802154.o trace.o + header_ops.o sysfs.o nl802154.o trace.o pan.o ieee802154_socket-y := socket.o CFLAGS_trace.o := -I$(src) diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index 57546e07e0..60e8fff134 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -198,6 +198,25 @@ void wpan_phy_free(struct wpan_phy *phy) } EXPORT_SYMBOL(wpan_phy_free); +static void cfg802154_free_peer_structures(struct wpan_dev *wpan_dev) +{ + struct ieee802154_pan_device *child, *tmp; + + mutex_lock(&wpan_dev->association_lock); + + kfree(wpan_dev->parent); + wpan_dev->parent = NULL; + + list_for_each_entry_safe(child, tmp, &wpan_dev->children, node) { + list_del(&child->node); + kfree(child); + } + + wpan_dev->nchildren = 0; + + mutex_unlock(&wpan_dev->association_lock); +} + int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, struct net *net) { @@ -276,6 +295,9 @@ static int cfg802154_netdev_notifier_call(struct notifier_block *nb, wpan_dev->identifier = ++rdev->wpan_dev_id; list_add_rcu(&wpan_dev->list, &rdev->wpan_dev_list); rdev->devlist_generation++; + mutex_init(&wpan_dev->association_lock); + INIT_LIST_HEAD(&wpan_dev->children); + wpan_dev->max_associations = SZ_16K; wpan_dev->netdev = dev; break; @@ -291,6 +313,8 @@ static int cfg802154_netdev_notifier_call(struct notifier_block *nb, rdev->opencount++; break; case NETDEV_UNREGISTER: + cfg802154_free_peer_structures(wpan_dev); + /* It is possible to get NETDEV_UNREGISTER * multiple times. To detect that, check * that the interface is still on the list diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 1a265a4213..7eb37de3ad 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -234,6 +234,8 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = { NL802154_SCAN_DONE_REASON_ABORTED), [NL802154_ATTR_BEACON_INTERVAL] = NLA_POLICY_MAX(NLA_U8, IEEE802154_ACTIVE_SCAN_DURATION), + [NL802154_ATTR_MAX_ASSOCIATIONS] = { .type = NLA_U32 }, + [NL802154_ATTR_PEER] = { .type = NLA_NESTED }, #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL [NL802154_ATTR_SEC_ENABLED] = { .type = NLA_U8, }, @@ -248,7 +250,6 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = { #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ }; -#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL static int nl802154_prepare_wpan_dev_dump(struct sk_buff *skb, struct netlink_callback *cb, @@ -307,7 +308,6 @@ nl802154_finish_wpan_dev_dump(struct cfg802154_registered_device *rdev) { rtnl_unlock(); } -#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ /* message building helper */ static inline void *nl802154hdr_put(struct sk_buff *skb, u32 portid, u32 seq, @@ -1087,15 +1087,14 @@ static int nl802154_set_pan_id(struct sk_buff *skb, struct genl_info *info) pan_id = nla_get_le16(info->attrs[NL802154_ATTR_PAN_ID]); - /* TODO - * I am not sure about to check here on broadcast pan_id. - * Broadcast is a valid setting, comment from 802.15.4: - * If this value is 0xffff, the device is not associated. - * - * This could useful to simple deassociate an device. + /* Only allow changing the PAN ID when the device has no more + * associations ongoing to avoid confusing peers. */ - if (pan_id == cpu_to_le16(IEEE802154_PAN_ID_BROADCAST)) + if (cfg802154_device_is_associated(wpan_dev)) { + NL_SET_ERR_MSG(info->extack, + "Existing associations, changing PAN ID forbidden"); return -EINVAL; + } return rdev_set_pan_id(rdev, wpan_dev, pan_id); } @@ -1123,20 +1122,17 @@ static int nl802154_set_short_addr(struct sk_buff *skb, struct genl_info *info) short_addr = nla_get_le16(info->attrs[NL802154_ATTR_SHORT_ADDR]); - /* TODO - * I am not sure about to check here on broadcast short_addr. - * Broadcast is a valid setting, comment from 802.15.4: - * A value of 0xfffe indicates that the device has - * associated but has not been allocated an address. A - * value of 0xffff indicates that the device does not - * have a short address. - * - * I think we should allow to set these settings but - * don't allow to allow socket communication with it. + /* The short address only has a meaning when part of a PAN, after a + * proper association procedure. However, we want to still offer the + * possibility to create static networks so changing the short address + * is only allowed when not already associated to other devices with + * the official handshake. */ - if (short_addr == cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC) || - short_addr == cpu_to_le16(IEEE802154_ADDR_SHORT_BROADCAST)) + if (cfg802154_device_is_associated(wpan_dev)) { + NL_SET_ERR_MSG(info->extack, + "Existing associations, changing short address forbidden"); return -EINVAL; + } return rdev_set_short_addr(rdev, wpan_dev, short_addr); } @@ -1638,6 +1634,189 @@ nl802154_stop_beacons(struct sk_buff *skb, struct genl_info *info) return rdev_stop_beacons(rdev, wpan_dev); } +static int nl802154_associate(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev; + struct wpan_phy *wpan_phy; + struct ieee802154_addr coord; + int err; + + wpan_dev = dev->ieee802154_ptr; + wpan_phy = &rdev->wpan_phy; + + if (wpan_phy->flags & WPAN_PHY_FLAG_DATAGRAMS_ONLY) { + NL_SET_ERR_MSG(info->extack, "PHY only supports datagrams"); + return -EOPNOTSUPP; + } + + if (!info->attrs[NL802154_ATTR_PAN_ID] || + !info->attrs[NL802154_ATTR_EXTENDED_ADDR]) + return -EINVAL; + + coord.pan_id = nla_get_le16(info->attrs[NL802154_ATTR_PAN_ID]); + coord.mode = IEEE802154_ADDR_LONG; + coord.extended_addr = nla_get_le64(info->attrs[NL802154_ATTR_EXTENDED_ADDR]); + + mutex_lock(&wpan_dev->association_lock); + err = rdev_associate(rdev, wpan_dev, &coord); + mutex_unlock(&wpan_dev->association_lock); + if (err) + pr_err("Association with PAN ID 0x%x failed (%d)\n", + le16_to_cpu(coord.pan_id), err); + + return err; +} + +static int nl802154_disassociate(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + struct wpan_phy *wpan_phy = &rdev->wpan_phy; + struct ieee802154_addr target; + + if (wpan_phy->flags & WPAN_PHY_FLAG_DATAGRAMS_ONLY) { + NL_SET_ERR_MSG(info->extack, "PHY only supports datagrams"); + return -EOPNOTSUPP; + } + + target.pan_id = wpan_dev->pan_id; + + if (info->attrs[NL802154_ATTR_EXTENDED_ADDR]) { + target.mode = IEEE802154_ADDR_LONG; + target.extended_addr = nla_get_le64(info->attrs[NL802154_ATTR_EXTENDED_ADDR]); + } else if (info->attrs[NL802154_ATTR_SHORT_ADDR]) { + target.mode = IEEE802154_ADDR_SHORT; + target.short_addr = nla_get_le16(info->attrs[NL802154_ATTR_SHORT_ADDR]); + } else { + NL_SET_ERR_MSG(info->extack, "Device address is missing"); + return -EINVAL; + } + + mutex_lock(&wpan_dev->association_lock); + rdev_disassociate(rdev, wpan_dev, &target); + mutex_unlock(&wpan_dev->association_lock); + + return 0; +} + +static int nl802154_set_max_associations(struct sk_buff *skb, struct genl_info *info) +{ + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + unsigned int max_assoc; + + if (!info->attrs[NL802154_ATTR_MAX_ASSOCIATIONS]) { + NL_SET_ERR_MSG(info->extack, "No maximum number of association given"); + return -EINVAL; + } + + max_assoc = nla_get_u32(info->attrs[NL802154_ATTR_MAX_ASSOCIATIONS]); + + mutex_lock(&wpan_dev->association_lock); + cfg802154_set_max_associations(wpan_dev, max_assoc); + mutex_unlock(&wpan_dev->association_lock); + + return 0; +} + +static int nl802154_send_peer_info(struct sk_buff *msg, + struct netlink_callback *cb, + u32 seq, int flags, + struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + struct ieee802154_pan_device *peer, + enum nl802154_peer_type type) +{ + struct nlattr *nla; + void *hdr; + + ASSERT_RTNL(); + + hdr = nl802154hdr_put(msg, NETLINK_CB(cb->skb).portid, seq, flags, + NL802154_CMD_LIST_ASSOCIATIONS); + if (!hdr) + return -ENOBUFS; + + genl_dump_check_consistent(cb, hdr); + + nla = nla_nest_start_noflag(msg, NL802154_ATTR_PEER); + if (!nla) + goto nla_put_failure; + + if (nla_put_u8(msg, NL802154_DEV_ADDR_ATTR_PEER_TYPE, type)) + goto nla_put_failure; + + if (nla_put_u8(msg, NL802154_DEV_ADDR_ATTR_MODE, peer->mode)) + goto nla_put_failure; + + if (nla_put(msg, NL802154_DEV_ADDR_ATTR_SHORT, + IEEE802154_SHORT_ADDR_LEN, &peer->short_addr)) + goto nla_put_failure; + + if (nla_put(msg, NL802154_DEV_ADDR_ATTR_EXTENDED, + IEEE802154_EXTENDED_ADDR_LEN, &peer->extended_addr)) + goto nla_put_failure; + + nla_nest_end(msg, nla); + + genlmsg_end(msg, hdr); + + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int nl802154_list_associations(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct cfg802154_registered_device *rdev; + struct ieee802154_pan_device *child; + struct wpan_dev *wpan_dev; + int err; + + err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev); + if (err) + return err; + + mutex_lock(&wpan_dev->association_lock); + + if (cb->args[2]) + goto out; + + if (wpan_dev->parent) { + err = nl802154_send_peer_info(skb, cb, cb->nlh->nlmsg_seq, + NLM_F_MULTI, rdev, wpan_dev, + wpan_dev->parent, + NL802154_PEER_TYPE_PARENT); + if (err < 0) + goto out_err; + } + + list_for_each_entry(child, &wpan_dev->children, node) { + err = nl802154_send_peer_info(skb, cb, cb->nlh->nlmsg_seq, + NLM_F_MULTI, rdev, wpan_dev, + child, + NL802154_PEER_TYPE_CHILD); + if (err < 0) + goto out_err; + } + + cb->args[2] = 1; +out: + err = skb->len; +out_err: + mutex_unlock(&wpan_dev->association_lock); + + nl802154_finish_wpan_dev_dump(rdev); + + return err; +} + #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL static const struct nla_policy nl802154_dev_addr_policy[NL802154_DEV_ADDR_ATTR_MAX + 1] = { [NL802154_DEV_ADDR_ATTR_PAN_ID] = { .type = NLA_U16 }, @@ -2759,6 +2938,34 @@ static const struct genl_ops nl802154_ops[] = { NL802154_FLAG_CHECK_NETDEV_UP | NL802154_FLAG_NEED_RTNL, }, + { + .cmd = NL802154_CMD_ASSOCIATE, + .doit = nl802154_associate, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_CHECK_NETDEV_UP | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_DISASSOCIATE, + .doit = nl802154_disassociate, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_CHECK_NETDEV_UP | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_SET_MAX_ASSOCIATIONS, + .doit = nl802154_set_max_associations, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, + { + .cmd = NL802154_CMD_LIST_ASSOCIATIONS, + .dumpit = nl802154_list_associations, + /* can be retrieved by unprivileged users */ + }, #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL { .cmd = NL802154_CMD_SET_SEC_PARAMS, diff --git a/net/ieee802154/pan.c b/net/ieee802154/pan.c new file mode 100644 index 0000000000..249df7364b --- /dev/null +++ b/net/ieee802154/pan.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * IEEE 802.15.4 PAN management + * + * Copyright (C) 2023 Qorvo US, Inc + * Authors: + * - David Girault <david.girault@qorvo.com> + * - Miquel Raynal <miquel.raynal@bootlin.com> + */ + +#include <linux/kernel.h> +#include <net/cfg802154.h> +#include <net/af_ieee802154.h> + +/* Checks whether a device address matches one from the PAN list. + * This helper is meant to be used only during PAN management, when we expect + * extended addresses to be used. + */ +static bool cfg802154_pan_device_is_matching(struct ieee802154_pan_device *pan_dev, + struct ieee802154_addr *ext_dev) +{ + if (!pan_dev || !ext_dev) + return false; + + if (ext_dev->mode == IEEE802154_ADDR_SHORT) + return false; + + return pan_dev->extended_addr == ext_dev->extended_addr; +} + +bool cfg802154_device_is_associated(struct wpan_dev *wpan_dev) +{ + bool is_assoc; + + mutex_lock(&wpan_dev->association_lock); + is_assoc = !list_empty(&wpan_dev->children) || wpan_dev->parent; + mutex_unlock(&wpan_dev->association_lock); + + return is_assoc; +} + +bool cfg802154_device_is_parent(struct wpan_dev *wpan_dev, + struct ieee802154_addr *target) +{ + lockdep_assert_held(&wpan_dev->association_lock); + + return cfg802154_pan_device_is_matching(wpan_dev->parent, target); +} +EXPORT_SYMBOL_GPL(cfg802154_device_is_parent); + +struct ieee802154_pan_device * +cfg802154_device_is_child(struct wpan_dev *wpan_dev, + struct ieee802154_addr *target) +{ + struct ieee802154_pan_device *child; + + lockdep_assert_held(&wpan_dev->association_lock); + + list_for_each_entry(child, &wpan_dev->children, node) + if (cfg802154_pan_device_is_matching(child, target)) + return child; + + return NULL; +} +EXPORT_SYMBOL_GPL(cfg802154_device_is_child); + +__le16 cfg802154_get_free_short_addr(struct wpan_dev *wpan_dev) +{ + struct ieee802154_pan_device *child; + __le16 addr; + + lockdep_assert_held(&wpan_dev->association_lock); + + do { + get_random_bytes(&addr, 2); + if (addr == cpu_to_le16(IEEE802154_ADDR_SHORT_BROADCAST) || + addr == cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC)) + continue; + + if (wpan_dev->short_addr == addr) + continue; + + if (wpan_dev->parent && wpan_dev->parent->short_addr == addr) + continue; + + list_for_each_entry(child, &wpan_dev->children, node) + if (child->short_addr == addr) + continue; + + break; + } while (1); + + return addr; +} +EXPORT_SYMBOL_GPL(cfg802154_get_free_short_addr); + +unsigned int cfg802154_set_max_associations(struct wpan_dev *wpan_dev, + unsigned int max) +{ + unsigned int old_max; + + lockdep_assert_held(&wpan_dev->association_lock); + + old_max = wpan_dev->max_associations; + wpan_dev->max_associations = max; + + return old_max; +} +EXPORT_SYMBOL_GPL(cfg802154_set_max_associations); diff --git a/net/ieee802154/rdev-ops.h b/net/ieee802154/rdev-ops.h index 5eaae15c61..64071ef6f5 100644 --- a/net/ieee802154/rdev-ops.h +++ b/net/ieee802154/rdev-ops.h @@ -265,6 +265,36 @@ static inline int rdev_stop_beacons(struct cfg802154_registered_device *rdev, return ret; } +static inline int rdev_associate(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + struct ieee802154_addr *coord) +{ + int ret; + + if (!rdev->ops->associate) + return -EOPNOTSUPP; + + trace_802154_rdev_associate(&rdev->wpan_phy, wpan_dev, coord); + ret = rdev->ops->associate(&rdev->wpan_phy, wpan_dev, coord); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; +} + +static inline int rdev_disassociate(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + struct ieee802154_addr *target) +{ + int ret; + + if (!rdev->ops->disassociate) + return -EOPNOTSUPP; + + trace_802154_rdev_disassociate(&rdev->wpan_phy, wpan_dev, target); + ret = rdev->ops->disassociate(&rdev->wpan_phy, wpan_dev, target); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; +} + #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL /* TODO this is already a nl802154, so move into ieee802154 */ static inline void diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h index c16db0b326..62aa646525 100644 --- a/net/ieee802154/trace.h +++ b/net/ieee802154/trace.h @@ -356,6 +356,44 @@ DEFINE_EVENT(802154_wdev_template, 802154_rdev_stop_beacons, TP_ARGS(wpan_phy, wpan_dev) ); +TRACE_EVENT(802154_rdev_associate, + TP_PROTO(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev, + struct ieee802154_addr *coord), + TP_ARGS(wpan_phy, wpan_dev, coord), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + WPAN_DEV_ENTRY + __field(__le64, addr) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + WPAN_DEV_ASSIGN; + __entry->addr = coord->extended_addr; + ), + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", associating with: 0x%llx", + WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, __entry->addr) +); + +TRACE_EVENT(802154_rdev_disassociate, + TP_PROTO(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev, + struct ieee802154_addr *target), + TP_ARGS(wpan_phy, wpan_dev, target), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + WPAN_DEV_ENTRY + __field(__le64, addr) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + WPAN_DEV_ASSIGN; + __entry->addr = target->extended_addr; + ), + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", disassociating with: 0x%llx", + WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, __entry->addr) +); + TRACE_EVENT(802154_rdev_return_int, TP_PROTO(struct wpan_phy *wpan_phy, int ret), TP_ARGS(wpan_phy, ret), diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index e144a02a6a..ec36d2ec05 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -16,8 +16,6 @@ obj-y := route.o inetpeer.o protocol.o \ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \ metrics.o netlink.o nexthop.o udp_tunnel_stub.o -obj-$(CONFIG_BPFILTER) += bpfilter/ - obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o obj-$(CONFIG_PROC_FS) += proc.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e59962f34c..a5a820ee20 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1853,9 +1853,7 @@ static __net_init int inet_init_net(struct net *net) /* * Set defaults for local port range */ - seqlock_init(&net->ipv4.ip_local_ports.lock); - net->ipv4.ip_local_ports.range[0] = 32768; - net->ipv4.ip_local_ports.range[1] = 60999; + net->ipv4.ip_local_ports.range = 60999u << 16 | 32768u; seqlock_init(&net->ipv4.ping_group_range.lock); /* diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index a2e6e1fdf8..64aec3dff8 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -597,5 +597,6 @@ static void __exit ah4_fini(void) module_init(ah4_init); module_exit(ah4_fini); +MODULE_DESCRIPTION("IPv4 AH transformation library"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_AH); diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 39dcccf0f1..ae8b15e689 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -271,6 +271,74 @@ static int bpf_tcp_ca_validate(void *kdata) return tcp_validate_congestion_control(kdata); } +static u32 bpf_tcp_ca_ssthresh(struct sock *sk) +{ + return 0; +} + +static void bpf_tcp_ca_cong_avoid(struct sock *sk, u32 ack, u32 acked) +{ +} + +static void bpf_tcp_ca_set_state(struct sock *sk, u8 new_state) +{ +} + +static void bpf_tcp_ca_cwnd_event(struct sock *sk, enum tcp_ca_event ev) +{ +} + +static void bpf_tcp_ca_in_ack_event(struct sock *sk, u32 flags) +{ +} + +static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *sample) +{ +} + +static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk) +{ + return 0; +} + +static void bpf_tcp_ca_cong_control(struct sock *sk, const struct rate_sample *rs) +{ +} + +static u32 bpf_tcp_ca_undo_cwnd(struct sock *sk) +{ + return 0; +} + +static u32 bpf_tcp_ca_sndbuf_expand(struct sock *sk) +{ + return 0; +} + +static void __bpf_tcp_ca_init(struct sock *sk) +{ +} + +static void __bpf_tcp_ca_release(struct sock *sk) +{ +} + +static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = { + .ssthresh = bpf_tcp_ca_ssthresh, + .cong_avoid = bpf_tcp_ca_cong_avoid, + .set_state = bpf_tcp_ca_set_state, + .cwnd_event = bpf_tcp_ca_cwnd_event, + .in_ack_event = bpf_tcp_ca_in_ack_event, + .pkts_acked = bpf_tcp_ca_pkts_acked, + .min_tso_segs = bpf_tcp_ca_min_tso_segs, + .cong_control = bpf_tcp_ca_cong_control, + .undo_cwnd = bpf_tcp_ca_undo_cwnd, + .sndbuf_expand = bpf_tcp_ca_sndbuf_expand, + + .init = __bpf_tcp_ca_init, + .release = __bpf_tcp_ca_release, +}; + struct bpf_struct_ops bpf_tcp_congestion_ops = { .verifier_ops = &bpf_tcp_ca_verifier_ops, .reg = bpf_tcp_ca_reg, @@ -281,6 +349,7 @@ struct bpf_struct_ops bpf_tcp_congestion_ops = { .init = bpf_tcp_ca_init, .validate = bpf_tcp_ca_validate, .name = "tcp_congestion_ops", + .cfi_stubs = &__bpf_ops_tcp_congestion_ops, }; static int __init bpf_tcp_ca_kfunc_init(void) diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile deleted file mode 100644 index 00af5305e0..0000000000 --- a/net/ipv4/bpfilter/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_BPFILTER) += sockopt.o diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c deleted file mode 100644 index 193bcc2acc..0000000000 --- a/net/ipv4/bpfilter/sockopt.c +++ /dev/null @@ -1,71 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/init.h> -#include <linux/module.h> -#include <linux/uaccess.h> -#include <linux/bpfilter.h> -#include <uapi/linux/bpf.h> -#include <linux/wait.h> -#include <linux/kmod.h> -#include <linux/fs.h> -#include <linux/file.h> - -struct bpfilter_umh_ops bpfilter_ops; -EXPORT_SYMBOL_GPL(bpfilter_ops); - -static int bpfilter_mbox_request(struct sock *sk, int optname, sockptr_t optval, - unsigned int optlen, bool is_set) -{ - int err; - mutex_lock(&bpfilter_ops.lock); - if (!bpfilter_ops.sockopt) { - mutex_unlock(&bpfilter_ops.lock); - request_module("bpfilter"); - mutex_lock(&bpfilter_ops.lock); - - if (!bpfilter_ops.sockopt) { - err = -ENOPROTOOPT; - goto out; - } - } - if (bpfilter_ops.info.tgid && - thread_group_exited(bpfilter_ops.info.tgid)) - umd_cleanup_helper(&bpfilter_ops.info); - - if (!bpfilter_ops.info.tgid) { - err = bpfilter_ops.start(); - if (err) - goto out; - } - err = bpfilter_ops.sockopt(sk, optname, optval, optlen, is_set); -out: - mutex_unlock(&bpfilter_ops.lock); - return err; -} - -int bpfilter_ip_set_sockopt(struct sock *sk, int optname, sockptr_t optval, - unsigned int optlen) -{ - return bpfilter_mbox_request(sk, optname, optval, optlen, true); -} - -int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, - int __user *optlen) -{ - int len; - - if (get_user(len, optlen)) - return -EFAULT; - - return bpfilter_mbox_request(sk, optname, USER_SOCKPTR(optval), len, - false); -} - -static int __init bpfilter_sockopt_init(void) -{ - mutex_init(&bpfilter_ops.lock); - bpfilter_ops.info.tgid = NULL; - bpfilter_ops.info.driver_name = "bpfilter_umh"; - - return 0; -} -device_initcall(bpfilter_sockopt_init); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index fe501d2186..d33d124218 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -1247,5 +1247,6 @@ static void __exit esp4_fini(void) module_init(esp4_init); module_exit(esp4_fini); +MODULE_DESCRIPTION("IPv4 ESP transformation library"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_ESP); diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 513f475c6a..5bdd1c0160 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -395,13 +395,13 @@ static int fib_default_rules_init(struct fib_rules_ops *ops) { int err; - err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, 0); + err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL); if (err < 0) return err; - err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0); + err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN); if (err < 0) return err; - err = fib_default_rule_add(ops, 0x7FFF, RT_TABLE_DEFAULT, 0); + err = fib_default_rule_add(ops, 0x7FFF, RT_TABLE_DEFAULT); if (err < 0) return err; return 0; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 9bdfdab906..3ff35f8117 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -52,6 +52,7 @@ #include <linux/if_arp.h> #include <linux/proc_fs.h> #include <linux/rcupdate.h> +#include <linux/rcupdate_wait.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/init.h> diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index e63a3bf996..437e782b96 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -92,6 +92,7 @@ #include <net/inet_common.h> #include <net/ip_fib.h> #include <net/l3mdev.h> +#include <net/addrconf.h> /* * Build xmit assembly blocks @@ -1032,6 +1033,8 @@ bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr) struct icmp_ext_hdr *ext_hdr, _ext_hdr; struct icmp_ext_echo_iio *iio, _iio; struct net *net = dev_net(skb->dev); + struct inet6_dev *in6_dev; + struct in_device *in_dev; struct net_device *dev; char buff[IFNAMSIZ]; u16 ident_len; @@ -1115,10 +1118,15 @@ bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr) /* Fill bits in reply message */ if (dev->flags & IFF_UP) status |= ICMP_EXT_ECHOREPLY_ACTIVE; - if (__in_dev_get_rcu(dev) && __in_dev_get_rcu(dev)->ifa_list) + + in_dev = __in_dev_get_rcu(dev); + if (in_dev && rcu_access_pointer(in_dev->ifa_list)) status |= ICMP_EXT_ECHOREPLY_IPV4; - if (!list_empty(&rcu_dereference(dev->ip6_ptr)->addr_list)) + + in6_dev = __in6_dev_get(dev); + if (in6_dev && !list_empty(&in6_dev->addr_list)) status |= ICMP_EXT_ECHOREPLY_IPV6; + dev_put(dev); icmphdr->un.echo.sequence |= htons(status); return true; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 762817d6c8..d1492c649a 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -117,37 +117,39 @@ bool inet_rcv_saddr_any(const struct sock *sk) return !sk->sk_rcv_saddr; } -void inet_get_local_port_range(const struct net *net, int *low, int *high) -{ - unsigned int seq; - - do { - seq = read_seqbegin(&net->ipv4.ip_local_ports.lock); - - *low = net->ipv4.ip_local_ports.range[0]; - *high = net->ipv4.ip_local_ports.range[1]; - } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq)); -} -EXPORT_SYMBOL(inet_get_local_port_range); - -void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high) +/** + * inet_sk_get_local_port_range - fetch ephemeral ports range + * @sk: socket + * @low: pointer to low port + * @high: pointer to high port + * + * Fetch netns port range (/proc/sys/net/ipv4/ip_local_port_range) + * Range can be overridden if socket got IP_LOCAL_PORT_RANGE option. + * Returns true if IP_LOCAL_PORT_RANGE was set on this socket. + */ +bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high) { - const struct inet_sock *inet = inet_sk(sk); - const struct net *net = sock_net(sk); int lo, hi, sk_lo, sk_hi; + bool local_range = false; + u32 sk_range; - inet_get_local_port_range(net, &lo, &hi); + inet_get_local_port_range(sock_net(sk), &lo, &hi); - sk_lo = inet->local_port_range.lo; - sk_hi = inet->local_port_range.hi; + sk_range = READ_ONCE(inet_sk(sk)->local_port_range); + if (unlikely(sk_range)) { + sk_lo = sk_range & 0xffff; + sk_hi = sk_range >> 16; - if (unlikely(lo <= sk_lo && sk_lo <= hi)) - lo = sk_lo; - if (unlikely(lo <= sk_hi && sk_hi <= hi)) - hi = sk_hi; + if (lo <= sk_lo && sk_lo <= hi) + lo = sk_lo; + if (lo <= sk_hi && sk_hi <= hi) + hi = sk_hi; + local_range = true; + } *low = lo; *high = hi; + return local_range; } EXPORT_SYMBOL(inet_sk_get_local_port_range); @@ -157,8 +159,11 @@ static bool inet_use_bhash2_on_bind(const struct sock *sk) if (sk->sk_family == AF_INET6) { int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); - return addr_type != IPV6_ADDR_ANY && - addr_type != IPV6_ADDR_MAPPED; + if (addr_type == IPV6_ADDR_ANY) + return false; + + if (addr_type != IPV6_ADDR_MAPPED) + return true; } #endif return sk->sk_rcv_saddr != htonl(INADDR_ANY); @@ -198,8 +203,15 @@ static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2, kuid_t sk_uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { - if (sk->sk_family == AF_INET && ipv6_only_sock(sk2)) - return false; + if (ipv6_only_sock(sk2)) { + if (sk->sk_family == AF_INET) + return false; + +#if IS_ENABLED(CONFIG_IPV6) + if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) + return false; +#endif + } return inet_bind_conflict(sk, sk2, sk_uid, relax, reuseport_cb_ok, reuseport_ok); @@ -211,18 +223,9 @@ static bool inet_bhash2_conflict(const struct sock *sk, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { - struct inet_timewait_sock *tw2; struct sock *sk2; - sk_for_each_bound_bhash2(sk2, &tb2->owners) { - if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax, - reuseport_cb_ok, reuseport_ok)) - return true; - } - - twsk_for_each_bound_bhash2(tw2, &tb2->deathrow) { - sk2 = (struct sock *)tw2; - + sk_for_each_bound(sk2, &tb2->owners) { if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax, reuseport_cb_ok, reuseport_ok)) return true; @@ -231,15 +234,20 @@ static bool inet_bhash2_conflict(const struct sock *sk, return false; } +#define sk_for_each_bound_bhash(__sk, __tb2, __tb) \ + hlist_for_each_entry(__tb2, &(__tb)->bhash2, bhash_node) \ + sk_for_each_bound(sk2, &(__tb2)->owners) + /* This should be called only when the tb and tb2 hashbuckets' locks are held */ static int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb, const struct inet_bind2_bucket *tb2, /* may be null */ bool relax, bool reuseport_ok) { - bool reuseport_cb_ok; - struct sock_reuseport *reuseport_cb; kuid_t uid = sock_i_uid((struct sock *)sk); + struct sock_reuseport *reuseport_cb; + bool reuseport_cb_ok; + struct sock *sk2; rcu_read_lock(); reuseport_cb = rcu_dereference(sk->sk_reuseport_cb); @@ -247,32 +255,29 @@ static int inet_csk_bind_conflict(const struct sock *sk, reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks); rcu_read_unlock(); - /* - * Unlike other sk lookup places we do not check + /* Conflicts with an existing IPV6_ADDR_ANY (if ipv6) or INADDR_ANY (if + * ipv4) should have been checked already. We need to do these two + * checks separately because their spinlocks have to be acquired/released + * independently of each other, to prevent possible deadlocks + */ + if (inet_use_bhash2_on_bind(sk)) + return tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, + reuseport_cb_ok, reuseport_ok); + + /* Unlike other sk lookup places we do not check * for sk_net here, since _all_ the socks listed * in tb->owners and tb2->owners list belong * to the same net - the one this bucket belongs to. */ + sk_for_each_bound_bhash(sk2, tb2, tb) { + if (!inet_bind_conflict(sk, sk2, uid, relax, reuseport_cb_ok, reuseport_ok)) + continue; - if (!inet_use_bhash2_on_bind(sk)) { - struct sock *sk2; - - sk_for_each_bound(sk2, &tb->owners) - if (inet_bind_conflict(sk, sk2, uid, relax, - reuseport_cb_ok, reuseport_ok) && - inet_rcv_saddr_equal(sk, sk2, true)) - return true; - - return false; + if (inet_rcv_saddr_equal(sk, sk2, true)) + return true; } - /* Conflicts with an existing IPV6_ADDR_ANY (if ipv6) or INADDR_ANY (if - * ipv4) should have been checked already. We need to do these two - * checks separately because their spinlocks have to be acquired/released - * independently of each other, to prevent possible deadlocks - */ - return tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, - reuseport_ok); + return false; } /* Determine if there is a bind conflict with an existing IPV6_ADDR_ANY (if ipv6) or @@ -289,6 +294,7 @@ static bool inet_bhash2_addr_any_conflict(const struct sock *sk, int port, int l struct sock_reuseport *reuseport_cb; struct inet_bind_hashbucket *head2; struct inet_bind2_bucket *tb2; + bool conflict = false; bool reuseport_cb_ok; rcu_read_lock(); @@ -301,18 +307,20 @@ static bool inet_bhash2_addr_any_conflict(const struct sock *sk, int port, int l spin_lock(&head2->lock); - inet_bind_bucket_for_each(tb2, &head2->chain) - if (inet_bind2_bucket_match_addr_any(tb2, net, port, l3mdev, sk)) - break; + inet_bind_bucket_for_each(tb2, &head2->chain) { + if (!inet_bind2_bucket_match_addr_any(tb2, net, port, l3mdev, sk)) + continue; - if (tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, - reuseport_ok)) { - spin_unlock(&head2->lock); - return true; + if (!inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, reuseport_ok)) + continue; + + conflict = true; + break; } spin_unlock(&head2->lock); - return false; + + return conflict; } /* @@ -455,7 +463,7 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, kuid_t uid = sock_i_uid(sk); bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; - if (hlist_empty(&tb->owners)) { + if (hlist_empty(&tb->bhash2)) { tb->fastreuse = reuse; if (sk->sk_reuseport) { tb->fastreuseport = FASTREUSEPORT_ANY; @@ -547,7 +555,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) } if (!found_port) { - if (!hlist_empty(&tb->owners)) { + if (!hlist_empty(&tb->bhash2)) { if (sk->sk_reuse == SK_FORCE_REUSE || (tb->fastreuse > 0 && reuse) || sk_reuseport_match(tb, sk)) @@ -567,7 +575,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) if (!tb2) { tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, - net, head2, port, l3mdev, sk); + net, head2, tb, sk); if (!tb2) goto fail_unlock; bhash2_created = true; @@ -589,11 +597,10 @@ success: fail_unlock: if (ret) { + if (bhash2_created) + inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, tb2); if (bhash_created) inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); - if (bhash2_created) - inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, - tb2); } if (head2_lock_acquired) spin_unlock(&head2->lock); @@ -774,6 +781,20 @@ void inet_csk_clear_xmit_timers(struct sock *sk) } EXPORT_SYMBOL(inet_csk_clear_xmit_timers); +void inet_csk_clear_xmit_timers_sync(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + /* ongoing timer handlers need to acquire socket lock. */ + sock_not_owned_by_me(sk); + + icsk->icsk_pending = icsk->icsk_ack.pending = 0; + + sk_stop_timer_sync(sk, &icsk->icsk_retransmit_timer); + sk_stop_timer_sync(sk, &icsk->icsk_delack_timer); + sk_stop_timer_sync(sk, &sk->sk_timer); +} + void inet_csk_delete_keepalive_timer(struct sock *sk) { sk_stop_timer(sk, &sk->sk_timer); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 5f7fdbd01c..9804e9608a 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -1077,10 +1077,94 @@ skip_listen_ht: s_i = num = s_num = 0; } +/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets + * with bh disabled. + */ +#define SKARR_SZ 16 + + /* Dump bound but inactive (not listening, connecting, etc.) sockets */ + if (cb->args[0] == 1) { + if (!(idiag_states & TCPF_BOUND_INACTIVE)) + goto skip_bind_ht; + + for (i = s_i; i < hashinfo->bhash_size; i++) { + struct inet_bind_hashbucket *ibb; + struct inet_bind2_bucket *tb2; + struct sock *sk_arr[SKARR_SZ]; + int num_arr[SKARR_SZ]; + int idx, accum, res; + +resume_bind_walk: + num = 0; + accum = 0; + ibb = &hashinfo->bhash2[i]; + + spin_lock_bh(&ibb->lock); + inet_bind_bucket_for_each(tb2, &ibb->chain) { + if (!net_eq(ib2_net(tb2), net)) + continue; + + sk_for_each_bound(sk, &tb2->owners) { + struct inet_sock *inet = inet_sk(sk); + + if (num < s_num) + goto next_bind; + + if (sk->sk_state != TCP_CLOSE || + !inet->inet_num) + goto next_bind; + + if (r->sdiag_family != AF_UNSPEC && + r->sdiag_family != sk->sk_family) + goto next_bind; + + if (!inet_diag_bc_sk(bc, sk)) + goto next_bind; + + sock_hold(sk); + num_arr[accum] = num; + sk_arr[accum] = sk; + if (++accum == SKARR_SZ) + goto pause_bind_walk; +next_bind: + num++; + } + } +pause_bind_walk: + spin_unlock_bh(&ibb->lock); + + res = 0; + for (idx = 0; idx < accum; idx++) { + if (res >= 0) { + res = inet_sk_diag_fill(sk_arr[idx], + NULL, skb, cb, + r, NLM_F_MULTI, + net_admin); + if (res < 0) + num = num_arr[idx]; + } + sock_put(sk_arr[idx]); + } + if (res < 0) + goto done; + + cond_resched(); + + if (accum == SKARR_SZ) { + s_num = num + 1; + goto resume_bind_walk; + } + + s_num = 0; + } +skip_bind_ht: + cb->args[0] = 2; + s_i = num = s_num = 0; + } + if (!(idiag_states & ~TCPF_LISTEN)) goto out; -#define SKARR_SZ 16 for (i = s_i; i <= hashinfo->ehash_mask; i++) { struct inet_ehash_bucket *head = &hashinfo->ehash[i]; spinlock_t *lock = inet_ehash_lockp(hashinfo, i); diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 7072fc0783..c88c9034d6 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -24,6 +24,8 @@ #include <net/ip.h> #include <net/ipv6.h> +#include "../core/sock_destructor.h" + /* Use skb->cb to track consecutive/adjacent fragments coming at * the end of the queue. Nodes in the rb-tree queue will * contain "runs" of one or more adjacent fragments. @@ -39,6 +41,7 @@ struct ipfrag_skb_cb { }; struct sk_buff *next_frag; int frag_run_len; + int ip_defrag_offset; }; #define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) @@ -396,12 +399,12 @@ int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, */ if (!last) fragrun_create(q, skb); /* First fragment. */ - else if (last->ip_defrag_offset + last->len < end) { + else if (FRAG_CB(last)->ip_defrag_offset + last->len < end) { /* This is the common case: skb goes to the end. */ /* Detect and discard overlaps. */ - if (offset < last->ip_defrag_offset + last->len) + if (offset < FRAG_CB(last)->ip_defrag_offset + last->len) return IPFRAG_OVERLAP; - if (offset == last->ip_defrag_offset + last->len) + if (offset == FRAG_CB(last)->ip_defrag_offset + last->len) fragrun_append_to_last(q, skb); else fragrun_create(q, skb); @@ -418,13 +421,13 @@ int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, parent = *rbn; curr = rb_to_skb(parent); - curr_run_end = curr->ip_defrag_offset + + curr_run_end = FRAG_CB(curr)->ip_defrag_offset + FRAG_CB(curr)->frag_run_len; - if (end <= curr->ip_defrag_offset) + if (end <= FRAG_CB(curr)->ip_defrag_offset) rbn = &parent->rb_left; else if (offset >= curr_run_end) rbn = &parent->rb_right; - else if (offset >= curr->ip_defrag_offset && + else if (offset >= FRAG_CB(curr)->ip_defrag_offset && end <= curr_run_end) return IPFRAG_DUP; else @@ -438,7 +441,7 @@ int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, rb_insert_color(&skb->rbnode, &q->rb_fragments); } - skb->ip_defrag_offset = offset; + FRAG_CB(skb)->ip_defrag_offset = offset; return IPFRAG_OK; } @@ -448,13 +451,28 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, struct sk_buff *parent) { struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments); - struct sk_buff **nextp; + void (*destructor)(struct sk_buff *); + unsigned int orig_truesize = 0; + struct sk_buff **nextp = NULL; + struct sock *sk = skb->sk; int delta; + if (sk && is_skb_wmem(skb)) { + /* TX: skb->sk might have been passed as argument to + * dst->output and must remain valid until tx completes. + * + * Move sk to reassembled skb and fix up wmem accounting. + */ + orig_truesize = skb->truesize; + destructor = skb->destructor; + } + if (head != skb) { fp = skb_clone(skb, GFP_ATOMIC); - if (!fp) - return NULL; + if (!fp) { + head = skb; + goto out_restore_sk; + } FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; if (RB_EMPTY_NODE(&skb->rbnode)) FRAG_CB(parent)->next_frag = fp; @@ -463,6 +481,12 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, &q->rb_fragments); if (q->fragments_tail == skb) q->fragments_tail = fp; + + if (orig_truesize) { + /* prevent skb_morph from releasing sk */ + skb->sk = NULL; + skb->destructor = NULL; + } skb_morph(skb, head); FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; rb_replace_node(&head->rbnode, &skb->rbnode, @@ -470,13 +494,13 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, consume_skb(head); head = skb; } - WARN_ON(head->ip_defrag_offset != 0); + WARN_ON(FRAG_CB(head)->ip_defrag_offset != 0); delta = -head->truesize; /* Head of list must not be cloned. */ if (skb_unclone(head, GFP_ATOMIC)) - return NULL; + goto out_restore_sk; delta += head->truesize; if (delta) @@ -492,7 +516,7 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, clone = alloc_skb(0, GFP_ATOMIC); if (!clone) - return NULL; + goto out_restore_sk; skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; skb_frag_list_init(head); for (i = 0; i < skb_shinfo(head)->nr_frags; i++) @@ -509,6 +533,21 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, nextp = &skb_shinfo(head)->frag_list; } +out_restore_sk: + if (orig_truesize) { + int ts_delta = head->truesize - orig_truesize; + + /* if this reassembled skb is fragmented later, + * fraglist skbs will get skb->sk assigned from head->sk, + * and each frag skb will be released via sock_wfree. + * + * Update sk_wmem_alloc. + */ + head->sk = sk; + head->destructor = destructor; + refcount_add(ts_delta, &sk->sk_wmem_alloc); + } + return nextp; } EXPORT_SYMBOL(inet_frag_reasm_prepare); @@ -516,6 +555,8 @@ EXPORT_SYMBOL(inet_frag_reasm_prepare); void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, void *reasm_data, bool try_coalesce) { + struct sock *sk = is_skb_wmem(head) ? head->sk : NULL; + const unsigned int head_truesize = head->truesize; struct sk_buff **nextp = reasm_data; struct rb_node *rbn; struct sk_buff *fp; @@ -579,6 +620,9 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, head->prev = NULL; head->tstamp = q->stamp; head->mono_delivery_time = q->mono_delivery_time; + + if (sk) + refcount_add(sum_truesize - head_truesize, &sk->sk_wmem_alloc); } EXPORT_SYMBOL(inet_frag_reasm_finish); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 7967ff7e02..4e470f1848 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -76,7 +76,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, tb->port = snum; tb->fastreuse = 0; tb->fastreuseport = 0; - INIT_HLIST_HEAD(&tb->owners); + INIT_HLIST_HEAD(&tb->bhash2); hlist_add_head(&tb->node, &head->chain); } return tb; @@ -87,7 +87,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, */ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) { - if (hlist_empty(&tb->owners)) { + if (hlist_empty(&tb->bhash2)) { __hlist_del(&tb->node); kmem_cache_free(cachep, tb); } @@ -100,47 +100,52 @@ bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net tb->l3mdev == l3mdev; } -static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb, +static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2, struct net *net, struct inet_bind_hashbucket *head, - unsigned short port, int l3mdev, + struct inet_bind_bucket *tb, const struct sock *sk) { - write_pnet(&tb->ib_net, net); - tb->l3mdev = l3mdev; - tb->port = port; + write_pnet(&tb2->ib_net, net); + tb2->l3mdev = tb->l3mdev; + tb2->port = tb->port; #if IS_ENABLED(CONFIG_IPV6) - tb->family = sk->sk_family; - if (sk->sk_family == AF_INET6) - tb->v6_rcv_saddr = sk->sk_v6_rcv_saddr; - else + BUILD_BUG_ON(USHRT_MAX < (IPV6_ADDR_ANY | IPV6_ADDR_MAPPED)); + if (sk->sk_family == AF_INET6) { + tb2->addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); + tb2->v6_rcv_saddr = sk->sk_v6_rcv_saddr; + } else { + tb2->addr_type = IPV6_ADDR_MAPPED; + ipv6_addr_set_v4mapped(sk->sk_rcv_saddr, &tb2->v6_rcv_saddr); + } +#else + tb2->rcv_saddr = sk->sk_rcv_saddr; #endif - tb->rcv_saddr = sk->sk_rcv_saddr; - INIT_HLIST_HEAD(&tb->owners); - INIT_HLIST_HEAD(&tb->deathrow); - hlist_add_head(&tb->node, &head->chain); + INIT_HLIST_HEAD(&tb2->owners); + hlist_add_head(&tb2->node, &head->chain); + hlist_add_head(&tb2->bhash_node, &tb->bhash2); } struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, - unsigned short port, - int l3mdev, + struct inet_bind_bucket *tb, const struct sock *sk) { - struct inet_bind2_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); + struct inet_bind2_bucket *tb2 = kmem_cache_alloc(cachep, GFP_ATOMIC); - if (tb) - inet_bind2_bucket_init(tb, net, head, port, l3mdev, sk); + if (tb2) + inet_bind2_bucket_init(tb2, net, head, tb, sk); - return tb; + return tb2; } /* Caller must hold hashbucket lock for this tb with local BH disabled */ void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) { - if (hlist_empty(&tb->owners) && hlist_empty(&tb->deathrow)) { + if (hlist_empty(&tb->owners)) { __hlist_del(&tb->node); + __hlist_del(&tb->bhash_node); kmem_cache_free(cachep, tb); } } @@ -149,18 +154,11 @@ static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family != tb2->family) { - if (sk->sk_family == AF_INET) - return ipv6_addr_v4mapped(&tb2->v6_rcv_saddr) && - tb2->v6_rcv_saddr.s6_addr32[3] == sk->sk_rcv_saddr; - - return ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr) && - sk->sk_v6_rcv_saddr.s6_addr32[3] == tb2->rcv_saddr; - } - if (sk->sk_family == AF_INET6) - return ipv6_addr_equal(&tb2->v6_rcv_saddr, - &sk->sk_v6_rcv_saddr); + return ipv6_addr_equal(&tb2->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); + + if (tb2->addr_type != IPV6_ADDR_MAPPED) + return false; #endif return tb2->rcv_saddr == sk->sk_rcv_saddr; } @@ -169,10 +167,9 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, struct inet_bind2_bucket *tb2, unsigned short port) { inet_sk(sk)->inet_num = port; - sk_add_bind_node(sk, &tb->owners); inet_csk(sk)->icsk_bind_hash = tb; - sk_add_bind2_node(sk, &tb2->owners); inet_csk(sk)->icsk_bind2_hash = tb2; + sk_add_bind_node(sk, &tb2->owners); } /* @@ -192,21 +189,20 @@ static void __inet_put_port(struct sock *sk) spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; - __sk_del_bind_node(sk); inet_csk(sk)->icsk_bind_hash = NULL; inet_sk(sk)->inet_num = 0; - inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); spin_lock(&head2->lock); if (inet_csk(sk)->icsk_bind2_hash) { struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash; - __sk_del_bind2_node(sk); + __sk_del_bind_node(sk); inet_csk(sk)->icsk_bind2_hash = NULL; inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); } spin_unlock(&head2->lock); + inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); spin_unlock(&head->lock); } @@ -275,8 +271,7 @@ bhash2_find: tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child); if (!tb2) { tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep, - net, head2, port, - l3mdev, child); + net, head2, tb, child); if (!tb2) goto error; } @@ -836,16 +831,15 @@ bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const return false; #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family != tb->family) { - if (sk->sk_family == AF_INET) - return ipv6_addr_any(&tb->v6_rcv_saddr) || - ipv6_addr_v4mapped_any(&tb->v6_rcv_saddr); + if (tb->addr_type == IPV6_ADDR_ANY) + return true; + if (tb->addr_type != IPV6_ADDR_MAPPED) return false; - } - if (sk->sk_family == AF_INET6) - return ipv6_addr_any(&tb->v6_rcv_saddr); + if (sk->sk_family == AF_INET6 && + !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) + return false; #endif return tb->rcv_saddr == 0; } @@ -942,7 +936,7 @@ static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, spin_lock_bh(&head->lock); spin_lock(&head2->lock); - __sk_del_bind2_node(sk); + __sk_del_bind_node(sk); inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); spin_unlock(&head2->lock); @@ -957,10 +951,10 @@ static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); if (!tb2) { tb2 = new_tb2; - inet_bind2_bucket_init(tb2, net, head2, port, l3mdev, sk); + inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk); } - sk_add_bind2_node(sk, &tb2->owners); inet_csk(sk)->icsk_bind2_hash = tb2; + sk_add_bind_node(sk, &tb2->owners); spin_unlock(&head2->lock); spin_unlock_bh(&head->lock); @@ -1012,7 +1006,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, bool tb_created = false; u32 remaining, offset; int ret, i, low, high; - int l3mdev; + bool local_ports; + int step, l3mdev; u32 index; if (port) { @@ -1024,10 +1019,12 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, l3mdev = inet_sk_bound_l3mdev(sk); - inet_sk_get_local_port_range(sk, &low, &high); + local_ports = inet_sk_get_local_port_range(sk, &low, &high); + step = local_ports ? 1 : 2; + high++; /* [32768, 60999] -> [32768, 61000[ */ remaining = high - low; - if (likely(remaining > 1)) + if (!local_ports && remaining > 1) remaining &= ~1U; get_random_sleepable_once(table_perturb, @@ -1040,10 +1037,11 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, /* In first pass we try ports of @low parity. * inet_csk_get_port() does the opposite choice. */ - offset &= ~1U; + if (!local_ports) + offset &= ~1U; other_parity_scan: port = low + offset; - for (i = 0; i < remaining; i += 2, port += 2) { + for (i = 0; i < remaining; i += step, port += step) { if (unlikely(port >= high)) port -= remaining; if (inet_is_local_reserved_port(net, port)) @@ -1060,7 +1058,7 @@ other_parity_scan: if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) goto next_port; - WARN_ON(hlist_empty(&tb->owners)); + WARN_ON(hlist_empty(&tb->bhash2)); if (!check_established(death_row, sk, port, &tw)) goto ok; @@ -1083,10 +1081,11 @@ next_port: cond_resched(); } - offset++; - if ((offset & 1) && remaining > 1) - goto other_parity_scan; - + if (!local_ports) { + offset++; + if ((offset & 1) && remaining > 1) + goto other_parity_scan; + } return -EADDRNOTAVAIL; ok: @@ -1099,7 +1098,7 @@ ok: tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); if (!tb2) { tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net, - head2, port, l3mdev, sk); + head2, tb, sk); if (!tb2) goto error; } @@ -1109,8 +1108,8 @@ ok: * on low contention the randomness is maximal and on high contention * it may be inexistent. */ - i = max_t(int, i, get_random_u32_below(8) * 2); - WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); + i = max_t(int, i, get_random_u32_below(8) * step); + WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step); /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, tb2, port); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 757ae3a4e2..e8de45d34d 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -35,13 +35,11 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, if (!tb) return; - __hlist_del(&tw->tw_bind_node); + __sk_del_bind_node((struct sock *)tw); tw->tw_tb = NULL; - inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); - - __hlist_del(&tw->tw_bind2_node); tw->tw_tb2 = NULL; inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); + inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); __sock_put((struct sock *)tw); } @@ -94,18 +92,6 @@ static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw, hlist_nulls_add_head_rcu(&tw->tw_node, list); } -static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, - struct hlist_head *list) -{ - hlist_add_head(&tw->tw_bind_node, list); -} - -static void inet_twsk_add_bind2_node(struct inet_timewait_sock *tw, - struct hlist_head *list) -{ - hlist_add_head(&tw->tw_bind2_node, list); -} - /* * Enter the time wait state. This is called with locally disabled BH. * Essentially we whip up a timewait bucket, copy the relevant info into it @@ -133,11 +119,10 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, tw->tw_tb = icsk->icsk_bind_hash; WARN_ON(!icsk->icsk_bind_hash); - inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); tw->tw_tb2 = icsk->icsk_bind2_hash; WARN_ON(!icsk->icsk_bind2_hash); - inet_twsk_add_bind2_node(tw, &tw->tw_tb2->deathrow); + sk_add_bind_node((struct sock *)tw, &tw->tw_tb2->owners); spin_unlock(&bhead2->lock); spin_unlock(&bhead->lock); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index a4941f53b5..fb947d1613 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -384,6 +384,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) } skb_dst_drop(skb); + skb_orphan(skb); return -EINPROGRESS; insert_error: @@ -487,7 +488,6 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) struct ipq *qp; __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS); - skb_orphan(skb); /* Lookup (or create) queue header */ qp = ip_find(net, ip_hdr(skb), user, vif); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 5169c3c72c..1954a56fec 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -280,8 +280,13 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, tpi->flags | TUNNEL_NO_KEY, iph->saddr, iph->daddr, 0); } else { + if (unlikely(!pskb_may_pull(skb, + gre_hdr_len + sizeof(*ershdr)))) + return PACKET_REJECT; + ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); ver = ershdr->ver; + iph = ip_hdr(skb); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags | TUNNEL_KEY, iph->saddr, iph->daddr, tpi->key); @@ -1793,6 +1798,7 @@ static void __exit ipgre_fini(void) module_init(ipgre_init); module_exit(ipgre_fini); +MODULE_DESCRIPTION("IPv4 GRE tunnels over IP library"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("gre"); MODULE_ALIAS_RTNL_LINK("gretap"); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 41537d18ee..67d8466223 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -972,8 +972,8 @@ static int __ip_append_data(struct sock *sk, unsigned int maxfraglen, fragheaderlen, maxnonfragsize; int csummode = CHECKSUM_NONE; struct rtable *rt = (struct rtable *)cork->dst; + bool paged, hold_tskey, extra_uref = false; unsigned int wmem_alloc_delta = 0; - bool paged, extra_uref = false; u32 tskey = 0; skb = skb_peek_tail(queue); @@ -982,10 +982,6 @@ static int __ip_append_data(struct sock *sk, mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize; paged = !!cork->gso_size; - if (cork->tx_flags & SKBTX_ANY_TSTAMP && - READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) - tskey = atomic_inc_return(&sk->sk_tskey) - 1; - hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); @@ -1052,6 +1048,11 @@ static int __ip_append_data(struct sock *sk, cork->length += length; + hold_tskey = cork->tx_flags & SKBTX_ANY_TSTAMP && + READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID; + if (hold_tskey) + tskey = atomic_inc_return(&sk->sk_tskey) - 1; + /* So, what's going on in the loop below? * * We use calculated fragment length to generate chained skb, @@ -1274,6 +1275,8 @@ error: cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); + if (hold_tskey) + atomic_dec(&sk->sk_tskey); return err; } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 8a88e705d8..21d2ffa919 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -47,8 +47,6 @@ #include <linux/errqueue.h> #include <linux/uaccess.h> -#include <linux/bpfilter.h> - /* * SOL_IP control messages. */ @@ -775,7 +773,7 @@ static int ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) if (optlen < GROUP_FILTER_SIZE(0)) return -EINVAL; - if (optlen > READ_ONCE(sysctl_optmem_max)) + if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) return -ENOBUFS; gsf = memdup_sockptr(optval, optlen); @@ -811,7 +809,7 @@ static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, if (optlen < size0) return -EINVAL; - if (optlen > READ_ONCE(sysctl_optmem_max) - 4) + if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max) - 4) return -ENOBUFS; p = kmalloc(optlen + 4, GFP_KERNEL); @@ -1055,6 +1053,19 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, case IP_TOS: /* This sets both TOS and Precedence */ ip_sock_set_tos(sk, val); return 0; + case IP_LOCAL_PORT_RANGE: + { + u16 lo = val; + u16 hi = val >> 16; + + if (optlen != sizeof(u32)) + return -EINVAL; + if (lo != 0 && hi != 0 && lo > hi) + return -EINVAL; + + WRITE_ONCE(inet->local_port_range, val); + return 0; + } } err = 0; @@ -1241,7 +1252,7 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, if (optlen < IP_MSFILTER_SIZE(0)) goto e_inval; - if (optlen > READ_ONCE(sysctl_optmem_max)) { + if (optlen > READ_ONCE(net->core.sysctl_optmem_max)) { err = -ENOBUFS; break; } @@ -1332,20 +1343,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, err = xfrm_user_policy(sk, optname, optval, optlen); break; - case IP_LOCAL_PORT_RANGE: - { - const __u16 lo = val; - const __u16 hi = val >> 16; - - if (optlen != sizeof(__u32)) - goto e_inval; - if (lo != 0 && hi != 0 && lo > hi) - goto e_inval; - - inet->local_port_range.lo = lo; - inet->local_port_range.hi = hi; - break; - } default: err = -ENOPROTOOPT; break; @@ -1414,11 +1411,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, return -ENOPROTOOPT; err = do_ip_setsockopt(sk, level, optname, optval, optlen); -#if IS_ENABLED(CONFIG_BPFILTER_UMH) - if (optname >= BPFILTER_IPT_SO_SET_REPLACE && - optname < BPFILTER_IPT_SET_MAX) - err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen); -#endif #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_HDRINCL && @@ -1694,6 +1686,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, return -EFAULT; return 0; } + case IP_LOCAL_PORT_RANGE: + val = READ_ONCE(inet->local_port_range); + goto copyval; } if (needs_rtnl) @@ -1723,9 +1718,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, else err = ip_get_mcast_msfilter(sk, optval, optlen, len); goto out; - case IP_LOCAL_PORT_RANGE: - val = inet->local_port_range.hi << 16 | inet->local_port_range.lo; - break; case IP_PROTOCOL: val = inet_sk(sk)->inet_num; break; @@ -1766,11 +1758,6 @@ int ip_getsockopt(struct sock *sk, int level, err = do_ip_getsockopt(sk, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); -#if IS_ENABLED(CONFIG_BPFILTER_UMH) - if (optname >= BPFILTER_IPT_SO_GET_INFO && - optname < BPFILTER_IPT_GET_MAX) - err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen); -#endif #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index b1b6dcf216..55039191b8 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -1298,6 +1298,7 @@ int ip_tunnel_init(struct net_device *dev) if (tunnel->collect_md) netif_keep_dst(dev); + netdev_lockdep_set_classes(dev); return 0; } EXPORT_SYMBOL_GPL(ip_tunnel_init); @@ -1325,4 +1326,5 @@ void ip_tunnel_setup(struct net_device *dev, unsigned int net_id) } EXPORT_SYMBOL_GPL(ip_tunnel_setup); +MODULE_DESCRIPTION("IPv4 tunnel implementation library"); MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 9ab9b3ebe0..d1d6bb28ed 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -721,6 +721,7 @@ static void __exit vti_fini(void) module_init(vti_init); module_exit(vti_fini); +MODULE_DESCRIPTION("Virtual (secure) IP tunneling library"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("vti"); MODULE_ALIAS_NETDEV("ip_vti0"); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 27b8f83c6e..03afa3871e 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -658,6 +658,7 @@ static void __exit ipip_fini(void) module_init(ipip_init); module_exit(ipip_fini); +MODULE_DESCRIPTION("IP/IP protocol decoder library"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("ipip"); MODULE_ALIAS_NETDEV("tunl0"); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 66eade3fb6..b53c36c473 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -253,7 +253,7 @@ static int __net_init ipmr_rules_init(struct net *net) goto err1; } - err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0); + err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT); if (err < 0) goto err2; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 2407066b0f..14365b20f1 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -956,6 +956,8 @@ static int do_replace(struct net *net, sockptr_t arg, unsigned int len) void *loc_cpu_entry; struct arpt_entry *iter; + if (len < sizeof(tmp)) + return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; @@ -964,6 +966,8 @@ static int do_replace(struct net *net, sockptr_t arg, unsigned int len) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; + if ((u64)len < (u64)tmp.size + sizeof(tmp)) + return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; @@ -1254,6 +1258,8 @@ static int compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) void *loc_cpu_entry; struct arpt_entry *iter; + if (len < sizeof(tmp)) + return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; @@ -1262,6 +1268,8 @@ static int compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; + if ((u64)len < (u64)tmp.size + sizeof(tmp)) + return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 7da1df4997..fe89a056eb 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1108,6 +1108,8 @@ do_replace(struct net *net, sockptr_t arg, unsigned int len) void *loc_cpu_entry; struct ipt_entry *iter; + if (len < sizeof(tmp)) + return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; @@ -1116,6 +1118,8 @@ do_replace(struct net *net, sockptr_t arg, unsigned int len) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; + if ((u64)len < (u64)tmp.size + sizeof(tmp)) + return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; @@ -1492,6 +1496,8 @@ compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) void *loc_cpu_entry; struct ipt_entry *iter; + if (len < sizeof(tmp)) + return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; @@ -1500,6 +1506,8 @@ compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; + if ((u64)len < (u64)tmp.size + sizeof(tmp)) + return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 16615d107c..f67d3d6fe9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -926,13 +926,11 @@ void ip_rt_send_redirect(struct sk_buff *skb) icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); peer->rate_last = jiffies; ++peer->n_redirects; -#ifdef CONFIG_IP_ROUTE_VERBOSE - if (log_martians && + if (IS_ENABLED(CONFIG_IP_ROUTE_VERBOSE) && log_martians && peer->n_redirects == ip_rt_redirect_number) net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", &ip_hdr(skb)->saddr, inet_iif(skb), &ip_hdr(skb)->daddr, &gw); -#endif } out_put_peer: inet_putpeer(peer); @@ -2168,6 +2166,9 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, int err = -EINVAL; u32 tag = 0; + if (!in_dev) + return -EINVAL; + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) goto martian_source; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index d37282c06e..61f1c96cfe 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -189,12 +189,14 @@ __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp) * Check if a ack sequence number is a valid syncookie. * Return the decoded mss if it is, or 0 if not. */ -int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, - u32 cookie) +int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th) { + __u32 cookie = ntohl(th->ack_seq) - 1; __u32 seq = ntohl(th->seq) - 1; - __u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr, - th->source, th->dest, seq); + __u32 mssind; + + mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr, + th->source, th->dest, seq); return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; } @@ -202,7 +204,7 @@ EXPORT_SYMBOL_GPL(__cookie_v4_check); struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct dst_entry *dst, u32 tsoff) + struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); struct sock *child; @@ -212,7 +214,6 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, NULL, &own_req); if (child) { refcount_set(&req->rsk_refcnt, 1); - tcp_sk(child)->tsoffset = tsoff; sock_rps_save_rxhash(child, skb); if (rsk_drop_req(req)) { @@ -269,26 +270,46 @@ bool cookie_timestamp_decode(const struct net *net, } EXPORT_SYMBOL(cookie_timestamp_decode); -bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt, - const struct net *net, const struct dst_entry *dst) +static int cookie_tcp_reqsk_init(struct sock *sk, struct sk_buff *skb, + struct request_sock *req) { - bool ecn_ok = tcp_opt->rcv_tsecr & TS_OPT_ECN; + struct inet_request_sock *ireq = inet_rsk(req); + struct tcp_request_sock *treq = tcp_rsk(req); + const struct tcphdr *th = tcp_hdr(skb); - if (!ecn_ok) - return false; + req->num_retrans = 0; - if (READ_ONCE(net->ipv4.sysctl_tcp_ecn)) - return true; + ireq->ir_num = ntohs(th->dest); + ireq->ir_rmt_port = th->source; + ireq->ir_iif = inet_request_bound_dev_if(sk, skb); + ireq->ir_mark = inet_request_mark(sk, skb); + + if (IS_ENABLED(CONFIG_SMC)) + ireq->smc_ok = 0; - return dst_feature(dst, RTAX_FEATURE_ECN); + treq->snt_synack = 0; + treq->tfo_listener = false; + treq->txhash = net_tx_rndhash(); + treq->rcv_isn = ntohl(th->seq) - 1; + treq->snt_isn = ntohl(th->ack_seq) - 1; + treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; + treq->req_usec_ts = false; + +#if IS_ENABLED(CONFIG_MPTCP) + treq->is_mptcp = sk_is_mptcp(sk); + if (treq->is_mptcp) + return mptcp_subflow_init_cookie_req(req, sk, skb); +#endif + + return 0; } -EXPORT_SYMBOL(cookie_ecn_ok); struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, - const struct tcp_request_sock_ops *af_ops, - struct sock *sk, - struct sk_buff *skb) + struct sock *sk, struct sk_buff *skb, + struct tcp_options_received *tcp_opt, + int mss, u32 tsoff) { + struct inet_request_sock *ireq; struct tcp_request_sock *treq; struct request_sock *req; @@ -300,126 +321,109 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, if (!req) return NULL; - treq = tcp_rsk(req); + if (cookie_tcp_reqsk_init(sk, skb, req)) { + reqsk_free(req); + return NULL; + } - /* treq->af_specific might be used to perform TCP_MD5 lookup */ - treq->af_specific = af_ops; + ireq = inet_rsk(req); + treq = tcp_rsk(req); - treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; - treq->req_usec_ts = false; + req->mss = mss; + req->ts_recent = tcp_opt->saw_tstamp ? tcp_opt->rcv_tsval : 0; -#if IS_ENABLED(CONFIG_MPTCP) - treq->is_mptcp = sk_is_mptcp(sk); - if (treq->is_mptcp) { - int err = mptcp_subflow_init_cookie_req(req, sk, skb); + ireq->snd_wscale = tcp_opt->snd_wscale; + ireq->tstamp_ok = tcp_opt->saw_tstamp; + ireq->sack_ok = tcp_opt->sack_ok; + ireq->wscale_ok = tcp_opt->wscale_ok; + ireq->ecn_ok = !!(tcp_opt->rcv_tsecr & TS_OPT_ECN); - if (err) { - reqsk_free(req); - return NULL; - } - } -#endif + treq->ts_off = tsoff; return req; } EXPORT_SYMBOL_GPL(cookie_tcp_reqsk_alloc); -/* On input, sk is a listener. - * Output is listener if incoming packet would not create a child - * NULL if memory could not be allocated. - */ -struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) +static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk, + struct sk_buff *skb) { - struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; struct tcp_options_received tcp_opt; - struct inet_request_sock *ireq; - struct tcp_request_sock *treq; - struct tcp_sock *tp = tcp_sk(sk); - const struct tcphdr *th = tcp_hdr(skb); - __u32 cookie = ntohl(th->ack_seq) - 1; - struct sock *ret = sk; - struct request_sock *req; - int full_space, mss; - struct rtable *rt; - __u8 rcv_wscale; - struct flowi4 fl4; u32 tsoff = 0; - int l3index; - - if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) || - !th->ack || th->rst) - goto out; + int mss; if (tcp_synq_no_recent_overflow(sk)) goto out; - mss = __cookie_v4_check(ip_hdr(skb), th, cookie); - if (mss == 0) { - __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); + mss = __cookie_v4_check(ip_hdr(skb), tcp_hdr(skb)); + if (!mss) { + __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESFAILED); goto out; } - __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV); + __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESRECV); /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); - tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL); + tcp_parse_options(net, skb, &tcp_opt, 0, NULL); if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { - tsoff = secure_tcp_ts_off(sock_net(sk), + tsoff = secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); tcp_opt.rcv_tsecr -= tsoff; } - if (!cookie_timestamp_decode(sock_net(sk), &tcp_opt)) + if (!cookie_timestamp_decode(net, &tcp_opt)) goto out; - ret = NULL; - req = cookie_tcp_reqsk_alloc(&tcp_request_sock_ops, - &tcp_request_sock_ipv4_ops, sk, skb); - if (!req) + return cookie_tcp_reqsk_alloc(&tcp_request_sock_ops, sk, skb, + &tcp_opt, mss, tsoff); +out: + return ERR_PTR(-EINVAL); +} + +/* On input, sk is a listener. + * Output is listener if incoming packet would not create a child + * NULL if memory could not be allocated. + */ +struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) +{ + struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; + const struct tcphdr *th = tcp_hdr(skb); + struct tcp_sock *tp = tcp_sk(sk); + struct inet_request_sock *ireq; + struct net *net = sock_net(sk); + struct request_sock *req; + struct sock *ret = sk; + struct flowi4 fl4; + struct rtable *rt; + __u8 rcv_wscale; + int full_space; + + if (!READ_ONCE(net->ipv4.sysctl_tcp_syncookies) || + !th->ack || th->rst) goto out; + req = cookie_tcp_check(net, sk, skb); + if (IS_ERR(req)) + goto out; + if (!req) + goto out_drop; + ireq = inet_rsk(req); - treq = tcp_rsk(req); - treq->rcv_isn = ntohl(th->seq) - 1; - treq->snt_isn = cookie; - treq->ts_off = 0; - treq->txhash = net_tx_rndhash(); - req->mss = mss; - ireq->ir_num = ntohs(th->dest); - ireq->ir_rmt_port = th->source; + sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); - ireq->ir_mark = inet_request_mark(sk, skb); - ireq->snd_wscale = tcp_opt.snd_wscale; - ireq->sack_ok = tcp_opt.sack_ok; - ireq->wscale_ok = tcp_opt.wscale_ok; - ireq->tstamp_ok = tcp_opt.saw_tstamp; - req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; - treq->snt_synack = 0; - treq->tfo_listener = false; - - if (IS_ENABLED(CONFIG_SMC)) - ireq->smc_ok = 0; - - ireq->ir_iif = inet_request_bound_dev_if(sk, skb); - - l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); - tcp_ao_syncookie(sk, skb, treq, AF_INET, l3index); /* We throwed the options of the initial SYN away, so we hope * the ACK carries the same options again (see RFC1122 4.2.3.8) */ - RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(sock_net(sk), skb)); + RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); - if (security_inet_conn_request(sk, skb, req)) { - reqsk_free(req); - goto out; - } + if (security_inet_conn_request(sk, skb, req)) + goto out_free; - req->num_retrans = 0; + tcp_ao_syncookie(sk, skb, req, AF_INET); /* * We need to lookup the route here to get at the correct @@ -433,11 +437,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) opt->srr ? opt->faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid); security_req_classify_flow(req, flowi4_to_flowi_common(&fl4)); - rt = ip_route_output_key(sock_net(sk), &fl4); - if (IS_ERR(rt)) { - reqsk_free(req); - goto out; - } + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) + goto out_free; /* Try to redo what tcp_v4_send_synack did. */ req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); @@ -453,13 +455,18 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) dst_metric(&rt->dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; - ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst); + ireq->ecn_ok &= cookie_ecn_ok(net, &rt->dst); - ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst, tsoff); + ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst); /* ip_queue_xmit() depends on our flow being setup * Normal sockets get it right from inet_csk_route_child_sock() */ if (ret) inet_sk(ret)->cork.fl.u.ip4 = fl4; -out: return ret; +out: + return ret; +out_free: + reqsk_free(req); +out_drop: + return NULL; } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index f63a545a73..7e4f16a7dc 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -50,26 +50,22 @@ static int tcp_plb_max_cong_thresh = 256; static int sysctl_tcp_low_latency __read_mostly; /* Update system visible IP port range */ -static void set_local_port_range(struct net *net, int range[2]) +static void set_local_port_range(struct net *net, unsigned int low, unsigned int high) { - bool same_parity = !((range[0] ^ range[1]) & 1); + bool same_parity = !((low ^ high) & 1); - write_seqlock_bh(&net->ipv4.ip_local_ports.lock); if (same_parity && !net->ipv4.ip_local_ports.warned) { net->ipv4.ip_local_ports.warned = true; pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end values.\n"); } - net->ipv4.ip_local_ports.range[0] = range[0]; - net->ipv4.ip_local_ports.range[1] = range[1]; - write_sequnlock_bh(&net->ipv4.ip_local_ports.lock); + WRITE_ONCE(net->ipv4.ip_local_ports.range, high << 16 | low); } /* Validate changes from /proc interface. */ static int ipv4_local_port_range(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = - container_of(table->data, struct net, ipv4.ip_local_ports.range); + struct net *net = table->data; int ret; int range[2]; struct ctl_table tmp = { @@ -93,7 +89,7 @@ static int ipv4_local_port_range(struct ctl_table *table, int write, (range[0] < READ_ONCE(net->ipv4.sysctl_ip_prot_sock))) ret = -EINVAL; else - set_local_port_range(net, range); + set_local_port_range(net, range[0], range[1]); } return ret; @@ -733,8 +729,8 @@ static struct ctl_table ipv4_net_table[] = { }, { .procname = "ip_local_port_range", - .maxlen = sizeof(init_net.ipv4.ip_local_ports.range), - .data = &init_net.ipv4.ip_local_ports.range, + .maxlen = 0, + .data = &init_net, .mode = 0644, .proc_handler = ipv4_local_port_range, }, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0d03d48702..5887eac87b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1860,7 +1860,6 @@ static int receive_fallback_to_copy(struct sock *sk, { unsigned long copy_address = (unsigned long)zc->copybuf_address; struct msghdr msg = {}; - struct iovec iov; int err; zc->length = 0; @@ -1869,8 +1868,8 @@ static int receive_fallback_to_copy(struct sock *sk, if (copy_address != zc->copybuf_address) return -EINVAL; - err = import_single_range(ITER_DEST, (void __user *)copy_address, - inq, &iov, &msg.msg_iter); + err = import_ubuf(ITER_DEST, (void __user *)copy_address, inq, + &msg.msg_iter); if (err) return err; @@ -1897,14 +1896,13 @@ static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, { unsigned long copy_address = (unsigned long)zc->copybuf_address; struct msghdr msg = {}; - struct iovec iov; int err; if (copy_address != zc->copybuf_address) return -EINVAL; - err = import_single_range(ITER_DEST, (void __user *)copy_address, - copylen, &iov, &msg.msg_iter); + err = import_ubuf(ITER_DEST, (void __user *)copy_address, copylen, + &msg.msg_iter); if (err) return err; err = skb_copy_datagram_msg(skb, *offset, &msg, copylen); @@ -2616,6 +2614,7 @@ void tcp_set_state(struct sock *sk, int state) BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN); BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING); BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV); + BUILD_BUG_ON((int)BPF_TCP_BOUND_INACTIVE != (int)TCP_BOUND_INACTIVE); BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES); /* bpf uapi header bpf.h defines an anonymous enum with values @@ -2931,6 +2930,8 @@ void tcp_close(struct sock *sk, long timeout) lock_sock(sk); __tcp_close(sk, timeout); release_sock(sk); + if (!sk->sk_net_refcnt) + inet_csk_clear_xmit_timers_sync(sk); sock_put(sk); } EXPORT_SYMBOL(tcp_close); @@ -4596,6 +4597,98 @@ static void __init tcp_init_mem(void) sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */ } +static void __init tcp_struct_check(void) +{ + /* TX read-mostly hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, max_window); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, rcv_ssthresh); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, reordering); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, lost_skb_hint); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 40); + + /* TXRX read-mostly hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_wnd); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, mss_cache); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_cwnd); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, prr_out); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, lost_out); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, sacked_out); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, scaling_ratio); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_txrx, 32); + + /* RX read-mostly hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, copied_seq); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rcv_tstamp); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_wl1); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tlp_high_seq); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rttvar_us); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, retrans_out); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, advmss); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, urg_data); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, lost); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 69); + + /* TX read-write hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, data_segs_out); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, bytes_sent); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, snd_sml); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_start); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_stat); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, write_seq); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, pushed_seq); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_clock_cache); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_mstamp); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 113); + + /* TXRX read-write hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_nxt); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_nxt); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_una); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, window_clamp); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, srtt_us); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, packets_out); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 76); + + /* RX read-write hotpath cache lines */ + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, segs_in); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, data_segs_in); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_wup); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, max_packets_out); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, cwnd_usage_seq); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 99); +} + void __init tcp_init(void) { int max_rshare, max_wshare, cnt; @@ -4606,6 +4699,8 @@ void __init tcp_init(void) BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof_field(struct sk_buff, cb)); + tcp_struct_check(); + percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE); diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index f8308d3f56..254d6e3f93 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -844,18 +844,30 @@ static struct tcp_ao_key *tcp_ao_inbound_lookup(unsigned short int family, } void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb, - struct tcp_request_sock *treq, - unsigned short int family, int l3index) + struct request_sock *req, unsigned short int family) { + struct tcp_request_sock *treq = tcp_rsk(req); const struct tcphdr *th = tcp_hdr(skb); const struct tcp_ao_hdr *aoh; struct tcp_ao_key *key; + int l3index; + + /* treq->af_specific is used to perform TCP_AO lookup + * in tcp_create_openreq_child(). + */ +#if IS_ENABLED(CONFIG_IPV6) + if (family == AF_INET6) + treq->af_specific = &tcp_request_sock_ipv6_ops; + else +#endif + treq->af_specific = &tcp_request_sock_ipv4_ops; treq->used_tcp_ao = false; if (tcp_parse_auth_options(th, NULL, &aoh) || !aoh) return; + l3index = l3mdev_master_ifindex_by_index(sock_net(sk), inet_rsk(req)->ir_iif); key = tcp_ao_inbound_lookup(family, sk, skb, -1, aoh->keyid, l3index); if (!key) /* Key not found, continue without TCP-AO */ @@ -1056,6 +1068,7 @@ void tcp_ao_connect_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_ao_info *ao_info; + struct hlist_node *next; union tcp_ao_addr *addr; struct tcp_ao_key *key; int family, l3index; @@ -1078,7 +1091,7 @@ void tcp_ao_connect_init(struct sock *sk) l3index = l3mdev_master_ifindex_by_index(sock_net(sk), sk->sk_bound_dev_if); - hlist_for_each_entry_rcu(key, &ao_info->head, node) { + hlist_for_each_entry_safe(key, next, &ao_info->head, node) { if (!tcp_ao_key_cmp(key, l3index, addr, key->prefixlen, family, -1, -1)) continue; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 701cb87043..df7b13f0e5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -202,23 +202,17 @@ static void bpf_skops_established(struct sock *sk, int bpf_op, } #endif -static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb, - unsigned int len) +static __cold void tcp_gro_dev_warn(const struct sock *sk, const struct sk_buff *skb, + unsigned int len) { - static bool __once __read_mostly; + struct net_device *dev; - if (!__once) { - struct net_device *dev; - - __once = true; - - rcu_read_lock(); - dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif); - if (!dev || len >= dev->mtu) - pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n", - dev ? dev->name : "Unknown driver"); - rcu_read_unlock(); - } + rcu_read_lock(); + dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif); + if (!dev || len >= READ_ONCE(dev->mtu)) + pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n", + dev ? dev->name : "Unknown driver"); + rcu_read_unlock(); } /* Adapt the MSS value used to make delayed ack decision to the @@ -250,9 +244,8 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, tcp_sk(sk)->advmss); /* Account for possibly-removed options */ - if (unlikely(len > icsk->icsk_ack.rcv_mss + - MAX_TCP_OPTION_SPACE)) - tcp_gro_dev_warn(sk, skb, len); + DO_ONCE_LITE_IF(len > icsk->icsk_ack.rcv_mss + MAX_TCP_OPTION_SPACE, + tcp_gro_dev_warn, sk, skb, len); /* If the skb has a len of exactly 1*MSS and has the PSH bit * set then it is likely the end of an application write. So * more data may not be arriving soon, and yet the data sender diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 1f9f6c1c19..d1ad20ce1c 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -626,7 +626,6 @@ void tcp_retransmit_timer(struct sock *sk) * implemented ftp to mars will work nicely. We will have to fix * the 120 second clamps though! */ - icsk->icsk_backoff++; out_reset_timer: /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is @@ -647,11 +646,12 @@ out_reset_timer: tcp_rto_min(sk), TCP_RTO_MAX); } else if (sk->sk_state != TCP_SYN_SENT || - icsk->icsk_backoff > + tp->total_rto > READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts)) { /* Use normal (exponential) backoff unless linear timeouts are * activated. */ + icsk->icsk_backoff++; icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index 5048c47c79..4c1f836aae 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -294,4 +294,5 @@ static void __exit tunnel4_fini(void) module_init(tunnel4_init); module_exit(tunnel4_fini); +MODULE_DESCRIPTION("IPv4 XFRM tunnel library"); MODULE_LICENSE("GPL"); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 17231c0f88..40282a3418 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -584,6 +584,13 @@ static inline bool __udp_is_mcast_sock(struct net *net, const struct sock *sk, } DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); +EXPORT_SYMBOL(udp_encap_needed_key); + +#if IS_ENABLED(CONFIG_IPV6) +DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); +EXPORT_SYMBOL(udpv6_encap_needed_key); +#endif + void udp_encap_enable(void) { static_branch_inc(&udp_encap_needed_key); @@ -1118,16 +1125,17 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_controllen) { err = udp_cmsg_send(sk, msg, &ipc.gso_size); - if (err > 0) + if (err > 0) { err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6); + connected = 0; + } if (unlikely(err < 0)) { kfree(ipc.opt); return err; } if (ipc.opt) free = 1; - connected = 0; } if (!ipc.opt) { struct ip_options_rcu *inet_opt; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 6c95d28d0c..c3d67423ae 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -449,8 +449,9 @@ static int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) NAPI_GRO_CB(p)->count++; p->data_len += skb->len; - /* sk owenrship - if any - completely transferred to the aggregated packet */ + /* sk ownership - if any - completely transferred to the aggregated packet */ skb->destructor = NULL; + skb->sk = NULL; p->truesize += skb->truesize; p->len += skb->len; @@ -551,11 +552,19 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, unsigned int off = skb_gro_offset(skb); int flush = 1; - /* we can do L4 aggregation only if the packet can't land in a tunnel - * otherwise we could corrupt the inner stream + /* We can do L4 aggregation only if the packet can't land in a tunnel + * otherwise we could corrupt the inner stream. Detecting such packets + * cannot be foolproof and the aggregation might still happen in some + * cases. Such packets should be caught in udp_unexpected_gso later. */ NAPI_GRO_CB(skb)->is_flist = 0; if (!sk || !udp_sk(sk)->gro_receive) { + /* If the packet was locally encapsulated in a UDP tunnel that + * wasn't detected above, do not GRO. + */ + if (skb->encapsulation) + goto out; + if (skb->dev->features & NETIF_F_GRO_FRAGLIST) NAPI_GRO_CB(skb)->is_flist = sk ? !udp_test_bit(GRO_ENABLED, sk) : 1; @@ -719,13 +728,7 @@ INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff) skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4); skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; - if (skb->ip_summed == CHECKSUM_UNNECESSARY) { - if (skb->csum_level < SKB_MAX_CSUM_LEVEL) - skb->csum_level++; - } else { - skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->csum_level = 0; - } + __skb_incr_checksum_unnecessary(skb); return 0; } diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index a87defb2b1..860aff5f85 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -253,4 +253,5 @@ struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb, } EXPORT_SYMBOL_GPL(udp_tunnel_dst_lookup); +MODULE_DESCRIPTION("IPv4 Foo over UDP tunnel driver"); MODULE_LICENSE("GPL"); diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index 8489fa1065..8cb266af13 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c @@ -114,5 +114,6 @@ static void __exit ipip_fini(void) module_init(ipip_init); module_exit(ipip_fini); +MODULE_DESCRIPTION("IPv4 XFRM tunnel driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_IPIP); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 055230b669..37d48aa073 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2061,9 +2061,10 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *add if (ipv6_addr_equal(&ifp->addr, addr)) { if (!dev || ifp->idev->dev == dev || !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) { - result = ifp; - in6_ifa_hold(ifp); - break; + if (in6_ifa_hold_safe(ifp)) { + result = ifp; + break; + } } } } diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 2016e90e6e..eb474f0987 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -800,5 +800,6 @@ static void __exit ah6_fini(void) module_init(ah6_init); module_exit(ah6_fini); +MODULE_DESCRIPTION("IPv6 AH transformation helpers"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_AH); diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index cc6a502db3..fff7849680 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -60,9 +60,9 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, if (!oif) { if (ipv6_addr_is_multicast(&fl6->daddr)) - oif = np->mcast_oif; + oif = READ_ONCE(np->mcast_oif); else - oif = np->ucast_oif; + oif = READ_ONCE(np->ucast_oif); } fl6->flowi6_oif = oif; @@ -229,7 +229,7 @@ ipv4_connected: } if (!sk->sk_bound_dev_if && (addr_type & IPV6_ADDR_MULTICAST)) - WRITE_ONCE(sk->sk_bound_dev_if, np->mcast_oif); + WRITE_ONCE(sk->sk_bound_dev_if, READ_ONCE(np->mcast_oif)); /* Connect to link-local address requires an interface */ if (!sk->sk_bound_dev_if) { diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index a3fa3eda38..7371886d4f 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -1301,5 +1301,6 @@ static void __exit esp6_fini(void) module_init(esp6_init); module_exit(esp6_fini); +MODULE_DESCRIPTION("IPv6 ESP transformation helpers"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_ESP); diff --git a/net/ipv6/exthdrs_offload.c b/net/ipv6/exthdrs_offload.c index 06750d65d4..4c00398f4d 100644 --- a/net/ipv6/exthdrs_offload.c +++ b/net/ipv6/exthdrs_offload.c @@ -16,6 +16,10 @@ static const struct net_offload dstopt_offload = { .flags = INET6_PROTO_GSO_EXTHDR, }; +static const struct net_offload hbh_offload = { + .flags = INET6_PROTO_GSO_EXTHDR, +}; + int __init ipv6_exthdrs_offload_init(void) { int ret; @@ -28,9 +32,16 @@ int __init ipv6_exthdrs_offload_init(void) if (ret) goto out_rt; + ret = inet6_add_offload(&hbh_offload, IPPROTO_HOPOPTS); + if (ret) + goto out_dstopts; + out: return ret; +out_dstopts: + inet6_del_offload(&dstopt_offload, IPPROTO_DSTOPTS); + out_rt: inet6_del_offload(&rthdr_offload, IPPROTO_ROUTING); goto out; diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index be52b18e08..52c04f0ac4 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -481,11 +481,11 @@ static int __net_init fib6_rules_net_init(struct net *net) if (IS_ERR(ops)) return PTR_ERR(ops); - err = fib_default_rule_add(ops, 0, RT6_TABLE_LOCAL, 0); + err = fib_default_rule_add(ops, 0, RT6_TABLE_LOCAL); if (err) goto out_fib6_rules_ops; - err = fib_default_rule_add(ops, 0x7FFE, RT6_TABLE_MAIN, 0); + err = fib_default_rule_add(ops, 0x7FFE, RT6_TABLE_MAIN); if (err) goto out_fib6_rules_ops; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index f624270971..1635da0728 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -584,9 +584,9 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, tmp_hdr.icmp6_pointer = htonl(info); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) - fl6.flowi6_oif = np->mcast_oif; + fl6.flowi6_oif = READ_ONCE(np->mcast_oif); else if (!fl6.flowi6_oif) - fl6.flowi6_oif = np->ucast_oif; + fl6.flowi6_oif = READ_ONCE(np->ucast_oif); ipcm6_init_sk(&ipc6, sk); ipc6.sockc.mark = mark; @@ -770,9 +770,9 @@ static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb) np = inet6_sk(sk); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) - fl6.flowi6_oif = np->mcast_oif; + fl6.flowi6_oif = READ_ONCE(np->mcast_oif); else if (!fl6.flowi6_oif) - fl6.flowi6_oif = np->ucast_oif; + fl6.flowi6_oif = READ_ONCE(np->ucast_oif); if (ip6_dst_lookup(net, sk, &dst, &fl6)) goto out; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 4fc2cae0d1..8184076a39 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -645,19 +645,19 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) if (!w) { /* New dump: * - * 1. hook callback destructor. - */ - cb->args[3] = (long)cb->done; - cb->done = fib6_dump_done; - - /* - * 2. allocate and initialize walker. + * 1. allocate and initialize walker. */ w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) return -ENOMEM; w->func = fib6_dump_node; cb->args[2] = (long)w; + + /* 2. hook callback destructor. + */ + cb->args[3] = (long)cb->done; + cb->done = fib6_dump_done; + } arg.skb = skb; @@ -1375,7 +1375,10 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_table *table = rt->fib6_table; - struct fib6_node *fn, *pn = NULL; + struct fib6_node *fn; +#ifdef CONFIG_IPV6_SUBTREES + struct fib6_node *pn = NULL; +#endif int err = -ENOMEM; int allow_create = 1; int replace_required = 0; @@ -1399,9 +1402,9 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, goto out; } +#ifdef CONFIG_IPV6_SUBTREES pn = fn; -#ifdef CONFIG_IPV6_SUBTREES if (rt->fib6_src.plen) { struct fib6_node *sn; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 070d87abf7..289b83347d 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -528,6 +528,9 @@ static int ip6erspan_rcv(struct sk_buff *skb, struct ip6_tnl *tunnel; u8 ver; + if (unlikely(!pskb_may_pull(skb, sizeof(*ershdr)))) + return PACKET_REJECT; + ipv6h = ipv6_hdr(skb); ershdr = (struct erspan_base_hdr *)skb->data; ver = ershdr->ver; @@ -1511,6 +1514,7 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) ip6gre_tnl_init_features(dev); netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL); + netdev_lockdep_set_classes(dev); return 0; cleanup_dst_cache_init: @@ -1903,6 +1907,7 @@ static int ip6erspan_tap_init(struct net_device *dev) ip6erspan_tnl_link_config(tunnel, 1); netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL); + netdev_lockdep_set_classes(dev); return 0; cleanup_dst_cache_init: diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index d631428733..cca64c7809 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -37,6 +37,40 @@ INDIRECT_CALL_L4(cb, f2, f1, head, skb); \ }) +static int ipv6_gro_pull_exthdrs(struct sk_buff *skb, int off, int proto) +{ + const struct net_offload *ops = NULL; + struct ipv6_opt_hdr *opth; + + for (;;) { + int len; + + ops = rcu_dereference(inet6_offloads[proto]); + + if (unlikely(!ops)) + break; + + if (!(ops->flags & INET6_PROTO_GSO_EXTHDR)) + break; + + opth = skb_gro_header(skb, off + sizeof(*opth), off); + if (unlikely(!opth)) + break; + + len = ipv6_optlen(opth); + + opth = skb_gro_header(skb, off + len, off); + if (unlikely(!opth)) + break; + proto = opth->nexthdr; + + off += len; + } + + skb_gro_pull(skb, off - skb_network_offset(skb)); + return proto; +} + static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto) { const struct net_offload *ops = NULL; @@ -45,15 +79,13 @@ static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto) struct ipv6_opt_hdr *opth; int len; - if (proto != NEXTHDR_HOP) { - ops = rcu_dereference(inet6_offloads[proto]); + ops = rcu_dereference(inet6_offloads[proto]); - if (unlikely(!ops)) - break; + if (unlikely(!ops)) + break; - if (!(ops->flags & INET6_PROTO_GSO_EXTHDR)) - break; - } + if (!(ops->flags & INET6_PROTO_GSO_EXTHDR)) + break; if (unlikely(!pskb_may_pull(skb, 8))) break; @@ -171,13 +203,12 @@ static int ipv6_exthdrs_len(struct ipv6hdr *iph, proto = iph->nexthdr; for (;;) { - if (proto != NEXTHDR_HOP) { - *opps = rcu_dereference(inet6_offloads[proto]); - if (unlikely(!(*opps))) - break; - if (!((*opps)->flags & INET6_PROTO_GSO_EXTHDR)) - break; - } + *opps = rcu_dereference(inet6_offloads[proto]); + if (unlikely(!(*opps))) + break; + if (!((*opps)->flags & INET6_PROTO_GSO_EXTHDR)) + break; + opth = (void *)opth + optlen; optlen = ipv6_optlen(opth); len += optlen; @@ -206,28 +237,25 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, goto out; skb_set_network_header(skb, off); - skb_gro_pull(skb, sizeof(*iph)); - skb_set_transport_header(skb, skb_gro_offset(skb)); - flush += ntohs(iph->payload_len) != skb_gro_len(skb); + flush += ntohs(iph->payload_len) != skb->len - hlen; proto = iph->nexthdr; ops = rcu_dereference(inet6_offloads[proto]); if (!ops || !ops->callbacks.gro_receive) { - pskb_pull(skb, skb_gro_offset(skb)); - skb_gro_frag0_invalidate(skb); - proto = ipv6_gso_pull_exthdrs(skb, proto); - skb_gro_pull(skb, -skb_transport_offset(skb)); - skb_reset_transport_header(skb); - __skb_push(skb, skb_gro_offset(skb)); + proto = ipv6_gro_pull_exthdrs(skb, hlen, proto); ops = rcu_dereference(inet6_offloads[proto]); if (!ops || !ops->callbacks.gro_receive) goto out; - iph = ipv6_hdr(skb); + iph = skb_gro_network_header(skb); + } else { + skb_gro_pull(skb, sizeof(*iph)); } + skb_set_transport_header(skb, skb_gro_offset(skb)); + NAPI_GRO_CB(skb)->proto = proto; flush--; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index a722a43dd6..31b86fe661 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1424,11 +1424,11 @@ static int __ip6_append_data(struct sock *sk, bool zc = false; u32 tskey = 0; struct rt6_info *rt = (struct rt6_info *)cork->dst; + bool paged, hold_tskey, extra_uref = false; struct ipv6_txoptions *opt = v6_cork->opt; int csummode = CHECKSUM_NONE; unsigned int maxnonfragsize, headersize; unsigned int wmem_alloc_delta = 0; - bool paged, extra_uref = false; skb = skb_peek_tail(queue); if (!skb) { @@ -1440,10 +1440,6 @@ static int __ip6_append_data(struct sock *sk, mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; orig_mtu = mtu; - if (cork->tx_flags & SKBTX_ANY_TSTAMP && - READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) - tskey = atomic_inc_return(&sk->sk_tskey) - 1; - hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + @@ -1538,6 +1534,11 @@ emsgsize: flags &= ~MSG_SPLICE_PAGES; } + hold_tskey = cork->tx_flags & SKBTX_ANY_TSTAMP && + READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID; + if (hold_tskey) + tskey = atomic_inc_return(&sk->sk_tskey) - 1; + /* * Let's try using as much space as possible. * Use MTU if total length of the message fits into the MTU. @@ -1794,6 +1795,8 @@ error: cork->length -= length; IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); + if (hold_tskey) + atomic_dec(&sk->sk_tskey); return err; } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 9bbabf750a..70478027a7 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1898,6 +1898,7 @@ ip6_tnl_dev_init_gen(struct net_device *dev) dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len; netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); + netdev_lockdep_set_classes(dev); return 0; destroy_dst: diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index a7bf0327b3..c99053189e 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -182,4 +182,5 @@ struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb, } EXPORT_SYMBOL_GPL(udp_tunnel6_dst_lookup); +MODULE_DESCRIPTION("IPv6 Foo over UDP tunnel driver"); MODULE_LICENSE("GPL"); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index e550240c85..1163ca6ea4 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -935,6 +935,7 @@ static inline int vti6_dev_init_gen(struct net_device *dev) if (!dev->tstats) return -ENOMEM; netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); + netdev_lockdep_set_classes(dev); return 0; } diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 30ca064b76..9782c180fe 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -242,7 +242,7 @@ static int __net_init ip6mr_rules_init(struct net *net) goto err1; } - err = fib_default_rule_add(ops, 0x7fff, RT6_TABLE_DFLT, 0); + err = fib_default_rule_add(ops, 0x7fff, RT6_TABLE_DFLT); if (err < 0) goto err2; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 7d661735cb..56c3c467f9 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -210,7 +210,7 @@ static int ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, if (optlen < GROUP_FILTER_SIZE(0)) return -EINVAL; - if (optlen > READ_ONCE(sysctl_optmem_max)) + if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) return -ENOBUFS; gsf = memdup_sockptr(optval, optlen); @@ -244,7 +244,7 @@ static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, if (optlen < size0) return -EINVAL; - if (optlen > READ_ONCE(sysctl_optmem_max) - 4) + if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max) - 4) return -ENOBUFS; p = kmalloc(optlen + 4, GFP_KERNEL); @@ -509,6 +509,59 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; return ip6_sock_set_addr_preferences(sk, val); + case IPV6_MULTICAST_IF: + if (sk->sk_type == SOCK_STREAM) + return -ENOPROTOOPT; + if (optlen < sizeof(int)) + return -EINVAL; + if (val) { + struct net_device *dev; + int bound_dev_if, midx; + + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, val); + if (!dev) { + rcu_read_unlock(); + return -ENODEV; + } + midx = l3mdev_master_ifindex_rcu(dev); + + rcu_read_unlock(); + + bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); + if (bound_dev_if && + bound_dev_if != val && + (!midx || midx != bound_dev_if)) + return -EINVAL; + } + WRITE_ONCE(np->mcast_oif, val); + return 0; + case IPV6_UNICAST_IF: + { + struct net_device *dev; + int ifindex; + + if (optlen != sizeof(int)) + return -EINVAL; + + ifindex = (__force int)ntohl((__force __be32)val); + if (!ifindex) { + WRITE_ONCE(np->ucast_oif, 0); + return 0; + } + + dev = dev_get_by_index(net, ifindex); + if (!dev) + return -EADDRNOTAVAIL; + dev_put(dev); + + if (READ_ONCE(sk->sk_bound_dev_if)) + return -EINVAL; + + WRITE_ONCE(np->ucast_oif, ifindex); + return 0; + } } if (needs_rtnl) rtnl_lock(); @@ -829,67 +882,6 @@ done: break; } - - case IPV6_UNICAST_IF: - { - struct net_device *dev = NULL; - int ifindex; - - if (optlen != sizeof(int)) - goto e_inval; - - ifindex = (__force int)ntohl((__force __be32)val); - if (ifindex == 0) { - np->ucast_oif = 0; - retv = 0; - break; - } - - dev = dev_get_by_index(net, ifindex); - retv = -EADDRNOTAVAIL; - if (!dev) - break; - dev_put(dev); - - retv = -EINVAL; - if (sk->sk_bound_dev_if) - break; - - np->ucast_oif = ifindex; - retv = 0; - break; - } - - case IPV6_MULTICAST_IF: - if (sk->sk_type == SOCK_STREAM) - break; - if (optlen < sizeof(int)) - goto e_inval; - - if (val) { - struct net_device *dev; - int midx; - - rcu_read_lock(); - - dev = dev_get_by_index_rcu(net, val); - if (!dev) { - rcu_read_unlock(); - retv = -ENODEV; - break; - } - midx = l3mdev_master_ifindex_rcu(dev); - - rcu_read_unlock(); - - if (sk->sk_bound_dev_if && - sk->sk_bound_dev_if != val && - (!midx || midx != sk->sk_bound_dev_if)) - goto e_inval; - } - np->mcast_oif = val; - retv = 0; - break; case IPV6_ADD_MEMBERSHIP: case IPV6_DROP_MEMBERSHIP: { @@ -1161,10 +1153,12 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, sockopt_release_sock(sk); if (!skb) { if (np->rxopt.bits.rxinfo) { + int mcast_oif = READ_ONCE(np->mcast_oif); struct in6_pktinfo src_info; - src_info.ipi6_ifindex = np->mcast_oif ? np->mcast_oif : + + src_info.ipi6_ifindex = mcast_oif ? : np->sticky_pktinfo.ipi6_ifindex; - src_info.ipi6_addr = np->mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr; + src_info.ipi6_addr = mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxhlim) { @@ -1178,11 +1172,13 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, put_cmsg(&msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); } if (np->rxopt.bits.rxoinfo) { + int mcast_oif = READ_ONCE(np->mcast_oif); struct in6_pktinfo src_info; - src_info.ipi6_ifindex = np->mcast_oif ? np->mcast_oif : + + src_info.ipi6_ifindex = mcast_oif ? : np->sticky_pktinfo.ipi6_ifindex; - src_info.ipi6_addr = np->mcast_oif ? sk->sk_v6_daddr : - np->sticky_pktinfo.ipi6_addr; + src_info.ipi6_addr = mcast_oif ? sk->sk_v6_daddr : + np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxohlim) { @@ -1359,7 +1355,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, break; case IPV6_MULTICAST_IF: - val = np->mcast_oif; + val = READ_ONCE(np->mcast_oif); break; case IPV6_MULTICAST_ALL: @@ -1367,7 +1363,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, break; case IPV6_UNICAST_IF: - val = (__force int)htonl((__u32) np->ucast_oif); + val = (__force int)htonl((__u32) READ_ONCE(np->ucast_oif)); break; case IPV6_MTU_DISCOVER: diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c index 83d2a8be26..6a16a5bd0d 100644 --- a/net/ipv6/mip6.c +++ b/net/ipv6/mip6.c @@ -405,6 +405,7 @@ static void __exit mip6_fini(void) module_init(mip6_init); module_exit(mip6_fini); +MODULE_DESCRIPTION("IPv6 Mobility driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_DSTOPTS); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_ROUTING); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index fd9f049d6d..131f7bb211 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1125,6 +1125,8 @@ do_replace(struct net *net, sockptr_t arg, unsigned int len) void *loc_cpu_entry; struct ip6t_entry *iter; + if (len < sizeof(tmp)) + return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; @@ -1133,6 +1135,8 @@ do_replace(struct net *net, sockptr_t arg, unsigned int len) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; + if ((u64)len < (u64)tmp.size + sizeof(tmp)) + return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; @@ -1501,6 +1505,8 @@ compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) void *loc_cpu_entry; struct ip6t_entry *iter; + if (len < sizeof(tmp)) + return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; @@ -1509,6 +1515,8 @@ compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; + if ((u64)len < (u64)tmp.size + sizeof(tmp)) + return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index b2dd48911c..efbec7ee27 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -294,6 +294,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, } skb_dst_drop(skb); + skb_orphan(skb); return -EINPROGRESS; insert_error: @@ -469,7 +470,6 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) hdr = ipv6_hdr(skb); fhdr = (struct frag_hdr *)skb_transport_header(skb); - skb_orphan(skb); fq = fq_find(net, fhdr->identification, user, hdr, skb->dev ? skb->dev->ifindex : 0); if (fq == NULL) { diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index d2098dd4ce..ef2059c889 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -107,9 +107,9 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) oif = np->sticky_pktinfo.ipi6_ifindex; if (!oif && ipv6_addr_is_multicast(daddr)) - oif = np->mcast_oif; + oif = READ_ONCE(np->mcast_oif); else if (!oif) - oif = np->ucast_oif; + oif = READ_ONCE(np->ucast_oif); addr_type = ipv6_addr_type(daddr); if ((__ipv6_addr_needs_scope_id(addr_type) && !oif) || @@ -157,9 +157,9 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rt = (struct rt6_info *) dst; if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) - fl6.flowi6_oif = np->mcast_oif; + fl6.flowi6_oif = READ_ONCE(np->mcast_oif); else if (!fl6.flowi6_oif) - fl6.flowi6_oif = np->ucast_oif; + fl6.flowi6_oif = READ_ONCE(np->ucast_oif); pfh.icmph.type = user_icmph.icmp6_type; pfh.icmph.code = user_icmph.icmp6_code; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index dd0a4e73e6..03dbb874c3 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -876,9 +876,9 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) final_p = fl6_update_dst(&fl6, opt, &final); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) - fl6.flowi6_oif = np->mcast_oif; + fl6.flowi6_oif = READ_ONCE(np->mcast_oif); else if (!fl6.flowi6_oif) - fl6.flowi6_oif = np->ucast_oif; + fl6.flowi6_oif = READ_ONCE(np->ucast_oif); security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); if (hdrincl) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index cc24cefdb8..fbad6e1c97 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1460,6 +1460,7 @@ static int ipip6_tunnel_init(struct net_device *dev) return err; } netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL); + netdev_lockdep_set_classes(dev); return 0; } @@ -1956,6 +1957,7 @@ xfrm_tunnel_failed: module_init(sit_init); module_exit(sit_cleanup); +MODULE_DESCRIPTION("IPv6-in-IPv4 tunnel SIT driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("sit"); MODULE_ALIAS_NETDEV("sit0"); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 12eedc6ca2..c8d2ca2722 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -114,76 +114,82 @@ __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mssp) return __cookie_v6_init_sequence(iph, th, mssp); } -int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th, - __u32 cookie) +int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th) { + __u32 cookie = ntohl(th->ack_seq) - 1; __u32 seq = ntohl(th->seq) - 1; - __u32 mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr, - th->source, th->dest, seq); + __u32 mssind; + + mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr, + th->source, th->dest, seq); return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; } EXPORT_SYMBOL_GPL(__cookie_v6_check); -struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) +static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk, + struct sk_buff *skb) { struct tcp_options_received tcp_opt; - struct inet_request_sock *ireq; - struct tcp_request_sock *treq; - struct ipv6_pinfo *np = inet6_sk(sk); - struct tcp_sock *tp = tcp_sk(sk); - const struct tcphdr *th = tcp_hdr(skb); - __u32 cookie = ntohl(th->ack_seq) - 1; - struct sock *ret = sk; - struct request_sock *req; - int full_space, mss; - struct dst_entry *dst; - __u8 rcv_wscale; u32 tsoff = 0; - int l3index; - - if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) || - !th->ack || th->rst) - goto out; + int mss; if (tcp_synq_no_recent_overflow(sk)) goto out; - mss = __cookie_v6_check(ipv6_hdr(skb), th, cookie); - if (mss == 0) { - __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); + mss = __cookie_v6_check(ipv6_hdr(skb), tcp_hdr(skb)); + if (!mss) { + __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESFAILED); goto out; } - __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV); + __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESRECV); /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); - tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL); + tcp_parse_options(net, skb, &tcp_opt, 0, NULL); if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { - tsoff = secure_tcpv6_ts_off(sock_net(sk), + tsoff = secure_tcpv6_ts_off(net, ipv6_hdr(skb)->daddr.s6_addr32, ipv6_hdr(skb)->saddr.s6_addr32); tcp_opt.rcv_tsecr -= tsoff; } - if (!cookie_timestamp_decode(sock_net(sk), &tcp_opt)) + if (!cookie_timestamp_decode(net, &tcp_opt)) goto out; - ret = NULL; - req = cookie_tcp_reqsk_alloc(&tcp6_request_sock_ops, - &tcp_request_sock_ipv6_ops, sk, skb); - if (!req) + return cookie_tcp_reqsk_alloc(&tcp6_request_sock_ops, sk, skb, + &tcp_opt, mss, tsoff); +out: + return ERR_PTR(-EINVAL); +} + +struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) +{ + const struct tcphdr *th = tcp_hdr(skb); + struct ipv6_pinfo *np = inet6_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct inet_request_sock *ireq; + struct net *net = sock_net(sk); + struct request_sock *req; + struct dst_entry *dst; + struct sock *ret = sk; + __u8 rcv_wscale; + int full_space; + + if (!READ_ONCE(net->ipv4.sysctl_tcp_syncookies) || + !th->ack || th->rst) goto out; + req = cookie_tcp_check(net, sk, skb); + if (IS_ERR(req)) + goto out; + if (!req) + goto out_drop; + ireq = inet_rsk(req); - treq = tcp_rsk(req); - treq->tfo_listener = false; - req->mss = mss; - ireq->ir_rmt_port = th->source; - ireq->ir_num = ntohs(th->dest); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; @@ -197,31 +203,12 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ireq->pktopts = skb; } - ireq->ir_iif = inet_request_bound_dev_if(sk, skb); /* So that link locals have meaning */ if (!sk->sk_bound_dev_if && ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = tcp_v6_iif(skb); - ireq->ir_mark = inet_request_mark(sk, skb); - - req->num_retrans = 0; - ireq->snd_wscale = tcp_opt.snd_wscale; - ireq->sack_ok = tcp_opt.sack_ok; - ireq->wscale_ok = tcp_opt.wscale_ok; - ireq->tstamp_ok = tcp_opt.saw_tstamp; - req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; - treq->snt_synack = 0; - treq->rcv_isn = ntohl(th->seq) - 1; - treq->snt_isn = cookie; - treq->ts_off = 0; - treq->txhash = net_tx_rndhash(); - - l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); - tcp_ao_syncookie(sk, skb, treq, AF_INET6, l3index); - - if (IS_ENABLED(CONFIG_SMC)) - ireq->smc_ok = 0; + tcp_ao_syncookie(sk, skb, req, AF_INET6); /* * We need to lookup the dst_entry to get the correct window size. @@ -243,7 +230,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) fl6.flowi6_uid = sk->sk_uid; security_req_classify_flow(req, flowi6_to_flowi_common(&fl6)); - dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p); if (IS_ERR(dst)) goto out_free; } @@ -261,12 +248,13 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) dst_metric(dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; - ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), dst); + ireq->ecn_ok &= cookie_ecn_ok(net, dst); - ret = tcp_get_cookie_sock(sk, skb, req, dst, tsoff); + ret = tcp_get_cookie_sock(sk, skb, req, dst); out: return ret; out_free: reqsk_free(req); +out_drop: return NULL; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 8c6623496d..57b25b1fc9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1699,7 +1699,7 @@ ipv6_pktoptions: if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt && !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo) - np->mcast_oif = tcp_v6_iif(opt_skb); + WRITE_ONCE(np->mcast_oif, tcp_v6_iif(opt_skb)); if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) WRITE_ONCE(np->mcast_hops, ipv6_hdr(opt_skb)->hop_limit); diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c index 00e8d8b1c9..dc4ea9b117 100644 --- a/net/ipv6/tunnel6.c +++ b/net/ipv6/tunnel6.c @@ -302,4 +302,5 @@ static void __exit tunnel6_fini(void) module_init(tunnel6_init); module_exit(tunnel6_fini); +MODULE_DESCRIPTION("IP-in-IPv6 tunnel driver"); MODULE_LICENSE("GPL"); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index a1a79ff46f..8c14c4cc82 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -450,7 +450,7 @@ csum_copy_err: goto try_again; } -DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); +DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key); void udpv6_encap_enable(void) { static_branch_inc(&udpv6_encap_needed_key); @@ -1476,9 +1476,11 @@ do_udp_sendmsg: ipc6.opt = opt; err = udp_cmsg_send(sk, msg, &ipc6.gso_size); - if (err > 0) + if (err > 0) { err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, fl6, &ipc6); + connected = false; + } if (err < 0) { fl6_sock_release(flowlabel); return err; @@ -1490,7 +1492,6 @@ do_udp_sendmsg: } if (!(opt->opt_nflen|opt->opt_flen)) opt = NULL; - connected = false; } if (!opt) { opt = txopt_get(np); @@ -1541,10 +1542,10 @@ do_udp_sendmsg: connected = false; if (!fl6->flowi6_oif && ipv6_addr_is_multicast(&fl6->daddr)) { - fl6->flowi6_oif = np->mcast_oif; + fl6->flowi6_oif = READ_ONCE(np->mcast_oif); connected = false; } else if (!fl6->flowi6_oif) - fl6->flowi6_oif = np->ucast_oif; + fl6->flowi6_oif = READ_ONCE(np->ucast_oif); security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 6b95ba241e..626d7b362d 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -174,13 +174,7 @@ INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff) skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4); skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; - if (skb->ip_summed == CHECKSUM_UNNECESSARY) { - if (skb->csum_level < SKB_MAX_CSUM_LEVEL) - skb->csum_level++; - } else { - skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->csum_level = 0; - } + __skb_incr_checksum_unnecessary(skb); return 0; } diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 1323f2f692..f6cb94f82c 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -401,5 +401,6 @@ static void __exit xfrm6_tunnel_fini(void) module_init(xfrm6_tunnel_init); module_exit(xfrm6_tunnel_fini); +MODULE_DESCRIPTION("IPv6 XFRM tunnel driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_IPV6); diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index ce33adb65a..b0b3e9c5af 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -67,7 +67,7 @@ static int iucv_bus_match(struct device *dev, struct device_driver *drv) return 0; } -struct bus_type iucv_bus = { +const struct bus_type iucv_bus = { .name = "iucv", .match = iucv_bus_match, }; diff --git a/net/key/af_key.c b/net/key/af_key.c index d68d01804d..f79fb99271 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3924,5 +3924,6 @@ out_unregister_key_proto: module_init(ipsec_pfkey_init); module_exit(ipsec_pfkey_exit); +MODULE_DESCRIPTION("PF_KEY socket helpers"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_KEY); diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 763a59414b..7bf14cf9ff 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -599,9 +599,9 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) final_p = fl6_update_dst(&fl6, opt, &final); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) - fl6.flowi6_oif = np->mcast_oif; + fl6.flowi6_oif = READ_ONCE(np->mcast_oif); else if (!fl6.flowi6_oif) - fl6.flowi6_oif = np->ucast_oif; + fl6.flowi6_oif = READ_ONCE(np->ucast_oif); security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig index cb0291decf..13438cc0a6 100644 --- a/net/mac80211/Kconfig +++ b/net/mac80211/Kconfig @@ -62,7 +62,6 @@ config MAC80211_KUNIT_TEST depends on KUNIT depends on MAC80211 default KUNIT_ALL_TESTS - depends on !KERNEL_6_2 help Enable this option to test mac80211 internals with kunit. diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile index c9eb527681..4406b4f8f3 100644 --- a/net/mac80211/Makefile +++ b/net/mac80211/Makefile @@ -67,4 +67,6 @@ mac80211-$(CONFIG_MAC80211_RC_MINSTREL) += $(rc80211_minstrel-y) obj-y += tests/ +mac80211-y += wbrf.o + ccflags-y += -DDEBUG diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index ebaf930bb4..1d43a80064 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1272,7 +1272,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, return -EALREADY; if (params->smps_mode != NL80211_SMPS_OFF) - return -ENOTSUPP; + return -EOPNOTSUPP; link->smps_mode = IEEE80211_SMPS_OFF; @@ -2557,7 +2557,7 @@ static int ieee80211_update_mesh_config(struct wiphy *wiphy, * devices that report signal in dBm. */ if (!ieee80211_hw_check(&sdata->local->hw, SIGNAL_DBM)) - return -ENOTSUPP; + return -EOPNOTSUPP; conf->rssi_threshold = nconf->rssi_threshold; } if (_chg_mesh_attr(NL80211_MESHCONF_HT_OPMODE, mask)) { diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 1d928f29ad..ef4c2cebc0 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -507,11 +507,16 @@ static void _ieee80211_change_chanctx(struct ieee80211_local *local, WARN_ON(!cfg80211_chandef_compatible(&ctx->conf.def, chandef)); + ieee80211_remove_wbrf(local, &ctx->conf.def); + ctx->conf.def = *chandef; /* check if min chanctx also changed */ changed = IEEE80211_CHANCTX_CHANGE_WIDTH | _ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for); + + ieee80211_add_wbrf(local, &ctx->conf.def); + drv_change_chanctx(local, ctx, changed); if (!local->use_chanctx) { @@ -667,6 +672,8 @@ static int ieee80211_add_chanctx(struct ieee80211_local *local, lockdep_assert_wiphy(local->hw.wiphy); + ieee80211_add_wbrf(local, &ctx->conf.def); + if (!local->use_chanctx) local->hw.conf.radar_enabled = ctx->conf.radar_enabled; @@ -746,6 +753,8 @@ static void ieee80211_del_chanctx(struct ieee80211_local *local, } ieee80211_recalc_idle(local); + + ieee80211_remove_wbrf(local, &ctx->conf.def); } static void ieee80211_free_chanctx(struct ieee80211_local *local, @@ -858,7 +867,7 @@ static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link, int ret = 0; if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_NAN)) - return -ENOTSUPP; + return -EOPNOTSUPP; conf = rcu_dereference_protected(link->conf->chanctx_conf, lockdep_is_held(&local->hw.wiphy->mtx)); @@ -1106,7 +1115,7 @@ int ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link, curr_ctx = ieee80211_link_get_chanctx(link); if (curr_ctx && local->use_chanctx && !local->ops->switch_vif_chanctx) - return -ENOTSUPP; + return -EOPNOTSUPP; new_ctx = ieee80211_find_reservation_chanctx(local, chandef, mode); if (!new_ctx) { diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index b575ae90e5..74be49191e 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -497,6 +497,7 @@ static const char *hw_flag_names[] = { FLAG(SUPPORTS_CONC_MON_RX_DECAP), FLAG(DETECTS_COLOR_COLLISION), FLAG(MLO_MCAST_MULTI_LINK_TX), + FLAG(DISALLOW_PUNCTURING), #undef FLAG }; diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 5bf507ebb0..1e9389c49a 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -16,7 +16,7 @@ #include "sta_info.h" #include "driver-ops.h" -/* sta attributtes */ +/* sta attributes */ #define STA_READ(name, field, format_string) \ static ssize_t sta_ ##name## _read(struct file *file, \ diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index f690c385a3..eb482fb8c3 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -805,7 +805,7 @@ drv_cancel_remain_on_channel(struct ieee80211_local *local, static inline int drv_set_ringparam(struct ieee80211_local *local, u32 tx, u32 rx) { - int ret = -ENOTSUPP; + int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); @@ -1666,6 +1666,26 @@ static inline int drv_net_setup_tc(struct ieee80211_local *local, return ret; } +static inline bool drv_can_activate_links(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + u16 active_links) +{ + bool ret = true; + + lockdep_assert_wiphy(local->hw.wiphy); + + if (!check_sdata_in_driver(sdata)) + return false; + + trace_drv_can_activate_links(local, sdata, active_links); + if (local->ops->can_activate_links) + ret = local->ops->can_activate_links(&local->hw, &sdata->vif, + active_links); + trace_drv_return_bool(local, ret); + + return ret; +} + int drv_change_vif_links(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u16 old_links, u16 new_links, diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 8b1e02f2f9..8f2b445a5e 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -485,7 +485,7 @@ int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata, ifibss->ssid_len, IEEE80211_BSS_TYPE_IBSS, IEEE80211_PRIVACY(ifibss->privacy)); - if (WARN_ON(!cbss)) + if (unlikely(!cbss)) return -EINVAL; rcu_read_lock(); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index e0a792a770..a18361afea 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -92,11 +92,14 @@ enum ieee80211_status_data { IEEE80211_STATUS_SUBDATA_MASK = 0xff0, }; -/* - * Keep a station's queues on the active list for deficit accounting purposes - * if it was active or queued during the last 100ms - */ -#define AIRTIME_ACTIVE_DURATION (HZ / 10) +static inline bool +ieee80211_sta_keep_active(struct sta_info *sta, u8 ac) +{ + /* Keep a station's queues on the active list for deficit accounting + * purposes if it was active or queued during the last 100ms. + */ + return time_before_eq(jiffies, sta->airtime[ac].last_active + HZ / 10); +} struct ieee80211_bss { u32 device_ts_beacon, device_ts_presp; @@ -436,6 +439,7 @@ struct ieee80211_mgd_assoc_data { bool need_beacon; bool synced; bool timeout_started; + bool comeback; /* whether the AP has requested association comeback */ bool s1g; unsigned int assoc_link_id; @@ -1559,6 +1563,8 @@ struct ieee80211_local { /* extended capabilities provided by mac80211 */ u8 ext_capa[8]; + + bool wbrf_supported; }; static inline struct ieee80211_sub_if_data * @@ -1770,10 +1776,7 @@ static inline bool txq_has_queue(struct ieee80211_txq *txq) static inline bool ieee80211_have_rx_timestamp(struct ieee80211_rx_status *status) { - WARN_ON_ONCE(status->flag & RX_FLAG_MACTIME_START && - status->flag & RX_FLAG_MACTIME_END); - return !!(status->flag & (RX_FLAG_MACTIME_START | RX_FLAG_MACTIME_END | - RX_FLAG_MACTIME_PLCP_START)); + return status->flag & RX_FLAG_MACTIME; } void ieee80211_vif_inc_num_mcast(struct ieee80211_sub_if_data *sdata); @@ -2600,4 +2603,19 @@ ieee80211_eht_cap_ie_to_sta_eht_cap(struct ieee80211_sub_if_data *sdata, const struct ieee80211_eht_cap_elem *eht_cap_ie_elem, u8 eht_cap_len, struct link_sta_info *link_sta); + +void ieee80211_check_wbrf_support(struct ieee80211_local *local); +void ieee80211_add_wbrf(struct ieee80211_local *local, struct cfg80211_chan_def *chandef); +void ieee80211_remove_wbrf(struct ieee80211_local *local, struct cfg80211_chan_def *chandef); + +#if IS_ENABLED(CONFIG_MAC80211_KUNIT_TEST) +#define EXPORT_SYMBOL_IF_MAC80211_KUNIT(sym) EXPORT_SYMBOL_IF_KUNIT(sym) +#define VISIBLE_IF_MAC80211_KUNIT +ieee80211_rx_result +ieee80211_drop_unencrypted_mgmt(struct ieee80211_rx_data *rx); +#else +#define EXPORT_SYMBOL_IF_MAC80211_KUNIT(sym) +#define VISIBLE_IF_MAC80211_KUNIT static +#endif + #endif /* IEEE80211_I_H */ diff --git a/net/mac80211/link.c b/net/mac80211/link.c index bf7bd880d0..d4f86955af 100644 --- a/net/mac80211/link.c +++ b/net/mac80211/link.c @@ -444,6 +444,9 @@ int ieee80211_set_active_links(struct ieee80211_vif *vif, u16 active_links) lockdep_assert_wiphy(local->hw.wiphy); + if (!drv_can_activate_links(local, sdata, active_links)) + return -EINVAL; + old_active = sdata->vif.active_links; if (old_active & active_links) { /* diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 033a5261ac..f2ece77935 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1405,6 +1405,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) debugfs_hw_add(local); rate_control_add_debugfs(local); + ieee80211_check_wbrf_support(local); + rtnl_lock(); wiphy_lock(hw->wiphy); diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index fccbcde335..3d4806b7ff 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -769,6 +769,9 @@ bool ieee80211_mesh_xmit_fast(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 ctrl_flags) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; + struct ieee80211_mesh_fast_tx_key key = { + .type = MESH_FAST_TX_TYPE_LOCAL + }; struct ieee80211_mesh_fast_tx *entry; struct ieee80211s_hdr *meshhdr; u8 sa[ETH_ALEN] __aligned(2); @@ -804,7 +807,10 @@ bool ieee80211_mesh_xmit_fast(struct ieee80211_sub_if_data *sdata, return false; } - entry = mesh_fast_tx_get(sdata, skb->data); + ether_addr_copy(key.addr, skb->data); + if (!ether_addr_equal(skb->data + ETH_ALEN, sdata->vif.addr)) + key.type = MESH_FAST_TX_TYPE_PROXIED; + entry = mesh_fast_tx_get(sdata, &key); if (!entry) return false; diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index ad8469293d..58c619874c 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -134,9 +134,38 @@ struct mesh_path { #define MESH_FAST_TX_CACHE_TIMEOUT 8000 /* msecs */ /** + * enum ieee80211_mesh_fast_tx_type - cached mesh fast tx entry type + * + * @MESH_FAST_TX_TYPE_LOCAL: tx from the local vif address as SA + * @MESH_FAST_TX_TYPE_PROXIED: local tx with a different SA (e.g. bridged) + * @MESH_FAST_TX_TYPE_FORWARDED: forwarded from a different mesh point + * @NUM_MESH_FAST_TX_TYPE: number of entry types + */ +enum ieee80211_mesh_fast_tx_type { + MESH_FAST_TX_TYPE_LOCAL, + MESH_FAST_TX_TYPE_PROXIED, + MESH_FAST_TX_TYPE_FORWARDED, + + /* must be last */ + NUM_MESH_FAST_TX_TYPE +}; + + +/** + * struct ieee80211_mesh_fast_tx_key - cached mesh fast tx entry key + * + * @addr: The Ethernet DA for this entry + * @type: cache entry type + */ +struct ieee80211_mesh_fast_tx_key { + u8 addr[ETH_ALEN] __aligned(2); + u16 type; +}; + +/** * struct ieee80211_mesh_fast_tx - cached mesh fast tx entry * @rhash: rhashtable pointer - * @addr_key: The Ethernet DA which is the key for this entry + * @key: the lookup key for this cache entry * @fast_tx: base fast_tx data * @hdr: cached mesh and rfc1042 headers * @hdrlen: length of mesh + rfc1042 @@ -147,7 +176,7 @@ struct mesh_path { */ struct ieee80211_mesh_fast_tx { struct rhash_head rhash; - u8 addr_key[ETH_ALEN] __aligned(2); + struct ieee80211_mesh_fast_tx_key key; struct ieee80211_fast_tx fast_tx; u8 hdr[sizeof(struct ieee80211s_hdr) + sizeof(rfc1042_header)]; @@ -333,7 +362,8 @@ void mesh_path_tx_root_frame(struct ieee80211_sub_if_data *sdata); bool mesh_action_is_path_sel(struct ieee80211_mgmt *mgmt); struct ieee80211_mesh_fast_tx * -mesh_fast_tx_get(struct ieee80211_sub_if_data *sdata, const u8 *addr); +mesh_fast_tx_get(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mesh_fast_tx_key *key); bool ieee80211_mesh_xmit_fast(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 ctrl_flags); void mesh_fast_tx_cache(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 775d52561c..024f48db6b 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -151,7 +151,7 @@ static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags, break; default: kfree_skb(skb); - return -ENOTSUPP; + return -EOPNOTSUPP; } *pos++ = ie_len; *pos++ = flags; diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index 8a3f44ce3e..a6b62169f0 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -37,8 +37,8 @@ static const struct rhashtable_params mesh_rht_params = { static const struct rhashtable_params fast_tx_rht_params = { .nelem_hint = 10, .automatic_shrinking = true, - .key_len = ETH_ALEN, - .key_offset = offsetof(struct ieee80211_mesh_fast_tx, addr_key), + .key_len = sizeof_field(struct ieee80211_mesh_fast_tx, key), + .key_offset = offsetof(struct ieee80211_mesh_fast_tx, key), .head_offset = offsetof(struct ieee80211_mesh_fast_tx, rhash), .hashfn = mesh_table_hash, }; @@ -431,20 +431,21 @@ static void mesh_fast_tx_entry_free(struct mesh_tx_cache *cache, } struct ieee80211_mesh_fast_tx * -mesh_fast_tx_get(struct ieee80211_sub_if_data *sdata, const u8 *addr) +mesh_fast_tx_get(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mesh_fast_tx_key *key) { struct ieee80211_mesh_fast_tx *entry; struct mesh_tx_cache *cache; cache = &sdata->u.mesh.tx_cache; - entry = rhashtable_lookup(&cache->rht, addr, fast_tx_rht_params); + entry = rhashtable_lookup(&cache->rht, key, fast_tx_rht_params); if (!entry) return NULL; if (!(entry->mpath->flags & MESH_PATH_ACTIVE) || mpath_expired(entry->mpath)) { spin_lock_bh(&cache->walk_lock); - entry = rhashtable_lookup(&cache->rht, addr, fast_tx_rht_params); + entry = rhashtable_lookup(&cache->rht, key, fast_tx_rht_params); if (entry) mesh_fast_tx_entry_free(cache, entry); spin_unlock_bh(&cache->walk_lock); @@ -489,18 +490,24 @@ void mesh_fast_tx_cache(struct ieee80211_sub_if_data *sdata, if (!sta) return; + build.key.type = MESH_FAST_TX_TYPE_LOCAL; if ((meshhdr->flags & MESH_FLAGS_AE) == MESH_FLAGS_AE_A5_A6) { /* This is required to keep the mppath alive */ mppath = mpp_path_lookup(sdata, meshhdr->eaddr1); if (!mppath) return; build.mppath = mppath; + if (!ether_addr_equal(meshhdr->eaddr2, sdata->vif.addr)) + build.key.type = MESH_FAST_TX_TYPE_PROXIED; } else if (ieee80211_has_a4(hdr->frame_control)) { mppath = mpath; } else { return; } + if (!ether_addr_equal(hdr->addr4, sdata->vif.addr)) + build.key.type = MESH_FAST_TX_TYPE_FORWARDED; + /* rate limit, in case fast xmit can't be enabled */ if (mppath->fast_tx_check == jiffies) return; @@ -547,7 +554,7 @@ void mesh_fast_tx_cache(struct ieee80211_sub_if_data *sdata, } } - memcpy(build.addr_key, mppath->dst, ETH_ALEN); + memcpy(build.key.addr, mppath->dst, ETH_ALEN); build.timestamp = jiffies; build.fast_tx.band = info->band; build.fast_tx.da_offs = offsetof(struct ieee80211_hdr, addr3); @@ -600,11 +607,10 @@ unlock_sta: void mesh_fast_tx_gc(struct ieee80211_sub_if_data *sdata) { unsigned long timeout = msecs_to_jiffies(MESH_FAST_TX_CACHE_TIMEOUT); - struct mesh_tx_cache *cache; + struct mesh_tx_cache *cache = &sdata->u.mesh.tx_cache; struct ieee80211_mesh_fast_tx *entry; struct hlist_node *n; - cache = &sdata->u.mesh.tx_cache; if (atomic_read(&cache->rht.nelems) < MESH_FAST_TX_CACHE_THRESHOLD_SIZE) return; @@ -622,7 +628,6 @@ void mesh_fast_tx_flush_mpath(struct mesh_path *mpath) struct ieee80211_mesh_fast_tx *entry; struct hlist_node *n; - cache = &sdata->u.mesh.tx_cache; spin_lock_bh(&cache->walk_lock); hlist_for_each_entry_safe(entry, n, &cache->walk_head, walk_list) if (entry->mpath == mpath) @@ -637,7 +642,6 @@ void mesh_fast_tx_flush_sta(struct ieee80211_sub_if_data *sdata, struct ieee80211_mesh_fast_tx *entry; struct hlist_node *n; - cache = &sdata->u.mesh.tx_cache; spin_lock_bh(&cache->walk_lock); hlist_for_each_entry_safe(entry, n, &cache->walk_head, walk_list) if (rcu_access_pointer(entry->mpath->next_hop) == sta) @@ -649,13 +653,18 @@ void mesh_fast_tx_flush_addr(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct mesh_tx_cache *cache = &sdata->u.mesh.tx_cache; + struct ieee80211_mesh_fast_tx_key key = {}; struct ieee80211_mesh_fast_tx *entry; + int i; - cache = &sdata->u.mesh.tx_cache; + ether_addr_copy(key.addr, addr); spin_lock_bh(&cache->walk_lock); - entry = rhashtable_lookup_fast(&cache->rht, addr, fast_tx_rht_params); - if (entry) - mesh_fast_tx_entry_free(cache, entry); + for (i = 0; i < NUM_MESH_FAST_TX_TYPE; i++) { + key.type = i; + entry = rhashtable_lookup_fast(&cache->rht, &key, fast_tx_rht_params); + if (entry) + mesh_fast_tx_entry_free(cache, entry); + } spin_unlock_bh(&cache->walk_lock); } @@ -676,10 +685,10 @@ struct mesh_path *mesh_path_add(struct ieee80211_sub_if_data *sdata, if (ether_addr_equal(dst, sdata->vif.addr)) /* never add ourselves as neighbours */ - return ERR_PTR(-ENOTSUPP); + return ERR_PTR(-EOPNOTSUPP); if (is_multicast_ether_addr(dst)) - return ERR_PTR(-ENOTSUPP); + return ERR_PTR(-EOPNOTSUPP); if (atomic_add_unless(&sdata->u.mesh.mpaths, 1, MESH_MAX_MPATHS) == 0) return ERR_PTR(-ENOSPC); @@ -719,10 +728,10 @@ int mpp_path_add(struct ieee80211_sub_if_data *sdata, if (ether_addr_equal(dst, sdata->vif.addr)) /* never add ourselves as neighbours */ - return -ENOTSUPP; + return -EOPNOTSUPP; if (is_multicast_ether_addr(dst)) - return -ENOTSUPP; + return -EOPNOTSUPP; new_mpath = mesh_path_new(sdata, dst, GFP_ATOMIC); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 6cfc07aaa1..94028b541b 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -138,6 +138,7 @@ ieee80211_handle_puncturing_bitmap(struct ieee80211_link_data *link, u16 bitmap, u64 *changed) { struct cfg80211_chan_def *chandef = &link->conf->chandef; + struct ieee80211_local *local = link->sdata->local; u16 extracted; u64 _changed = 0; @@ -150,7 +151,9 @@ ieee80211_handle_puncturing_bitmap(struct ieee80211_link_data *link, bitmap); if (cfg80211_valid_disable_subchannel_bitmap(&bitmap, - chandef)) + chandef) && + !(bitmap && ieee80211_hw_check(&local->hw, + DISALLOW_PUNCTURING))) break; link->u.mgd.conn_flags |= ieee80211_chandef_downgrade(chandef); @@ -598,6 +601,7 @@ static int ieee80211_config_bw(struct ieee80211_link_data *link, return ret; } + cfg80211_schedule_channels_check(&sdata->wdev); return 0; } @@ -1385,7 +1389,7 @@ static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) struct ieee80211_mgmt *mgmt; u8 *pos, qos_info, *ie_start; size_t offset, noffset; - u16 capab = WLAN_CAPABILITY_ESS, link_capab; + u16 capab = 0, link_capab; __le16 listen_int; struct element *ext_capa = NULL; enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); @@ -1532,6 +1536,17 @@ static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) *pos++ = assoc_data->ssid_len; memcpy(pos, assoc_data->ssid, assoc_data->ssid_len); + /* + * This bit is technically reserved, so it shouldn't matter for either + * the AP or us, but it also means we shouldn't set it. However, we've + * always set it in the past, and apparently some EHT APs check that + * we don't set it. To avoid interoperability issues with old APs that + * for some reason check it and want it to be set, set the bit for all + * pre-EHT connections as we used to do. + */ + if (link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_EHT) + capab |= WLAN_CAPABILITY_ESS; + /* add the elements for the assoc (main) link */ link_capab = capab; offset = ieee80211_assoc_link_elems(sdata, skb, &link_capab, @@ -5368,6 +5383,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, assoc_data->ap_addr, tu, ms); assoc_data->timeout = jiffies + msecs_to_jiffies(ms); assoc_data->timeout_started = true; + assoc_data->comeback = true; if (ms > IEEE80211_ASSOC_TIMEOUT) run_again(sdata, assoc_data->timeout); goto notify_driver; @@ -5389,33 +5405,24 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, } if (ieee80211_vif_is_mld(&sdata->vif)) { + struct ieee80211_mle_basic_common_info *common; + if (!elems->ml_basic) { sdata_info(sdata, - "MLO association with %pM but no multi-link element in response!\n", + "MLO association with %pM but no (basic) multi-link element in response!\n", assoc_data->ap_addr); goto abandon_assoc; } - if (le16_get_bits(elems->ml_basic->control, - IEEE80211_ML_CONTROL_TYPE) != - IEEE80211_ML_CONTROL_TYPE_BASIC) { + common = (void *)elems->ml_basic->variable; + + if (memcmp(assoc_data->ap_addr, + common->mld_mac_addr, ETH_ALEN)) { sdata_info(sdata, - "bad multi-link element (control=0x%x)\n", - le16_to_cpu(elems->ml_basic->control)); + "AP MLD MAC address mismatch: got %pM expected %pM\n", + common->mld_mac_addr, + assoc_data->ap_addr); goto abandon_assoc; - } else { - struct ieee80211_mle_basic_common_info *common; - - common = (void *)elems->ml_basic->variable; - - if (memcmp(assoc_data->ap_addr, - common->mld_mac_addr, ETH_ALEN)) { - sdata_info(sdata, - "AP MLD MAC address mismatch: got %pM expected %pM\n", - common->mld_mac_addr, - assoc_data->ap_addr); - goto abandon_assoc; - } } } @@ -5683,6 +5690,7 @@ static bool ieee80211_config_puncturing(struct ieee80211_link_data *link, const struct ieee80211_eht_operation *eht_oper, u64 *changed) { + struct ieee80211_local *local = link->sdata->local; u16 bitmap = 0, extracted; if ((eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT) && @@ -5714,6 +5722,9 @@ static bool ieee80211_config_puncturing(struct ieee80211_link_data *link, return false; } + if (bitmap && ieee80211_hw_check(&local->hw, DISALLOW_PUNCTURING)) + return false; + ieee80211_handle_puncturing_bitmap(link, eht_oper, bitmap, changed); return true; } @@ -5837,7 +5848,7 @@ static void ieee80211_ml_reconfiguration(struct ieee80211_sub_if_data *sdata, */ if (control & IEEE80211_MLE_STA_RECONF_CONTROL_AP_REM_TIMER_PRESENT) - link_removal_timeout[link_id] = le16_to_cpu(*(__le16 *)pos); + link_removal_timeout[link_id] = get_unaligned_le16(pos); } removed_links &= sdata->vif.valid_links; @@ -5862,8 +5873,11 @@ static void ieee80211_ml_reconfiguration(struct ieee80211_sub_if_data *sdata, continue; } - link_delay = link_conf->beacon_int * - link_removal_timeout[link_id]; + if (link_removal_timeout[link_id] < 1) + link_delay = 0; + else + link_delay = link_conf->beacon_int * + (link_removal_timeout[link_id] - 1); if (!delay) delay = link_delay; @@ -6731,8 +6745,18 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) } ifmgd->auth_data->timeout_started = true; } else if (ifmgd->assoc_data && + !ifmgd->assoc_data->comeback && (ieee80211_is_assoc_req(fc) || ieee80211_is_reassoc_req(fc))) { + /* + * Update association timeout based on the TX status + * for the (Re)Association Request frame. Skip this if + * we have already processed a (Re)Association Response + * frame that indicated need for association comeback + * at a specific time in the future. This could happen + * if the TX status information is delayed enough for + * the response to be received and processed first. + */ if (status_acked) { ifmgd->assoc_data->timeout = jiffies + IEEE80211_ASSOC_TIMEOUT_SHORT; @@ -7673,7 +7697,8 @@ ieee80211_setup_assoc_link(struct ieee80211_sub_if_data *sdata, bitmap = get_unaligned_le16(disable_subchannel_bitmap); if (cfg80211_valid_disable_subchannel_bitmap(&bitmap, - &link->conf->chandef)) + &link->conf->chandef) && + !(bitmap && ieee80211_hw_check(&local->hw, DISALLOW_PUNCTURING))) ieee80211_handle_puncturing_bitmap(link, eht_oper, bitmap, diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 64352e4e6d..541b0f53c6 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -19,6 +19,7 @@ #include <linux/export.h> #include <linux/kcov.h> #include <linux/bitops.h> +#include <kunit/visibility.h> #include <net/mac80211.h> #include <net/ieee80211_radiotap.h> #include <asm/unaligned.h> @@ -566,7 +567,8 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, if (local->hw.radiotap_timestamp.units_pos >= 0) { u16 accuracy = 0; - u8 flags = IEEE80211_RADIOTAP_TIMESTAMP_FLAG_32BIT; + u8 flags; + u64 ts; rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_TIMESTAMP)); @@ -575,7 +577,15 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, while ((pos - (u8 *)rthdr) & 7) pos++; - put_unaligned_le64(status->device_timestamp, pos); + if (status->flag & RX_FLAG_MACTIME_IS_RTAP_TS64) { + flags = IEEE80211_RADIOTAP_TIMESTAMP_FLAG_64BIT; + ts = status->mactime; + } else { + flags = IEEE80211_RADIOTAP_TIMESTAMP_FLAG_32BIT; + ts = status->device_timestamp; + } + + put_unaligned_le64(ts, pos); pos += sizeof(u64); if (local->hw.radiotap_timestamp.accuracy >= 0) { @@ -920,7 +930,7 @@ static void ieee80211_parse_qos(struct ieee80211_rx_data *rx) * Drivers always need to pass packets that are aligned to two-byte boundaries * to the stack. * - * Additionally, should, if possible, align the payload data in a way that + * Additionally, they should, if possible, align the payload data in a way that * guarantees that the contained IP header is aligned to a four-byte * boundary. In the case of regular frames, this simply means aligning the * payload to a four-byte boundary (because either the IP header is directly @@ -936,7 +946,7 @@ static void ieee80211_parse_qos(struct ieee80211_rx_data *rx) * subframe to a length that is a multiple of four. * * Padding like Atheros hardware adds which is between the 802.11 header and - * the payload is not supported, the driver is required to move the 802.11 + * the payload is not supported; the driver is required to move the 802.11 * header to be directly in front of the payload in that case. */ static void ieee80211_verify_alignment(struct ieee80211_rx_data *rx) @@ -2405,7 +2415,7 @@ static int ieee80211_drop_unencrypted(struct ieee80211_rx_data *rx, __le16 fc) return 0; } -static ieee80211_rx_result +VISIBLE_IF_MAC80211_KUNIT ieee80211_rx_result ieee80211_drop_unencrypted_mgmt(struct ieee80211_rx_data *rx) { struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); @@ -2484,6 +2494,7 @@ ieee80211_drop_unencrypted_mgmt(struct ieee80211_rx_data *rx) return RX_CONTINUE; } +EXPORT_SYMBOL_IF_MAC80211_KUNIT(ieee80211_drop_unencrypted_mgmt); static ieee80211_rx_result __ieee80211_data_to_8023(struct ieee80211_rx_data *rx, bool *port_control) @@ -2735,7 +2746,10 @@ ieee80211_rx_mesh_fast_forward(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int hdrlen) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; - struct ieee80211_mesh_fast_tx *entry = NULL; + struct ieee80211_mesh_fast_tx_key key = { + .type = MESH_FAST_TX_TYPE_FORWARDED + }; + struct ieee80211_mesh_fast_tx *entry; struct ieee80211s_hdr *mesh_hdr; struct tid_ampdu_tx *tid_tx; struct sta_info *sta; @@ -2744,9 +2758,13 @@ ieee80211_rx_mesh_fast_forward(struct ieee80211_sub_if_data *sdata, mesh_hdr = (struct ieee80211s_hdr *)(skb->data + sizeof(eth)); if ((mesh_hdr->flags & MESH_FLAGS_AE) == MESH_FLAGS_AE_A5_A6) - entry = mesh_fast_tx_get(sdata, mesh_hdr->eaddr1); + ether_addr_copy(key.addr, mesh_hdr->eaddr1); else if (!(mesh_hdr->flags & MESH_FLAGS_AE)) - entry = mesh_fast_tx_get(sdata, skb->data); + ether_addr_copy(key.addr, skb->data); + else + return false; + + entry = mesh_fast_tx_get(sdata, &key); if (!entry) return false; diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index fca3f67ac0..f9d5842601 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -194,11 +194,32 @@ ieee80211_bss_info_update(struct ieee80211_local *local, if (scan_sdata && scan_sdata->vif.type == NL80211_IFTYPE_STATION && scan_sdata->vif.cfg.assoc && ieee80211_have_rx_timestamp(rx_status)) { - bss_meta.parent_tsf = - ieee80211_calculate_rx_timestamp(local, rx_status, - len + FCS_LEN, 24); - ether_addr_copy(bss_meta.parent_bssid, - scan_sdata->vif.bss_conf.bssid); + struct ieee80211_bss_conf *link_conf = NULL; + + /* for an MLO connection, set the TSF data only in case we have + * an indication on which of the links the frame was received + */ + if (ieee80211_vif_is_mld(&scan_sdata->vif)) { + if (rx_status->link_valid) { + s8 link_id = rx_status->link_id; + + link_conf = + rcu_dereference(scan_sdata->vif.link_conf[link_id]); + } + } else { + link_conf = &scan_sdata->vif.bss_conf; + } + + if (link_conf) { + bss_meta.parent_tsf = + ieee80211_calculate_rx_timestamp(local, + rx_status, + len + FCS_LEN, + 24); + + ether_addr_copy(bss_meta.parent_bssid, + link_conf->bssid); + } } rcu_read_unlock(); @@ -672,6 +693,21 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, if (local->scan_req) return -EBUSY; + /* For an MLO connection, if a link ID was specified, validate that it + * is indeed active. If no link ID was specified, select one of the + * active links. + */ + if (ieee80211_vif_is_mld(&sdata->vif)) { + if (req->tsf_report_link_id >= 0) { + if (!(sdata->vif.active_links & + BIT(req->tsf_report_link_id))) + return -EINVAL; + } else { + req->tsf_report_link_id = + __ffs(sdata->vif.active_links); + } + } + if (!__ieee80211_can_leave_ch(sdata)) return -EBUSY; @@ -720,6 +756,8 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, local->hw_scan_req->req.duration = req->duration; local->hw_scan_req->req.duration_mandatory = req->duration_mandatory; + local->hw_scan_req->req.tsf_report_link_id = + req->tsf_report_link_id; local->hw_scan_band = 0; local->hw_scan_req->req.n_6ghz_params = req->n_6ghz_params; @@ -1257,7 +1295,7 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, iebufsz = local->scan_ies_len + req->ie_len; if (!local->ops->sched_scan_start) - return -ENOTSUPP; + return -EOPNOTSUPP; for (i = 0; i < NUM_NL80211_BANDS; i++) { if (local->hw.wiphy->bands[i]) { @@ -1322,7 +1360,7 @@ int ieee80211_request_sched_scan_stop(struct ieee80211_local *local) lockdep_assert_wiphy(local->hw.wiphy); if (!local->ops->sched_scan_stop) - return -ENOTSUPP; + return -EOPNOTSUPP; /* We don't want to restart sched scan anymore. */ RCU_INIT_POINTER(local->sched_scan_req, NULL); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index bcf3f727fc..4391d8dd63 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -40,7 +40,7 @@ * either sta_info_insert() or sta_info_insert_rcu(); only in the latter * case (which acquires an rcu read section but must not be called from * within one) will the pointer still be valid after the call. Note that - * the caller may not do much with the STA info before inserting it, in + * the caller may not do much with the STA info before inserting it; in * particular, it may not start any mesh peer link management or add * encryption keys. * @@ -58,7 +58,7 @@ * In order to remove a STA info structure, various sta_info_destroy_*() * calls are available. * - * There is no concept of ownership on a STA entry, each structure is + * There is no concept of ownership on a STA entry; each structure is * owned by the global hash table/list until it is removed. All users of * the structure need to be RCU protected so that the structure won't be * freed before they are done using it. @@ -2273,7 +2273,6 @@ void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, struct ieee80211_local *local = sta->sdata->local; u8 ac = ieee80211_ac_from_tid(tid); u32 airtime = 0; - u32 diff; if (sta->local->airtime_flags & AIRTIME_USE_TX) airtime += tx_airtime; @@ -2284,8 +2283,7 @@ void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, sta->airtime[ac].tx_airtime += tx_airtime; sta->airtime[ac].rx_airtime += rx_airtime; - diff = (u32)jiffies - sta->airtime[ac].last_active; - if (diff <= AIRTIME_ACTIVE_DURATION) + if (ieee80211_sta_keep_active(sta, ac)) sta->airtime[ac].deficit -= airtime; spin_unlock_bh(&local->active_txq_lock[ac]); diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index f471304672..ac4c7a6f96 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -138,7 +138,7 @@ enum ieee80211_agg_stop_reason { struct airtime_info { u64 rx_airtime; u64 tx_airtime; - u32 last_active; + unsigned long last_active; s32 deficit; atomic_t aql_tx_pending; /* Estimated airtime for frames pending */ u32 aql_limit_low; diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 05a7dff69f..49730b4241 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -1001,7 +1001,7 @@ ieee80211_tdls_build_mgmt_packet_data(struct ieee80211_sub_if_data *sdata, skb); break; default: - ret = -ENOTSUPP; + ret = -EOPNOTSUPP; break; } @@ -1071,7 +1071,7 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev, /* any value is ok */ break; default: - ret = -ENOTSUPP; + ret = -EOPNOTSUPP; break; } @@ -1177,7 +1177,7 @@ ieee80211_tdls_mgmt_setup(struct wiphy *wiphy, struct net_device *dev, smps_mode != IEEE80211_SMPS_OFF) { tdls_dbg(sdata, "Aborting TDLS setup due to SMPS mode %d\n", smps_mode); - return -ENOTSUPP; + return -EOPNOTSUPP; } lockdep_assert_wiphy(local->hw.wiphy); @@ -1289,7 +1289,7 @@ int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev, int ret; if (!(wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS)) - return -ENOTSUPP; + return -EOPNOTSUPP; /* make sure we are in managed mode, and associated */ if (sdata->vif.type != NL80211_IFTYPE_STATION || @@ -1446,7 +1446,7 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, lockdep_assert_wiphy(local->hw.wiphy); if (!(wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS)) - return -ENOTSUPP; + return -EOPNOTSUPP; if (sdata->vif.type != NL80211_IFTYPE_STATION) return -EINVAL; @@ -1459,7 +1459,7 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, case NL80211_TDLS_SETUP: case NL80211_TDLS_DISCOVERY_REQ: /* We don't support in-driver setup/teardown/discovery */ - return -ENOTSUPP; + return -EOPNOTSUPP; } /* protect possible bss_conf changes and avoid concurrency in @@ -1510,7 +1510,7 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, return ret; break; default: - return -ENOTSUPP; + return -EOPNOTSUPP; } if (ether_addr_equal(sdata->u.mgd.tdls_peer, peer)) { @@ -1673,7 +1673,7 @@ ieee80211_tdls_channel_switch(struct wiphy *wiphy, struct net_device *dev, if (!test_sta_flag(sta, WLAN_STA_TDLS_CHAN_SWITCH)) { tdls_dbg(sdata, "TDLS channel switch unsupported by %pM\n", addr); - ret = -ENOTSUPP; + ret = -EOPNOTSUPP; goto out; } @@ -1993,7 +1993,7 @@ ieee80211_process_tdls_channel_switch_req(struct ieee80211_sub_if_data *sdata, if (!sta->sta.deflink.ht_cap.ht_supported && elems->sec_chan_offs && elems->sec_chan_offs->sec_chan_offs) { tdls_dbg(sdata, "TDLS chan switch - wide chan unsupported\n"); - ret = -ENOTSUPP; + ret = -EOPNOTSUPP; goto out; } diff --git a/net/mac80211/tests/Makefile b/net/mac80211/tests/Makefile index 4814584f8a..4fdaf3feac 100644 --- a/net/mac80211/tests/Makefile +++ b/net/mac80211/tests/Makefile @@ -1,3 +1,3 @@ -mac80211-tests-y += module.o elems.o +mac80211-tests-y += module.o elems.o mfp.o obj-$(CONFIG_MAC80211_KUNIT_TEST) += mac80211-tests.o diff --git a/net/mac80211/tests/mfp.c b/net/mac80211/tests/mfp.c new file mode 100644 index 0000000000..a8dc1601da --- /dev/null +++ b/net/mac80211/tests/mfp.c @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KUnit tests for management frame acceptance + * + * Copyright (C) 2023 Intel Corporation + */ +#include <kunit/test.h> +#include <kunit/skbuff.h> +#include "../ieee80211_i.h" +#include "../sta_info.h" + +MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING); + +static const struct mfp_test_case { + const char *desc; + bool sta, mfp, decrypted, unicast, assoc; + u8 category; + u8 stype; + u8 action; + ieee80211_rx_result result; +} accept_mfp_cases[] = { + /* regular public action */ + { + .desc = "public action: accept unicast from unknown peer", + .stype = IEEE80211_STYPE_ACTION, + .category = WLAN_CATEGORY_PUBLIC, + .action = WLAN_PUB_ACTION_DSE_ENABLEMENT, + .unicast = true, + .result = RX_CONTINUE, + }, + { + .desc = "public action: accept multicast from unknown peer", + .stype = IEEE80211_STYPE_ACTION, + .category = WLAN_CATEGORY_PUBLIC, + .action = WLAN_PUB_ACTION_DSE_ENABLEMENT, + .unicast = false, + .result = RX_CONTINUE, + }, + { + .desc = "public action: accept unicast without MFP", + .stype = IEEE80211_STYPE_ACTION, + .category = WLAN_CATEGORY_PUBLIC, + .action = WLAN_PUB_ACTION_DSE_ENABLEMENT, + .unicast = true, + .sta = true, + .result = RX_CONTINUE, + }, + { + .desc = "public action: accept multicast without MFP", + .stype = IEEE80211_STYPE_ACTION, + .category = WLAN_CATEGORY_PUBLIC, + .action = WLAN_PUB_ACTION_DSE_ENABLEMENT, + .unicast = false, + .sta = true, + .result = RX_CONTINUE, + }, + { + .desc = "public action: drop unicast with MFP", + .stype = IEEE80211_STYPE_ACTION, + .category = WLAN_CATEGORY_PUBLIC, + .action = WLAN_PUB_ACTION_DSE_ENABLEMENT, + .unicast = true, + .sta = true, + .mfp = true, + .result = RX_DROP_U_UNPROT_UNICAST_PUB_ACTION, + }, + { + .desc = "public action: accept multicast with MFP", + .stype = IEEE80211_STYPE_ACTION, + .category = WLAN_CATEGORY_PUBLIC, + .action = WLAN_PUB_ACTION_DSE_ENABLEMENT, + .unicast = false, + .sta = true, + .mfp = true, + .result = RX_CONTINUE, + }, + /* protected dual of public action */ + { + .desc = "protected dual: drop unicast from unknown peer", + .stype = IEEE80211_STYPE_ACTION, + .category = WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION, + .action = WLAN_PUB_ACTION_DSE_ENABLEMENT, + .unicast = true, + .result = RX_DROP_U_UNPROT_DUAL, + }, + { + .desc = "protected dual: drop multicast from unknown peer", + .stype = IEEE80211_STYPE_ACTION, + .category = WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION, + .action = WLAN_PUB_ACTION_DSE_ENABLEMENT, + .unicast = false, + .result = RX_DROP_U_UNPROT_DUAL, + }, + { + .desc = "protected dual: drop unicast without MFP", + .stype = IEEE80211_STYPE_ACTION, + .category = WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION, + .action = WLAN_PUB_ACTION_DSE_ENABLEMENT, + .unicast = true, + .sta = true, + .result = RX_DROP_U_UNPROT_DUAL, + }, + { + .desc = "protected dual: drop multicast without MFP", + .stype = IEEE80211_STYPE_ACTION, + .category = WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION, + .action = WLAN_PUB_ACTION_DSE_ENABLEMENT, + .unicast = false, + .sta = true, + .result = RX_DROP_U_UNPROT_DUAL, + }, + { + .desc = "protected dual: drop undecrypted unicast with MFP", + .stype = IEEE80211_STYPE_ACTION, + .category = WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION, + .action = WLAN_PUB_ACTION_DSE_ENABLEMENT, + .unicast = true, + .sta = true, + .mfp = true, + .result = RX_DROP_U_UNPROT_DUAL, + }, + { + .desc = "protected dual: drop undecrypted multicast with MFP", + .stype = IEEE80211_STYPE_ACTION, + .category = WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION, + .action = WLAN_PUB_ACTION_DSE_ENABLEMENT, + .unicast = false, + .sta = true, + .mfp = true, + .result = RX_DROP_U_UNPROT_DUAL, + }, + { + .desc = "protected dual: accept unicast with MFP", + .stype = IEEE80211_STYPE_ACTION, + .category = WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION, + .action = WLAN_PUB_ACTION_DSE_ENABLEMENT, + .decrypted = true, + .unicast = true, + .sta = true, + .mfp = true, + .result = RX_CONTINUE, + }, + { + .desc = "protected dual: accept multicast with MFP", + .stype = IEEE80211_STYPE_ACTION, + .category = WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION, + .action = WLAN_PUB_ACTION_DSE_ENABLEMENT, + .decrypted = true, + .unicast = false, + .sta = true, + .mfp = true, + .result = RX_CONTINUE, + }, + /* deauth/disassoc before keys are set */ + { + .desc = "deauth: accept unicast with MFP but w/o key", + .stype = IEEE80211_STYPE_DEAUTH, + .sta = true, + .mfp = true, + .unicast = true, + .result = RX_CONTINUE, + }, + { + .desc = "disassoc: accept unicast with MFP but w/o key", + .stype = IEEE80211_STYPE_DEAUTH, + .sta = true, + .mfp = true, + .unicast = true, + .result = RX_CONTINUE, + }, + /* non-public robust action frame ... */ + { + .desc = "BA action: drop unicast before assoc", + .stype = IEEE80211_STYPE_ACTION, + .category = WLAN_CATEGORY_BACK, + .unicast = true, + .sta = true, + .result = RX_DROP_U_UNPROT_ROBUST_ACTION, + }, + { + .desc = "BA action: drop unprotected after assoc", + .stype = IEEE80211_STYPE_ACTION, + .category = WLAN_CATEGORY_BACK, + .unicast = true, + .sta = true, + .mfp = true, + .result = RX_DROP_U_UNPROT_UCAST_MGMT, + }, + { + .desc = "BA action: accept unprotected without MFP", + .stype = IEEE80211_STYPE_ACTION, + .category = WLAN_CATEGORY_BACK, + .unicast = true, + .sta = true, + .assoc = true, + .mfp = false, + .result = RX_CONTINUE, + }, + { + .desc = "BA action: drop unprotected with MFP", + .stype = IEEE80211_STYPE_ACTION, + .category = WLAN_CATEGORY_BACK, + .unicast = true, + .sta = true, + .mfp = true, + .result = RX_DROP_U_UNPROT_UCAST_MGMT, + }, +}; + +KUNIT_ARRAY_PARAM_DESC(accept_mfp, accept_mfp_cases, desc); + +static void accept_mfp(struct kunit *test) +{ + static struct sta_info sta; + const struct mfp_test_case *params = test->param_value; + struct ieee80211_rx_data rx = { + .sta = params->sta ? &sta : NULL, + }; + struct ieee80211_rx_status *status; + struct ieee80211_hdr_3addr hdr = { + .frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + params->stype), + .addr1 = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, + .addr2 = { 0x12, 0x22, 0x33, 0x44, 0x55, 0x66 }, + /* A3/BSSID doesn't matter here */ + }; + + memset(&sta, 0, sizeof(sta)); + + if (!params->sta) { + KUNIT_ASSERT_FALSE(test, params->mfp); + KUNIT_ASSERT_FALSE(test, params->decrypted); + } + + if (params->mfp) + set_sta_flag(&sta, WLAN_STA_MFP); + + if (params->assoc) + set_bit(WLAN_STA_ASSOC, &sta._flags); + + rx.skb = kunit_zalloc_skb(test, 128, GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, rx.skb); + status = IEEE80211_SKB_RXCB(rx.skb); + + if (params->decrypted) { + status->flag |= RX_FLAG_DECRYPTED; + if (params->unicast) + hdr.frame_control |= + cpu_to_le16(IEEE80211_FCTL_PROTECTED); + } + + if (params->unicast) + hdr.addr1[0] = 0x02; + + skb_put_data(rx.skb, &hdr, sizeof(hdr)); + + switch (params->stype) { + case IEEE80211_STYPE_ACTION: + skb_put_u8(rx.skb, params->category); + skb_put_u8(rx.skb, params->action); + break; + case IEEE80211_STYPE_DEAUTH: + case IEEE80211_STYPE_DISASSOC: { + __le16 reason = cpu_to_le16(WLAN_REASON_UNSPECIFIED); + + skb_put_data(rx.skb, &reason, sizeof(reason)); + } + break; + } + + KUNIT_EXPECT_EQ(test, + (__force u32)ieee80211_drop_unencrypted_mgmt(&rx), + (__force u32)params->result); +} + +static struct kunit_case mfp_test_cases[] = { + KUNIT_CASE_PARAM(accept_mfp, accept_mfp_gen_params), + {} +}; + +static struct kunit_suite mfp = { + .name = "mac80211-mfp", + .test_cases = mfp_test_cases, +}; + +kunit_test_suite(mfp); diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 032718d5b2..06835ed4c4 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -2512,6 +2512,31 @@ TRACE_EVENT(drv_net_setup_tc, ) ); +TRACE_EVENT(drv_can_activate_links, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + u16 active_links), + + TP_ARGS(local, sdata, active_links), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + __field(u16, active_links) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + __entry->active_links = active_links; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT " requested active_links:0x%04x\n", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->active_links + ) +); + TRACE_EVENT(drv_change_vif_links, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index a85918594c..6fbb15b659 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -4015,14 +4015,13 @@ ieee80211_txq_set_active(struct txq_info *txqi) return; sta = container_of(txqi->txq.sta, struct sta_info, sta); - sta->airtime[txqi->txq.ac].last_active = (u32)jiffies; + sta->airtime[txqi->txq.ac].last_active = jiffies; } static bool ieee80211_txq_keep_active(struct txq_info *txqi) { struct sta_info *sta; - u32 diff; if (!txqi->txq.sta) return false; @@ -4031,9 +4030,7 @@ ieee80211_txq_keep_active(struct txq_info *txqi) if (ieee80211_sta_deficit(sta, txqi->txq.ac) >= 0) return false; - diff = (u32)jiffies - sta->airtime[txqi->txq.ac].last_active; - - return diff <= AIRTIME_ACTIVE_DURATION; + return ieee80211_sta_keep_active(sta, txqi->txq.ac); } struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac) diff --git a/net/mac80211/util.c b/net/mac80211/util.c index ed680120d5..643c54855b 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -4176,6 +4176,7 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, unsigned int mpdu_offset) { u64 ts = status->mactime; + bool mactime_plcp_start; struct rate_info ri; u16 rate; u8 n_ltf; @@ -4183,6 +4184,9 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, if (WARN_ON(!ieee80211_have_rx_timestamp(status))) return 0; + mactime_plcp_start = (status->flag & RX_FLAG_MACTIME) == + RX_FLAG_MACTIME_PLCP_START; + memset(&ri, 0, sizeof(ri)); ri.bw = status->bw; @@ -4197,7 +4201,7 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; /* TODO/FIXME: is this right? handle other PPDUs */ - if (status->flag & RX_FLAG_MACTIME_PLCP_START) { + if (mactime_plcp_start) { mpdu_offset += 2; ts += 36; } @@ -4214,7 +4218,7 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, * See P802.11ax_D6.0, section 27.3.4 for * VHT PPDU format. */ - if (status->flag & RX_FLAG_MACTIME_PLCP_START) { + if (mactime_plcp_start) { mpdu_offset += 2; ts += 36; @@ -4238,7 +4242,7 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, * See P802.11REVmd_D3.0, section 19.3.2 for * HT PPDU format. */ - if (status->flag & RX_FLAG_MACTIME_PLCP_START) { + if (mactime_plcp_start) { mpdu_offset += 2; if (status->enc_flags & RX_ENC_FLAG_HT_GF) ts += 24; @@ -4266,7 +4270,7 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, * See P802.11REVmd_D3.0, section 21.3.2 for * VHT PPDU format. */ - if (status->flag & RX_FLAG_MACTIME_PLCP_START) { + if (mactime_plcp_start) { mpdu_offset += 2; ts += 36; @@ -4288,7 +4292,7 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, sband = local->hw.wiphy->bands[status->band]; ri.legacy = sband->bitrates[status->rate_idx].bitrate; - if (status->flag & RX_FLAG_MACTIME_PLCP_START) { + if (mactime_plcp_start) { if (status->band == NL80211_BAND_5GHZ) { ts += 20; mpdu_offset += 2; @@ -4310,7 +4314,7 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, return 0; /* rewind from end of MPDU */ - if (status->flag & RX_FLAG_MACTIME_END) + if ((status->flag & RX_FLAG_MACTIME) == RX_FLAG_MACTIME_END) ts -= mpdu_len * 8 * 10 / rate; ts += mpdu_offset * 8 * 10 / rate; diff --git a/net/mac80211/wbrf.c b/net/mac80211/wbrf.c new file mode 100644 index 0000000000..3a86123091 --- /dev/null +++ b/net/mac80211/wbrf.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Wifi Band Exclusion Interface for WLAN + * Copyright (C) 2023 Advanced Micro Devices + * + */ + +#include <linux/acpi_amd_wbrf.h> +#include <linux/units.h> +#include <net/cfg80211.h> +#include "ieee80211_i.h" + +void ieee80211_check_wbrf_support(struct ieee80211_local *local) +{ + struct wiphy *wiphy = local->hw.wiphy; + struct device *dev; + + if (!wiphy) + return; + + dev = wiphy->dev.parent; + if (!dev) + return; + + local->wbrf_supported = acpi_amd_wbrf_supported_producer(dev); +} + +static void get_chan_freq_boundary(u32 center_freq, u32 bandwidth, u64 *start, u64 *end) +{ + bandwidth *= KHZ_PER_MHZ; + center_freq *= KHZ_PER_MHZ; + + *start = center_freq - bandwidth / 2; + *end = center_freq + bandwidth / 2; + + /* Frequency in Hz is expected */ + *start = *start * HZ_PER_KHZ; + *end = *end * HZ_PER_KHZ; +} + +static void get_ranges_from_chandef(struct cfg80211_chan_def *chandef, + struct wbrf_ranges_in_out *ranges_in) +{ + u64 start_freq1, end_freq1; + u64 start_freq2, end_freq2; + int bandwidth; + + bandwidth = nl80211_chan_width_to_mhz(chandef->width); + + get_chan_freq_boundary(chandef->center_freq1, bandwidth, &start_freq1, &end_freq1); + + ranges_in->band_list[0].start = start_freq1; + ranges_in->band_list[0].end = end_freq1; + ranges_in->num_of_ranges = 1; + + if (chandef->width == NL80211_CHAN_WIDTH_80P80) { + get_chan_freq_boundary(chandef->center_freq2, bandwidth, &start_freq2, &end_freq2); + + ranges_in->band_list[1].start = start_freq2; + ranges_in->band_list[1].end = end_freq2; + ranges_in->num_of_ranges++; + } +} + +void ieee80211_add_wbrf(struct ieee80211_local *local, struct cfg80211_chan_def *chandef) +{ + struct wbrf_ranges_in_out ranges_in = {0}; + struct device *dev; + + if (!local->wbrf_supported) + return; + + dev = local->hw.wiphy->dev.parent; + + get_ranges_from_chandef(chandef, &ranges_in); + + acpi_amd_wbrf_add_remove(dev, WBRF_RECORD_ADD, &ranges_in); +} + +void ieee80211_remove_wbrf(struct ieee80211_local *local, struct cfg80211_chan_def *chandef) +{ + struct wbrf_ranges_in_out ranges_in = {0}; + struct device *dev; + + if (!local->wbrf_supported) + return; + + dev = local->hw.wiphy->dev.parent; + + get_ranges_from_chandef(chandef, &ranges_in); + + acpi_amd_wbrf_add_remove(dev, WBRF_RECORD_REMOVE, &ranges_in); +} diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c index 5c3cb019f7..ef7f23af04 100644 --- a/net/mac802154/cfg.c +++ b/net/mac802154/cfg.c @@ -315,6 +315,179 @@ static int mac802154_stop_beacons(struct wpan_phy *wpan_phy, return mac802154_stop_beacons_locked(local, sdata); } +static int mac802154_associate(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev, + struct ieee802154_addr *coord) +{ + struct ieee802154_local *local = wpan_phy_priv(wpan_phy); + u64 ceaddr = swab64((__force u64)coord->extended_addr); + struct ieee802154_sub_if_data *sdata; + struct ieee802154_pan_device *parent; + __le16 short_addr; + int ret; + + ASSERT_RTNL(); + + sdata = IEEE802154_WPAN_DEV_TO_SUB_IF(wpan_dev); + + if (wpan_dev->parent) { + dev_err(&sdata->dev->dev, + "Device %8phC is already associated\n", &ceaddr); + return -EPERM; + } + + if (coord->mode == IEEE802154_SHORT_ADDRESSING) + return -EINVAL; + + parent = kzalloc(sizeof(*parent), GFP_KERNEL); + if (!parent) + return -ENOMEM; + + parent->pan_id = coord->pan_id; + parent->mode = coord->mode; + parent->extended_addr = coord->extended_addr; + parent->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_BROADCAST); + + /* Set the PAN ID hardware address filter beforehand to avoid dropping + * the association response with a destination PAN ID field set to the + * "new" PAN ID. + */ + if (local->hw.flags & IEEE802154_HW_AFILT) { + ret = drv_set_pan_id(local, coord->pan_id); + if (ret < 0) + goto free_parent; + } + + ret = mac802154_perform_association(sdata, parent, &short_addr); + if (ret) + goto reset_panid; + + if (local->hw.flags & IEEE802154_HW_AFILT) { + ret = drv_set_short_addr(local, short_addr); + if (ret < 0) + goto reset_panid; + } + + wpan_dev->pan_id = coord->pan_id; + wpan_dev->short_addr = short_addr; + wpan_dev->parent = parent; + + return 0; + +reset_panid: + if (local->hw.flags & IEEE802154_HW_AFILT) + drv_set_pan_id(local, cpu_to_le16(IEEE802154_PAN_ID_BROADCAST)); + +free_parent: + kfree(parent); + return ret; +} + +static int mac802154_disassociate_from_parent(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev) +{ + struct ieee802154_local *local = wpan_phy_priv(wpan_phy); + struct ieee802154_pan_device *child, *tmp; + struct ieee802154_sub_if_data *sdata; + unsigned int max_assoc; + u64 eaddr; + int ret; + + sdata = IEEE802154_WPAN_DEV_TO_SUB_IF(wpan_dev); + + /* Start by disassociating all the children and preventing new ones to + * attempt associations. + */ + max_assoc = cfg802154_set_max_associations(wpan_dev, 0); + list_for_each_entry_safe(child, tmp, &wpan_dev->children, node) { + ret = mac802154_send_disassociation_notif(sdata, child, + IEEE802154_COORD_WISHES_DEVICE_TO_LEAVE); + if (ret) { + eaddr = swab64((__force u64)child->extended_addr); + dev_err(&sdata->dev->dev, + "Disassociation with %8phC may have failed (%d)\n", + &eaddr, ret); + } + + list_del(&child->node); + } + + ret = mac802154_send_disassociation_notif(sdata, wpan_dev->parent, + IEEE802154_DEVICE_WISHES_TO_LEAVE); + if (ret) { + eaddr = swab64((__force u64)wpan_dev->parent->extended_addr); + dev_err(&sdata->dev->dev, + "Disassociation from %8phC may have failed (%d)\n", + &eaddr, ret); + } + + ret = 0; + + kfree(wpan_dev->parent); + wpan_dev->parent = NULL; + wpan_dev->pan_id = cpu_to_le16(IEEE802154_PAN_ID_BROADCAST); + wpan_dev->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_BROADCAST); + + if (local->hw.flags & IEEE802154_HW_AFILT) { + ret = drv_set_pan_id(local, wpan_dev->pan_id); + if (ret < 0) + goto reset_mac_assoc; + + ret = drv_set_short_addr(local, wpan_dev->short_addr); + if (ret < 0) + goto reset_mac_assoc; + } + +reset_mac_assoc: + cfg802154_set_max_associations(wpan_dev, max_assoc); + + return ret; +} + +static int mac802154_disassociate_child(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev, + struct ieee802154_pan_device *child) +{ + struct ieee802154_sub_if_data *sdata; + int ret; + + sdata = IEEE802154_WPAN_DEV_TO_SUB_IF(wpan_dev); + + ret = mac802154_send_disassociation_notif(sdata, child, + IEEE802154_COORD_WISHES_DEVICE_TO_LEAVE); + if (ret) + return ret; + + list_del(&child->node); + wpan_dev->nchildren--; + kfree(child); + + return 0; +} + +static int mac802154_disassociate(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev, + struct ieee802154_addr *target) +{ + u64 teaddr = swab64((__force u64)target->extended_addr); + struct ieee802154_pan_device *pan_device; + + ASSERT_RTNL(); + + if (cfg802154_device_is_parent(wpan_dev, target)) + return mac802154_disassociate_from_parent(wpan_phy, wpan_dev); + + pan_device = cfg802154_device_is_child(wpan_dev, target); + if (pan_device) + return mac802154_disassociate_child(wpan_phy, wpan_dev, + pan_device); + + dev_err(&wpan_dev->netdev->dev, + "Device %8phC is not associated with us\n", &teaddr); + + return -EINVAL; +} + #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL static void ieee802154_get_llsec_table(struct wpan_phy *wpan_phy, @@ -526,6 +699,8 @@ const struct cfg802154_ops mac802154_config_ops = { .abort_scan = mac802154_abort_scan, .send_beacons = mac802154_send_beacons, .stop_beacons = mac802154_stop_beacons, + .associate = mac802154_associate, + .disassociate = mac802154_disassociate, #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL .get_llsec_table = ieee802154_get_llsec_table, .lock_llsec_table = ieee802154_lock_llsec_table, diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index c347ec9ff8..08dd521a51 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -24,6 +24,7 @@ enum ieee802154_ongoing { IEEE802154_IS_SCANNING = BIT(0), IEEE802154_IS_BEACONING = BIT(1), + IEEE802154_IS_ASSOCIATING = BIT(2), }; /* mac802154 device private data */ @@ -74,6 +75,13 @@ struct ieee802154_local { struct list_head rx_mac_cmd_list; struct work_struct rx_mac_cmd_work; + /* Association */ + struct ieee802154_pan_device *assoc_dev; + struct completion assoc_done; + __le16 assoc_addr; + u8 assoc_status; + struct work_struct assoc_work; + bool started; bool suspended; unsigned long ongoing; @@ -296,6 +304,25 @@ static inline bool mac802154_is_beaconing(struct ieee802154_local *local) void mac802154_rx_mac_cmd_worker(struct work_struct *work); +int mac802154_perform_association(struct ieee802154_sub_if_data *sdata, + struct ieee802154_pan_device *coord, + __le16 *short_addr); +int mac802154_process_association_resp(struct ieee802154_sub_if_data *sdata, + struct sk_buff *skb); + +static inline bool mac802154_is_associating(struct ieee802154_local *local) +{ + return test_bit(IEEE802154_IS_ASSOCIATING, &local->ongoing); +} + +int mac802154_send_disassociation_notif(struct ieee802154_sub_if_data *sdata, + struct ieee802154_pan_device *target, + u8 reason); +int mac802154_process_disassociation_notif(struct ieee802154_sub_if_data *sdata, + struct sk_buff *skb); +int mac802154_process_association_req(struct ieee802154_sub_if_data *sdata, + struct sk_buff *skb); + /* interface handling */ int ieee802154_iface_init(void); void ieee802154_iface_exit(void); diff --git a/net/mac802154/main.c b/net/mac802154/main.c index 357ece6743..9ab7396668 100644 --- a/net/mac802154/main.c +++ b/net/mac802154/main.c @@ -103,6 +103,8 @@ ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops) INIT_DELAYED_WORK(&local->beacon_work, mac802154_beacon_worker); INIT_WORK(&local->rx_mac_cmd_work, mac802154_rx_mac_cmd_worker); + init_completion(&local->assoc_done); + /* init supported flags with 802.15.4 default ranges */ phy->supported.max_minbe = 8; phy->supported.min_maxbe = 3; diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index e2434b4fe5..e40a988d6c 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -93,6 +93,31 @@ void mac802154_rx_mac_cmd_worker(struct work_struct *work) queue_delayed_work(local->mac_wq, &local->beacon_work, 0); break; + + case IEEE802154_CMD_ASSOCIATION_RESP: + dev_dbg(&mac_pkt->sdata->dev->dev, "processing ASSOC RESP\n"); + if (!mac802154_is_associating(local)) + break; + + mac802154_process_association_resp(mac_pkt->sdata, mac_pkt->skb); + break; + + case IEEE802154_CMD_ASSOCIATION_REQ: + dev_dbg(&mac_pkt->sdata->dev->dev, "processing ASSOC REQ\n"); + if (mac_pkt->sdata->wpan_dev.iftype != NL802154_IFTYPE_COORD) + break; + + mac802154_process_association_req(mac_pkt->sdata, mac_pkt->skb); + break; + + case IEEE802154_CMD_DISASSOCIATION_NOTIFY: + dev_dbg(&mac_pkt->sdata->dev->dev, "processing DISASSOC NOTIF\n"); + if (mac_pkt->sdata->wpan_dev.iftype != NL802154_IFTYPE_COORD) + break; + + mac802154_process_disassociation_notif(mac_pkt->sdata, mac_pkt->skb); + break; + default: break; } @@ -131,12 +156,15 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata, switch (mac_cb(skb)->dest.mode) { case IEEE802154_ADDR_NONE: - if (hdr->source.mode != IEEE802154_ADDR_NONE) - /* FIXME: check if we are PAN coordinator */ - skb->pkt_type = PACKET_OTHERHOST; - else + if (hdr->source.mode == IEEE802154_ADDR_NONE) /* ACK comes with both addresses empty */ skb->pkt_type = PACKET_HOST; + else if (!wpan_dev->parent) + /* No dest means PAN coordinator is the recipient */ + skb->pkt_type = PACKET_HOST; + else + /* We are not the PAN coordinator, just relaying */ + skb->pkt_type = PACKET_OTHERHOST; break; case IEEE802154_ADDR_LONG: if (mac_cb(skb)->dest.pan_id != span && diff --git a/net/mac802154/scan.c b/net/mac802154/scan.c index d9658f2c4a..1c0eeaa765 100644 --- a/net/mac802154/scan.c +++ b/net/mac802154/scan.c @@ -466,6 +466,7 @@ int mac802154_send_beacons_locked(struct ieee802154_sub_if_data *sdata, struct cfg802154_beacon_request *request) { struct ieee802154_local *local = sdata->local; + struct wpan_dev *wpan_dev = &sdata->wpan_dev; ASSERT_RTNL(); @@ -495,8 +496,7 @@ int mac802154_send_beacons_locked(struct ieee802154_sub_if_data *sdata, local->beacon.mac_pl.superframe_order = request->interval; local->beacon.mac_pl.final_cap_slot = 0xf; local->beacon.mac_pl.battery_life_ext = 0; - /* TODO: Fill this field with the coordinator situation in the network */ - local->beacon.mac_pl.pan_coordinator = 1; + local->beacon.mac_pl.pan_coordinator = !wpan_dev->parent; local->beacon.mac_pl.assoc_permit = 1; if (request->interval == IEEE802154_ACTIVE_SCAN_DURATION) @@ -510,3 +510,406 @@ int mac802154_send_beacons_locked(struct ieee802154_sub_if_data *sdata, return 0; } + +int mac802154_perform_association(struct ieee802154_sub_if_data *sdata, + struct ieee802154_pan_device *coord, + __le16 *short_addr) +{ + u64 ceaddr = swab64((__force u64)coord->extended_addr); + struct ieee802154_association_req_frame frame = {}; + struct ieee802154_local *local = sdata->local; + struct wpan_dev *wpan_dev = &sdata->wpan_dev; + struct sk_buff *skb; + int ret; + + frame.mhr.fc.type = IEEE802154_FC_TYPE_MAC_CMD; + frame.mhr.fc.security_enabled = 0; + frame.mhr.fc.frame_pending = 0; + frame.mhr.fc.ack_request = 1; /* We always expect an ack here */ + frame.mhr.fc.intra_pan = 0; + frame.mhr.fc.dest_addr_mode = (coord->mode == IEEE802154_ADDR_LONG) ? + IEEE802154_EXTENDED_ADDRESSING : IEEE802154_SHORT_ADDRESSING; + frame.mhr.fc.version = IEEE802154_2003_STD; + frame.mhr.fc.source_addr_mode = IEEE802154_EXTENDED_ADDRESSING; + frame.mhr.source.mode = IEEE802154_ADDR_LONG; + frame.mhr.source.pan_id = cpu_to_le16(IEEE802154_PANID_BROADCAST); + frame.mhr.source.extended_addr = wpan_dev->extended_addr; + frame.mhr.dest.mode = coord->mode; + frame.mhr.dest.pan_id = coord->pan_id; + if (coord->mode == IEEE802154_ADDR_LONG) + frame.mhr.dest.extended_addr = coord->extended_addr; + else + frame.mhr.dest.short_addr = coord->short_addr; + frame.mhr.seq = atomic_inc_return(&wpan_dev->dsn) & 0xFF; + frame.mac_pl.cmd_id = IEEE802154_CMD_ASSOCIATION_REQ; + frame.assoc_req_pl.device_type = 1; + frame.assoc_req_pl.power_source = 1; + frame.assoc_req_pl.rx_on_when_idle = 1; + frame.assoc_req_pl.alloc_addr = 1; + + skb = alloc_skb(IEEE802154_MAC_CMD_SKB_SZ + sizeof(frame.assoc_req_pl), + GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + skb->dev = sdata->dev; + + ret = ieee802154_mac_cmd_push(skb, &frame, &frame.assoc_req_pl, + sizeof(frame.assoc_req_pl)); + if (ret) { + kfree_skb(skb); + return ret; + } + + local->assoc_dev = coord; + reinit_completion(&local->assoc_done); + set_bit(IEEE802154_IS_ASSOCIATING, &local->ongoing); + + ret = ieee802154_mlme_tx_one_locked(local, sdata, skb); + if (ret) { + if (ret > 0) + ret = (ret == IEEE802154_NO_ACK) ? -EREMOTEIO : -EIO; + dev_warn(&sdata->dev->dev, + "No ASSOC REQ ACK received from %8phC\n", &ceaddr); + goto clear_assoc; + } + + ret = wait_for_completion_killable_timeout(&local->assoc_done, 10 * HZ); + if (ret <= 0) { + dev_warn(&sdata->dev->dev, + "No ASSOC RESP received from %8phC\n", &ceaddr); + ret = -ETIMEDOUT; + goto clear_assoc; + } + + if (local->assoc_status != IEEE802154_ASSOCIATION_SUCCESSFUL) { + if (local->assoc_status == IEEE802154_PAN_AT_CAPACITY) + ret = -ERANGE; + else + ret = -EPERM; + + dev_warn(&sdata->dev->dev, + "Negative ASSOC RESP received from %8phC: %s\n", &ceaddr, + local->assoc_status == IEEE802154_PAN_AT_CAPACITY ? + "PAN at capacity" : "access denied"); + } + + ret = 0; + *short_addr = local->assoc_addr; + +clear_assoc: + clear_bit(IEEE802154_IS_ASSOCIATING, &local->ongoing); + local->assoc_dev = NULL; + + return ret; +} + +int mac802154_process_association_resp(struct ieee802154_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee802154_addr *src = &mac_cb(skb)->source; + struct ieee802154_addr *dest = &mac_cb(skb)->dest; + u64 deaddr = swab64((__force u64)dest->extended_addr); + struct ieee802154_local *local = sdata->local; + struct wpan_dev *wpan_dev = &sdata->wpan_dev; + struct ieee802154_assoc_resp_pl resp_pl = {}; + + if (skb->len != sizeof(resp_pl)) + return -EINVAL; + + if (unlikely(src->mode != IEEE802154_EXTENDED_ADDRESSING || + dest->mode != IEEE802154_EXTENDED_ADDRESSING)) + return -EINVAL; + + if (unlikely(dest->extended_addr != wpan_dev->extended_addr || + src->extended_addr != local->assoc_dev->extended_addr)) + return -ENODEV; + + memcpy(&resp_pl, skb->data, sizeof(resp_pl)); + local->assoc_addr = resp_pl.short_addr; + local->assoc_status = resp_pl.status; + + dev_dbg(&skb->dev->dev, + "ASSOC RESP 0x%x received from %8phC, getting short address %04x\n", + local->assoc_status, &deaddr, local->assoc_addr); + + complete(&local->assoc_done); + + return 0; +} + +int mac802154_send_disassociation_notif(struct ieee802154_sub_if_data *sdata, + struct ieee802154_pan_device *target, + u8 reason) +{ + struct ieee802154_disassociation_notif_frame frame = {}; + u64 teaddr = swab64((__force u64)target->extended_addr); + struct ieee802154_local *local = sdata->local; + struct wpan_dev *wpan_dev = &sdata->wpan_dev; + struct sk_buff *skb; + int ret; + + frame.mhr.fc.type = IEEE802154_FC_TYPE_MAC_CMD; + frame.mhr.fc.security_enabled = 0; + frame.mhr.fc.frame_pending = 0; + frame.mhr.fc.ack_request = 1; + frame.mhr.fc.intra_pan = 1; + frame.mhr.fc.dest_addr_mode = (target->mode == IEEE802154_ADDR_LONG) ? + IEEE802154_EXTENDED_ADDRESSING : IEEE802154_SHORT_ADDRESSING; + frame.mhr.fc.version = IEEE802154_2003_STD; + frame.mhr.fc.source_addr_mode = IEEE802154_EXTENDED_ADDRESSING; + frame.mhr.source.mode = IEEE802154_ADDR_LONG; + frame.mhr.source.pan_id = wpan_dev->pan_id; + frame.mhr.source.extended_addr = wpan_dev->extended_addr; + frame.mhr.dest.mode = target->mode; + frame.mhr.dest.pan_id = wpan_dev->pan_id; + if (target->mode == IEEE802154_ADDR_LONG) + frame.mhr.dest.extended_addr = target->extended_addr; + else + frame.mhr.dest.short_addr = target->short_addr; + frame.mhr.seq = atomic_inc_return(&wpan_dev->dsn) & 0xFF; + frame.mac_pl.cmd_id = IEEE802154_CMD_DISASSOCIATION_NOTIFY; + frame.disassoc_pl = reason; + + skb = alloc_skb(IEEE802154_MAC_CMD_SKB_SZ + sizeof(frame.disassoc_pl), + GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + skb->dev = sdata->dev; + + ret = ieee802154_mac_cmd_push(skb, &frame, &frame.disassoc_pl, + sizeof(frame.disassoc_pl)); + if (ret) { + kfree_skb(skb); + return ret; + } + + ret = ieee802154_mlme_tx_one_locked(local, sdata, skb); + if (ret) { + dev_warn(&sdata->dev->dev, + "No DISASSOC ACK received from %8phC\n", &teaddr); + if (ret > 0) + ret = (ret == IEEE802154_NO_ACK) ? -EREMOTEIO : -EIO; + return ret; + } + + dev_dbg(&sdata->dev->dev, "DISASSOC ACK received from %8phC\n", &teaddr); + return 0; +} + +static int +mac802154_send_association_resp_locked(struct ieee802154_sub_if_data *sdata, + struct ieee802154_pan_device *target, + struct ieee802154_assoc_resp_pl *assoc_resp_pl) +{ + u64 teaddr = swab64((__force u64)target->extended_addr); + struct ieee802154_association_resp_frame frame = {}; + struct ieee802154_local *local = sdata->local; + struct wpan_dev *wpan_dev = &sdata->wpan_dev; + struct sk_buff *skb; + int ret; + + frame.mhr.fc.type = IEEE802154_FC_TYPE_MAC_CMD; + frame.mhr.fc.security_enabled = 0; + frame.mhr.fc.frame_pending = 0; + frame.mhr.fc.ack_request = 1; /* We always expect an ack here */ + frame.mhr.fc.intra_pan = 1; + frame.mhr.fc.dest_addr_mode = IEEE802154_EXTENDED_ADDRESSING; + frame.mhr.fc.version = IEEE802154_2003_STD; + frame.mhr.fc.source_addr_mode = IEEE802154_EXTENDED_ADDRESSING; + frame.mhr.source.mode = IEEE802154_ADDR_LONG; + frame.mhr.source.extended_addr = wpan_dev->extended_addr; + frame.mhr.dest.mode = IEEE802154_ADDR_LONG; + frame.mhr.dest.pan_id = wpan_dev->pan_id; + frame.mhr.dest.extended_addr = target->extended_addr; + frame.mhr.seq = atomic_inc_return(&wpan_dev->dsn) & 0xFF; + frame.mac_pl.cmd_id = IEEE802154_CMD_ASSOCIATION_RESP; + + skb = alloc_skb(IEEE802154_MAC_CMD_SKB_SZ + sizeof(*assoc_resp_pl), + GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + skb->dev = sdata->dev; + + ret = ieee802154_mac_cmd_push(skb, &frame, assoc_resp_pl, + sizeof(*assoc_resp_pl)); + if (ret) { + kfree_skb(skb); + return ret; + } + + ret = ieee802154_mlme_tx_locked(local, sdata, skb); + if (ret) { + dev_warn(&sdata->dev->dev, + "No ASSOC RESP ACK received from %8phC\n", &teaddr); + if (ret > 0) + ret = (ret == IEEE802154_NO_ACK) ? -EREMOTEIO : -EIO; + return ret; + } + + return 0; +} + +int mac802154_process_association_req(struct ieee802154_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct wpan_dev *wpan_dev = &sdata->wpan_dev; + struct ieee802154_addr *src = &mac_cb(skb)->source; + struct ieee802154_addr *dest = &mac_cb(skb)->dest; + struct ieee802154_assoc_resp_pl assoc_resp_pl = {}; + struct ieee802154_assoc_req_pl assoc_req_pl; + struct ieee802154_pan_device *child, *exchild; + struct ieee802154_addr tmp = {}; + u64 ceaddr; + int ret; + + if (skb->len != sizeof(assoc_req_pl)) + return -EINVAL; + + if (unlikely(src->mode != IEEE802154_EXTENDED_ADDRESSING)) + return -EINVAL; + + if (unlikely(dest->pan_id != wpan_dev->pan_id)) + return -ENODEV; + + if (dest->mode == IEEE802154_EXTENDED_ADDRESSING && + unlikely(dest->extended_addr != wpan_dev->extended_addr)) + return -ENODEV; + else if (dest->mode == IEEE802154_SHORT_ADDRESSING && + unlikely(dest->short_addr != wpan_dev->short_addr)) + return -ENODEV; + + if (wpan_dev->parent) { + dev_dbg(&sdata->dev->dev, + "Ignoring ASSOC REQ, not the PAN coordinator\n"); + return -ENODEV; + } + + mutex_lock(&wpan_dev->association_lock); + + memcpy(&assoc_req_pl, skb->data, sizeof(assoc_req_pl)); + if (assoc_req_pl.assoc_type) { + dev_err(&skb->dev->dev, "Fast associations not supported yet\n"); + ret = -EOPNOTSUPP; + goto unlock; + } + + child = kzalloc(sizeof(*child), GFP_KERNEL); + if (!child) { + ret = -ENOMEM; + goto unlock; + } + + child->extended_addr = src->extended_addr; + child->mode = IEEE802154_EXTENDED_ADDRESSING; + ceaddr = swab64((__force u64)child->extended_addr); + + if (wpan_dev->nchildren >= wpan_dev->max_associations) { + if (!wpan_dev->max_associations) + assoc_resp_pl.status = IEEE802154_PAN_ACCESS_DENIED; + else + assoc_resp_pl.status = IEEE802154_PAN_AT_CAPACITY; + assoc_resp_pl.short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_BROADCAST); + dev_dbg(&sdata->dev->dev, + "Refusing ASSOC REQ from child %8phC, %s\n", &ceaddr, + assoc_resp_pl.status == IEEE802154_PAN_ACCESS_DENIED ? + "access denied" : "too many children"); + } else { + assoc_resp_pl.status = IEEE802154_ASSOCIATION_SUCCESSFUL; + if (assoc_req_pl.alloc_addr) { + assoc_resp_pl.short_addr = cfg802154_get_free_short_addr(wpan_dev); + child->mode = IEEE802154_SHORT_ADDRESSING; + } else { + assoc_resp_pl.short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC); + } + child->short_addr = assoc_resp_pl.short_addr; + dev_dbg(&sdata->dev->dev, + "Accepting ASSOC REQ from child %8phC, providing short address 0x%04x\n", + &ceaddr, le16_to_cpu(child->short_addr)); + } + + ret = mac802154_send_association_resp_locked(sdata, child, &assoc_resp_pl); + if (ret || assoc_resp_pl.status != IEEE802154_ASSOCIATION_SUCCESSFUL) { + kfree(child); + goto unlock; + } + + dev_dbg(&sdata->dev->dev, + "Successful association with new child %8phC\n", &ceaddr); + + /* Ensure this child is not already associated (might happen due to + * retransmissions), in this case drop the ex structure. + */ + tmp.mode = child->mode; + tmp.extended_addr = child->extended_addr; + exchild = cfg802154_device_is_child(wpan_dev, &tmp); + if (exchild) { + dev_dbg(&sdata->dev->dev, + "Child %8phC was already known\n", &ceaddr); + list_del(&exchild->node); + } + + list_add(&child->node, &wpan_dev->children); + wpan_dev->nchildren++; + +unlock: + mutex_unlock(&wpan_dev->association_lock); + return ret; +} + +int mac802154_process_disassociation_notif(struct ieee802154_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee802154_addr *src = &mac_cb(skb)->source; + struct ieee802154_addr *dest = &mac_cb(skb)->dest; + struct wpan_dev *wpan_dev = &sdata->wpan_dev; + struct ieee802154_pan_device *child; + struct ieee802154_addr target; + bool parent; + u64 teaddr; + + if (skb->len != sizeof(u8)) + return -EINVAL; + + if (unlikely(src->mode != IEEE802154_EXTENDED_ADDRESSING)) + return -EINVAL; + + if (dest->mode == IEEE802154_EXTENDED_ADDRESSING && + unlikely(dest->extended_addr != wpan_dev->extended_addr)) + return -ENODEV; + else if (dest->mode == IEEE802154_SHORT_ADDRESSING && + unlikely(dest->short_addr != wpan_dev->short_addr)) + return -ENODEV; + + if (dest->pan_id != wpan_dev->pan_id) + return -ENODEV; + + target.mode = IEEE802154_EXTENDED_ADDRESSING; + target.extended_addr = src->extended_addr; + teaddr = swab64((__force u64)target.extended_addr); + dev_dbg(&skb->dev->dev, "Processing DISASSOC NOTIF from %8phC\n", &teaddr); + + mutex_lock(&wpan_dev->association_lock); + parent = cfg802154_device_is_parent(wpan_dev, &target); + if (!parent) + child = cfg802154_device_is_child(wpan_dev, &target); + if (!parent && !child) { + mutex_unlock(&wpan_dev->association_lock); + return -EINVAL; + } + + if (parent) { + kfree(wpan_dev->parent); + wpan_dev->parent = NULL; + } else { + list_del(&child->node); + kfree(child); + wpan_dev->nchildren--; + } + + mutex_unlock(&wpan_dev->association_lock); + + return 0; +} diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c index 533d082f07..45d1e6a157 100644 --- a/net/mpls/mpls_gso.c +++ b/net/mpls/mpls_gso.c @@ -27,6 +27,9 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb, __be16 mpls_protocol; unsigned int mpls_hlen; + if (!skb_inner_network_header_was_set(skb)) + goto out; + skb_reset_network_header(skb); mpls_hlen = skb_inner_network_header(skb) - skb_network_header(skb); if (unlikely(!mpls_hlen || mpls_hlen % MPLS_HLEN)) diff --git a/net/mptcp/mptcp_pm_gen.c b/net/mptcp/mptcp_pm_gen.c index a2325e70dd..670da7822e 100644 --- a/net/mptcp/mptcp_pm_gen.c +++ b/net/mptcp/mptcp_pm_gen.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) /* Do not edit directly, auto-generated from: */ -/* Documentation/netlink/specs/mptcp.yaml */ +/* Documentation/netlink/specs/mptcp_pm.yaml */ /* YNL-GEN kernel source */ #include <net/netlink.h> diff --git a/net/mptcp/mptcp_pm_gen.h b/net/mptcp/mptcp_pm_gen.h index 10579d1845..ac9fc7225b 100644 --- a/net/mptcp/mptcp_pm_gen.h +++ b/net/mptcp/mptcp_pm_gen.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ /* Do not edit directly, auto-generated from: */ -/* Documentation/netlink/specs/mptcp.yaml */ +/* Documentation/netlink/specs/mptcp_pm.yaml */ /* YNL-GEN kernel header */ #ifndef _LINUX_MPTCP_PM_GEN_H diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index cccb720c1c..58d17d9604 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1109,7 +1109,7 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc static const struct genl_multicast_group mptcp_pm_mcgrps[] = { [MPTCP_PM_CMD_GRP_OFFSET] = { .name = MPTCP_PM_CMD_GRP_NAME, }, [MPTCP_PM_EV_GRP_OFFSET] = { .name = MPTCP_PM_EV_GRP_NAME, - .flags = GENL_UNS_ADMIN_PERM, + .flags = GENL_MCAST_CAP_NET_ADMIN, }, }; diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 6eabd1d79f..bc97cc30f0 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -288,12 +288,12 @@ int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) if (!mptcp_pm_is_userspace(msk)) { GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); - goto remove_err; + goto out; } if (id_val == 0) { err = mptcp_userspace_pm_remove_id_zero_address(msk, info); - goto remove_err; + goto out; } lock_sock(sk); @@ -308,7 +308,7 @@ int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) if (!match) { GENL_SET_ERR_MSG(info, "address with specified id not found"); release_sock(sk); - goto remove_err; + goto out; } list_move(&match->list, &free_list); @@ -322,7 +322,7 @@ int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) } err = 0; - remove_err: +out: sock_put(sk); return err; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 046ab95bc0..2b921af271 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -55,28 +55,14 @@ static u64 mptcp_wnd_end(const struct mptcp_sock *msk) return READ_ONCE(msk->wnd_end); } -static bool mptcp_is_tcpsk(struct sock *sk) +static const struct proto_ops *mptcp_fallback_tcp_ops(const struct sock *sk) { - struct socket *sock = sk->sk_socket; - - if (unlikely(sk->sk_prot == &tcp_prot)) { - /* we are being invoked after mptcp_accept() has - * accepted a non-mp-capable flow: sk is a tcp_sk, - * not an mptcp one. - * - * Hand the socket over to tcp so all further socket ops - * bypass mptcp. - */ - WRITE_ONCE(sock->ops, &inet_stream_ops); - return true; #if IS_ENABLED(CONFIG_MPTCP_IPV6) - } else if (unlikely(sk->sk_prot == &tcpv6_prot)) { - WRITE_ONCE(sock->ops, &inet6_stream_ops); - return true; + if (sk->sk_prot == &tcpv6_prot) + return &inet6_stream_ops; #endif - } - - return false; + WARN_ON_ONCE(sk->sk_prot != &tcp_prot); + return &inet_stream_ops; } static int __mptcp_socket_create(struct mptcp_sock *msk) @@ -3333,44 +3319,6 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; } -static struct sock *mptcp_accept(struct sock *ssk, int flags, int *err, - bool kern) -{ - struct sock *newsk; - - pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk)); - newsk = inet_csk_accept(ssk, flags, err, kern); - if (!newsk) - return NULL; - - pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk)); - if (sk_is_mptcp(newsk)) { - struct mptcp_subflow_context *subflow; - struct sock *new_mptcp_sock; - - subflow = mptcp_subflow_ctx(newsk); - new_mptcp_sock = subflow->conn; - - /* is_mptcp should be false if subflow->conn is missing, see - * subflow_syn_recv_sock() - */ - if (WARN_ON_ONCE(!new_mptcp_sock)) { - tcp_sk(newsk)->is_mptcp = 0; - goto out; - } - - newsk = new_mptcp_sock; - MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK); - } else { - MPTCP_INC_STATS(sock_net(ssk), - MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); - } - -out: - newsk->sk_kern_sock = kern; - return newsk; -} - void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) { struct mptcp_subflow_context *subflow, *tmp; @@ -3807,7 +3755,6 @@ static struct proto mptcp_prot = { .connect = mptcp_connect, .disconnect = mptcp_disconnect, .close = mptcp_close, - .accept = mptcp_accept, .setsockopt = mptcp_setsockopt, .getsockopt = mptcp_getsockopt, .shutdown = mptcp_shutdown, @@ -3917,18 +3864,36 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (!ssk) return -EINVAL; - newsk = mptcp_accept(ssk, flags, &err, kern); + pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk)); + newsk = inet_csk_accept(ssk, flags, &err, kern); if (!newsk) return err; - lock_sock(newsk); - - __inet_accept(sock, newsock, newsk); - if (!mptcp_is_tcpsk(newsock->sk)) { - struct mptcp_sock *msk = mptcp_sk(newsk); + pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk)); + if (sk_is_mptcp(newsk)) { struct mptcp_subflow_context *subflow; + struct sock *new_mptcp_sock; + + subflow = mptcp_subflow_ctx(newsk); + new_mptcp_sock = subflow->conn; + + /* is_mptcp should be false if subflow->conn is missing, see + * subflow_syn_recv_sock() + */ + if (WARN_ON_ONCE(!new_mptcp_sock)) { + tcp_sk(newsk)->is_mptcp = 0; + goto tcpfallback; + } + + newsk = new_mptcp_sock; + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK); + + newsk->sk_kern_sock = kern; + lock_sock(newsk); + __inet_accept(sock, newsock, newsk); set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags); + msk = mptcp_sk(newsk); msk->in_accept_queue = 0; /* set ssk->sk_socket of accept()ed flows to mptcp socket. @@ -3950,6 +3915,19 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (unlikely(list_is_singular(&msk->conn_list))) mptcp_set_state(newsk, TCP_CLOSE); } + } else { +tcpfallback: + newsk->sk_kern_sock = kern; + lock_sock(newsk); + __inet_accept(sock, newsock, newsk); + /* we are being invoked after accepting a non-mp-capable + * flow: sk is a tcp_sk, not an mptcp one. + * + * Hand the socket over to tcp so all further socket ops + * bypass mptcp. + */ + WRITE_ONCE(newsock->sk->sk_socket->ops, + mptcp_fallback_tcp_ops(newsock->sk)); } release_sock(newsk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 7384613ea2..07f6242afc 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1087,6 +1087,15 @@ static inline void __mptcp_do_fallback(struct mptcp_sock *msk) set_bit(MPTCP_FALLBACK_DONE, &msk->flags); } +static inline bool __mptcp_has_initial_subflow(const struct mptcp_sock *msk) +{ + struct sock *ssk = READ_ONCE(msk->first); + + return ssk && ((1 << inet_sk_state_load(ssk)) & + (TCPF_ESTABLISHED | TCPF_SYN_SENT | + TCPF_SYN_RECV | TCPF_LISTEN)); +} + static inline void mptcp_do_fallback(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 3536807337..ef3edba754 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -440,6 +440,8 @@ static bool mptcp_supported_sockopt(int level, int optname) /* should work fine */ case IP_FREEBIND: case IP_TRANSPARENT: + case IP_BIND_ADDRESS_NO_PORT: + case IP_LOCAL_PORT_RANGE: /* the following are control cmsg related */ case IP_PKTINFO: @@ -455,7 +457,6 @@ static bool mptcp_supported_sockopt(int level, int optname) /* common stuff that need some love */ case IP_TOS: case IP_TTL: - case IP_BIND_ADDRESS_NO_PORT: case IP_MTU_DISCOVER: case IP_RECVERR: @@ -683,8 +684,8 @@ static int mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, sockptr_t op return 0; } -static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int optname, - sockptr_t optval, unsigned int optlen) +static int mptcp_setsockopt_sol_ip_set(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) { struct sock *sk = (struct sock *)msk; struct sock *ssk; @@ -710,6 +711,14 @@ static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int o inet_assign_bit(TRANSPARENT, ssk, inet_test_bit(TRANSPARENT, sk)); break; + case IP_BIND_ADDRESS_NO_PORT: + inet_assign_bit(BIND_ADDRESS_NO_PORT, ssk, + inet_test_bit(BIND_ADDRESS_NO_PORT, sk)); + break; + case IP_LOCAL_PORT_RANGE: + WRITE_ONCE(inet_sk(ssk)->local_port_range, + READ_ONCE(inet_sk(sk)->local_port_range)); + break; default: release_sock(sk); WARN_ON_ONCE(1); @@ -755,7 +764,9 @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, switch (optname) { case IP_FREEBIND: case IP_TRANSPARENT: - return mptcp_setsockopt_sol_ip_set_transparent(msk, optname, optval, optlen); + case IP_BIND_ADDRESS_NO_PORT: + case IP_LOCAL_PORT_RANGE: + return mptcp_setsockopt_sol_ip_set(msk, optname, optval, optlen); case IP_TOS: return mptcp_setsockopt_v4_set_tos(msk, optname, optval, optlen); } @@ -938,6 +949,8 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) info->mptcpi_bytes_sent = msk->bytes_sent; info->mptcpi_bytes_received = msk->bytes_received; info->mptcpi_bytes_retrans = msk->bytes_retrans; + info->mptcpi_subflows_total = info->mptcpi_subflows + + __mptcp_has_initial_subflow(msk); unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(mptcp_diag_fill_info); @@ -1348,6 +1361,12 @@ static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname, switch (optname) { case IP_TOS: return mptcp_put_int_option(msk, optval, optlen, READ_ONCE(inet_sk(sk)->tos)); + case IP_BIND_ADDRESS_NO_PORT: + return mptcp_put_int_option(msk, optval, optlen, + inet_test_bit(BIND_ADDRESS_NO_PORT, sk)); + case IP_LOCAL_PORT_RANGE: + return mptcp_put_int_option(msk, optval, optlen, + READ_ONCE(inet_sk(sk)->local_port_range)); } return -EOPNOTSUPP; @@ -1448,6 +1467,8 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) inet_assign_bit(TRANSPARENT, ssk, inet_test_bit(TRANSPARENT, sk)); inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk)); + inet_assign_bit(BIND_ADDRESS_NO_PORT, ssk, inet_test_bit(BIND_ADDRESS_NO_PORT, sk)); + WRITE_ONCE(inet_sk(ssk)->local_port_range, READ_ONCE(inet_sk(sk)->local_port_range)); } void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk) @@ -1479,6 +1500,10 @@ int mptcp_set_rcvlowat(struct sock *sk, int val) struct mptcp_subflow_context *subflow; int space, cap; + /* bpf can land here with a wrong sk type */ + if (sk->sk_protocol == IPPROTO_TCP) + return -EINVAL; + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) cap = sk->sk_rcvbuf >> 1; else diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 71ba86246f..13f66d11b7 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -905,6 +905,8 @@ dispose_child: return child; fallback: + if (fallback) + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); mptcp_subflow_drop_ctx(child); return child; } diff --git a/net/ncsi/ncsi-cmd.c b/net/ncsi/ncsi-cmd.c index fd2236ee9a..b3ff37a181 100644 --- a/net/ncsi/ncsi-cmd.c +++ b/net/ncsi/ncsi-cmd.c @@ -270,7 +270,8 @@ static struct ncsi_cmd_handler { { NCSI_PKT_CMD_GPS, 0, ncsi_cmd_handler_default }, { NCSI_PKT_CMD_OEM, -1, ncsi_cmd_handler_oem }, { NCSI_PKT_CMD_PLDM, 0, NULL }, - { NCSI_PKT_CMD_GPUUID, 0, ncsi_cmd_handler_default } + { NCSI_PKT_CMD_GPUUID, 0, ncsi_cmd_handler_default }, + { NCSI_PKT_CMD_GMCMA, 0, ncsi_cmd_handler_default } }; static struct ncsi_request *ncsi_alloc_command(struct ncsi_cmd_arg *nca) diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index d9da942ad5..745c788f1d 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -689,8 +689,6 @@ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, return 0; } -#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_KEEP_PHY) - static int ncsi_oem_keep_phy_intel(struct ncsi_cmd_arg *nca) { unsigned char data[NCSI_OEM_INTEL_CMD_KEEP_PHY_LEN]; @@ -716,10 +714,6 @@ static int ncsi_oem_keep_phy_intel(struct ncsi_cmd_arg *nca) return ret; } -#endif - -#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC) - /* NCSI OEM Command APIs */ static int ncsi_oem_gma_handler_bcm(struct ncsi_cmd_arg *nca) { @@ -856,8 +850,6 @@ static int ncsi_gma_handler(struct ncsi_cmd_arg *nca, unsigned int mf_id) return nch->handler(nca); } -#endif /* CONFIG_NCSI_OEM_CMD_GET_MAC */ - /* Determine if a given channel from the channel_queue should be used for Tx */ static bool ncsi_channel_is_tx(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc) @@ -1039,20 +1031,23 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) goto error; } - nd->state = ncsi_dev_state_config_oem_gma; + nd->state = IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC) + ? ncsi_dev_state_config_oem_gma + : ncsi_dev_state_config_clear_vids; break; case ncsi_dev_state_config_oem_gma: nd->state = ncsi_dev_state_config_clear_vids; - ret = -1; -#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC) - nca.type = NCSI_PKT_CMD_OEM; nca.package = np->id; nca.channel = nc->id; ndp->pending_req_num = 1; - ret = ncsi_gma_handler(&nca, nc->version.mf_id); -#endif /* CONFIG_NCSI_OEM_CMD_GET_MAC */ - + if (nc->version.major >= 1 && nc->version.minor >= 2) { + nca.type = NCSI_PKT_CMD_GMCMA; + ret = ncsi_xmit_cmd(&nca); + } else { + nca.type = NCSI_PKT_CMD_OEM; + ret = ncsi_gma_handler(&nca, nc->version.mf_id); + } if (ret < 0) schedule_work(&ndp->work); @@ -1404,7 +1399,6 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) schedule_work(&ndp->work); break; -#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC) case ncsi_dev_state_probe_mlx_gma: ndp->pending_req_num = 1; @@ -1429,7 +1423,6 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) nd->state = ncsi_dev_state_probe_cis; break; -#endif /* CONFIG_NCSI_OEM_CMD_GET_MAC */ case ncsi_dev_state_probe_cis: ndp->pending_req_num = NCSI_RESERVED_CHANNEL; @@ -1447,7 +1440,6 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) if (IS_ENABLED(CONFIG_NCSI_OEM_CMD_KEEP_PHY)) nd->state = ncsi_dev_state_probe_keep_phy; break; -#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_KEEP_PHY) case ncsi_dev_state_probe_keep_phy: ndp->pending_req_num = 1; @@ -1460,7 +1452,6 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) nd->state = ncsi_dev_state_probe_gvi; break; -#endif /* CONFIG_NCSI_OEM_CMD_KEEP_PHY */ case ncsi_dev_state_probe_gvi: case ncsi_dev_state_probe_gc: case ncsi_dev_state_probe_gls: diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h index c9d1da34dc..f2f3b5c1b9 100644 --- a/net/ncsi/ncsi-pkt.h +++ b/net/ncsi/ncsi-pkt.h @@ -338,6 +338,14 @@ struct ncsi_rsp_gpuuid_pkt { __be32 checksum; }; +/* Get MC MAC Address */ +struct ncsi_rsp_gmcma_pkt { + struct ncsi_rsp_pkt_hdr rsp; + unsigned char address_count; + unsigned char reserved[3]; + unsigned char addresses[][ETH_ALEN]; +}; + /* AEN: Link State Change */ struct ncsi_aen_lsc_pkt { struct ncsi_aen_pkt_hdr aen; /* AEN header */ @@ -398,6 +406,7 @@ struct ncsi_aen_hncdsc_pkt { #define NCSI_PKT_CMD_GPUUID 0x52 /* Get package UUID */ #define NCSI_PKT_CMD_QPNPR 0x56 /* Query Pending NC PLDM request */ #define NCSI_PKT_CMD_SNPR 0x57 /* Send NC PLDM Reply */ +#define NCSI_PKT_CMD_GMCMA 0x58 /* Get MC MAC Address */ /* NCSI packet responses */ @@ -433,6 +442,7 @@ struct ncsi_aen_hncdsc_pkt { #define NCSI_PKT_RSP_GPUUID (NCSI_PKT_CMD_GPUUID + 0x80) #define NCSI_PKT_RSP_QPNPR (NCSI_PKT_CMD_QPNPR + 0x80) #define NCSI_PKT_RSP_SNPR (NCSI_PKT_CMD_SNPR + 0x80) +#define NCSI_PKT_RSP_GMCMA (NCSI_PKT_CMD_GMCMA + 0x80) /* NCSI response code/reason */ #define NCSI_PKT_RSP_C_COMPLETED 0x0000 /* Command Completed */ diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 480e80e3c2..bee290d0f4 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -1091,6 +1091,44 @@ static int ncsi_rsp_handler_netlink(struct ncsi_request *nr) return ret; } +static int ncsi_rsp_handler_gmcma(struct ncsi_request *nr) +{ + struct ncsi_dev_priv *ndp = nr->ndp; + struct net_device *ndev = ndp->ndev.dev; + struct ncsi_rsp_gmcma_pkt *rsp; + struct sockaddr saddr; + int ret = -1; + int i; + + rsp = (struct ncsi_rsp_gmcma_pkt *)skb_network_header(nr->rsp); + saddr.sa_family = ndev->type; + ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + + netdev_info(ndev, "NCSI: Received %d provisioned MAC addresses\n", + rsp->address_count); + for (i = 0; i < rsp->address_count; i++) { + netdev_info(ndev, "NCSI: MAC address %d: %02x:%02x:%02x:%02x:%02x:%02x\n", + i, rsp->addresses[i][0], rsp->addresses[i][1], + rsp->addresses[i][2], rsp->addresses[i][3], + rsp->addresses[i][4], rsp->addresses[i][5]); + } + + for (i = 0; i < rsp->address_count; i++) { + memcpy(saddr.sa_data, &rsp->addresses[i], ETH_ALEN); + ret = ndev->netdev_ops->ndo_set_mac_address(ndev, &saddr); + if (ret < 0) { + netdev_warn(ndev, "NCSI: Unable to assign %pM to device\n", + saddr.sa_data); + continue; + } + netdev_warn(ndev, "NCSI: Set MAC address to %pM\n", saddr.sa_data); + break; + } + + ndp->gma_flag = ret == 0; + return ret; +} + static struct ncsi_rsp_handler { unsigned char type; int payload; @@ -1127,7 +1165,8 @@ static struct ncsi_rsp_handler { { NCSI_PKT_RSP_PLDM, -1, ncsi_rsp_handler_pldm }, { NCSI_PKT_RSP_GPUUID, 20, ncsi_rsp_handler_gpuuid }, { NCSI_PKT_RSP_QPNPR, -1, ncsi_rsp_handler_pldm }, - { NCSI_PKT_RSP_SNPR, -1, ncsi_rsp_handler_pldm } + { NCSI_PKT_RSP_SNPR, -1, ncsi_rsp_handler_pldm }, + { NCSI_PKT_RSP_GMCMA, -1, ncsi_rsp_handler_gmcma }, }; int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev, diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 9523104a90..cb48a2b9cb 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -4,6 +4,8 @@ #ifndef __IP_SET_BITMAP_IP_GEN_H #define __IP_SET_BITMAP_IP_GEN_H +#include <linux/rcupdate_wait.h> + #define mtype_do_test IPSET_TOKEN(MTYPE, _do_test) #define mtype_gc_test IPSET_TOKEN(MTYPE, _gc_test) #define mtype_is_filled IPSET_TOKEN(MTYPE, _is_filled) diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 20aad81fca..cf3ce72c3d 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -5,6 +5,7 @@ #define _IP_SET_HASH_GEN_H #include <linux/rcupdate.h> +#include <linux/rcupdate_wait.h> #include <linux/jhash.h> #include <linux/types.h> #include <linux/netfilter/nfnetlink.h> diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 9065da3cdd..a743db0738 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -31,6 +31,7 @@ #include <linux/seq_file.h> #include <linux/jhash.h> #include <linux/random.h> +#include <linux/rcupdate_wait.h> #include <net/net_namespace.h> #include <net/ip_vs.h> diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index c5970ba416..f821ad2e19 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -21,6 +21,7 @@ #include <linux/interrupt.h> #include <linux/sysctl.h> #include <linux/list.h> +#include <linux/rcupdate_wait.h> #include <net/ip_vs.h> diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index a0921adc31..1e689c7141 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -126,7 +126,8 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, if (sctph->source != cp->vport || payload_csum || skb->ip_summed == CHECKSUM_PARTIAL) { sctph->source = cp->vport; - sctp_nat_csum(skb, sctph, sctphoff); + if (!skb_is_gso(skb) || !skb_is_gso_sctp(skb)) + sctp_nat_csum(skb, sctph, sctphoff); } else { skb->ip_summed = CHECKSUM_UNNECESSARY; } @@ -174,7 +175,8 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, (skb->ip_summed == CHECKSUM_PARTIAL && !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CRC))) { sctph->dest = cp->dport; - sctp_nat_csum(skb, sctph, sctphoff); + if (!skb_is_gso(skb) || !skb_is_gso_sctp(skb)) + sctp_nat_csum(skb, sctph, sctphoff); } else if (skb->ip_summed != CHECKSUM_PARTIAL) { skb->ip_summed = CHECKSUM_UNNECESSARY; } diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index eaf9f2ed00..be74c0906d 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1365,7 +1365,7 @@ static int set_mcast_if(struct sock *sk, struct net_device *dev) struct ipv6_pinfo *np = inet6_sk(sk); /* IPV6_MULTICAST_IF */ - np->mcast_oif = dev->ifindex; + WRITE_ONCE(np->mcast_oif, dev->ifindex); } #endif release_sock(sk); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index fb0ae15e96..3b846cbdc0 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -876,6 +876,7 @@ struct ctnetlink_filter_u32 { struct ctnetlink_filter { u8 family; + bool zone_filter; u_int32_t orig_flags; u_int32_t reply_flags; @@ -992,13 +993,16 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) if (err) goto err_filter; + if (cda[CTA_ZONE]) { + err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone); + if (err < 0) + goto err_filter; + filter->zone_filter = true; + } + if (!cda[CTA_FILTER]) return filter; - err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone); - if (err < 0) - goto err_filter; - err = ctnetlink_parse_filter(cda[CTA_FILTER], filter); if (err < 0) goto err_filter; @@ -1043,7 +1047,7 @@ err_filter: static bool ctnetlink_needs_filter(u8 family, const struct nlattr * const *cda) { - return family || cda[CTA_MARK] || cda[CTA_FILTER] || cda[CTA_STATUS]; + return family || cda[CTA_MARK] || cda[CTA_FILTER] || cda[CTA_STATUS] || cda[CTA_ZONE]; } static int ctnetlink_start(struct netlink_callback *cb) @@ -1148,6 +1152,10 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) if (filter->family && nf_ct_l3num(ct) != filter->family) goto ignore_entry; + if (filter->zone_filter && + !nf_ct_zone_equal_any(ct, &filter->zone)) + goto ignore_entry; + if (filter->orig_flags) { tuple = nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL); if (!ctnetlink_filter_match_tuple(&filter->orig, tuple, diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index 9505f9d188..6eef15648b 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -21,7 +21,8 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, proto = veth->h_vlan_encapsulated_proto; break; case htons(ETH_P_PPP_SES): - proto = nf_flow_pppoe_proto(skb); + if (!nf_flow_pppoe_proto(skb, &proto)) + return NF_ACCEPT; break; default: proto = skb->protocol; diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index e45fade764..5383bed3d3 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -157,7 +157,7 @@ static void nf_flow_tuple_encap(struct sk_buff *skb, tuple->encap[i].proto = skb->protocol; break; case htons(ETH_P_PPP_SES): - phdr = (struct pppoe_hdr *)skb_mac_header(skb); + phdr = (struct pppoe_hdr *)skb_network_header(skb); tuple->encap[i].id = ntohs(phdr->sid); tuple->encap[i].proto = skb->protocol; break; @@ -273,10 +273,11 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, return NF_STOLEN; } -static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto, +static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, u32 *offset) { struct vlan_ethhdr *veth; + __be16 inner_proto; switch (skb->protocol) { case htons(ETH_P_8021Q): @@ -287,7 +288,8 @@ static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto, } break; case htons(ETH_P_PPP_SES): - if (nf_flow_pppoe_proto(skb) == proto) { + if (nf_flow_pppoe_proto(skb, &inner_proto) && + inner_proto == proto) { *offset += PPPOE_SES_HLEN; return true; } @@ -316,7 +318,7 @@ static void nf_flow_encap_pop(struct sk_buff *skb, skb_reset_network_header(skb); break; case htons(ETH_P_PPP_SES): - skb->protocol = nf_flow_pppoe_proto(skb); + skb->protocol = __nf_flow_pppoe_proto(skb); skb_pull(skb, PPPOE_SES_HLEN); skb_reset_network_header(skb); break; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index c3d7ecbc77..016c816d91 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -551,8 +551,11 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, find_free_id: if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) off = (ntohs(*keyptr) - ntohs(range->base_proto.all)); - else + else if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL) || + maniptype != NF_NAT_MANIP_DST) off = get_random_u16(); + else + off = 0; attempts = range_size; if (attempts > NF_NAT_MAX_ATTEMPTS) diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 467671f2d4..fbbc4fd373 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -617,7 +617,7 @@ synproxy_recv_client_ack(struct net *net, struct synproxy_net *snet = synproxy_pernet(net); int mss; - mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1); + mss = __cookie_v4_check(ip_hdr(skb), th); if (mss == 0) { this_cpu_inc(snet->stats->cookie_invalid); return false; @@ -1034,7 +1034,7 @@ synproxy_recv_client_ack_ipv6(struct net *net, struct synproxy_net *snet = synproxy_pernet(net); int mss; - mss = nf_cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1); + mss = nf_cookie_v6_check(ipv6_hdr(skb), th); if (mss == 0) { this_cpu_inc(snet->stats->cookie_invalid); return false; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d07872814f..0e697e53a7 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -594,6 +594,12 @@ static int nft_mapelem_deactivate(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + + nft_set_elem_change_active(ctx->net, set, ext); nft_setelem_data_deactivate(ctx->net, set, elem_priv); return 0; @@ -617,6 +623,7 @@ static void nft_map_catchall_deactivate(const struct nft_ctx *ctx, if (!nft_set_elem_active(ext, genmask)) continue; + nft_set_elem_change_active(ctx->net, set, ext); nft_setelem_data_deactivate(ctx->net, set, catchall->elem); break; } @@ -626,6 +633,7 @@ static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set) { struct nft_set_iter iter = { .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, .fn = nft_mapelem_deactivate, }; @@ -1198,6 +1206,26 @@ static void nf_tables_table_disable(struct net *net, struct nft_table *table) #define __NFT_TABLE_F_UPDATE (__NFT_TABLE_F_WAS_DORMANT | \ __NFT_TABLE_F_WAS_AWAKEN) +static bool nft_table_pending_update(const struct nft_ctx *ctx) +{ + struct nftables_pernet *nft_net = nft_pernet(ctx->net); + struct nft_trans *trans; + + if (ctx->table->flags & __NFT_TABLE_F_UPDATE) + return true; + + list_for_each_entry(trans, &nft_net->commit_list, list) { + if (trans->ctx.table == ctx->table && + ((trans->msg_type == NFT_MSG_NEWCHAIN && + nft_trans_chain_update(trans)) || + (trans->msg_type == NFT_MSG_DELCHAIN && + nft_is_base_chain(trans->ctx.chain)))) + return true; + } + + return false; +} + static int nf_tables_updtable(struct nft_ctx *ctx) { struct nft_trans *trans; @@ -1221,7 +1249,7 @@ static int nf_tables_updtable(struct nft_ctx *ctx) return -EOPNOTSUPP; /* No dormant off/on/off/on games in single transaction */ - if (ctx->table->flags & __NFT_TABLE_F_UPDATE) + if (nft_table_pending_update(ctx)) return -EINVAL; trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE, @@ -2418,6 +2446,9 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, struct nft_stats __percpu *stats = NULL; struct nft_chain_hook hook = {}; + if (table->flags & __NFT_TABLE_F_UPDATE) + return -EINVAL; + if (flags & NFT_CHAIN_BINDING) return -EOPNOTSUPP; @@ -2619,6 +2650,13 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, } } + if (table->flags & __NFT_TABLE_F_UPDATE && + !list_empty(&hook.list)) { + NL_SET_BAD_ATTR(extack, attr); + err = -EOPNOTSUPP; + goto err_hooks; + } + if (!(table->flags & NFT_TABLE_F_DORMANT) && nft_is_base_chain(chain) && !list_empty(&hook.list)) { @@ -2848,6 +2886,9 @@ static int nft_delchain_hook(struct nft_ctx *ctx, struct nft_trans *trans; int err; + if (ctx->table->flags & __NFT_TABLE_F_UPDATE) + return -EOPNOTSUPP; + err = nft_chain_parse_hook(ctx->net, basechain, nla, &chain_hook, ctx->family, chain->flags, extack); if (err < 0) @@ -2932,7 +2973,8 @@ static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); if (nla[NFTA_CHAIN_HOOK]) { - if (chain->flags & NFT_CHAIN_HW_OFFLOAD) + if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN || + chain->flags & NFT_CHAIN_HW_OFFLOAD) return -EOPNOTSUPP; if (nft_is_base_chain(chain)) { @@ -2982,6 +3024,9 @@ static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, */ int nft_register_expr(struct nft_expr_type *type) { + if (WARN_ON_ONCE(type->maxattr > NFT_EXPR_MAXATTR)) + return -ENOMEM; + nfnl_lock(NFNL_SUBSYS_NFTABLES); if (type->family == NFPROTO_UNSPEC) list_add_tail_rcu(&type->list, &nf_tables_expressions); @@ -3011,7 +3056,7 @@ static const struct nft_expr_type *__nft_expr_type_get(u8 family, { const struct nft_expr_type *type, *candidate = NULL; - list_for_each_entry(type, &nf_tables_expressions, list) { + list_for_each_entry_rcu(type, &nf_tables_expressions, list) { if (!nla_strcmp(nla, type->name)) { if (!type->family && !candidate) candidate = type; @@ -3043,9 +3088,13 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net, if (nla == NULL) return ERR_PTR(-EINVAL); + rcu_read_lock(); type = __nft_expr_type_get(family, nla); - if (type != NULL && try_module_get(type->owner)) + if (type != NULL && try_module_get(type->owner)) { + rcu_read_unlock(); return type; + } + rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES @@ -3276,14 +3325,13 @@ int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src) { int err; - if (src->ops->clone) { - dst->ops = src->ops; - err = src->ops->clone(dst, src); - if (err < 0) - return err; - } else { - memcpy(dst, src, src->ops->size); - } + if (WARN_ON_ONCE(!src->ops->clone)) + return -EINVAL; + + dst->ops = src->ops; + err = src->ops->clone(dst, src); + if (err < 0) + return err; __module_get(src->ops->type->owner); @@ -3827,6 +3875,9 @@ int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_data *data; int err; + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) return 0; @@ -3850,17 +3901,20 @@ int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set) { - u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_iter dummy_iter = { + .genmask = nft_genmask_next(ctx->net), + }; struct nft_set_elem_catchall *catchall; + struct nft_set_ext *ext; int ret = 0; list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); - if (!nft_set_elem_active(ext, genmask)) + if (!nft_set_elem_active(ext, dummy_iter.genmask)) continue; - ret = nft_setelem_validate(ctx, set, NULL, catchall->elem); + ret = nft_setelem_validate(ctx, set, &dummy_iter, catchall->elem); if (ret < 0) return ret; } @@ -5006,6 +5060,9 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if ((flags & (NFT_SET_CONSTANT | NFT_SET_TIMEOUT)) == (NFT_SET_CONSTANT | NFT_SET_TIMEOUT)) return -EOPNOTSUPP; + if ((flags & (NFT_SET_CONSTANT | NFT_SET_TIMEOUT)) == + (NFT_SET_CONSTANT | NFT_SET_TIMEOUT)) + return -EOPNOTSUPP; } desc.dtype = 0; @@ -5354,6 +5411,11 @@ static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + return nft_setelem_data_validate(ctx, set, elem_priv); } @@ -5398,6 +5460,7 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, } iter.genmask = nft_genmask_next(ctx->net); + iter.type = NFT_ITER_UPDATE; iter.skip = 0; iter.count = 0; iter.err = 0; @@ -5445,6 +5508,13 @@ static int nft_mapelem_activate(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + + /* called from abort path, reverse check to undo changes. */ + if (nft_set_elem_active(ext, iter->genmask)) + return 0; + + nft_clear(ctx->net, ext); nft_setelem_data_activate(ctx->net, set, elem_priv); return 0; @@ -5462,6 +5532,7 @@ static void nft_map_catchall_activate(const struct nft_ctx *ctx, if (!nft_set_elem_active(ext, genmask)) continue; + nft_clear(ctx->net, ext); nft_setelem_data_activate(ctx->net, set, catchall->elem); break; } @@ -5471,6 +5542,7 @@ static void nft_map_activate(const struct nft_ctx *ctx, struct nft_set *set) { struct nft_set_iter iter = { .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, .fn = nft_mapelem_activate, }; @@ -5735,6 +5807,9 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_set_dump_args *args; + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + if (nft_set_elem_expired(ext) || nft_set_elem_is_dead(ext)) return 0; @@ -5845,6 +5920,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) args.skb = skb; args.reset = dump_ctx->reset; args.iter.genmask = nft_genmask_cur(net); + args.iter.type = NFT_ITER_READ; args.iter.skip = cb->args[0]; args.iter.count = 0; args.iter.err = 0; @@ -5857,10 +5933,6 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) nla_nest_end(skb, nest); nlmsg_end(skb, nlh); - if (dump_ctx->reset && args.iter.count > args.iter.skip) - audit_log_nft_set_reset(table, cb->seq, - args.iter.count - args.iter.skip); - rcu_read_unlock(); if (args.iter.err && args.iter.err != -EMSGSIZE) @@ -5876,6 +5948,26 @@ nla_put_failure: return -ENOSPC; } +static int nf_tables_dumpreset_set(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk)); + struct nft_set_dump_ctx *dump_ctx = cb->data; + int ret, skip = cb->args[0]; + + mutex_lock(&nft_net->commit_mutex); + + ret = nf_tables_dump_set(skb, cb); + + if (cb->args[0] > skip) + audit_log_nft_set_reset(dump_ctx->ctx.table, cb->seq, + cb->args[0] - skip); + + mutex_unlock(&nft_net->commit_mutex); + + return ret; +} + static int nf_tables_dump_set_start(struct netlink_callback *cb) { struct nft_set_dump_ctx *dump_ctx = cb->data; @@ -5950,7 +6042,7 @@ static int nft_setelem_parse_flags(const struct nft_set *set, return 0; } -static int nft_setelem_parse_key(struct nft_ctx *ctx, struct nft_set *set, +static int nft_setelem_parse_key(struct nft_ctx *ctx, const struct nft_set *set, struct nft_data *key, struct nlattr *attr) { struct nft_data_desc desc = { @@ -6003,7 +6095,7 @@ static void *nft_setelem_catchall_get(const struct net *net, return priv; } -static int nft_setelem_get(struct nft_ctx *ctx, struct nft_set *set, +static int nft_setelem_get(struct nft_ctx *ctx, const struct nft_set *set, struct nft_set_elem *elem, u32 flags) { void *priv; @@ -6022,7 +6114,7 @@ static int nft_setelem_get(struct nft_ctx *ctx, struct nft_set *set, return 0; } -static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, +static int nft_get_set_elem(struct nft_ctx *ctx, const struct nft_set *set, const struct nlattr *attr, bool reset) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; @@ -6079,21 +6171,18 @@ err_fill_setelem: return err; } -/* called with rcu_read_lock held */ -static int nf_tables_getsetelem(struct sk_buff *skb, - const struct nfnl_info *info, - const struct nlattr * const nla[]) +static int nft_set_dump_ctx_init(struct nft_set_dump_ctx *dump_ctx, + const struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[], + bool reset) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); u8 family = info->nfmsg->nfgen_family; - int rem, err = 0, nelems = 0; struct net *net = info->net; struct nft_table *table; struct nft_set *set; - struct nlattr *attr; - struct nft_ctx ctx; - bool reset = false; table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, genmask, 0); @@ -6108,10 +6197,22 @@ static int nf_tables_getsetelem(struct sk_buff *skb, return PTR_ERR(set); } - nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + nft_ctx_init(&dump_ctx->ctx, net, skb, + info->nlh, family, table, NULL, nla); + dump_ctx->set = set; + dump_ctx->reset = reset; + return 0; +} - if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETSETELEM_RESET) - reset = true; +/* called with rcu_read_lock held */ +static int nf_tables_getsetelem(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + struct netlink_ext_ack *extack = info->extack; + struct nft_set_dump_ctx dump_ctx; + struct nlattr *attr; + int rem, err = 0; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { @@ -6120,12 +6221,55 @@ static int nf_tables_getsetelem(struct sk_buff *skb, .done = nf_tables_dump_set_done, .module = THIS_MODULE, }; - struct nft_set_dump_ctx dump_ctx = { - .set = set, - .ctx = ctx, - .reset = reset, + + err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, false); + if (err) + return err; + + c.data = &dump_ctx; + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); + } + + if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) + return -EINVAL; + + err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, false); + if (err) + return err; + + nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { + err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, false); + if (err < 0) { + NL_SET_BAD_ATTR(extack, attr); + break; + } + } + + return err; +} + +static int nf_tables_getsetelem_reset(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + struct nftables_pernet *nft_net = nft_pernet(info->net); + struct netlink_ext_ack *extack = info->extack; + struct nft_set_dump_ctx dump_ctx; + int rem, err = 0, nelems = 0; + struct nlattr *attr; + + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .start = nf_tables_dump_set_start, + .dump = nf_tables_dumpreset_set, + .done = nf_tables_dump_set_done, + .module = THIS_MODULE, }; + err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, true); + if (err) + return err; + c.data = &dump_ctx; return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } @@ -6133,18 +6277,31 @@ static int nf_tables_getsetelem(struct sk_buff *skb, if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) return -EINVAL; + if (!try_module_get(THIS_MODULE)) + return -EINVAL; + rcu_read_unlock(); + mutex_lock(&nft_net->commit_mutex); + rcu_read_lock(); + + err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, true); + if (err) + goto out_unlock; + nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { - err = nft_get_set_elem(&ctx, set, attr, reset); + err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, true); if (err < 0) { NL_SET_BAD_ATTR(extack, attr); break; } nelems++; } + audit_log_nft_set_reset(dump_ctx.ctx.table, nft_net->base_seq, nelems); - if (reset) - audit_log_nft_set_reset(table, nft_pernet(net)->base_seq, - nelems); +out_unlock: + rcu_read_unlock(); + mutex_unlock(&nft_net->commit_mutex); + rcu_read_lock(); + module_put(THIS_MODULE); return err; } @@ -6503,7 +6660,7 @@ static void nft_setelem_activate(struct net *net, struct nft_set *set, struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_setelem_is_catchall(set, elem_priv)) { - nft_set_elem_change_active(net, set, ext); + nft_clear(net, ext); } else { set->ops->activate(net, set, elem_priv); } @@ -7062,6 +7219,16 @@ void nft_data_hold(const struct nft_data *data, enum nft_data_types type) } } +static int nft_setelem_active_next(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + u8 genmask = nft_genmask_next(net); + + return nft_set_elem_active(ext, genmask); +} + static void nft_setelem_data_activate(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) @@ -7185,8 +7352,12 @@ static int nft_setelem_flush(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv) { + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_trans *trans; + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM, sizeof(struct nft_trans_elem), GFP_ATOMIC); if (!trans) @@ -7248,6 +7419,7 @@ static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask) { struct nft_set_iter iter = { .genmask = genmask, + .type = NFT_ITER_UPDATE, .fn = nft_setelem_flush, }; @@ -7483,7 +7655,7 @@ static const struct nft_object_type *__nft_obj_type_get(u32 objtype, u8 family) { const struct nft_object_type *type; - list_for_each_entry(type, &nf_tables_objects, list) { + list_for_each_entry_rcu(type, &nf_tables_objects, list) { if (type->family != NFPROTO_UNSPEC && type->family != family) continue; @@ -7499,9 +7671,13 @@ nft_obj_type_get(struct net *net, u32 objtype, u8 family) { const struct nft_object_type *type; + rcu_read_lock(); type = __nft_obj_type_get(objtype, family); - if (type != NULL && try_module_get(type->owner)) + if (type != NULL && try_module_get(type->owner)) { + rcu_read_unlock(); return type; + } + rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES @@ -8173,11 +8349,12 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, return err; } +/* call under rcu_read_lock */ static const struct nf_flowtable_type *__nft_flowtable_type_get(u8 family) { const struct nf_flowtable_type *type; - list_for_each_entry(type, &nf_tables_flowtables, list) { + list_for_each_entry_rcu(type, &nf_tables_flowtables, list) { if (family == type->family) return type; } @@ -8189,9 +8366,13 @@ nft_flowtable_type_get(struct net *net, u8 family) { const struct nf_flowtable_type *type; + rcu_read_lock(); type = __nft_flowtable_type_get(family); - if (type != NULL && try_module_get(type->owner)) + if (type != NULL && try_module_get(type->owner)) { + rcu_read_unlock(); return type; + } + rcu_read_unlock(); lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES @@ -9122,7 +9303,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_set_elem_list_policy, }, [NFT_MSG_GETSETELEM_RESET] = { - .call = nf_tables_getsetelem, + .call = nf_tables_getsetelem_reset, .type = NFNL_CB_RCU, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, @@ -10092,9 +10273,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) if (nft_trans_chain_update(trans)) { nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN, &nft_trans_chain_hooks(trans)); - nft_netdev_unregister_hooks(net, - &nft_trans_chain_hooks(trans), - true); + if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT)) { + nft_netdev_unregister_hooks(net, + &nft_trans_chain_hooks(trans), + true); + } } else { nft_chain_del(trans->ctx.chain); nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN, @@ -10333,10 +10516,11 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) struct nft_trans *trans, *next; LIST_HEAD(set_update_list); struct nft_trans_elem *te; + int err = 0; if (action == NFNL_ABORT_VALIDATE && nf_tables_validate(net) < 0) - return -EAGAIN; + err = -EAGAIN; list_for_each_entry_safe_reverse(trans, next, &nft_net->commit_list, list) { @@ -10366,9 +10550,11 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) break; case NFT_MSG_NEWCHAIN: if (nft_trans_chain_update(trans)) { - nft_netdev_unregister_hooks(net, - &nft_trans_chain_hooks(trans), - true); + if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT)) { + nft_netdev_unregister_hooks(net, + &nft_trans_chain_hooks(trans), + true); + } free_percpu(nft_trans_chain_stats(trans)); kfree(nft_trans_chain_name(trans)); nft_trans_destroy(trans); @@ -10460,8 +10646,10 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) case NFT_MSG_DESTROYSETELEM: te = (struct nft_trans_elem *)trans->data; - nft_setelem_data_activate(net, te->set, te->elem_priv); - nft_setelem_activate(net, te->set, te->elem_priv); + if (!nft_setelem_active_next(net, te->set, te->elem_priv)) { + nft_setelem_data_activate(net, te->set, te->elem_priv); + nft_setelem_activate(net, te->set, te->elem_priv); + } if (!nft_setelem_is_catchall(te->set, te->elem_priv)) te->set->ndeact--; @@ -10522,12 +10710,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nf_tables_abort_release(trans); } - if (action == NFNL_ABORT_AUTOLOAD) - nf_tables_module_autoload(net); - else - nf_tables_module_autoload_cleanup(net); - - return 0; + return err; } static int nf_tables_abort(struct net *net, struct sk_buff *skb, @@ -10540,6 +10723,17 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb, gc_seq = nft_gc_seq_begin(nft_net); ret = __nf_tables_abort(net, action); nft_gc_seq_end(nft_net, gc_seq); + + WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + + /* module autoload needs to happen after GC sequence update because it + * temporarily releases and grabs mutex again. + */ + if (action == NFNL_ABORT_AUTOLOAD) + nf_tables_module_autoload(net); + else + nf_tables_module_autoload_cleanup(net); + mutex_unlock(&nft_net->commit_mutex); return ret; @@ -10643,6 +10837,9 @@ static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); + if (!nft_set_elem_active(ext, iter->genmask)) + return 0; + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) return 0; @@ -10727,6 +10924,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, continue; iter.genmask = nft_genmask_next(ctx->net); + iter.type = NFT_ITER_UPDATE; iter.skip = 0; iter.count = 0; iter.err = 0; @@ -11341,9 +11539,10 @@ static void __net_exit nf_tables_exit_net(struct net *net) gc_seq = nft_gc_seq_begin(nft_net); - if (!list_empty(&nft_net->commit_list) || - !list_empty(&nft_net->module_list)) - __nf_tables_abort(net, NFNL_ABORT_NONE); + WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + + if (!list_empty(&nft_net->module_list)) + nf_tables_module_autoload_cleanup(net); __nft_release_tables(net); @@ -11435,6 +11634,7 @@ static void __exit nf_tables_module_exit(void) unregister_netdevice_notifier(&nf_tables_flowtable_notifier); nft_chain_filter_fini(); nft_chain_route_fini(); + nf_tables_trans_destroy_flush_work(); unregister_pernet_subsys(&nf_tables_net_ops); cancel_work_sync(&trans_gc_work); cancel_work_sync(&trans_destroy_work); diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index 274b6f7e6b..d170758a1e 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -338,7 +338,9 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev, return; if (n > 1) { - nf_unregister_net_hook(ctx->net, &found->ops); + if (!(ctx->chain->table->flags & NFT_TABLE_F_DORMANT)) + nf_unregister_net_hook(ctx->net, &found->ops); + list_del_rcu(&found->list); kfree_rcu(found, rcu); return; diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 397351fa4d..ab95760987 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -361,6 +361,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; } + __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags); ret = flow_offload_add(flowtable, flow); if (ret < 0) goto err_flow_add; diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 870e5b113d..87c18eddb0 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -216,6 +216,7 @@ static int nft_lookup_validate(const struct nft_ctx *ctx, return 0; iter.genmask = nft_genmask_next(ctx->net); + iter.type = NFT_ITER_UPDATE; iter.skip = 0; iter.count = 0; iter.err = 0; diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 32df7a1683..1caa04619d 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -172,7 +172,7 @@ static void nft_bitmap_activate(const struct net *net, nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off); /* Enter 11 state. */ priv->bitmap[idx] |= (genmask << off); - nft_set_elem_change_active(net, set, &be->ext); + nft_clear(net, &be->ext); } static void nft_bitmap_flush(const struct net *net, @@ -222,8 +222,6 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx, list_for_each_entry_rcu(be, &priv->list, head) { if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&be->ext, iter->genmask)) - goto cont; iter->err = iter->fn(ctx, set, iter, &be->priv); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 6968a3b342..daa56dda73 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -199,7 +199,7 @@ static void nft_rhash_activate(const struct net *net, const struct nft_set *set, { struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &he->ext); + nft_clear(net, &he->ext); } static void nft_rhash_flush(const struct net *net, @@ -286,8 +286,6 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&he->ext, iter->genmask)) - goto cont; iter->err = iter->fn(ctx, set, iter, &he->priv); if (iter->err < 0) @@ -599,7 +597,7 @@ static void nft_hash_activate(const struct net *net, const struct nft_set *set, { struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &he->ext); + nft_clear(net, &he->ext); } static void nft_hash_flush(const struct net *net, @@ -652,8 +650,6 @@ static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set, hlist_for_each_entry_rcu(he, &priv->table[i], node) { if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&he->ext, iter->genmask)) - goto cont; iter->err = iter->fn(ctx, set, iter, &he->priv); if (iter->err < 0) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index abf659cb2d..b42a34087e 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -360,7 +360,7 @@ * Return: -1 on no match, bit position on 'match_only', 0 otherwise. */ int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst, - union nft_pipapo_map_bucket *mt, bool match_only) + const union nft_pipapo_map_bucket *mt, bool match_only) { unsigned long bitset; int k, ret = -1; @@ -412,9 +412,9 @@ bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, struct nft_pipapo_scratch *scratch; unsigned long *res_map, *fill_map; u8 genmask = nft_genmask_cur(net); + const struct nft_pipapo_match *m; + const struct nft_pipapo_field *f; const u8 *rp = (const u8 *)key; - struct nft_pipapo_match *m; - struct nft_pipapo_field *f; bool map_index; int i; @@ -519,11 +519,13 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, { struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT); struct nft_pipapo *priv = nft_set_priv(set); - struct nft_pipapo_match *m = priv->clone; unsigned long *res_map, *fill_map = NULL; - struct nft_pipapo_field *f; + const struct nft_pipapo_match *m; + const struct nft_pipapo_field *f; int i; + m = priv->clone; + res_map = kmalloc_array(m->bsize_max, sizeof(*res_map), GFP_ATOMIC); if (!res_map) { ret = ERR_PTR(-ENOMEM); @@ -1597,7 +1599,7 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; - struct nft_pipapo_field *f; + const struct nft_pipapo_field *f; int i, start, rules_fx; start = first_rule; @@ -1624,7 +1626,7 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) if (__nft_set_elem_expired(&e->ext, tstamp)) { priv->dirty = true; - gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); if (!gc) return; @@ -1771,7 +1773,7 @@ static void nft_pipapo_activate(const struct net *net, { struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &e->ext); + nft_clear(net, &e->ext); } /** @@ -2000,6 +2002,8 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, rules_fx = rules_f0; nft_pipapo_for_each_field(f, i, m) { + bool last = i == m->field_count - 1; + if (!pipapo_match_field(f, start, rules_fx, match_start, match_end)) break; @@ -2012,16 +2016,18 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); - } - if (i == m->field_count) { - priv->dirty = true; - pipapo_drop(m, rulemap); - return; + if (last && f->mt[rulemap[i].to].e == e) { + priv->dirty = true; + pipapo_drop(m, rulemap); + return; + } } first_rule += rules_f0; } + + WARN_ON_ONCE(1); /* elem_priv not found */ } /** @@ -2038,13 +2044,15 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter) { struct nft_pipapo *priv = nft_set_priv(set); - struct net *net = read_pnet(&set->net); - struct nft_pipapo_match *m; - struct nft_pipapo_field *f; + const struct nft_pipapo_match *m; + const struct nft_pipapo_field *f; int i, r; + WARN_ON_ONCE(iter->type != NFT_ITER_READ && + iter->type != NFT_ITER_UPDATE); + rcu_read_lock(); - if (iter->genmask == nft_genmask_cur(net)) + if (iter->type == NFT_ITER_READ) m = rcu_dereference(priv->match); else m = priv->clone; @@ -2066,9 +2074,6 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, e = f->mt[r].e; - if (!nft_set_elem_active(&e->ext, iter->genmask)) - goto cont; - iter->err = iter->fn(ctx, set, iter, &e->priv); if (iter->err < 0) goto out; diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h index f59a0cd811..42464e7c24 100644 --- a/net/netfilter/nft_set_pipapo.h +++ b/net/netfilter/nft_set_pipapo.h @@ -144,10 +144,10 @@ struct nft_pipapo_scratch { /** * struct nft_pipapo_match - Data used for lookup and matching - * @field_count Amount of fields in set + * @field_count: Amount of fields in set * @scratch: Preallocated per-CPU maps for partial matching results * @bsize_max: Maximum lookup table bucket size of all fields, in longs - * @rcu Matching data is swapped on commits + * @rcu: Matching data is swapped on commits * @f: Fields, with lookup and mapping tables */ struct nft_pipapo_match { @@ -187,7 +187,7 @@ struct nft_pipapo_elem { }; int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst, - union nft_pipapo_map_bucket *mt, bool match_only); + const union nft_pipapo_map_bucket *mt, bool match_only); /** * pipapo_and_field_buckets_4bit() - Intersect 4-bit buckets @@ -195,7 +195,7 @@ int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst, * @dst: Area to store result * @data: Input data selecting table buckets */ -static inline void pipapo_and_field_buckets_4bit(struct nft_pipapo_field *f, +static inline void pipapo_and_field_buckets_4bit(const struct nft_pipapo_field *f, unsigned long *dst, const u8 *data) { @@ -223,7 +223,7 @@ static inline void pipapo_and_field_buckets_4bit(struct nft_pipapo_field *f, * @dst: Area to store result * @data: Input data selecting table buckets */ -static inline void pipapo_and_field_buckets_8bit(struct nft_pipapo_field *f, +static inline void pipapo_and_field_buckets_8bit(const struct nft_pipapo_field *f, unsigned long *dst, const u8 *data) { diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index a3a8ddca99..d08407d589 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -212,8 +212,9 @@ static int nft_pipapo_avx2_refill(int offset, unsigned long *map, * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; u8 pg[2] = { pkt[0] >> 4, pkt[0] & 0xf }; @@ -274,8 +275,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; u8 pg[4] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf }; @@ -350,8 +352,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { u8 pg[8] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf, pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf, @@ -445,8 +448,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { u8 pg[12] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf, pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf, @@ -534,8 +538,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { u8 pg[32] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf, pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf, @@ -669,8 +674,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; @@ -726,8 +732,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; @@ -790,8 +797,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; @@ -865,8 +873,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; @@ -950,8 +959,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; @@ -1042,8 +1052,9 @@ nothing: * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill, - struct nft_pipapo_field *f, int offset, - const u8 *pkt, bool first, bool last) + const struct nft_pipapo_field *f, + int offset, const u8 *pkt, + bool first, bool last) { unsigned long bsize = f->bsize; int i, ret = -1, b; @@ -1119,9 +1130,9 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_scratch *scratch; u8 genmask = nft_genmask_cur(net); + const struct nft_pipapo_match *m; + const struct nft_pipapo_field *f; const u8 *rp = (const u8 *)key; - struct nft_pipapo_match *m; - struct nft_pipapo_field *f; unsigned long *res, *fill; bool map_index; int i, ret = 0; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 9944fe479e..b7ea213275 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -532,7 +532,7 @@ static void nft_rbtree_activate(const struct net *net, { struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv); - nft_set_elem_change_active(net, set, &rbe->ext); + nft_clear(net, &rbe->ext); } static void nft_rbtree_flush(const struct net *net, @@ -600,8 +600,6 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&rbe->ext, iter->genmask)) - goto cont; iter->err = iter->fn(ctx, set, iter, &rbe->priv); if (iter->err < 0) { diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 6ae782efb1..ff31535126 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1204,8 +1204,7 @@ struct sock *netlink_getsockbyfilp(struct file *filp) return sock; } -static struct sk_buff *netlink_alloc_large_skb(unsigned int size, - int broadcast) +struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast) { struct sk_buff *skb; void *data; @@ -1520,8 +1519,7 @@ out: int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid, u32 group, gfp_t allocation, - int (*filter)(struct sock *dsk, - struct sk_buff *skb, void *data), + netlink_filter_fn filter, void *filter_data) { struct net *net = sock_net(ssk); diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 9c7ffd10df..8c7af02f84 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -631,6 +631,138 @@ static int genl_validate_ops(const struct genl_family *family) return 0; } +static void *genl_sk_priv_alloc(struct genl_family *family) +{ + void *priv; + + priv = kzalloc(family->sock_priv_size, GFP_KERNEL); + if (!priv) + return ERR_PTR(-ENOMEM); + + if (family->sock_priv_init) + family->sock_priv_init(priv); + + return priv; +} + +static void genl_sk_priv_free(const struct genl_family *family, void *priv) +{ + if (family->sock_priv_destroy) + family->sock_priv_destroy(priv); + kfree(priv); +} + +static int genl_sk_privs_alloc(struct genl_family *family) +{ + if (!family->sock_priv_size) + return 0; + + family->sock_privs = kzalloc(sizeof(*family->sock_privs), GFP_KERNEL); + if (!family->sock_privs) + return -ENOMEM; + xa_init(family->sock_privs); + return 0; +} + +static void genl_sk_privs_free(const struct genl_family *family) +{ + unsigned long id; + void *priv; + + if (!family->sock_priv_size) + return; + + xa_for_each(family->sock_privs, id, priv) + genl_sk_priv_free(family, priv); + + xa_destroy(family->sock_privs); + kfree(family->sock_privs); +} + +static void genl_sk_priv_free_by_sock(struct genl_family *family, + struct sock *sk) +{ + void *priv; + + if (!family->sock_priv_size) + return; + priv = xa_erase(family->sock_privs, (unsigned long) sk); + if (!priv) + return; + genl_sk_priv_free(family, priv); +} + +static void genl_release(struct sock *sk, unsigned long *groups) +{ + struct genl_family *family; + unsigned int id; + + down_read(&cb_lock); + + idr_for_each_entry(&genl_fam_idr, family, id) + genl_sk_priv_free_by_sock(family, sk); + + up_read(&cb_lock); +} + +/** + * __genl_sk_priv_get - Get family private pointer for socket, if exists + * + * @family: family + * @sk: socket + * + * Lookup a private memory for a Generic netlink family and specified socket. + * + * Caller should make sure this is called in RCU read locked section. + * + * Return: valid pointer on success, otherwise negative error value + * encoded by ERR_PTR(), NULL in case priv does not exist. + */ +void *__genl_sk_priv_get(struct genl_family *family, struct sock *sk) +{ + if (WARN_ON_ONCE(!family->sock_privs)) + return ERR_PTR(-EINVAL); + return xa_load(family->sock_privs, (unsigned long) sk); +} + +/** + * genl_sk_priv_get - Get family private pointer for socket + * + * @family: family + * @sk: socket + * + * Lookup a private memory for a Generic netlink family and specified socket. + * Allocate the private memory in case it was not already done. + * + * Return: valid pointer on success, otherwise negative error value + * encoded by ERR_PTR(). + */ +void *genl_sk_priv_get(struct genl_family *family, struct sock *sk) +{ + void *priv, *old_priv; + + priv = __genl_sk_priv_get(family, sk); + if (priv) + return priv; + + /* priv for the family does not exist so far, create it. */ + + priv = genl_sk_priv_alloc(family); + if (IS_ERR(priv)) + return ERR_CAST(priv); + + old_priv = xa_cmpxchg(family->sock_privs, (unsigned long) sk, NULL, + priv, GFP_KERNEL); + if (old_priv) { + genl_sk_priv_free(family, priv); + if (xa_is_err(old_priv)) + return ERR_PTR(xa_err(old_priv)); + /* Race happened, priv for the socket was already inserted. */ + return old_priv; + } + return priv; +} + /** * genl_register_family - register a generic netlink family * @family: generic netlink family @@ -659,6 +791,10 @@ int genl_register_family(struct genl_family *family) goto errout_locked; } + err = genl_sk_privs_alloc(family); + if (err) + goto errout_locked; + /* * Sadly, a few cases need to be special-cased * due to them having previously abused the API @@ -679,7 +815,7 @@ int genl_register_family(struct genl_family *family) start, end + 1, GFP_KERNEL); if (family->id < 0) { err = family->id; - goto errout_locked; + goto errout_sk_privs_free; } err = genl_validate_assign_mc_groups(family); @@ -698,6 +834,8 @@ int genl_register_family(struct genl_family *family) errout_remove: idr_remove(&genl_fam_idr, family->id); +errout_sk_privs_free: + genl_sk_privs_free(family); errout_locked: genl_unlock_all(); return err; @@ -728,6 +866,9 @@ int genl_unregister_family(const struct genl_family *family) up_write(&cb_lock); wait_event(genl_sk_destructing_waitq, atomic_read(&genl_sk_destructing_cnt) == 0); + + genl_sk_privs_free(family); + genl_unlock(); genl_ctrl_event(CTRL_CMD_DELFAMILY, family, NULL, 0); @@ -1688,10 +1829,10 @@ static int genl_bind(struct net *net, int group) continue; grp = &family->mcgrps[i]; - if ((grp->flags & GENL_UNS_ADMIN_PERM) && + if ((grp->flags & GENL_MCAST_CAP_NET_ADMIN) && !ns_capable(net->user_ns, CAP_NET_ADMIN)) ret = -EPERM; - if (grp->cap_sys_admin && + if ((grp->flags & GENL_MCAST_CAP_SYS_ADMIN) && !ns_capable(net->user_ns, CAP_SYS_ADMIN)) ret = -EPERM; @@ -1708,6 +1849,7 @@ static int __net_init genl_pernet_init(struct net *net) .input = genl_rcv, .flags = NL_CFG_F_NONROOT_RECV, .bind = genl_bind, + .release = genl_release, }; /* we'll bump the group number right afterwards */ diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index d63d2e5dc6..dae378f1d5 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -858,4 +858,5 @@ void nfc_digital_unregister_device(struct nfc_digital_dev *ddev) } EXPORT_SYMBOL(nfc_digital_unregister_device); +MODULE_DESCRIPTION("NFC Digital protocol stack"); MODULE_LICENSE("GPL"); diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 12684d835c..0d26c8ec99 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -1516,6 +1516,11 @@ static void nci_rx_work(struct work_struct *work) nfc_send_to_raw_sock(ndev->nfc_dev, skb, RAW_PAYLOAD_NCI, NFC_DIRECTION_RX); + if (!nci_plen(skb->data)) { + kfree_skb(skb); + break; + } + /* Process frame */ switch (nci_mt(skb->data)) { case NCI_MT_RSP_PKT: @@ -1581,4 +1586,5 @@ static void nci_cmd_work(struct work_struct *work) } } +MODULE_DESCRIPTION("NFC Controller Interface"); MODULE_LICENSE("GPL"); diff --git a/net/nfc/nci/spi.c b/net/nfc/nci/spi.c index b68150c971..6a93533c48 100644 --- a/net/nfc/nci/spi.c +++ b/net/nfc/nci/spi.c @@ -319,4 +319,5 @@ done: } EXPORT_SYMBOL_GPL(nci_spi_read); +MODULE_DESCRIPTION("NFC Controller Interface (NCI) SPI link layer"); MODULE_LICENSE("GPL"); diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 3019a4406c..2928c142a2 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1380,8 +1380,9 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, if (ct_info.timeout[0]) { if (nf_ct_set_timeout(net, ct_info.ct, family, key->ip.proto, ct_info.timeout)) - pr_info_ratelimited("Failed to associated timeout " - "policy `%s'\n", ct_info.timeout); + OVS_NLERR(log, + "Failed to associated timeout policy '%s'", + ct_info.timeout); else ct_info.nf_ct_timeout = rcu_dereference( nf_ct_timeout_find(ct_info.ct)->timeout); @@ -1592,9 +1593,9 @@ static void ovs_ct_limit_exit(struct net *net, struct ovs_net *ovs_net) for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) { struct hlist_head *head = &info->limits[i]; struct ovs_ct_limit *ct_limit; + struct hlist_node *next; - hlist_for_each_entry_rcu(ct_limit, head, hlist_node, - lockdep_ovsl_is_held()) + hlist_for_each_entry_safe(ct_limit, next, head, hlist_node) kfree_rcu(ct_limit, rcu); } kfree(info->limits); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index f017d7d33d..e6a8701a38 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2121,13 +2121,13 @@ static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb, static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { + enum skb_drop_reason drop_reason = SKB_CONSUMED; struct sock *sk; struct sockaddr_ll *sll; struct packet_sock *po; u8 *skb_head = skb->data; int skb_len = skb->len; unsigned int snaplen, res; - bool is_drop_n_account = false; if (skb->pkt_type == PACKET_LOOPBACK) goto drop; @@ -2217,9 +2217,9 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, return 0; drop_n_acct: - is_drop_n_account = true; atomic_inc(&po->tp_drops); atomic_inc(&sk->sk_drops); + drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR; drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { @@ -2227,16 +2227,14 @@ drop_n_restore: skb->len = skb_len; } drop: - if (!is_drop_n_account) - consume_skb(skb); - else - kfree_skb(skb); + kfree_skb_reason(skb, drop_reason); return 0; } static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { + enum skb_drop_reason drop_reason = SKB_CONSUMED; struct sock *sk; struct packet_sock *po; struct sockaddr_ll *sll; @@ -2250,7 +2248,6 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct sk_buff *copy_skb = NULL; struct timespec64 ts; __u32 ts_status; - bool is_drop_n_account = false; unsigned int slot_id = 0; int vnet_hdr_sz = 0; @@ -2498,19 +2495,16 @@ drop_n_restore: skb->len = skb_len; } drop: - if (!is_drop_n_account) - consume_skb(skb); - else - kfree_skb(skb); + kfree_skb_reason(skb, drop_reason); return 0; drop_n_account: spin_unlock(&sk->sk_receive_queue.lock); atomic_inc(&po->tp_drops); - is_drop_n_account = true; + drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR; sk->sk_data_ready(sk); - kfree_skb(copy_skb); + kfree_skb_reason(copy_skb, drop_reason); goto drop_n_restore; } @@ -4787,5 +4781,6 @@ out: module_init(packet_init); module_exit(packet_exit); +MODULE_DESCRIPTION("Packet socket support (AF_PACKET)"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_PACKET); diff --git a/net/psample/psample.c b/net/psample/psample.c index c34e902855..ddd211a151 100644 --- a/net/psample/psample.c +++ b/net/psample/psample.c @@ -32,7 +32,7 @@ enum psample_nl_multicast_groups { static const struct genl_multicast_group psample_nl_mcgrps[] = { [PSAMPLE_NL_MCGRP_CONFIG] = { .name = PSAMPLE_NL_MCGRP_CONFIG_NAME }, [PSAMPLE_NL_MCGRP_SAMPLE] = { .name = PSAMPLE_NL_MCGRP_SAMPLE_NAME, - .flags = GENL_UNS_ADMIN_PERM }, + .flags = GENL_MCAST_CAP_NET_ADMIN, }, }; static struct genl_family psample_nl_family __ro_after_init; diff --git a/net/rds/rdma.c b/net/rds/rdma.c index a4e3c5de99..00dbcd4d28 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -302,7 +302,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, } ret = PTR_ERR(trans_private); /* Trigger connection so that its ready for the next retry */ - if (ret == -ENODEV) + if (ret == -ENODEV && cp) rds_conn_connect_if_down(cp->cp_conn); goto out; } diff --git a/net/rds/recv.c b/net/rds/recv.c index c71b923764..5627f80013 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -425,6 +425,7 @@ static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc, struct sock *sk = rds_rs_to_sk(rs); int ret = 0; unsigned long flags; + struct rds_incoming *to_drop = NULL; write_lock_irqsave(&rs->rs_recv_lock, flags); if (!list_empty(&inc->i_item)) { @@ -435,11 +436,14 @@ static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc, -be32_to_cpu(inc->i_hdr.h_len), inc->i_hdr.h_dport); list_del_init(&inc->i_item); - rds_inc_put(inc); + to_drop = inc; } } write_unlock_irqrestore(&rs->rs_recv_lock, flags); + if (to_drop) + rds_inc_put(to_drop); + rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop); return ret; } @@ -758,16 +762,21 @@ void rds_clear_recv_queue(struct rds_sock *rs) struct sock *sk = rds_rs_to_sk(rs); struct rds_incoming *inc, *tmp; unsigned long flags; + LIST_HEAD(to_drop); write_lock_irqsave(&rs->rs_recv_lock, flags); list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) { rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, -be32_to_cpu(inc->i_hdr.h_len), inc->i_hdr.h_dport); + list_move(&inc->i_item, &to_drop); + } + write_unlock_irqrestore(&rs->rs_recv_lock, flags); + + list_for_each_entry_safe(inc, tmp, &to_drop, i_item) { list_del_init(&inc->i_item); rds_inc_put(inc); } - write_unlock_irqrestore(&rs->rs_recv_lock, flags); } /* diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 53b3535a1e..05008ce5c4 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -165,7 +165,7 @@ int rds_tcp_accept_one(struct socket *sock) struct ipv6_pinfo *inet6; inet6 = inet6_sk(new_sock->sk); - dev_if = inet6->mcast_oif; + dev_if = READ_ONCE(inet6->mcast_oif); } else { dev_if = new_sock->sk->sk_bound_dev_if; } diff --git a/net/rfkill/core.c b/net/rfkill/core.c index 14cc8fe858..c3feb4f49d 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -1351,11 +1351,11 @@ static long rfkill_fop_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct rfkill_data *data = file->private_data; - int ret = -ENOSYS; + int ret = -ENOTTY; u32 size; if (_IOC_TYPE(cmd) != RFKILL_IOC_MAGIC) - return -ENOSYS; + return -ENOTTY; mutex_lock(&data->mtx); switch (_IOC_NR(cmd)) { diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 027414dafe..7818aae1be 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -68,6 +68,7 @@ struct rxrpc_net { atomic_t nr_calls; /* Count of allocated calls */ atomic_t nr_conns; + struct list_head bundle_proc_list; /* List of bundles for proc */ struct list_head conn_proc_list; /* List of conns in this namespace for proc */ struct list_head service_conns; /* Service conns in this namespace */ rwlock_t conn_lock; /* Lock for ->conn_proc_list, ->service_conns */ @@ -440,6 +441,7 @@ struct rxrpc_bundle { struct rxrpc_local *local; /* Representation of local endpoint */ struct rxrpc_peer *peer; /* Remote endpoint */ struct key *key; /* Security details */ + struct list_head proc_link; /* Link in net->bundle_proc_list */ const struct rxrpc_security *security; /* applied security module */ refcount_t ref; atomic_t active; /* Number of active users */ @@ -453,6 +455,7 @@ struct rxrpc_bundle { struct rb_node local_node; /* Node in local->client_conns */ struct list_head waiting_calls; /* Calls waiting for channels */ unsigned long avail_chans; /* Mask of available channels */ + unsigned int conn_ids[4]; /* Connection IDs. */ struct rxrpc_connection *conns[4]; /* The connections in the bundle (max 4) */ }; @@ -1191,6 +1194,7 @@ void rxrpc_put_peer(struct rxrpc_peer *, enum rxrpc_peer_trace); */ extern const struct seq_operations rxrpc_call_seq_ops; extern const struct seq_operations rxrpc_connection_seq_ops; +extern const struct seq_operations rxrpc_bundle_seq_ops; extern const struct seq_operations rxrpc_peer_seq_ops; extern const struct seq_operations rxrpc_local_seq_ops; diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 1d95f8bc76..3b9b267a44 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -91,6 +91,10 @@ static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_call *call, atomic_set(&bundle->active, 1); INIT_LIST_HEAD(&bundle->waiting_calls); trace_rxrpc_bundle(bundle->debug_id, 1, rxrpc_bundle_new); + + write_lock(&bundle->local->rxnet->conn_lock); + list_add_tail(&bundle->proc_link, &bundle->local->rxnet->bundle_proc_list); + write_unlock(&bundle->local->rxnet->conn_lock); } return bundle; } @@ -109,6 +113,9 @@ static void rxrpc_free_bundle(struct rxrpc_bundle *bundle) { trace_rxrpc_bundle(bundle->debug_id, refcount_read(&bundle->ref), rxrpc_bundle_free); + write_lock(&bundle->local->rxnet->conn_lock); + list_del(&bundle->proc_link); + write_unlock(&bundle->local->rxnet->conn_lock); rxrpc_put_peer(bundle->peer, rxrpc_peer_put_bundle); key_put(bundle->key); kfree(bundle); @@ -338,6 +345,7 @@ static bool rxrpc_add_conn_to_bundle(struct rxrpc_bundle *bundle, old = bundle->conns[slot]; if (old) { bundle->conns[slot] = NULL; + bundle->conn_ids[slot] = 0; trace_rxrpc_client(old, -1, rxrpc_client_replace); rxrpc_put_connection(old, rxrpc_conn_put_noreuse); } @@ -351,6 +359,7 @@ static bool rxrpc_add_conn_to_bundle(struct rxrpc_bundle *bundle, rxrpc_activate_bundle(bundle); conn->bundle_shift = shift; bundle->conns[slot] = conn; + bundle->conn_ids[slot] = conn->debug_id; for (i = 0; i < RXRPC_MAXCALLS; i++) set_bit(shift + i, &bundle->avail_chans); return true; @@ -671,6 +680,7 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) if (bundle->conns[bindex] == conn) { _debug("clear slot %u", bindex); bundle->conns[bindex] = NULL; + bundle->conn_ids[bindex] = 0; for (i = 0; i < RXRPC_MAXCALLS; i++) clear_bit(conn->bundle_shift + i, &bundle->avail_chans); rxrpc_put_client_connection_id(bundle->local, conn); diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index a0319c040c..a4c135d0fb 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -45,6 +45,7 @@ static __net_init int rxrpc_init_net(struct net *net) atomic_set(&rxnet->nr_calls, 1); atomic_set(&rxnet->nr_conns, 1); + INIT_LIST_HEAD(&rxnet->bundle_proc_list); INIT_LIST_HEAD(&rxnet->conn_proc_list); INIT_LIST_HEAD(&rxnet->service_conns); rwlock_init(&rxnet->conn_lock); @@ -78,6 +79,9 @@ static __net_init int rxrpc_init_net(struct net *net) proc_create_net("conns", 0444, rxnet->proc_net, &rxrpc_connection_seq_ops, sizeof(struct seq_net_private)); + proc_create_net("bundles", 0444, rxnet->proc_net, + &rxrpc_bundle_seq_ops, + sizeof(struct seq_net_private)); proc_create_net("peers", 0444, rxnet->proc_net, &rxrpc_peer_seq_ops, sizeof(struct seq_net_private)); diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 208312c244..26dc2f26d9 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -199,6 +199,82 @@ const struct seq_operations rxrpc_connection_seq_ops = { }; /* + * generate a list of extant virtual bundles in /proc/net/rxrpc/bundles + */ +static void *rxrpc_bundle_seq_start(struct seq_file *seq, loff_t *_pos) + __acquires(rxnet->conn_lock) +{ + struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); + + read_lock(&rxnet->conn_lock); + return seq_list_start_head(&rxnet->bundle_proc_list, *_pos); +} + +static void *rxrpc_bundle_seq_next(struct seq_file *seq, void *v, + loff_t *pos) +{ + struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); + + return seq_list_next(v, &rxnet->bundle_proc_list, pos); +} + +static void rxrpc_bundle_seq_stop(struct seq_file *seq, void *v) + __releases(rxnet->conn_lock) +{ + struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); + + read_unlock(&rxnet->conn_lock); +} + +static int rxrpc_bundle_seq_show(struct seq_file *seq, void *v) +{ + struct rxrpc_bundle *bundle; + struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); + char lbuff[50], rbuff[50]; + + if (v == &rxnet->bundle_proc_list) { + seq_puts(seq, + "Proto Local " + " Remote " + " SvID Ref Act Flg Key |" + " Bundle Conn_0 Conn_1 Conn_2 Conn_3\n" + ); + return 0; + } + + bundle = list_entry(v, struct rxrpc_bundle, proc_link); + + sprintf(lbuff, "%pISpc", &bundle->local->srx.transport); + sprintf(rbuff, "%pISpc", &bundle->peer->srx.transport); + seq_printf(seq, + "UDP %-47.47s %-47.47s %4x %3u %3d" + " %c%c%c %08x | %08x %08x %08x %08x %08x\n", + lbuff, + rbuff, + bundle->service_id, + refcount_read(&bundle->ref), + atomic_read(&bundle->active), + bundle->try_upgrade ? 'U' : '-', + bundle->exclusive ? 'e' : '-', + bundle->upgrade ? 'u' : '-', + key_serial(bundle->key), + bundle->debug_id, + bundle->conn_ids[0], + bundle->conn_ids[1], + bundle->conn_ids[2], + bundle->conn_ids[3]); + + return 0; +} + +const struct seq_operations rxrpc_bundle_seq_ops = { + .start = rxrpc_bundle_seq_start, + .next = rxrpc_bundle_seq_next, + .stop = rxrpc_bundle_seq_stop, + .show = rxrpc_bundle_seq_show, +}; + +/* * generate a list of extant virtual peers in /proc/net/rxrpc/peers */ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v) diff --git a/net/sched/Makefile b/net/sched/Makefile index b5fd49641d..82c3f78ca4 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -13,7 +13,6 @@ obj-$(CONFIG_NET_ACT_POLICE) += act_police.o obj-$(CONFIG_NET_ACT_GACT) += act_gact.o obj-$(CONFIG_NET_ACT_MIRRED) += act_mirred.o obj-$(CONFIG_NET_ACT_SAMPLE) += act_sample.o -obj-$(CONFIG_NET_ACT_IPT) += act_ipt.o obj-$(CONFIG_NET_ACT_NAT) += act_nat.o obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index c39252d61e..3e30d72604 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -816,6 +816,9 @@ EXPORT_SYMBOL(tcf_idr_cleanup); * its reference and bind counters, and return 1. Otherwise insert temporary * error pointer (to prevent concurrent users from inserting actions with same * index) and return 0. + * + * May return -EAGAIN for binding actions in case of a parallel add/delete on + * the requested index. */ int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, @@ -824,43 +827,61 @@ int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, struct tcf_idrinfo *idrinfo = tn->idrinfo; struct tc_action *p; int ret; + u32 max; -again: - mutex_lock(&idrinfo->lock); if (*index) { +again: + rcu_read_lock(); p = idr_find(&idrinfo->action_idr, *index); + if (IS_ERR(p)) { /* This means that another process allocated * index but did not assign the pointer yet. */ - mutex_unlock(&idrinfo->lock); + rcu_read_unlock(); goto again; } - if (p) { - refcount_inc(&p->tcfa_refcnt); - if (bind) - atomic_inc(&p->tcfa_bindcnt); - *a = p; - ret = 1; - } else { - *a = NULL; - ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index, - *index, GFP_KERNEL); - if (!ret) - idr_replace(&idrinfo->action_idr, - ERR_PTR(-EBUSY), *index); + if (!p) { + /* Empty slot, try to allocate it */ + max = *index; + rcu_read_unlock(); + goto new; + } + + if (!refcount_inc_not_zero(&p->tcfa_refcnt)) { + /* Action was deleted in parallel */ + rcu_read_unlock(); + return -EAGAIN; } + + if (bind) + atomic_inc(&p->tcfa_bindcnt); + *a = p; + + rcu_read_unlock(); + + return 1; } else { + /* Find a slot */ *index = 1; - *a = NULL; - ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index, - UINT_MAX, GFP_KERNEL); - if (!ret) - idr_replace(&idrinfo->action_idr, ERR_PTR(-EBUSY), - *index); + max = UINT_MAX; } + +new: + *a = NULL; + + mutex_lock(&idrinfo->lock); + ret = idr_alloc_u32(&idrinfo->action_idr, ERR_PTR(-EBUSY), index, max, + GFP_KERNEL); mutex_unlock(&idrinfo->lock); + + /* N binds raced for action allocation, + * retry for all the ones that failed. + */ + if (ret == -ENOSPC && *index == max) + ret = -EAGAIN; + return ret; } EXPORT_SYMBOL(tcf_idr_check_alloc); @@ -1098,7 +1119,8 @@ repeat: } } else if (TC_ACT_EXT_CMP(ret, TC_ACT_GOTO_CHAIN)) { if (unlikely(!rcu_access_pointer(a->goto_chain))) { - tcf_set_drop_reason(res, SKB_DROP_REASON_TC_ERROR); + tcf_set_drop_reason(skb, + SKB_DROP_REASON_TC_CHAIN_NOTFOUND); return TC_ACT_SHOT; } tcf_action_goto_chain_exec(a, res); @@ -1118,8 +1140,7 @@ int tcf_action_destroy(struct tc_action *actions[], int bind) struct tc_action *a; int ret = 0, i; - for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { - a = actions[i]; + tcf_act_for_each_action(i, a, actions) { actions[i] = NULL; ops = a->ops; ret = __tcf_idr_release(a, bind, true); @@ -1136,18 +1157,29 @@ static int tcf_action_put(struct tc_action *p) return __tcf_action_put(p, false); } -/* Put all actions in this array, skip those NULL's. */ static void tcf_action_put_many(struct tc_action *actions[]) { + struct tc_action *a; + int i; + + tcf_act_for_each_action(i, a, actions) { + const struct tc_action_ops *ops = a->ops; + if (tcf_action_put(a)) + module_put(ops->owner); + } +} + +static void tca_put_bound_many(struct tc_action *actions[], int init_res[]) +{ + struct tc_action *a; int i; - for (i = 0; i < TCA_ACT_MAX_PRIO; i++) { - struct tc_action *a = actions[i]; - const struct tc_action_ops *ops; + tcf_act_for_each_action(i, a, actions) { + const struct tc_action_ops *ops = a->ops; - if (!a) + if (init_res[i] == ACT_P_CREATED) continue; - ops = a->ops; + if (tcf_action_put(a)) module_put(ops->owner); } @@ -1211,8 +1243,7 @@ int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int err = -EINVAL, i; struct nlattr *nest; - for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { - a = actions[i]; + tcf_act_for_each_action(i, a, actions) { nest = nla_nest_start_noflag(skb, i + 1); if (nest == NULL) goto nla_put_failure; @@ -1274,30 +1305,29 @@ static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { [TCA_ACT_HW_STATS] = NLA_POLICY_BITFIELD32(TCA_ACT_HW_STATS_ANY), }; -void tcf_idr_insert_many(struct tc_action *actions[]) +void tcf_idr_insert_many(struct tc_action *actions[], int init_res[]) { + struct tc_action *a; int i; - for (i = 0; i < TCA_ACT_MAX_PRIO; i++) { - struct tc_action *a = actions[i]; + tcf_act_for_each_action(i, a, actions) { struct tcf_idrinfo *idrinfo; - if (!a) + if (init_res[i] == ACT_P_BOUND) continue; + idrinfo = a->idrinfo; mutex_lock(&idrinfo->lock); - /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc if - * it is just created, otherwise this is just a nop. - */ + /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ idr_replace(&idrinfo->action_idr, a, a->tcfa_index); mutex_unlock(&idrinfo->lock); } } -struct tc_action_ops *tc_action_load_ops(struct nlattr *nla, bool police, - bool rtnl_held, +struct tc_action_ops *tc_action_load_ops(struct nlattr *nla, u32 flags, struct netlink_ext_ack *extack) { + bool police = flags & TCA_ACT_FLAGS_POLICE; struct nlattr *tb[TCA_ACT_MAX + 1]; struct tc_action_ops *a_o; char act_name[IFNAMSIZ]; @@ -1329,6 +1359,8 @@ struct tc_action_ops *tc_action_load_ops(struct nlattr *nla, bool police, a_o = tc_lookup_action_n(act_name); if (a_o == NULL) { #ifdef CONFIG_MODULES + bool rtnl_held = !(flags & TCA_ACT_FLAGS_NO_RTNL); + if (rtnl_held) rtnl_unlock(); request_module("act_%s", act_name); @@ -1445,9 +1477,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { struct tc_action_ops *a_o; - a_o = tc_action_load_ops(tb[i], flags & TCA_ACT_FLAGS_POLICE, - !(flags & TCA_ACT_FLAGS_NO_RTNL), - extack); + a_o = tc_action_load_ops(tb[i], flags, extack); if (IS_ERR(a_o)) { err = PTR_ERR(a_o); goto err_mod; @@ -1488,7 +1518,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, /* We have to commit them all together, because if any error happened in * between, we could not handle the failure gracefully. */ - tcf_idr_insert_many(actions); + tcf_idr_insert_many(actions, init_res); *attr_size = tcf_action_full_attrs_size(sz); err = i - 1; @@ -1497,10 +1527,8 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, err: tcf_action_destroy(actions, flags & TCA_ACT_FLAGS_BIND); err_mod: - for (i = 0; i < TCA_ACT_MAX_PRIO; i++) { - if (ops[i]) - module_put(ops[i]->owner); - } + for (i = 0; i < TCA_ACT_MAX_PRIO && ops[i]; i++) + module_put(ops[i]->owner); return err; } @@ -1753,10 +1781,10 @@ err_out: static int tcf_action_delete(struct net *net, struct tc_action *actions[]) { + struct tc_action *a; int i; - for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { - struct tc_action *a = actions[i]; + tcf_act_for_each_action(i, a, actions) { const struct tc_action_ops *ops = a->ops; /* Actions can be deleted concurrently so we must save their * type and id to search again after reference is released. @@ -1768,7 +1796,7 @@ static int tcf_action_delete(struct net *net, struct tc_action *actions[]) if (tcf_action_put(a)) { /* last reference, action was deleted concurrently */ module_put(ops->owner); - } else { + } else { int ret; /* now do the delete */ @@ -1780,31 +1808,45 @@ static int tcf_action_delete(struct net *net, struct tc_action *actions[]) return 0; } -static int -tcf_reoffload_del_notify(struct net *net, struct tc_action *action) +static struct sk_buff *tcf_reoffload_del_notify_msg(struct net *net, + struct tc_action *action) { size_t attr_size = tcf_action_fill_size(action); struct tc_action *actions[TCA_ACT_MAX_PRIO] = { [0] = action, }; - const struct tc_action_ops *ops = action->ops; struct sk_buff *skb; - int ret; - skb = alloc_skb(attr_size <= NLMSG_GOODSIZE ? NLMSG_GOODSIZE : attr_size, - GFP_KERNEL); + skb = alloc_skb(max(attr_size, NLMSG_GOODSIZE), GFP_KERNEL); if (!skb) - return -ENOBUFS; + return ERR_PTR(-ENOBUFS); if (tca_get_fill(skb, actions, 0, 0, 0, RTM_DELACTION, 0, 1, NULL) <= 0) { kfree_skb(skb); - return -EINVAL; + return ERR_PTR(-EINVAL); + } + + return skb; +} + +static int tcf_reoffload_del_notify(struct net *net, struct tc_action *action) +{ + const struct tc_action_ops *ops = action->ops; + struct sk_buff *skb; + int ret; + + if (!rtnl_notify_needed(net, 0, RTNLGRP_TC)) { + skb = NULL; + } else { + skb = tcf_reoffload_del_notify_msg(net, action); + if (IS_ERR(skb)) + return PTR_ERR(skb); } ret = tcf_idr_release_unsafe(action); if (ret == ACT_P_DELETED) { module_put(ops->owner); - ret = rtnetlink_send(skb, net, 0, RTNLGRP_TC, 0); + ret = rtnetlink_maybe_send(skb, net, 0, RTNLGRP_TC, 0); } else { kfree_skb(skb); } @@ -1870,23 +1912,41 @@ int tcf_action_reoffload_cb(flow_indr_block_bind_cb_t *cb, return 0; } -static int -tcf_del_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], - u32 portid, size_t attr_size, struct netlink_ext_ack *extack) +static struct sk_buff *tcf_del_notify_msg(struct net *net, struct nlmsghdr *n, + struct tc_action *actions[], + u32 portid, size_t attr_size, + struct netlink_ext_ack *extack) { - int ret; struct sk_buff *skb; - skb = alloc_skb(attr_size <= NLMSG_GOODSIZE ? NLMSG_GOODSIZE : attr_size, - GFP_KERNEL); + skb = alloc_skb(max(attr_size, NLMSG_GOODSIZE), GFP_KERNEL); if (!skb) - return -ENOBUFS; + return ERR_PTR(-ENOBUFS); if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION, 0, 2, extack) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes"); kfree_skb(skb); - return -EINVAL; + return ERR_PTR(-EINVAL); + } + + return skb; +} + +static int tcf_del_notify(struct net *net, struct nlmsghdr *n, + struct tc_action *actions[], u32 portid, + size_t attr_size, struct netlink_ext_ack *extack) +{ + struct sk_buff *skb; + int ret; + + if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) { + skb = NULL; + } else { + skb = tcf_del_notify_msg(net, n, actions, portid, attr_size, + extack); + if (IS_ERR(skb)) + return PTR_ERR(skb); } /* now do the delete */ @@ -1897,9 +1957,8 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], return ret; } - ret = rtnetlink_send(skb, net, portid, RTNLGRP_TC, - n->nlmsg_flags & NLM_F_ECHO); - return ret; + return rtnetlink_maybe_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); } static int @@ -1950,26 +2009,44 @@ err: return ret; } -static int -tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], - u32 portid, size_t attr_size, struct netlink_ext_ack *extack) +static struct sk_buff *tcf_add_notify_msg(struct net *net, struct nlmsghdr *n, + struct tc_action *actions[], + u32 portid, size_t attr_size, + struct netlink_ext_ack *extack) { struct sk_buff *skb; - skb = alloc_skb(attr_size <= NLMSG_GOODSIZE ? NLMSG_GOODSIZE : attr_size, - GFP_KERNEL); + skb = alloc_skb(max(attr_size, NLMSG_GOODSIZE), GFP_KERNEL); if (!skb) - return -ENOBUFS; + return ERR_PTR(-ENOBUFS); if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags, RTM_NEWACTION, 0, 0, extack) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); kfree_skb(skb); - return -EINVAL; + return ERR_PTR(-EINVAL); } - return rtnetlink_send(skb, net, portid, RTNLGRP_TC, - n->nlmsg_flags & NLM_F_ECHO); + return skb; +} + +static int tcf_add_notify(struct net *net, struct nlmsghdr *n, + struct tc_action *actions[], u32 portid, + size_t attr_size, struct netlink_ext_ack *extack) +{ + struct sk_buff *skb; + + if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) { + skb = NULL; + } else { + skb = tcf_add_notify_msg(net, n, actions, portid, attr_size, + extack); + if (IS_ERR(skb)) + return PTR_ERR(skb); + } + + return rtnetlink_maybe_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); } static int tcf_action_add(struct net *net, struct nlattr *nla, @@ -1977,7 +2054,7 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, struct netlink_ext_ack *extack) { size_t attr_size = 0; - int loop, ret, i; + int loop, ret; struct tc_action *actions[TCA_ACT_MAX_PRIO] = {}; int init_res[TCA_ACT_MAX_PRIO] = {}; @@ -1990,13 +2067,11 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, if (ret < 0) return ret; + ret = tcf_add_notify(net, n, actions, portid, attr_size, extack); - /* only put existing actions */ - for (i = 0; i < TCA_ACT_MAX_PRIO; i++) - if (init_res[i] == ACT_P_CREATED) - actions[i] = NULL; - tcf_action_put_many(actions); + /* only put bound actions */ + tca_put_bound_many(actions, init_res); return ret; } diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index b0455fda7d..6cfee66581 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -318,7 +318,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, } else if (ret > 0) { /* Don't override defaults. */ if (bind) - return 0; + return ACT_P_BOUND; if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*act, bind); diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 0d7aee8933..f876275665 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -146,7 +146,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, } else if (ret > 0) { ci = to_connmark(*a); if (bind) { - err = 0; + err = ACT_P_BOUND; goto out_free; } if (!(flags & TCA_ACT_FLAGS_REPLACE)) { diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 8ed285023a..7f8b1f2f2e 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -77,8 +77,8 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, } ret = ACT_P_CREATED; } else if (err > 0) { - if (bind)/* dont override defaults */ - return 0; + if (bind) /* dont override defaults */ + return ACT_P_BOUND; if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 3d50215985..6124d8b128 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1349,7 +1349,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, res = ACT_P_CREATED; } else { if (bind) - return 0; + return ACT_P_BOUND; if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 4d15b6a616..e620f9a84a 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -221,7 +221,7 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, ret = ACT_P_CREATED; } else if (err > 0) { if (bind) /* don't override defaults */ - return 0; + return ACT_P_BOUND; if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 904ab3d457..4af3b7ec24 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -108,7 +108,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, ret = ACT_P_CREATED; } else if (err > 0) { if (bind)/* dont override defaults */ - return 0; + return ACT_P_BOUND; if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index 393b787292..c681cd011a 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -356,7 +356,7 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, return err; if (err && bind) - return 0; + return ACT_P_BOUND; if (!err) { ret = tcf_idr_create_from_flags(tn, index, est, a, diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index bc7611b074..0e867d13be 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -548,7 +548,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, exists = err; if (exists && bind) { kfree(p); - return 0; + return ACT_P_BOUND; } if (!exists) { diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c deleted file mode 100644 index 598d6e2991..0000000000 --- a/net/sched/act_ipt.c +++ /dev/null @@ -1,464 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/sched/act_ipt.c iptables target interface - * - *TODO: Add other tables. For now we only support the ipv4 table targets - * - * Copyright: Jamal Hadi Salim (2002-13) - */ - -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/skbuff.h> -#include <linux/rtnetlink.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <net/netlink.h> -#include <net/pkt_sched.h> -#include <linux/tc_act/tc_ipt.h> -#include <net/tc_act/tc_ipt.h> -#include <net/tc_wrapper.h> -#include <net/ip.h> - -#include <linux/netfilter_ipv4/ip_tables.h> - - -static struct tc_action_ops act_ipt_ops; -static struct tc_action_ops act_xt_ops; - -static int ipt_init_target(struct net *net, struct xt_entry_target *t, - char *table, unsigned int hook) -{ - struct xt_tgchk_param par; - struct xt_target *target; - struct ipt_entry e = {}; - int ret = 0; - - target = xt_request_find_target(AF_INET, t->u.user.name, - t->u.user.revision); - if (IS_ERR(target)) - return PTR_ERR(target); - - t->u.kernel.target = target; - memset(&par, 0, sizeof(par)); - par.net = net; - par.table = table; - par.entryinfo = &e; - par.target = target; - par.targinfo = t->data; - par.hook_mask = 1 << hook; - par.family = NFPROTO_IPV4; - - ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false); - if (ret < 0) { - module_put(t->u.kernel.target->me); - return ret; - } - return 0; -} - -static void ipt_destroy_target(struct xt_entry_target *t, struct net *net) -{ - struct xt_tgdtor_param par = { - .target = t->u.kernel.target, - .targinfo = t->data, - .family = NFPROTO_IPV4, - .net = net, - }; - if (par.target->destroy != NULL) - par.target->destroy(&par); - module_put(par.target->me); -} - -static void tcf_ipt_release(struct tc_action *a) -{ - struct tcf_ipt *ipt = to_ipt(a); - - if (ipt->tcfi_t) { - ipt_destroy_target(ipt->tcfi_t, a->idrinfo->net); - kfree(ipt->tcfi_t); - } - kfree(ipt->tcfi_tname); -} - -static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { - [TCA_IPT_TABLE] = { .type = NLA_STRING, .len = IFNAMSIZ }, - [TCA_IPT_HOOK] = NLA_POLICY_RANGE(NLA_U32, NF_INET_PRE_ROUTING, - NF_INET_NUMHOOKS), - [TCA_IPT_INDEX] = { .type = NLA_U32 }, - [TCA_IPT_TARG] = { .len = sizeof(struct xt_entry_target) }, -}; - -static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, - struct nlattr *est, struct tc_action **a, - const struct tc_action_ops *ops, - struct tcf_proto *tp, u32 flags) -{ - struct tc_action_net *tn = net_generic(net, id); - bool bind = flags & TCA_ACT_FLAGS_BIND; - struct nlattr *tb[TCA_IPT_MAX + 1]; - struct tcf_ipt *ipt; - struct xt_entry_target *td, *t; - char *tname; - bool exists = false; - int ret = 0, err; - u32 hook = 0; - u32 index = 0; - - if (nla == NULL) - return -EINVAL; - - err = nla_parse_nested_deprecated(tb, TCA_IPT_MAX, nla, ipt_policy, - NULL); - if (err < 0) - return err; - - if (tb[TCA_IPT_INDEX] != NULL) - index = nla_get_u32(tb[TCA_IPT_INDEX]); - - err = tcf_idr_check_alloc(tn, &index, a, bind); - if (err < 0) - return err; - exists = err; - if (exists && bind) - return 0; - - if (tb[TCA_IPT_HOOK] == NULL || tb[TCA_IPT_TARG] == NULL) { - if (exists) - tcf_idr_release(*a, bind); - else - tcf_idr_cleanup(tn, index); - return -EINVAL; - } - - td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]); - if (nla_len(tb[TCA_IPT_TARG]) != td->u.target_size) { - if (exists) - tcf_idr_release(*a, bind); - else - tcf_idr_cleanup(tn, index); - return -EINVAL; - } - - if (!exists) { - ret = tcf_idr_create(tn, index, est, a, ops, bind, - false, flags); - if (ret) { - tcf_idr_cleanup(tn, index); - return ret; - } - ret = ACT_P_CREATED; - } else { - if (bind)/* dont override defaults */ - return 0; - - if (!(flags & TCA_ACT_FLAGS_REPLACE)) { - tcf_idr_release(*a, bind); - return -EEXIST; - } - } - - err = -EINVAL; - hook = nla_get_u32(tb[TCA_IPT_HOOK]); - switch (hook) { - case NF_INET_PRE_ROUTING: - break; - case NF_INET_POST_ROUTING: - break; - default: - goto err1; - } - - if (tb[TCA_IPT_TABLE]) { - /* mangle only for now */ - if (nla_strcmp(tb[TCA_IPT_TABLE], "mangle")) - goto err1; - } - - tname = kstrdup("mangle", GFP_KERNEL); - if (unlikely(!tname)) - goto err1; - - t = kmemdup(td, td->u.target_size, GFP_KERNEL); - if (unlikely(!t)) - goto err2; - - err = ipt_init_target(net, t, tname, hook); - if (err < 0) - goto err3; - - ipt = to_ipt(*a); - - spin_lock_bh(&ipt->tcf_lock); - if (ret != ACT_P_CREATED) { - ipt_destroy_target(ipt->tcfi_t, net); - kfree(ipt->tcfi_tname); - kfree(ipt->tcfi_t); - } - ipt->tcfi_tname = tname; - ipt->tcfi_t = t; - ipt->tcfi_hook = hook; - spin_unlock_bh(&ipt->tcf_lock); - return ret; - -err3: - kfree(t); -err2: - kfree(tname); -err1: - tcf_idr_release(*a, bind); - return err; -} - -static int tcf_ipt_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action **a, - struct tcf_proto *tp, - u32 flags, struct netlink_ext_ack *extack) -{ - return __tcf_ipt_init(net, act_ipt_ops.net_id, nla, est, - a, &act_ipt_ops, tp, flags); -} - -static int tcf_xt_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action **a, - struct tcf_proto *tp, - u32 flags, struct netlink_ext_ack *extack) -{ - return __tcf_ipt_init(net, act_xt_ops.net_id, nla, est, - a, &act_xt_ops, tp, flags); -} - -static bool tcf_ipt_act_check(struct sk_buff *skb) -{ - const struct iphdr *iph; - unsigned int nhoff, len; - - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - return false; - - nhoff = skb_network_offset(skb); - iph = ip_hdr(skb); - if (iph->ihl < 5 || iph->version != 4) - return false; - - len = skb_ip_totlen(skb); - if (skb->len < nhoff + len || len < (iph->ihl * 4u)) - return false; - - return pskb_may_pull(skb, iph->ihl * 4u); -} - -TC_INDIRECT_SCOPE int tcf_ipt_act(struct sk_buff *skb, - const struct tc_action *a, - struct tcf_result *res) -{ - char saved_cb[sizeof_field(struct sk_buff, cb)]; - int ret = 0, result = 0; - struct tcf_ipt *ipt = to_ipt(a); - struct xt_action_param par; - struct nf_hook_state state = { - .net = dev_net(skb->dev), - .in = skb->dev, - .hook = ipt->tcfi_hook, - .pf = NFPROTO_IPV4, - }; - - if (skb_protocol(skb, false) != htons(ETH_P_IP)) - return TC_ACT_UNSPEC; - - if (skb_unclone(skb, GFP_ATOMIC)) - return TC_ACT_UNSPEC; - - if (!tcf_ipt_act_check(skb)) - return TC_ACT_UNSPEC; - - if (state.hook == NF_INET_POST_ROUTING) { - if (!skb_dst(skb)) - return TC_ACT_UNSPEC; - - state.out = skb->dev; - } - - memcpy(saved_cb, skb->cb, sizeof(saved_cb)); - - spin_lock(&ipt->tcf_lock); - - tcf_lastuse_update(&ipt->tcf_tm); - bstats_update(&ipt->tcf_bstats, skb); - - /* yes, we have to worry about both in and out dev - * worry later - danger - this API seems to have changed - * from earlier kernels - */ - par.state = &state; - par.target = ipt->tcfi_t->u.kernel.target; - par.targinfo = ipt->tcfi_t->data; - - memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); - - ret = par.target->target(skb, &par); - - switch (ret) { - case NF_ACCEPT: - result = TC_ACT_OK; - break; - case NF_DROP: - result = TC_ACT_SHOT; - ipt->tcf_qstats.drops++; - break; - case XT_CONTINUE: - result = TC_ACT_PIPE; - break; - default: - net_notice_ratelimited("tc filter: Bogus netfilter code %d assume ACCEPT\n", - ret); - result = TC_ACT_OK; - break; - } - spin_unlock(&ipt->tcf_lock); - - memcpy(skb->cb, saved_cb, sizeof(skb->cb)); - - return result; - -} - -static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, - int ref) -{ - unsigned char *b = skb_tail_pointer(skb); - struct tcf_ipt *ipt = to_ipt(a); - struct xt_entry_target *t; - struct tcf_t tm; - struct tc_cnt c; - - /* for simple targets kernel size == user size - * user name = target name - * for foolproof you need to not assume this - */ - - spin_lock_bh(&ipt->tcf_lock); - t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC); - if (unlikely(!t)) - goto nla_put_failure; - - c.bindcnt = atomic_read(&ipt->tcf_bindcnt) - bind; - c.refcnt = refcount_read(&ipt->tcf_refcnt) - ref; - strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name); - - if (nla_put(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t) || - nla_put_u32(skb, TCA_IPT_INDEX, ipt->tcf_index) || - nla_put_u32(skb, TCA_IPT_HOOK, ipt->tcfi_hook) || - nla_put(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c) || - nla_put_string(skb, TCA_IPT_TABLE, ipt->tcfi_tname)) - goto nla_put_failure; - - tcf_tm_dump(&tm, &ipt->tcf_tm); - if (nla_put_64bit(skb, TCA_IPT_TM, sizeof(tm), &tm, TCA_IPT_PAD)) - goto nla_put_failure; - - spin_unlock_bh(&ipt->tcf_lock); - kfree(t); - return skb->len; - -nla_put_failure: - spin_unlock_bh(&ipt->tcf_lock); - nlmsg_trim(skb, b); - kfree(t); - return -1; -} - -static struct tc_action_ops act_ipt_ops = { - .kind = "ipt", - .id = TCA_ID_IPT, - .owner = THIS_MODULE, - .act = tcf_ipt_act, - .dump = tcf_ipt_dump, - .cleanup = tcf_ipt_release, - .init = tcf_ipt_init, - .size = sizeof(struct tcf_ipt), -}; - -static __net_init int ipt_init_net(struct net *net) -{ - struct tc_action_net *tn = net_generic(net, act_ipt_ops.net_id); - - return tc_action_net_init(net, tn, &act_ipt_ops); -} - -static void __net_exit ipt_exit_net(struct list_head *net_list) -{ - tc_action_net_exit(net_list, act_ipt_ops.net_id); -} - -static struct pernet_operations ipt_net_ops = { - .init = ipt_init_net, - .exit_batch = ipt_exit_net, - .id = &act_ipt_ops.net_id, - .size = sizeof(struct tc_action_net), -}; - -static struct tc_action_ops act_xt_ops = { - .kind = "xt", - .id = TCA_ID_XT, - .owner = THIS_MODULE, - .act = tcf_ipt_act, - .dump = tcf_ipt_dump, - .cleanup = tcf_ipt_release, - .init = tcf_xt_init, - .size = sizeof(struct tcf_ipt), -}; - -static __net_init int xt_init_net(struct net *net) -{ - struct tc_action_net *tn = net_generic(net, act_xt_ops.net_id); - - return tc_action_net_init(net, tn, &act_xt_ops); -} - -static void __net_exit xt_exit_net(struct list_head *net_list) -{ - tc_action_net_exit(net_list, act_xt_ops.net_id); -} - -static struct pernet_operations xt_net_ops = { - .init = xt_init_net, - .exit_batch = xt_exit_net, - .id = &act_xt_ops.net_id, - .size = sizeof(struct tc_action_net), -}; - -MODULE_AUTHOR("Jamal Hadi Salim(2002-13)"); -MODULE_DESCRIPTION("Iptables target actions"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("act_xt"); - -static int __init ipt_init_module(void) -{ - int ret1, ret2; - - ret1 = tcf_register_action(&act_xt_ops, &xt_net_ops); - if (ret1 < 0) - pr_err("Failed to load xt action\n"); - - ret2 = tcf_register_action(&act_ipt_ops, &ipt_net_ops); - if (ret2 < 0) - pr_err("Failed to load ipt action\n"); - - if (ret1 < 0 && ret2 < 0) { - return ret1; - } else - return 0; -} - -static void __exit ipt_cleanup_module(void) -{ - tcf_unregister_action(&act_ipt_ops, &ipt_net_ops); - tcf_unregister_action(&act_xt_ops, &xt_net_ops); -} - -module_init(ipt_init_module); -module_exit(ipt_cleanup_module); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 674f7ae356..6faa7d00da 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -85,10 +85,21 @@ static void tcf_mirred_release(struct tc_action *a) static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { [TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) }, + [TCA_MIRRED_BLOCKID] = NLA_POLICY_MIN(NLA_U32, 1), }; static struct tc_action_ops act_mirred_ops; +static void tcf_mirred_replace_dev(struct tcf_mirred *m, + struct net_device *ndev) +{ + struct net_device *odev; + + odev = rcu_replace_pointer(m->tcfm_dev, ndev, + lockdep_is_held(&m->tcf_lock)); + netdev_put(odev, &m->tcfm_dev_tracker); +} + static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, @@ -124,7 +135,18 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, return err; exists = err; if (exists && bind) - return 0; + return ACT_P_BOUND; + + if (tb[TCA_MIRRED_BLOCKID] && parm->ifindex) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot specify Block ID and dev simultaneously"); + if (exists) + tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, index); + + return -EINVAL; + } switch (parm->eaction) { case TCA_EGRESS_MIRROR: @@ -142,9 +164,10 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, } if (!exists) { - if (!parm->ifindex) { + if (!parm->ifindex && !tb[TCA_MIRRED_BLOCKID]) { tcf_idr_cleanup(tn, index); - NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist"); + NL_SET_ERR_MSG_MOD(extack, + "Must specify device or block"); return -EINVAL; } ret = tcf_idr_create_from_flags(tn, index, est, a, @@ -170,7 +193,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, spin_lock_bh(&m->tcf_lock); if (parm->ifindex) { - struct net_device *odev, *ndev; + struct net_device *ndev; ndev = dev_get_by_index(net, parm->ifindex); if (!ndev) { @@ -179,11 +202,14 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, goto put_chain; } mac_header_xmit = dev_is_mac_header_xmit(ndev); - odev = rcu_replace_pointer(m->tcfm_dev, ndev, - lockdep_is_held(&m->tcf_lock)); - netdev_put(odev, &m->tcfm_dev_tracker); + tcf_mirred_replace_dev(m, ndev); netdev_tracker_alloc(ndev, &m->tcfm_dev_tracker, GFP_ATOMIC); m->tcfm_mac_header_xmit = mac_header_xmit; + m->tcfm_blockid = 0; + } else if (tb[TCA_MIRRED_BLOCKID]) { + tcf_mirred_replace_dev(m, NULL); + m->tcfm_mac_header_xmit = false; + m->tcfm_blockid = nla_get_u32(tb[TCA_MIRRED_BLOCKID]); } goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); m->tcfm_eaction = parm->eaction; @@ -302,6 +328,89 @@ err_cant_do: return retval; } +static int tcf_blockcast_redir(struct sk_buff *skb, struct tcf_mirred *m, + struct tcf_block *block, int m_eaction, + const u32 exception_ifindex, int retval) +{ + struct net_device *dev_prev = NULL; + struct net_device *dev = NULL; + unsigned long index; + int mirred_eaction; + + mirred_eaction = tcf_mirred_act_wants_ingress(m_eaction) ? + TCA_INGRESS_MIRROR : TCA_EGRESS_MIRROR; + + xa_for_each(&block->ports, index, dev) { + if (index == exception_ifindex) + continue; + + if (!dev_prev) + goto assign_prev; + + tcf_mirred_to_dev(skb, m, dev_prev, + dev_is_mac_header_xmit(dev), + mirred_eaction, retval); +assign_prev: + dev_prev = dev; + } + + if (dev_prev) + return tcf_mirred_to_dev(skb, m, dev_prev, + dev_is_mac_header_xmit(dev_prev), + m_eaction, retval); + + return retval; +} + +static int tcf_blockcast_mirror(struct sk_buff *skb, struct tcf_mirred *m, + struct tcf_block *block, int m_eaction, + const u32 exception_ifindex, int retval) +{ + struct net_device *dev = NULL; + unsigned long index; + + xa_for_each(&block->ports, index, dev) { + if (index == exception_ifindex) + continue; + + tcf_mirred_to_dev(skb, m, dev, + dev_is_mac_header_xmit(dev), + m_eaction, retval); + } + + return retval; +} + +static int tcf_blockcast(struct sk_buff *skb, struct tcf_mirred *m, + const u32 blockid, struct tcf_result *res, + int retval) +{ + const u32 exception_ifindex = skb->dev->ifindex; + struct tcf_block *block; + bool is_redirect; + int m_eaction; + + m_eaction = READ_ONCE(m->tcfm_eaction); + is_redirect = tcf_mirred_is_act_redirect(m_eaction); + + /* we are already under rcu protection, so can call block lookup + * directly. + */ + block = tcf_block_lookup(dev_net(skb->dev), blockid); + if (!block || xa_empty(&block->ports)) { + tcf_action_inc_overlimit_qstats(&m->common); + return retval; + } + + if (is_redirect) + return tcf_blockcast_redir(skb, m, block, m_eaction, + exception_ifindex, retval); + + /* If it's not redirect, it is mirror */ + return tcf_blockcast_mirror(skb, m, block, m_eaction, exception_ifindex, + retval); +} + TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) @@ -312,6 +421,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, bool m_mac_header_xmit; struct net_device *dev; int m_eaction; + u32 blockid; nest_level = __this_cpu_inc_return(mirred_nest_level); if (unlikely(nest_level > MIRRED_NEST_LIMIT)) { @@ -324,6 +434,12 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, tcf_lastuse_update(&m->tcf_tm); tcf_action_update_bstats(&m->common, skb); + blockid = READ_ONCE(m->tcfm_blockid); + if (blockid) { + retval = tcf_blockcast(skb, m, blockid, res, retval); + goto dec_nest_level; + } + dev = rcu_dereference_bh(m->tcfm_dev); if (unlikely(!dev)) { pr_notice_once("tc mirred: target device is gone\n"); @@ -365,6 +481,7 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, }; struct net_device *dev; struct tcf_t t; + u32 blockid; spin_lock_bh(&m->tcf_lock); opt.action = m->tcf_action; @@ -376,6 +493,10 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt)) goto nla_put_failure; + blockid = m->tcfm_blockid; + if (blockid && nla_put_u32(skb, TCA_MIRRED_BLOCKID, blockid)) + goto nla_put_failure; + tcf_tm_dump(&t, &m->tcf_tm); if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD)) goto nla_put_failure; diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index 1010dc6328..34b8edb6cc 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -195,7 +195,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla, return err; exists = err; if (exists && bind) - return 0; + return ACT_P_BOUND; if (!exists) { ret = tcf_idr_create(tn, index, est, a, &act_mpls_ops, bind, diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 4184af5abb..a180e72463 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -69,7 +69,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, ret = ACT_P_CREATED; } else if (err > 0) { if (bind) - return 0; + return ACT_P_BOUND; if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 1ef8fcfa99..2ef22969f2 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -202,7 +202,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, ret = ACT_P_CREATED; } else if (err > 0) { if (bind) - return 0; + return ACT_P_BOUND; if (!(flags & TCA_ACT_FLAGS_REPLACE)) { ret = -EEXIST; goto out_release; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index f3121c5a85..e119b4a3db 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -77,7 +77,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, return err; exists = err; if (exists && bind) - return 0; + return ACT_P_BOUND; if (!exists) { ret = tcf_idr_create(tn, index, NULL, a, diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 4c670e7568..c5c61efe6d 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -66,7 +66,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, return err; exists = err; if (exists && bind) - return 0; + return ACT_P_BOUND; if (!exists) { ret = tcf_idr_create(tn, index, est, a, diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 4b84514534..0a3e928882 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -118,7 +118,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, return err; exists = err; if (exists && bind) - return 0; + return ACT_P_BOUND; if (tb[TCA_DEF_DATA] == NULL) { if (exists) diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index ce7008cf29..754f78b35b 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -209,7 +209,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, return err; exists = err; if (exists && bind) - return 0; + return ACT_P_BOUND; if (!flags) { if (exists) diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index dffa990a96..0015393910 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -157,7 +157,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, return err; exists = err; if (exists && bind) - return 0; + return ACT_P_BOUND; if (!lflags) { if (exists) @@ -241,13 +241,13 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_skbmod *d = to_skbmod(a); unsigned char *b = skb_tail_pointer(skb); struct tcf_skbmod_params *p; - struct tc_skbmod opt = { - .index = d->tcf_index, - .refcnt = refcount_read(&d->tcf_refcnt) - ref, - .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, - }; + struct tc_skbmod opt; struct tcf_t t; + memset(&opt, 0, sizeof(opt)); + opt.index = d->tcf_index; + opt.refcnt = refcount_read(&d->tcf_refcnt) - ref, + opt.bindcnt = atomic_read(&d->tcf_bindcnt) - bind; spin_lock_bh(&d->tcf_lock); opt.action = d->tcf_action; p = rcu_dereference_protected(d->skbmod_p, diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 0c8aa7e686..300b08aa82 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -401,7 +401,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, return err; exists = err; if (exists && bind) - return 0; + return ACT_P_BOUND; switch (parm->t_action) { case TCA_TUNNEL_KEY_ACT_RELEASE: diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 0251442f5f..836183011a 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -151,7 +151,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, return err; exists = err; if (exists && bind) - return 0; + return ACT_P_BOUND; switch (parm->v_action) { case TCA_VLAN_ACT_POP: diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 02c594baa1..ff3d396a65 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -531,6 +531,7 @@ static void tcf_block_destroy(struct tcf_block *block) { mutex_destroy(&block->lock); mutex_destroy(&block->proto_destroy_lock); + xa_destroy(&block->ports); kfree_rcu(block, rcu); } @@ -650,7 +651,7 @@ static void tc_chain_tmplt_del(const struct tcf_proto_ops *tmplt_ops, static int tc_chain_notify_delete(const struct tcf_proto_ops *tmplt_ops, void *tmplt_priv, u32 chain_index, struct tcf_block *block, struct sk_buff *oskb, - u32 seq, u16 flags, bool unicast); + u32 seq, u16 flags); static void __tcf_chain_put(struct tcf_chain *chain, bool by_act, bool explicitly_created) @@ -685,8 +686,7 @@ static void __tcf_chain_put(struct tcf_chain *chain, bool by_act, if (non_act_refcnt == chain->explicitly_created && !by_act) { if (non_act_refcnt == 0) tc_chain_notify_delete(tmplt_ops, tmplt_priv, - chain->index, block, NULL, 0, 0, - false); + chain->index, block, NULL, 0, 0); /* Last reference to chain, no need to lock. */ chain->flushing = false; } @@ -1003,6 +1003,7 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, refcount_set(&block->refcnt, 1); block->net = net; block->index = block_index; + xa_init(&block->ports); /* Don't store q pointer for blocks which are shared */ if (!tcf_block_shared(block)) @@ -1010,12 +1011,13 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, return block; } -static struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index) +struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index) { struct tcf_net *tn = net_generic(net, tcf_net_id); return idr_find(&tn->idr, block_index); } +EXPORT_SYMBOL(tcf_block_lookup); static struct tcf_block *tcf_block_refcnt_get(struct net *net, u32 block_index) { @@ -1422,10 +1424,19 @@ static void tcf_block_owner_del(struct tcf_block *block, WARN_ON(1); } +static bool tcf_block_tracks_dev(struct tcf_block *block, + struct tcf_block_ext_info *ei) +{ + return tcf_block_shared(block) && + (ei->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS || + ei->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS); +} + int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, struct tcf_block_ext_info *ei, struct netlink_ext_ack *extack) { + struct net_device *dev = qdisc_dev(q); struct net *net = qdisc_net(q); struct tcf_block *block = NULL; int err; @@ -1459,9 +1470,18 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, if (err) goto err_block_offload_bind; + if (tcf_block_tracks_dev(block, ei)) { + err = xa_insert(&block->ports, dev->ifindex, dev, GFP_KERNEL); + if (err) { + NL_SET_ERR_MSG(extack, "block dev insert failed"); + goto err_dev_insert; + } + } + *p_block = block; return 0; +err_dev_insert: err_block_offload_bind: tcf_chain0_head_change_cb_del(block, ei); err_chain0_head_change_cb_add: @@ -1500,8 +1520,12 @@ EXPORT_SYMBOL(tcf_block_get); void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei) { + struct net_device *dev = qdisc_dev(q); + if (!block) return; + if (tcf_block_tracks_dev(block, ei)) + xa_erase(&block->ports, dev->ifindex); tcf_chain0_head_change_cb_del(block, ei); tcf_block_owner_del(block, q, ei->binder_type); @@ -1664,7 +1688,6 @@ static inline int __tcf_classify(struct sk_buff *skb, int act_index, u32 *last_executed_chain) { - u32 orig_reason = res->drop_reason; #ifdef CONFIG_NET_CLS_ACT const int max_reclassify_loop = 16; const struct tcf_proto *first_tp; @@ -1689,13 +1712,15 @@ reclassify: */ if (unlikely(n->tp != tp || n->tp->chain != n->chain || !tp->ops->get_exts)) { - tcf_set_drop_reason(res, SKB_DROP_REASON_TC_ERROR); + tcf_set_drop_reason(skb, + SKB_DROP_REASON_TC_COOKIE_ERROR); return TC_ACT_SHOT; } exts = tp->ops->get_exts(tp, n->handle); if (unlikely(!exts || n->exts != exts)) { - tcf_set_drop_reason(res, SKB_DROP_REASON_TC_ERROR); + tcf_set_drop_reason(skb, + SKB_DROP_REASON_TC_COOKIE_ERROR); return TC_ACT_SHOT; } @@ -1719,18 +1744,13 @@ reclassify: goto reset; } #endif - if (err >= 0) { - /* Policy drop or drop reason is over-written by - * classifiers with a bogus value(0) */ - if (err == TC_ACT_SHOT && - res->drop_reason == SKB_NOT_DROPPED_YET) - tcf_set_drop_reason(res, orig_reason); + if (err >= 0) return err; - } } if (unlikely(n)) { - tcf_set_drop_reason(res, SKB_DROP_REASON_TC_ERROR); + tcf_set_drop_reason(skb, + SKB_DROP_REASON_TC_COOKIE_ERROR); return TC_ACT_SHOT; } @@ -1742,7 +1762,8 @@ reset: tp->chain->block->index, tp->prio & 0xffff, ntohs(tp->protocol)); - tcf_set_drop_reason(res, SKB_DROP_REASON_TC_ERROR); + tcf_set_drop_reason(skb, + SKB_DROP_REASON_TC_RECLASSIFY_LOOP); return TC_ACT_SHOT; } @@ -1780,7 +1801,8 @@ int tcf_classify(struct sk_buff *skb, n = tcf_exts_miss_cookie_lookup(ext->act_miss_cookie, &act_index); if (!n) { - tcf_set_drop_reason(res, SKB_DROP_REASON_TC_ERROR); + tcf_set_drop_reason(skb, + SKB_DROP_REASON_TC_COOKIE_ERROR); return TC_ACT_SHOT; } @@ -1791,7 +1813,9 @@ int tcf_classify(struct sk_buff *skb, fchain = tcf_chain_lookup_rcu(block, chain); if (!fchain) { - tcf_set_drop_reason(res, SKB_DROP_REASON_TC_ERROR); + tcf_set_drop_reason(skb, + SKB_DROP_REASON_TC_CHAIN_NOTFOUND); + return TC_ACT_SHOT; } @@ -1813,10 +1837,9 @@ int tcf_classify(struct sk_buff *skb, ext = tc_skb_ext_alloc(skb); if (WARN_ON_ONCE(!ext)) { - tcf_set_drop_reason(res, SKB_DROP_REASON_TC_ERROR); + tcf_set_drop_reason(skb, SKB_DROP_REASON_NOMEM); return TC_ACT_SHOT; } - ext->chain = last_executed_chain; ext->mru = cb->mru; ext->post_ct = cb->post_ct; @@ -2059,6 +2082,9 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; int err = 0; + if (!unicast && !rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) + return 0; + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; @@ -2081,13 +2107,16 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, struct tcf_block *block, struct Qdisc *q, - u32 parent, void *fh, bool unicast, bool *last, - bool rtnl_held, struct netlink_ext_ack *extack) + u32 parent, void *fh, bool *last, bool rtnl_held, + struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; int err; + if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) + return tp->ops->delete(tp, fh, last, rtnl_held, extack); + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; @@ -2106,11 +2135,8 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, return err; } - if (unicast) - err = rtnl_unicast(skb, net, portid); - else - err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, - n->nlmsg_flags & NLM_F_ECHO); + err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send filter delete notification"); @@ -2505,9 +2531,8 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, } else { bool last; - err = tfilter_del_notify(net, skb, n, tp, block, - q, parent, fh, false, &last, - rtnl_held, extack); + err = tfilter_del_notify(net, skb, n, tp, block, q, parent, fh, + &last, rtnl_held, extack); if (err) goto errout; @@ -2738,6 +2763,7 @@ errout: } static const struct nla_policy tcf_tfilter_dump_policy[TCA_MAX + 1] = { + [TCA_CHAIN] = { .type = NLA_U32 }, [TCA_DUMP_FLAGS] = NLA_POLICY_BITFIELD32(TCA_DUMP_FLAGS_TERSE), }; @@ -2912,6 +2938,9 @@ static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, struct sk_buff *skb; int err = 0; + if (!unicast && !rtnl_notify_needed(net, flags, RTNLGRP_TC)) + return 0; + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; @@ -2935,12 +2964,15 @@ static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, static int tc_chain_notify_delete(const struct tcf_proto_ops *tmplt_ops, void *tmplt_priv, u32 chain_index, struct tcf_block *block, struct sk_buff *oskb, - u32 seq, u16 flags, bool unicast) + u32 seq, u16 flags) { u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; struct net *net = block->net; struct sk_buff *skb; + if (!rtnl_notify_needed(net, flags, RTNLGRP_TC)) + return 0; + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; @@ -2951,9 +2983,6 @@ static int tc_chain_notify_delete(const struct tcf_proto_ops *tmplt_ops, return -EINVAL; } - if (unicast) - return rtnl_unicast(skb, net, portid); - return rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO); } @@ -3300,12 +3329,11 @@ int tcf_exts_validate_ex(struct net *net, struct tcf_proto *tp, struct nlattr ** if (exts->police && tb[exts->police]) { struct tc_action_ops *a_o; - a_o = tc_action_load_ops(tb[exts->police], true, - !(flags & TCA_ACT_FLAGS_NO_RTNL), + flags |= TCA_ACT_FLAGS_POLICE | TCA_ACT_FLAGS_BIND; + a_o = tc_action_load_ops(tb[exts->police], flags, extack); if (IS_ERR(a_o)) return PTR_ERR(a_o); - flags |= TCA_ACT_FLAGS_POLICE | TCA_ACT_FLAGS_BIND; act = tcf_action_init_1(net, tp, tb[exts->police], rate_tlv, a_o, init_res, flags, extack); @@ -3316,7 +3344,7 @@ int tcf_exts_validate_ex(struct net *net, struct tcf_proto *tp, struct nlattr ** act->type = exts->type = TCA_OLD_COMPAT; exts->actions[0] = act; exts->nr_actions = 1; - tcf_idr_insert_many(exts->actions); + tcf_idr_insert_many(exts->actions, init_res); } else if (exts->action && tb[exts->action]) { int err; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index d5bdfd4a76..289e1755c2 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -71,7 +71,7 @@ struct tc_u_hnode { struct tc_u_hnode __rcu *next; u32 handle; u32 prio; - int refcnt; + refcount_t refcnt; unsigned int divisor; struct idr handle_idr; bool is_root; @@ -86,7 +86,7 @@ struct tc_u_hnode { struct tc_u_common { struct tc_u_hnode __rcu *hlist; void *ptr; - int refcnt; + refcount_t refcnt; struct idr handle_idr; struct hlist_node hnode; long knodes; @@ -359,7 +359,7 @@ static int u32_init(struct tcf_proto *tp) if (root_ht == NULL) return -ENOBUFS; - root_ht->refcnt++; + refcount_set(&root_ht->refcnt, 1); root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : 0x80000000; root_ht->prio = tp->prio; root_ht->is_root = true; @@ -371,18 +371,20 @@ static int u32_init(struct tcf_proto *tp) kfree(root_ht); return -ENOBUFS; } + refcount_set(&tp_c->refcnt, 1); tp_c->ptr = key; INIT_HLIST_NODE(&tp_c->hnode); idr_init(&tp_c->handle_idr); hlist_add_head(&tp_c->hnode, tc_u_hash(key)); + } else { + refcount_inc(&tp_c->refcnt); } - tp_c->refcnt++; RCU_INIT_POINTER(root_ht->next, tp_c->hlist); rcu_assign_pointer(tp_c->hlist, root_ht); - root_ht->refcnt++; + /* root_ht must be destroyed when tcf_proto is destroyed */ rcu_assign_pointer(tp->root, root_ht); tp->data = tp_c; return 0; @@ -393,7 +395,7 @@ static void __u32_destroy_key(struct tc_u_knode *n) struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); tcf_exts_destroy(&n->exts); - if (ht && --ht->refcnt == 0) + if (ht && refcount_dec_and_test(&ht->refcnt)) kfree(ht); kfree(n); } @@ -601,8 +603,6 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, struct tc_u_hnode __rcu **hn; struct tc_u_hnode *phn; - WARN_ON(--ht->refcnt); - u32_clear_hnode(tp, ht, extack); hn = &tp_c->hlist; @@ -630,10 +630,10 @@ static void u32_destroy(struct tcf_proto *tp, bool rtnl_held, WARN_ON(root_ht == NULL); - if (root_ht && --root_ht->refcnt == 1) + if (root_ht && refcount_dec_and_test(&root_ht->refcnt)) u32_destroy_hnode(tp, root_ht, extack); - if (--tp_c->refcnt == 0) { + if (refcount_dec_and_test(&tp_c->refcnt)) { struct tc_u_hnode *ht; hlist_del(&tp_c->hnode); @@ -645,7 +645,7 @@ static void u32_destroy(struct tcf_proto *tp, bool rtnl_held, /* u32_destroy_key() will later free ht for us, if it's * still referenced by some knode */ - if (--ht->refcnt == 0) + if (refcount_dec_and_test(&ht->refcnt)) kfree_rcu(ht, rcu); } @@ -674,7 +674,7 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last, return -EINVAL; } - if (ht->refcnt == 1) { + if (refcount_dec_if_one(&ht->refcnt)) { u32_destroy_hnode(tp, ht, extack); } else { NL_SET_ERR_MSG_MOD(extack, "Can not delete in-use filter"); @@ -682,7 +682,7 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last, } out: - *last = tp_c->refcnt == 1 && tp_c->knodes == 0; + *last = refcount_read(&tp_c->refcnt) == 1 && tp_c->knodes == 0; return ret; } @@ -766,14 +766,14 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, NL_SET_ERR_MSG_MOD(extack, "Not linking to root node"); return -EINVAL; } - ht_down->refcnt++; + refcount_inc(&ht_down->refcnt); } ht_old = rtnl_dereference(n->ht_down); rcu_assign_pointer(n->ht_down, ht_down); if (ht_old) - ht_old->refcnt--; + refcount_dec(&ht_old->refcnt); } if (ifindex >= 0) @@ -852,7 +852,7 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp, /* bump reference count as long as we hold pointer to structure */ if (ht) - ht->refcnt++; + refcount_inc(&ht->refcnt); return new; } @@ -932,7 +932,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, ht_old = rtnl_dereference(n->ht_down); if (ht_old) - ht_old->refcnt++; + refcount_inc(&ht_old->refcnt); } __u32_destroy_key(new); return err; @@ -980,7 +980,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return err; } } - ht->refcnt = 1; + refcount_set(&ht->refcnt, 1); ht->divisor = divisor; ht->handle = handle; ht->prio = tp->prio; diff --git a/net/sched/em_canid.c b/net/sched/em_canid.c index 5ea84decec..5337bc4627 100644 --- a/net/sched/em_canid.c +++ b/net/sched/em_canid.c @@ -222,6 +222,7 @@ static void __exit exit_em_canid(void) tcf_em_unregister(&em_canid_ops); } +MODULE_DESCRIPTION("ematch classifier to match CAN IDs embedded in skb CAN frames"); MODULE_LICENSE("GPL"); module_init(init_em_canid); diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c index f17b049ea5..c90ad7ea26 100644 --- a/net/sched/em_cmp.c +++ b/net/sched/em_cmp.c @@ -87,6 +87,7 @@ static void __exit exit_em_cmp(void) tcf_em_unregister(&em_cmp_ops); } +MODULE_DESCRIPTION("ematch classifier for basic data types(8/16/32 bit) against skb data"); MODULE_LICENSE("GPL"); module_init(init_em_cmp); diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 09d8afd04a..8996c73c97 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -1006,6 +1006,7 @@ static void __exit exit_em_meta(void) tcf_em_unregister(&em_meta_ops); } +MODULE_DESCRIPTION("ematch classifier for various internal kernel metadata, skb metadata and sk metadata"); MODULE_LICENSE("GPL"); module_init(init_em_meta); diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c index a83b237cbe..4f9f21a05d 100644 --- a/net/sched/em_nbyte.c +++ b/net/sched/em_nbyte.c @@ -68,6 +68,7 @@ static void __exit exit_em_nbyte(void) tcf_em_unregister(&em_nbyte_ops); } +MODULE_DESCRIPTION("ematch classifier for arbitrary skb multi-bytes"); MODULE_LICENSE("GPL"); module_init(init_em_nbyte); diff --git a/net/sched/em_text.c b/net/sched/em_text.c index f176afb705..420c66203b 100644 --- a/net/sched/em_text.c +++ b/net/sched/em_text.c @@ -147,6 +147,7 @@ static void __exit exit_em_text(void) tcf_em_unregister(&em_text_ops); } +MODULE_DESCRIPTION("ematch classifier for embedded text in skbs"); MODULE_LICENSE("GPL"); module_init(init_em_text); diff --git a/net/sched/em_u32.c b/net/sched/em_u32.c index 71b070da04..fdec4db5ec 100644 --- a/net/sched/em_u32.c +++ b/net/sched/em_u32.c @@ -52,6 +52,7 @@ static void __exit exit_em_u32(void) tcf_em_unregister(&em_u32_ops); } +MODULE_DESCRIPTION("ematch skb classifier using 32 bit chunks of data"); MODULE_LICENSE("GPL"); module_init(init_em_u32); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index e9eaf63722..31e38a614f 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -809,7 +809,7 @@ void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len) notify = !sch->q.qlen && !WARN_ON_ONCE(!n && !qdisc_is_offloaded); /* TODO: perform the search on a per txq basis */ - sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid)); + sch = qdisc_lookup_rcu(qdisc_dev(sch), TC_H_MAJ(parentid)); if (sch == NULL) { WARN_ON_ONCE(parentid != TC_H_ROOT); break; @@ -1003,6 +1003,32 @@ static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible) return false; } +static int qdisc_get_notify(struct net *net, struct sk_buff *oskb, + struct nlmsghdr *n, u32 clid, struct Qdisc *q, + struct netlink_ext_ack *extack) +{ + struct sk_buff *skb; + u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (!tc_qdisc_dump_ignore(q, false)) { + if (tc_fill_qdisc(skb, q, clid, portid, n->nlmsg_seq, 0, + RTM_NEWQDISC, extack) < 0) + goto err_out; + } + + if (skb->len) + return rtnetlink_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); + +err_out: + kfree_skb(skb); + return -EINVAL; +} + static int qdisc_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, struct Qdisc *old, struct Qdisc *new, @@ -1011,6 +1037,9 @@ static int qdisc_notify(struct net *net, struct sk_buff *oskb, struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; + if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) + return 0; + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; @@ -1542,7 +1571,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, if (err != 0) return err; } else { - qdisc_notify(net, skb, n, clid, NULL, q, NULL); + qdisc_get_notify(net, skb, n, clid, q, NULL); } return 0; } @@ -1936,6 +1965,9 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb, struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; + if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) + return 0; + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; @@ -1949,6 +1981,27 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb, n->nlmsg_flags & NLM_F_ECHO); } +static int tclass_get_notify(struct net *net, struct sk_buff *oskb, + struct nlmsghdr *n, struct Qdisc *q, + unsigned long cl, struct netlink_ext_ack *extack) +{ + struct sk_buff *skb; + u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, RTM_NEWTCLASS, + extack) < 0) { + kfree_skb(skb); + return -EINVAL; + } + + return rtnetlink_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); +} + static int tclass_del_notify(struct net *net, const struct Qdisc_class_ops *cops, struct sk_buff *oskb, struct nlmsghdr *n, @@ -1962,14 +2015,18 @@ static int tclass_del_notify(struct net *net, if (!cops->delete) return -EOPNOTSUPP; - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb) - return -ENOBUFS; + if (rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) { + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; - if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, - RTM_DELTCLASS, extack) < 0) { - kfree_skb(skb); - return -EINVAL; + if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, + RTM_DELTCLASS, extack) < 0) { + kfree_skb(skb); + return -EINVAL; + } + } else { + skb = NULL; } err = cops->delete(q, cl, extack); @@ -1978,8 +2035,8 @@ static int tclass_del_notify(struct net *net, return err; } - err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, - n->nlmsg_flags & NLM_F_ECHO); + err = rtnetlink_maybe_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); return err; } @@ -2174,7 +2231,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, tc_bind_tclass(q, portid, clid, 0); goto out; case RTM_GETTCLASS: - err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS, extack); + err = tclass_get_notify(net, skb, n, q, cl, extack); goto out; default: err = -EINVAL; diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index 9a0b85190a..beece8e82c 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -57,6 +57,8 @@ #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> +#include <linux/units.h> + #include <net/netevent.h> #include <net/netlink.h> #include <net/sch_generic.h> @@ -65,8 +67,6 @@ static LIST_HEAD(cbs_list); static DEFINE_SPINLOCK(cbs_list_lock); -#define BYTES_PER_KBIT (1000LL / 8) - struct cbs_sched_data { bool offload; int queue; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 4195a4bc26..a498b5d7c5 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -522,8 +522,9 @@ static void dev_watchdog(struct timer_list *t) if (unlikely(timedout_ms)) { trace_net_dev_xmit_timeout(dev, i); - WARN_ONCE(1, "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out %u ms\n", - dev->name, netdev_drivername(dev), i, timedout_ms); + netdev_crit(dev, "NETDEV WATCHDOG: CPU: %d: transmit queue %u timed out %u ms\n", + raw_smp_processor_id(), + i, timedout_ms); netif_freeze_queues(dev); dev->netdev_ops->ndo_tx_timeout(dev, i); netif_unfreeze_queues(dev); @@ -972,6 +973,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; + sch->owner = -1; netdev_hold(dev, &sch->dev_tracker, GFP_KERNEL); refcount_set(&sch->refcnt, 1); @@ -1050,6 +1052,7 @@ static void qdisc_free_cb(struct rcu_head *head) static void __qdisc_destroy(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; + struct net_device *dev = qdisc_dev(qdisc); #ifdef CONFIG_NET_SCHED qdisc_hash_del(qdisc); @@ -1060,11 +1063,12 @@ static void __qdisc_destroy(struct Qdisc *qdisc) qdisc_reset(qdisc); + if (ops->destroy) ops->destroy(qdisc); module_put(ops->owner); - netdev_put(qdisc_dev(qdisc), &qdisc->dev_tracker); + netdev_put(dev, &qdisc->dev_tracker); trace_qdisc_destroy(qdisc); diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index 7182c5a450..5c16521818 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -38,6 +38,14 @@ void sctp_inq_init(struct sctp_inq *queue) INIT_WORK(&queue->immediate, NULL); } +/* Properly release the chunk which is being worked on. */ +static inline void sctp_inq_chunk_free(struct sctp_chunk *chunk) +{ + if (chunk->head_skb) + chunk->skb = chunk->head_skb; + sctp_chunk_free(chunk); +} + /* Release the memory associated with an SCTP inqueue. */ void sctp_inq_free(struct sctp_inq *queue) { @@ -53,7 +61,7 @@ void sctp_inq_free(struct sctp_inq *queue) * free it as well. */ if (queue->in_progress) { - sctp_chunk_free(queue->in_progress); + sctp_inq_chunk_free(queue->in_progress); queue->in_progress = NULL; } } @@ -130,9 +138,7 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) goto new_skb; } - if (chunk->head_skb) - chunk->skb = chunk->head_skb; - sctp_chunk_free(chunk); + sctp_inq_chunk_free(chunk); chunk = queue->in_progress = NULL; } else { /* Nothing to do. Next chunk in the packet, please. */ diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 73eebddbbf..0f53a5c6fd 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -677,8 +677,6 @@ static bool smc_isascii(char *hostname) static void smc_conn_save_peer_info_fce(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { - struct smc_clc_msg_accept_confirm_v2 *clc_v2 = - (struct smc_clc_msg_accept_confirm_v2 *)clc; struct smc_clc_first_contact_ext *fce; int clc_v2_len; @@ -687,17 +685,15 @@ static void smc_conn_save_peer_info_fce(struct smc_sock *smc, return; if (smc->conn.lgr->is_smcd) { - memcpy(smc->conn.lgr->negotiated_eid, clc_v2->d1.eid, + memcpy(smc->conn.lgr->negotiated_eid, clc->d1.eid, SMC_MAX_EID_LEN); - clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2, - d1); + clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm, d1); } else { - memcpy(smc->conn.lgr->negotiated_eid, clc_v2->r1.eid, + memcpy(smc->conn.lgr->negotiated_eid, clc->r1.eid, SMC_MAX_EID_LEN); - clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2, - r1); + clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm, r1); } - fce = (struct smc_clc_first_contact_ext *)(((u8 *)clc_v2) + clc_v2_len); + fce = (struct smc_clc_first_contact_ext *)(((u8 *)clc) + clc_v2_len); smc->conn.lgr->peer_os = fce->os_type; smc->conn.lgr->peer_smc_release = fce->release; if (smc_isascii(fce->hostname)) @@ -928,6 +924,7 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc->clcsock->file->private_data = smc->clcsock; smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; + smc->sk.sk_socket->wq.fasync_list = NULL; /* There might be some wait entries remaining * in smc sk->sk_wq and they should be woken up @@ -1048,7 +1045,8 @@ static int smc_find_ism_v2_device_clnt(struct smc_sock *smc, { int rc = SMC_CLC_DECL_NOSMCDDEV; struct smcd_dev *smcd; - int i = 1; + int i = 1, entry = 1; + bool is_virtual; u16 chid; if (smcd_indicated(ini->smc_type_v1)) @@ -1060,14 +1058,23 @@ static int smc_find_ism_v2_device_clnt(struct smc_sock *smc, chid = smc_ism_get_chid(smcd); if (!smc_find_ism_v2_is_unique_chid(chid, ini, i)) continue; + is_virtual = __smc_ism_is_virtual(chid); if (!smc_pnet_is_pnetid_set(smcd->pnetid) || smc_pnet_is_ndev_pnetid(sock_net(&smc->sk), smcd->pnetid)) { + if (is_virtual && entry == SMCD_CLC_MAX_V2_GID_ENTRIES) + /* It's the last GID-CHID entry left in CLC + * Proposal SMC-Dv2 extension, but a virtual + * ISM device will take two entries. So give + * up it and try the next potential ISM device. + */ + continue; ini->ism_dev[i] = smcd; ini->ism_chid[i] = chid; ini->is_smcd = true; rc = 0; i++; - if (i > SMC_MAX_ISM_DEVS) + entry = is_virtual ? entry + 2 : entry + 1; + if (entry > SMCD_CLC_MAX_V2_GID_ENTRIES) break; } } @@ -1149,13 +1156,13 @@ static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, } #define SMC_CLC_MAX_ACCEPT_LEN \ - (sizeof(struct smc_clc_msg_accept_confirm_v2) + \ + (sizeof(struct smc_clc_msg_accept_confirm) + \ sizeof(struct smc_clc_first_contact_ext_v2x) + \ sizeof(struct smc_clc_msg_trail)) /* CLC handshake during connect */ static int smc_connect_clc(struct smc_sock *smc, - struct smc_clc_msg_accept_confirm_v2 *aclc2, + struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { int rc = 0; @@ -1165,7 +1172,7 @@ static int smc_connect_clc(struct smc_sock *smc, if (rc) return rc; /* receive SMC Accept CLC message */ - return smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN, + return smc_clc_wait_msg(smc, aclc, SMC_CLC_MAX_ACCEPT_LEN, SMC_CLC_ACCEPT, CLC_WAIT_TIME); } @@ -1201,10 +1208,8 @@ static int smc_connect_rdma_v2_prepare(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { - struct smc_clc_msg_accept_confirm_v2 *clc_v2 = - (struct smc_clc_msg_accept_confirm_v2 *)aclc; struct smc_clc_first_contact_ext *fce = - smc_get_clc_first_contact_ext(clc_v2, false); + smc_get_clc_first_contact_ext(aclc, false); struct net *net = sock_net(&smc->sk); int rc; @@ -1327,10 +1332,7 @@ static int smc_connect_rdma(struct smc_sock *smc, } if (aclc->hdr.version > SMC_V1) { - struct smc_clc_msg_accept_confirm_v2 *clc_v2 = - (struct smc_clc_msg_accept_confirm_v2 *)aclc; - - eid = clc_v2->r1.eid; + eid = aclc->r1.eid; if (ini->first_contact_local) smc_fill_gid_list(link->lgr, &ini->smcrv2.gidlist, link->smcibdev, link->gid); @@ -1371,7 +1373,7 @@ connect_abort: * Determine from the CHID of the received CLC ACCEPT the ISM device chosen. */ static int -smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm_v2 *aclc, +smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { int i; @@ -1398,12 +1400,9 @@ static int smc_connect_ism(struct smc_sock *smc, ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK; if (aclc->hdr.version == SMC_V2) { - struct smc_clc_msg_accept_confirm_v2 *aclc_v2 = - (struct smc_clc_msg_accept_confirm_v2 *)aclc; - if (ini->first_contact_peer) { struct smc_clc_first_contact_ext *fce = - smc_get_clc_first_contact_ext(aclc_v2, true); + smc_get_clc_first_contact_ext(aclc, true); ini->release_nr = fce->release; rc = smc_clc_clnt_v2x_features_validate(fce, ini); @@ -1411,11 +1410,16 @@ static int smc_connect_ism(struct smc_sock *smc, return rc; } - rc = smc_v2_determine_accepted_chid(aclc_v2, ini); + rc = smc_v2_determine_accepted_chid(aclc, ini); if (rc) return rc; + + if (__smc_ism_is_virtual(ini->ism_chid[ini->ism_selected])) + ini->ism_peer_gid[ini->ism_selected].gid_ext = + ntohll(aclc->d1.gid_ext); + /* for non-virtual ISM devices, peer gid_ext remains 0. */ } - ini->ism_peer_gid[ini->ism_selected] = ntohll(aclc->d0.gid); + ini->ism_peer_gid[ini->ism_selected].gid = ntohll(aclc->d0.gid); /* there is only one lgr role for SMC-D; use server lock */ mutex_lock(&smc_server_lgr_pending); @@ -1437,12 +1441,8 @@ static int smc_connect_ism(struct smc_sock *smc, smc_rx_init(smc); smc_tx_init(smc); - if (aclc->hdr.version > SMC_V1) { - struct smc_clc_msg_accept_confirm_v2 *clc_v2 = - (struct smc_clc_msg_accept_confirm_v2 *)aclc; - - eid = clc_v2->d1.eid; - } + if (aclc->hdr.version > SMC_V1) + eid = aclc->d1.eid; rc = smc_clc_send_confirm(smc, ini->first_contact_local, aclc->hdr.version, eid, ini); @@ -1493,7 +1493,6 @@ static int smc_connect_check_aclc(struct smc_init_info *ini, static int __smc_connect(struct smc_sock *smc) { u8 version = smc_ism_is_v2_capable() ? SMC_V2 : SMC_V1; - struct smc_clc_msg_accept_confirm_v2 *aclc2; struct smc_clc_msg_accept_confirm *aclc; struct smc_init_info *ini = NULL; u8 *buf = NULL; @@ -1541,11 +1540,10 @@ static int __smc_connect(struct smc_sock *smc) rc = SMC_CLC_DECL_MEM; goto fallback; } - aclc2 = (struct smc_clc_msg_accept_confirm_v2 *)buf; - aclc = (struct smc_clc_msg_accept_confirm *)aclc2; + aclc = (struct smc_clc_msg_accept_confirm *)buf; /* perform CLC handshake */ - rc = smc_connect_clc(smc, aclc2, ini); + rc = smc_connect_clc(smc, aclc, ini); if (rc) { /* -EAGAIN on timeout, see tcp_recvmsg() */ if (rc == -EAGAIN) { @@ -2106,7 +2104,8 @@ static bool smc_is_already_selected(struct smcd_dev *smcd, /* check for ISM devices matching proposed ISM devices */ static void smc_check_ism_v2_match(struct smc_init_info *ini, - u16 proposed_chid, u64 proposed_gid, + u16 proposed_chid, + struct smcd_gid *proposed_gid, unsigned int *matches) { struct smcd_dev *smcd; @@ -2118,7 +2117,11 @@ static void smc_check_ism_v2_match(struct smc_init_info *ini, continue; if (smc_ism_get_chid(smcd) == proposed_chid && !smc_ism_cantalk(proposed_gid, ISM_RESERVED_VLANID, smcd)) { - ini->ism_peer_gid[*matches] = proposed_gid; + ini->ism_peer_gid[*matches].gid = proposed_gid->gid; + if (__smc_ism_is_virtual(proposed_chid)) + ini->ism_peer_gid[*matches].gid_ext = + proposed_gid->gid_ext; + /* non-virtual ISM's peer gid_ext remains 0. */ ini->ism_dev[*matches] = smcd; (*matches)++; break; @@ -2140,9 +2143,11 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, struct smc_clc_v2_extension *smc_v2_ext; struct smc_clc_msg_smcd *pclc_smcd; unsigned int matches = 0; + struct smcd_gid smcd_gid; u8 smcd_version; u8 *eid = NULL; int i, rc; + u16 chid; if (!(ini->smcd_version & SMC_V2) || !smcd_indicated(ini->smc_type_v2)) goto not_found; @@ -2152,18 +2157,35 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext); mutex_lock(&smcd_dev_list.mutex); - if (pclc_smcd->ism.chid) + if (pclc_smcd->ism.chid) { /* check for ISM device matching proposed native ISM device */ + smcd_gid.gid = ntohll(pclc_smcd->ism.gid); + smcd_gid.gid_ext = 0; smc_check_ism_v2_match(ini, ntohs(pclc_smcd->ism.chid), - ntohll(pclc_smcd->ism.gid), &matches); - for (i = 1; i <= smc_v2_ext->hdr.ism_gid_cnt; i++) { + &smcd_gid, &matches); + } + for (i = 0; i < smc_v2_ext->hdr.ism_gid_cnt; i++) { /* check for ISM devices matching proposed non-native ISM * devices */ - smc_check_ism_v2_match(ini, - ntohs(smcd_v2_ext->gidchid[i - 1].chid), - ntohll(smcd_v2_ext->gidchid[i - 1].gid), - &matches); + smcd_gid.gid = ntohll(smcd_v2_ext->gidchid[i].gid); + smcd_gid.gid_ext = 0; + chid = ntohs(smcd_v2_ext->gidchid[i].chid); + if (__smc_ism_is_virtual(chid)) { + if ((i + 1) == smc_v2_ext->hdr.ism_gid_cnt || + chid != ntohs(smcd_v2_ext->gidchid[i + 1].chid)) + /* each virtual ISM device takes two GID-CHID + * entries and CHID of the second entry repeats + * that of the first entry. + * + * So check if the next GID-CHID entry exists + * and both two entries' CHIDs are the same. + */ + continue; + smcd_gid.gid_ext = + ntohll(smcd_v2_ext->gidchid[++i].gid); + } + smc_check_ism_v2_match(ini, chid, &smcd_gid, &matches); } mutex_unlock(&smcd_dev_list.mutex); @@ -2212,7 +2234,8 @@ static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc, if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1)) goto not_found; ini->is_smcd = true; /* prepare ISM check */ - ini->ism_peer_gid[0] = ntohll(pclc_smcd->ism.gid); + ini->ism_peer_gid[0].gid = ntohll(pclc_smcd->ism.gid); + ini->ism_peer_gid[0].gid_ext = 0; rc = smc_find_ism_device(new_smc, ini); if (rc) goto not_found; @@ -2461,7 +2484,7 @@ static void smc_listen_work(struct work_struct *work) if (rc) goto out_decl; - rc = smc_clc_srv_v2x_features_validate(pclc, ini); + rc = smc_clc_srv_v2x_features_validate(new_smc, pclc, ini); if (rc) goto out_decl; diff --git a/net/smc/smc.h b/net/smc/smc.h index e377980b84..df64efd2de 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -29,9 +29,6 @@ #define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ #define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ -#define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM - * devices - */ #define SMC_AUTOCORKING_DEFAULT_SIZE 0x10000 /* 64K by default */ extern struct proto smc_proto; @@ -58,6 +55,13 @@ enum smc_state { /* possible states of an SMC socket */ SMC_PROCESSABORT = 27, }; +enum smc_supplemental_features { + SMC_SPF_VIRT_ISM_DEV = 0, +}; + +#define SMC_FEATURE_MASK \ + (BIT(SMC_SPF_VIRT_ISM_DEV)) + struct smc_link_group; struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */ @@ -196,7 +200,6 @@ struct smc_connection { * - dec on polled tx cqe */ wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/ - atomic_t tx_pushing; /* nr_threads trying tx push */ struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ u32 tx_off; /* base offset in peer rmb */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 1489a8421d..9a13709bea 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -387,9 +387,9 @@ static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc) /* check arriving CLC accept or confirm */ static bool -smc_clc_msg_acc_conf_valid(struct smc_clc_msg_accept_confirm_v2 *clc_v2) +smc_clc_msg_acc_conf_valid(struct smc_clc_msg_accept_confirm *clc) { - struct smc_clc_msg_hdr *hdr = &clc_v2->hdr; + struct smc_clc_msg_hdr *hdr = &clc->hdr; if (hdr->typev1 != SMC_TYPE_R && hdr->typev1 != SMC_TYPE_D) return false; @@ -428,15 +428,16 @@ smc_clc_msg_decl_valid(struct smc_clc_msg_decline *dclc) return true; } -static int smc_clc_fill_fce(struct smc_clc_first_contact_ext_v2x *fce, - struct smc_init_info *ini) +static int smc_clc_fill_fce_v2x(struct smc_clc_first_contact_ext_v2x *fce_v2x, + struct smc_init_info *ini) { - int ret = sizeof(*fce); + int ret = sizeof(*fce_v2x); - memset(fce, 0, sizeof(*fce)); - fce->fce_v2_base.os_type = SMC_CLC_OS_LINUX; - fce->fce_v2_base.release = ini->release_nr; - memcpy(fce->fce_v2_base.hostname, smc_hostname, sizeof(smc_hostname)); + memset(fce_v2x, 0, sizeof(*fce_v2x)); + fce_v2x->fce_v2_base.os_type = SMC_CLC_OS_LINUX; + fce_v2x->fce_v2_base.release = ini->release_nr; + memcpy(fce_v2x->fce_v2_base.hostname, + smc_hostname, sizeof(smc_hostname)); if (ini->is_smcd && ini->release_nr < SMC_RELEASE_1) { ret = sizeof(struct smc_clc_first_contact_ext); goto out; @@ -444,9 +445,10 @@ static int smc_clc_fill_fce(struct smc_clc_first_contact_ext_v2x *fce, if (ini->release_nr >= SMC_RELEASE_1) { if (!ini->is_smcd) { - fce->max_conns = ini->max_conns; - fce->max_links = ini->max_links; + fce_v2x->max_conns = ini->max_conns; + fce_v2x->max_links = ini->max_links; } + fce_v2x->feature_mask = htons(ini->feature_mask); } out: @@ -458,7 +460,7 @@ out: */ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm, bool check_trl) { - struct smc_clc_msg_accept_confirm_v2 *clc_v2; + struct smc_clc_msg_accept_confirm *clc; struct smc_clc_msg_proposal *pclc; struct smc_clc_msg_decline *dclc; struct smc_clc_msg_trail *trl; @@ -476,12 +478,11 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm, bool check_trl) break; case SMC_CLC_ACCEPT: case SMC_CLC_CONFIRM: - clc_v2 = (struct smc_clc_msg_accept_confirm_v2 *)clcm; - if (!smc_clc_msg_acc_conf_valid(clc_v2)) + clc = (struct smc_clc_msg_accept_confirm *)clcm; + if (!smc_clc_msg_acc_conf_valid(clc)) return false; trl = (struct smc_clc_msg_trail *) - ((u8 *)clc_v2 + ntohs(clc_v2->hdr.length) - - sizeof(*trl)); + ((u8 *)clc + ntohs(clc->hdr.length) - sizeof(*trl)); break; case SMC_CLC_DECLINE: dclc = (struct smc_clc_msg_decline *)clcm; @@ -834,6 +835,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) struct smc_clc_smcd_gid_chid *gidchids; struct smc_clc_msg_proposal_area *pclc; struct smc_clc_ipv6_prefix *ipv6_prfx; + struct net *net = sock_net(&smc->sk); struct smc_clc_v2_extension *v2_ext; struct smc_clc_msg_smcd *pclc_smcd; struct smc_clc_msg_trail *trl; @@ -891,11 +893,13 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) ETH_ALEN); } if (smcd_indicated(ini->smc_type_v1)) { + struct smcd_gid smcd_gid; + /* add SMC-D specifics */ if (ini->ism_dev[0]) { smcd = ini->ism_dev[0]; - pclc_smcd->ism.gid = - htonll(smcd->ops->get_local_gid(smcd)); + smcd->ops->get_local_gid(smcd, &smcd_gid); + pclc_smcd->ism.gid = htonll(smcd_gid.gid); pclc_smcd->ism.chid = htons(smc_ism_get_chid(ini->ism_dev[0])); } @@ -916,6 +920,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) pclc_smcd->v2_ext_offset = htons(v2_ext_offset); plen += sizeof(*v2_ext); + v2_ext->feature_mask = htons(SMC_FEATURE_MASK); read_lock(&smc_clc_eid_table.lock); v2_ext->hdr.eid_cnt = smc_clc_eid_table.ueid_cnt; plen += smc_clc_eid_table.ueid_cnt * SMC_MAX_EID_LEN; @@ -927,10 +932,11 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) read_unlock(&smc_clc_eid_table.lock); } if (smcd_indicated(ini->smc_type_v2)) { + struct smcd_gid smcd_gid; u8 *eid = NULL; + int entry = 0; v2_ext->hdr.flag.seid = smc_clc_eid_table.seid_enabled; - v2_ext->hdr.ism_gid_cnt = ini->ism_offered_cnt; v2_ext->hdr.smcd_v2_ext_offset = htons(sizeof(*v2_ext) - offsetofend(struct smc_clnt_opts_area_hdr, smcd_v2_ext_offset) + @@ -942,19 +948,31 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) if (ini->ism_offered_cnt) { for (i = 1; i <= ini->ism_offered_cnt; i++) { smcd = ini->ism_dev[i]; - gidchids[i - 1].gid = - htonll(smcd->ops->get_local_gid(smcd)); - gidchids[i - 1].chid = + smcd->ops->get_local_gid(smcd, &smcd_gid); + gidchids[entry].chid = htons(smc_ism_get_chid(ini->ism_dev[i])); + gidchids[entry].gid = htonll(smcd_gid.gid); + if (smc_ism_is_virtual(smcd)) { + /* a virtual ISM device takes two + * entries. CHID of the second entry + * repeats that of the first entry. + */ + gidchids[entry + 1].chid = + gidchids[entry].chid; + gidchids[entry + 1].gid = + htonll(smcd_gid.gid_ext); + entry++; + } + entry++; } - plen += ini->ism_offered_cnt * - sizeof(struct smc_clc_smcd_gid_chid); + plen += entry * sizeof(struct smc_clc_smcd_gid_chid); } + v2_ext->hdr.ism_gid_cnt = entry; } if (smcr_indicated(ini->smc_type_v2)) { memcpy(v2_ext->roce, ini->smcrv2.ib_gid_v2, SMC_GID_SIZE); - v2_ext->max_conns = SMC_CONN_PER_LGR_PREFER; - v2_ext->max_links = SMC_LINKS_PER_LGR_MAX_PREFER; + v2_ext->max_conns = net->smc.sysctl_max_conns_per_lgr; + v2_ext->max_links = net->smc.sysctl_max_links_per_lgr; } pclc_base->hdr.length = htons(plen); @@ -985,7 +1003,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) vec[i++].iov_len = sizeof(*smcd_v2_ext); if (ini->ism_offered_cnt) { vec[i].iov_base = gidchids; - vec[i++].iov_len = ini->ism_offered_cnt * + vec[i++].iov_len = v2_ext->hdr.ism_gid_cnt * sizeof(struct smc_clc_smcd_gid_chid); } } @@ -1006,109 +1024,143 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) return reason_code; } +static void +smcd_clc_prep_confirm_accept(struct smc_connection *conn, + struct smc_clc_msg_accept_confirm *clc, + int first_contact, u8 version, + u8 *eid, struct smc_init_info *ini, + int *fce_len, + struct smc_clc_first_contact_ext_v2x *fce_v2x, + struct smc_clc_msg_trail *trl) +{ + struct smcd_dev *smcd = conn->lgr->smcd; + struct smcd_gid smcd_gid; + u16 chid; + int len; + + /* SMC-D specific settings */ + memcpy(clc->hdr.eyecatcher, SMCD_EYECATCHER, + sizeof(SMCD_EYECATCHER)); + smcd->ops->get_local_gid(smcd, &smcd_gid); + clc->hdr.typev1 = SMC_TYPE_D; + clc->d0.gid = htonll(smcd_gid.gid); + clc->d0.token = htonll(conn->rmb_desc->token); + clc->d0.dmbe_size = conn->rmbe_size_comp; + clc->d0.dmbe_idx = 0; + memcpy(&clc->d0.linkid, conn->lgr->id, SMC_LGR_ID_SIZE); + if (version == SMC_V1) { + clc->hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN); + } else { + chid = smc_ism_get_chid(smcd); + clc->d1.chid = htons(chid); + if (eid && eid[0]) + memcpy(clc->d1.eid, eid, SMC_MAX_EID_LEN); + if (__smc_ism_is_virtual(chid)) + clc->d1.gid_ext = htonll(smcd_gid.gid_ext); + len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2; + if (first_contact) { + *fce_len = smc_clc_fill_fce_v2x(fce_v2x, ini); + len += *fce_len; + } + clc->hdr.length = htons(len); + } + memcpy(trl->eyecatcher, SMCD_EYECATCHER, + sizeof(SMCD_EYECATCHER)); +} + +static void +smcr_clc_prep_confirm_accept(struct smc_connection *conn, + struct smc_clc_msg_accept_confirm *clc, + int first_contact, u8 version, + u8 *eid, struct smc_init_info *ini, + int *fce_len, + struct smc_clc_first_contact_ext_v2x *fce_v2x, + struct smc_clc_fce_gid_ext *gle, + struct smc_clc_msg_trail *trl) +{ + struct smc_link *link = conn->lnk; + int len; + + /* SMC-R specific settings */ + memcpy(clc->hdr.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + clc->hdr.typev1 = SMC_TYPE_R; + memcpy(clc->r0.lcl.id_for_peer, local_systemid, + sizeof(local_systemid)); + memcpy(&clc->r0.lcl.gid, link->gid, SMC_GID_SIZE); + memcpy(&clc->r0.lcl.mac, &link->smcibdev->mac[link->ibport - 1], + ETH_ALEN); + hton24(clc->r0.qpn, link->roce_qp->qp_num); + clc->r0.rmb_rkey = + htonl(conn->rmb_desc->mr[link->link_idx]->rkey); + clc->r0.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ + clc->r0.rmbe_alert_token = htonl(conn->alert_token_local); + switch (clc->hdr.type) { + case SMC_CLC_ACCEPT: + clc->r0.qp_mtu = link->path_mtu; + break; + case SMC_CLC_CONFIRM: + clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu); + break; + } + clc->r0.rmbe_size = conn->rmbe_size_comp; + clc->r0.rmb_dma_addr = conn->rmb_desc->is_vm ? + cpu_to_be64((uintptr_t)conn->rmb_desc->cpu_addr) : + cpu_to_be64((u64)sg_dma_address + (conn->rmb_desc->sgt[link->link_idx].sgl)); + hton24(clc->r0.psn, link->psn_initial); + if (version == SMC_V1) { + clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); + } else { + if (eid && eid[0]) + memcpy(clc->r1.eid, eid, SMC_MAX_EID_LEN); + len = SMCR_CLC_ACCEPT_CONFIRM_LEN_V2; + if (first_contact) { + *fce_len = smc_clc_fill_fce_v2x(fce_v2x, ini); + len += *fce_len; + fce_v2x->fce_v2_base.v2_direct = + !link->lgr->uses_gateway; + if (clc->hdr.type == SMC_CLC_CONFIRM) { + memset(gle, 0, sizeof(*gle)); + gle->gid_cnt = ini->smcrv2.gidlist.len; + len += sizeof(*gle); + len += gle->gid_cnt * sizeof(gle->gid[0]); + } + } + clc->hdr.length = htons(len); + } + memcpy(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); +} + /* build and send CLC CONFIRM / ACCEPT message */ static int smc_clc_send_confirm_accept(struct smc_sock *smc, - struct smc_clc_msg_accept_confirm_v2 *clc_v2, + struct smc_clc_msg_accept_confirm *clc, int first_contact, u8 version, u8 *eid, struct smc_init_info *ini) { + struct smc_clc_first_contact_ext_v2x fce_v2x; struct smc_connection *conn = &smc->conn; - struct smc_clc_first_contact_ext_v2x fce; - struct smcd_dev *smcd = conn->lgr->smcd; - struct smc_clc_msg_accept_confirm *clc; struct smc_clc_fce_gid_ext gle; struct smc_clc_msg_trail trl; - int i, len, fce_len; + int i, fce_len; struct kvec vec[5]; struct msghdr msg; /* send SMC Confirm CLC msg */ - clc = (struct smc_clc_msg_accept_confirm *)clc_v2; clc->hdr.version = version; /* SMC version */ if (first_contact) clc->hdr.typev2 |= SMC_FIRST_CONTACT_MASK; - if (conn->lgr->is_smcd) { - /* SMC-D specific settings */ - memcpy(clc->hdr.eyecatcher, SMCD_EYECATCHER, - sizeof(SMCD_EYECATCHER)); - clc->hdr.typev1 = SMC_TYPE_D; - clc->d0.gid = htonll(smcd->ops->get_local_gid(smcd)); - clc->d0.token = htonll(conn->rmb_desc->token); - clc->d0.dmbe_size = conn->rmbe_size_comp; - clc->d0.dmbe_idx = 0; - memcpy(&clc->d0.linkid, conn->lgr->id, SMC_LGR_ID_SIZE); - if (version == SMC_V1) { - clc->hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN); - } else { - clc_v2->d1.chid = htons(smc_ism_get_chid(smcd)); - if (eid && eid[0]) - memcpy(clc_v2->d1.eid, eid, SMC_MAX_EID_LEN); - len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2; - if (first_contact) { - fce_len = smc_clc_fill_fce(&fce, ini); - len += fce_len; - } - clc_v2->hdr.length = htons(len); - } - memcpy(trl.eyecatcher, SMCD_EYECATCHER, - sizeof(SMCD_EYECATCHER)); - } else { - struct smc_link *link = conn->lnk; - - /* SMC-R specific settings */ - memcpy(clc->hdr.eyecatcher, SMC_EYECATCHER, - sizeof(SMC_EYECATCHER)); - clc->hdr.typev1 = SMC_TYPE_R; - clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); - memcpy(clc->r0.lcl.id_for_peer, local_systemid, - sizeof(local_systemid)); - memcpy(&clc->r0.lcl.gid, link->gid, SMC_GID_SIZE); - memcpy(&clc->r0.lcl.mac, &link->smcibdev->mac[link->ibport - 1], - ETH_ALEN); - hton24(clc->r0.qpn, link->roce_qp->qp_num); - clc->r0.rmb_rkey = - htonl(conn->rmb_desc->mr[link->link_idx]->rkey); - clc->r0.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ - clc->r0.rmbe_alert_token = htonl(conn->alert_token_local); - switch (clc->hdr.type) { - case SMC_CLC_ACCEPT: - clc->r0.qp_mtu = link->path_mtu; - break; - case SMC_CLC_CONFIRM: - clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu); - break; - } - clc->r0.rmbe_size = conn->rmbe_size_comp; - clc->r0.rmb_dma_addr = conn->rmb_desc->is_vm ? - cpu_to_be64((uintptr_t)conn->rmb_desc->cpu_addr) : - cpu_to_be64((u64)sg_dma_address - (conn->rmb_desc->sgt[link->link_idx].sgl)); - hton24(clc->r0.psn, link->psn_initial); - if (version == SMC_V1) { - clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); - } else { - if (eid && eid[0]) - memcpy(clc_v2->r1.eid, eid, SMC_MAX_EID_LEN); - len = SMCR_CLC_ACCEPT_CONFIRM_LEN_V2; - if (first_contact) { - fce_len = smc_clc_fill_fce(&fce, ini); - len += fce_len; - fce.fce_v2_base.v2_direct = !link->lgr->uses_gateway; - if (clc->hdr.type == SMC_CLC_CONFIRM) { - memset(&gle, 0, sizeof(gle)); - gle.gid_cnt = ini->smcrv2.gidlist.len; - len += sizeof(gle); - len += gle.gid_cnt * sizeof(gle.gid[0]); - } - } - clc_v2->hdr.length = htons(len); - } - memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); - } - + if (conn->lgr->is_smcd) + smcd_clc_prep_confirm_accept(conn, clc, first_contact, + version, eid, ini, &fce_len, + &fce_v2x, &trl); + else + smcr_clc_prep_confirm_accept(conn, clc, first_contact, + version, eid, ini, &fce_len, + &fce_v2x, &gle, &trl); memset(&msg, 0, sizeof(msg)); i = 0; - vec[i].iov_base = clc_v2; + vec[i].iov_base = clc; if (version > SMC_V1) vec[i++].iov_len = (clc->hdr.typev1 == SMC_TYPE_D ? SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 : @@ -1120,7 +1172,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, SMCR_CLC_ACCEPT_CONFIRM_LEN) - sizeof(trl); if (version > SMC_V1 && first_contact) { - vec[i].iov_base = &fce; + vec[i].iov_base = &fce_v2x; vec[i++].iov_len = fce_len; if (!conn->lgr->is_smcd) { if (clc->hdr.type == SMC_CLC_CONFIRM) { @@ -1142,16 +1194,16 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, u8 version, u8 *eid, struct smc_init_info *ini) { - struct smc_clc_msg_accept_confirm_v2 cclc_v2; + struct smc_clc_msg_accept_confirm cclc; int reason_code = 0; int len; /* send SMC Confirm CLC msg */ - memset(&cclc_v2, 0, sizeof(cclc_v2)); - cclc_v2.hdr.type = SMC_CLC_CONFIRM; - len = smc_clc_send_confirm_accept(smc, &cclc_v2, clnt_first_contact, + memset(&cclc, 0, sizeof(cclc)); + cclc.hdr.type = SMC_CLC_CONFIRM; + len = smc_clc_send_confirm_accept(smc, &cclc, clnt_first_contact, version, eid, ini); - if (len < ntohs(cclc_v2.hdr.length)) { + if (len < ntohs(cclc.hdr.length)) { if (len >= 0) { reason_code = -ENETUNREACH; smc->sk.sk_err = -reason_code; @@ -1167,26 +1219,29 @@ int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, int smc_clc_send_accept(struct smc_sock *new_smc, bool srv_first_contact, u8 version, u8 *negotiated_eid, struct smc_init_info *ini) { - struct smc_clc_msg_accept_confirm_v2 aclc_v2; + struct smc_clc_msg_accept_confirm aclc; int len; - memset(&aclc_v2, 0, sizeof(aclc_v2)); - aclc_v2.hdr.type = SMC_CLC_ACCEPT; - len = smc_clc_send_confirm_accept(new_smc, &aclc_v2, srv_first_contact, + memset(&aclc, 0, sizeof(aclc)); + aclc.hdr.type = SMC_CLC_ACCEPT; + len = smc_clc_send_confirm_accept(new_smc, &aclc, srv_first_contact, version, negotiated_eid, ini); - if (len < ntohs(aclc_v2.hdr.length)) + if (len < ntohs(aclc.hdr.length)) len = len >= 0 ? -EPROTO : -new_smc->clcsock->sk->sk_err; return len > 0 ? 0 : len; } -int smc_clc_srv_v2x_features_validate(struct smc_clc_msg_proposal *pclc, +int smc_clc_srv_v2x_features_validate(struct smc_sock *smc, + struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { struct smc_clc_v2_extension *pclc_v2_ext; + struct net *net = sock_net(&smc->sk); ini->max_conns = SMC_CONN_PER_LGR_MAX; ini->max_links = SMC_LINKS_ADD_LNK_MAX; + ini->feature_mask = SMC_FEATURE_MASK; if ((!(ini->smcd_version & SMC_V2) && !(ini->smcr_version & SMC_V2)) || ini->release_nr < SMC_RELEASE_1) @@ -1197,11 +1252,13 @@ int smc_clc_srv_v2x_features_validate(struct smc_clc_msg_proposal *pclc, return SMC_CLC_DECL_NOV2EXT; if (ini->smcr_version & SMC_V2) { - ini->max_conns = min_t(u8, pclc_v2_ext->max_conns, SMC_CONN_PER_LGR_PREFER); + ini->max_conns = min_t(u8, pclc_v2_ext->max_conns, + net->smc.sysctl_max_conns_per_lgr); if (ini->max_conns < SMC_CONN_PER_LGR_MIN) return SMC_CLC_DECL_MAXCONNERR; - ini->max_links = min_t(u8, pclc_v2_ext->max_links, SMC_LINKS_PER_LGR_MAX_PREFER); + ini->max_links = min_t(u8, pclc_v2_ext->max_links, + net->smc.sysctl_max_links_per_lgr); if (ini->max_links < SMC_LINKS_ADD_LNK_MIN) return SMC_CLC_DECL_MAXLINKERR; } @@ -1228,6 +1285,8 @@ int smc_clc_clnt_v2x_features_validate(struct smc_clc_first_contact_ext *fce, return SMC_CLC_DECL_MAXLINKERR; ini->max_links = fce_v2x->max_links; } + /* common supplemental features of server and client */ + ini->feature_mask = ntohs(fce_v2x->feature_mask) & SMC_FEATURE_MASK; return 0; } @@ -1235,10 +1294,8 @@ int smc_clc_clnt_v2x_features_validate(struct smc_clc_first_contact_ext *fce, int smc_clc_v2x_features_confirm_check(struct smc_clc_msg_accept_confirm *cclc, struct smc_init_info *ini) { - struct smc_clc_msg_accept_confirm_v2 *clc_v2 = - (struct smc_clc_msg_accept_confirm_v2 *)cclc; struct smc_clc_first_contact_ext *fce = - smc_get_clc_first_contact_ext(clc_v2, ini->is_smcd); + smc_get_clc_first_contact_ext(cclc, ini->is_smcd); struct smc_clc_first_contact_ext_v2x *fce_v2x = (struct smc_clc_first_contact_ext_v2x *)fce; @@ -1258,6 +1315,8 @@ int smc_clc_v2x_features_confirm_check(struct smc_clc_msg_accept_confirm *cclc, if (fce_v2x->max_links != ini->max_links) return SMC_CLC_DECL_MAXLINKERR; } + /* common supplemental features returned by client */ + ini->feature_mask = ntohs(fce_v2x->feature_mask); return 0; } diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 08155a96a0..a9f9bdd26d 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -138,7 +138,8 @@ struct smc_clc_v2_extension { u8 roce[16]; /* RoCEv2 GID */ u8 max_conns; u8 max_links; - u8 reserved[14]; + __be16 feature_mask; + u8 reserved[12]; u8 user_eids[][SMC_MAX_EID_LEN]; }; @@ -171,6 +172,11 @@ struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ #define SMC_CLC_MAX_V6_PREFIX 8 #define SMC_CLC_MAX_UEID 8 +#define SMCD_CLC_MAX_V2_GID_ENTRIES 8 /* max # of CHID-GID entries in CLC + * proposal SMC-Dv2 extension. + * each ISM device takes one entry and + * each virtual ISM takes two entries. + */ struct smc_clc_msg_proposal_area { struct smc_clc_msg_proposal pclc_base; @@ -180,7 +186,8 @@ struct smc_clc_msg_proposal_area { struct smc_clc_v2_extension pclc_v2_ext; u8 user_eids[SMC_CLC_MAX_UEID][SMC_MAX_EID_LEN]; struct smc_clc_smcd_v2_extension pclc_smcd_v2_ext; - struct smc_clc_smcd_gid_chid pclc_gidchids[SMC_MAX_ISM_DEVS]; + struct smc_clc_smcd_gid_chid + pclc_gidchids[SMCD_CLC_MAX_V2_GID_ENTRIES]; struct smc_clc_msg_trail pclc_trl; }; @@ -240,9 +247,14 @@ struct smc_clc_first_contact_ext { struct smc_clc_first_contact_ext_v2x { struct smc_clc_first_contact_ext fce_v2_base; - u8 max_conns; /* for SMC-R only */ - u8 max_links; /* for SMC-R only */ - u8 reserved3[2]; + union { + struct { + u8 max_conns; /* for SMC-R only */ + u8 max_links; /* for SMC-R only */ + }; + u8 reserved3[2]; /* for SMC-D only */ + }; + __be16 feature_mask; __be32 vendor_exp_options; u8 reserved4[8]; } __packed; /* format defined in @@ -259,28 +271,21 @@ struct smc_clc_fce_gid_ext { struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */ struct smc_clc_msg_hdr hdr; union { - struct smcr_clc_msg_accept_confirm r0; /* SMC-R */ - struct { /* SMC-D */ - struct smcd_clc_msg_accept_confirm_common d0; - u32 reserved5[3]; - }; - }; -} __packed; /* format defined in RFC7609 */ - -struct smc_clc_msg_accept_confirm_v2 { /* clc accept / confirm message */ - struct smc_clc_msg_hdr hdr; - union { struct { /* SMC-R */ struct smcr_clc_msg_accept_confirm r0; - u8 eid[SMC_MAX_EID_LEN]; - u8 reserved6[8]; - } r1; + struct { /* v2 only */ + u8 eid[SMC_MAX_EID_LEN]; + u8 reserved6[8]; + } __packed r1; + }; struct { /* SMC-D */ struct smcd_clc_msg_accept_confirm_common d0; - __be16 chid; - u8 eid[SMC_MAX_EID_LEN]; - u8 reserved5[8]; - } d1; + struct { /* v2 only, but 12 bytes reserved in v1 */ + __be16 chid; + u8 eid[SMC_MAX_EID_LEN]; + __be64 gid_ext; + } __packed d1; + }; }; }; @@ -389,24 +394,23 @@ smc_get_clc_smcd_v2_ext(struct smc_clc_v2_extension *prop_v2ext) } static inline struct smc_clc_first_contact_ext * -smc_get_clc_first_contact_ext(struct smc_clc_msg_accept_confirm_v2 *clc_v2, +smc_get_clc_first_contact_ext(struct smc_clc_msg_accept_confirm *clc, bool is_smcd) { int clc_v2_len; - if (clc_v2->hdr.version == SMC_V1 || - !(clc_v2->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) + if (clc->hdr.version == SMC_V1 || + !(clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) return NULL; if (is_smcd) clc_v2_len = - offsetofend(struct smc_clc_msg_accept_confirm_v2, d1); + offsetofend(struct smc_clc_msg_accept_confirm, d1); else clc_v2_len = - offsetofend(struct smc_clc_msg_accept_confirm_v2, r1); + offsetofend(struct smc_clc_msg_accept_confirm, r1); - return (struct smc_clc_first_contact_ext *)(((u8 *)clc_v2) + - clc_v2_len); + return (struct smc_clc_first_contact_ext *)(((u8 *)clc) + clc_v2_len); } struct smcd_dev; @@ -422,7 +426,8 @@ int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, u8 version, u8 *eid, struct smc_init_info *ini); int smc_clc_send_accept(struct smc_sock *smc, bool srv_first_contact, u8 version, u8 *negotiated_eid, struct smc_init_info *ini); -int smc_clc_srv_v2x_features_validate(struct smc_clc_msg_proposal *pclc, +int smc_clc_srv_v2x_features_validate(struct smc_sock *smc, + struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini); int smc_clc_clnt_v2x_features_validate(struct smc_clc_first_contact_ext *fce, struct smc_init_info *ini); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index d520ee62c8..e4c8584112 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -506,6 +506,7 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, { char smc_pnet[SMC_MAX_PNETID_LEN + 1]; struct smcd_dev *smcd = lgr->smcd; + struct smcd_gid smcd_gid; struct nlattr *attrs; void *nlh; @@ -521,13 +522,19 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, if (nla_put_u32(skb, SMC_NLA_LGR_D_ID, *((u32 *)&lgr->id))) goto errattr; + smcd->ops->get_local_gid(smcd, &smcd_gid); if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID, - smcd->ops->get_local_gid(smcd), - SMC_NLA_LGR_D_PAD)) + smcd_gid.gid, SMC_NLA_LGR_D_PAD)) goto errattr; - if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_PEER_GID, lgr->peer_gid, + if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_EXT_GID, + smcd_gid.gid_ext, SMC_NLA_LGR_D_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_PEER_GID, lgr->peer_gid.gid, SMC_NLA_LGR_D_PAD)) goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_PEER_EXT_GID, + lgr->peer_gid.gid_ext, SMC_NLA_LGR_D_PAD)) + goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_D_VLAN_ID, lgr->vlan_id)) goto errattr; if (nla_put_u32(skb, SMC_NLA_LGR_D_CONNS_NUM, lgr->conns_num)) @@ -876,7 +883,10 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) /* SMC-D specific settings */ smcd = ini->ism_dev[ini->ism_selected]; get_device(smcd->ops->get_dev(smcd)); - lgr->peer_gid = ini->ism_peer_gid[ini->ism_selected]; + lgr->peer_gid.gid = + ini->ism_peer_gid[ini->ism_selected].gid; + lgr->peer_gid.gid_ext = + ini->ism_peer_gid[ini->ism_selected].gid_ext; lgr->smcd = ini->ism_dev[ini->ism_selected]; lgr_list = &ini->ism_dev[ini->ism_selected]->lgr_list; lgr_lock = &lgr->smcd->lgr_lock; @@ -1514,7 +1524,8 @@ void smc_lgr_terminate_sched(struct smc_link_group *lgr) } /* Called when peer lgr shutdown (regularly or abnormally) is received */ -void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) +void smc_smcd_terminate(struct smcd_dev *dev, struct smcd_gid *peer_gid, + unsigned short vlan) { struct smc_link_group *lgr, *l; LIST_HEAD(lgr_free_list); @@ -1522,9 +1533,12 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) /* run common cleanup function and build free list */ spin_lock_bh(&dev->lgr_lock); list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) { - if ((!peer_gid || lgr->peer_gid == peer_gid) && + if ((!peer_gid->gid || + (lgr->peer_gid.gid == peer_gid->gid && + !smc_ism_is_virtual(dev) ? 1 : + lgr->peer_gid.gid_ext == peer_gid->gid_ext)) && (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) { - if (peer_gid) /* peer triggered termination */ + if (peer_gid->gid) /* peer triggered termination */ lgr->peer_shutdown = 1; list_move(&lgr->list, &lgr_free_list); lgr->freeing = 1; @@ -1860,9 +1874,18 @@ static bool smcr_lgr_match(struct smc_link_group *lgr, u8 smcr_version, } static bool smcd_lgr_match(struct smc_link_group *lgr, - struct smcd_dev *smcismdev, u64 peer_gid) + struct smcd_dev *smcismdev, + struct smcd_gid *peer_gid) { - return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev; + if (lgr->peer_gid.gid != peer_gid->gid || + lgr->smcd != smcismdev) + return false; + + if (smc_ism_is_virtual(smcismdev) && + lgr->peer_gid.gid_ext != peer_gid->gid_ext) + return false; + + return true; } /* create a new SMC connection (and a new link group if necessary) */ @@ -1892,7 +1915,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) write_lock_bh(&lgr->conns_lock); if ((ini->is_smcd ? smcd_lgr_match(lgr, ini->ism_dev[ini->ism_selected], - ini->ism_peer_gid[ini->ism_selected]) : + &ini->ism_peer_gid[ini->ism_selected]) : smcr_lgr_match(lgr, ini->smcr_version, ini->peer_systemid, ini->peer_gid, ini->peer_mac, role, diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 120027d404..1f17537603 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -17,9 +17,11 @@ #include <linux/pci.h> #include <rdma/ib_verbs.h> #include <net/genetlink.h> +#include <net/smc.h> #include "smc.h" #include "smc_ib.h" +#include "smc_clc.h" #define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */ #define SMC_CONN_PER_LGR_MIN 16 /* min. # of connections per link group */ @@ -355,7 +357,7 @@ struct smc_link_group { /* max links can be added in lgr */ }; struct { /* SMC-D */ - u64 peer_gid; + struct smcd_gid peer_gid; /* Peer GID (remote) */ struct smcd_dev *smcd; /* ISM device for VLAN reg. */ @@ -392,6 +394,11 @@ struct smc_init_info_smcrv2 { struct smc_gidlist gidlist; }; +#define SMC_MAX_V2_ISM_DEVS SMCD_CLC_MAX_V2_GID_ENTRIES + /* max # of proposed non-native ISM devices, + * which can't exceed the max # of CHID-GID + * entries in CLC proposal SMC-Dv2 extension. + */ struct smc_init_info { u8 is_smcd; u8 smc_type_v1; @@ -401,6 +408,7 @@ struct smc_init_info { u8 max_links; u8 first_contact_peer; u8 first_contact_local; + u16 feature_mask; unsigned short vlan_id; u32 rc; u8 negotiated_eid[SMC_MAX_EID_LEN]; @@ -416,9 +424,9 @@ struct smc_init_info { u32 ib_clcqpn; struct smc_init_info_smcrv2 smcrv2; /* SMC-D */ - u64 ism_peer_gid[SMC_MAX_ISM_DEVS + 1]; - struct smcd_dev *ism_dev[SMC_MAX_ISM_DEVS + 1]; - u16 ism_chid[SMC_MAX_ISM_DEVS + 1]; + struct smcd_gid ism_peer_gid[SMC_MAX_V2_ISM_DEVS + 1]; + struct smcd_dev *ism_dev[SMC_MAX_V2_ISM_DEVS + 1]; + u16 ism_chid[SMC_MAX_V2_ISM_DEVS + 1]; u8 ism_offered_cnt; /* # of ISM devices offered */ u8 ism_selected; /* index of selected ISM dev*/ u8 smcd_version; @@ -544,7 +552,7 @@ void smc_lgr_hold(struct smc_link_group *lgr); void smc_lgr_put(struct smc_link_group *lgr); void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport); void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport); -void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, +void smc_smcd_terminate(struct smcd_dev *dev, struct smcd_gid *peer_gid, unsigned short vlan); void smc_smcd_terminate_all(struct smcd_dev *dev); void smc_smcr_terminate_all(struct smc_ib_device *smcibdev); diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index fb9e5cc128..5a33908015 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -21,6 +21,7 @@ #include "smc.h" #include "smc_core.h" +#include "smc_ism.h" struct smc_diag_dump_ctx { int pos[2]; @@ -167,12 +168,16 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, struct smc_connection *conn = &smc->conn; struct smcd_diag_dmbinfo dinfo; struct smcd_dev *smcd = conn->lgr->smcd; + struct smcd_gid smcd_gid; memset(&dinfo, 0, sizeof(dinfo)); dinfo.linkid = *((u32 *)conn->lgr->id); - dinfo.peer_gid = conn->lgr->peer_gid; - dinfo.my_gid = smcd->ops->get_local_gid(smcd); + dinfo.peer_gid = conn->lgr->peer_gid.gid; + dinfo.peer_gid_ext = conn->lgr->peer_gid.gid_ext; + smcd->ops->get_local_gid(smcd, &smcd_gid); + dinfo.my_gid = smcd_gid.gid; + dinfo.my_gid_ext = smcd_gid.gid_ext; dinfo.token = conn->rmb_desc->token; dinfo.peer_token = conn->peer_token; diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 89981dbe46..97704a9e84 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -844,7 +844,7 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) goto out; /* the calculated number of cq entries fits to mlx5 cq allocation */ cqe_size_order = cache_line_size() == 128 ? 7 : 6; - smc_order = MAX_ORDER - cqe_size_order; + smc_order = MAX_PAGE_ORDER - cqe_size_order; if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index fbee249309..ac88de2a06 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -43,8 +43,30 @@ static struct ism_client smc_ism_client = { }; #endif +static void smc_ism_create_system_eid(void) +{ + struct smc_ism_seid *seid = + (struct smc_ism_seid *)smc_ism_v2_system_eid; +#if IS_ENABLED(CONFIG_S390) + struct cpuid id; + u16 ident_tail; + char tmp[5]; + + memcpy(seid->seid_string, "IBM-SYSZ-ISMSEID00000000", 24); + get_cpu_id(&id); + ident_tail = (u16)(id.ident & SMC_ISM_IDENT_MASK); + snprintf(tmp, 5, "%04X", ident_tail); + memcpy(seid->serial_number, tmp, 4); + snprintf(tmp, 5, "%04X", id.machine); + memcpy(seid->type, tmp, 4); +#else + memset(seid, 0, SMC_MAX_EID_LEN); +#endif +} + /* Test if an ISM communication is possible - same CPC */ -int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) +int smc_ism_cantalk(struct smcd_gid *peer_gid, unsigned short vlan_id, + struct smcd_dev *smcd) { return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0, vlan_id); @@ -208,7 +230,7 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, dmb.dmb_len = dmb_len; dmb.sba_idx = dmb_desc->sba_idx; dmb.vlan_id = lgr->vlan_id; - dmb.rgid = lgr->peer_gid; + dmb.rgid = lgr->peer_gid.gid; rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, &smc_ism_client); if (!rc) { dmb_desc->sba_idx = dmb.sba_idx; @@ -340,18 +362,20 @@ union smcd_sw_event_info { static void smcd_handle_sw_event(struct smc_ism_event_work *wrk) { + struct smcd_gid peer_gid = { .gid = wrk->event.tok, + .gid_ext = 0 }; union smcd_sw_event_info ev_info; ev_info.info = wrk->event.info; switch (wrk->event.code) { case ISM_EVENT_CODE_SHUTDOWN: /* Peer shut down DMBs */ - smc_smcd_terminate(wrk->smcd, wrk->event.tok, ev_info.vlan_id); + smc_smcd_terminate(wrk->smcd, &peer_gid, ev_info.vlan_id); break; case ISM_EVENT_CODE_TESTLINK: /* Activity timer */ if (ev_info.code == ISM_EVENT_REQUEST) { ev_info.code = ISM_EVENT_RESPONSE; wrk->smcd->ops->signal_event(wrk->smcd, - wrk->event.tok, + &peer_gid, ISM_EVENT_REQUEST_IR, ISM_EVENT_CODE_TESTLINK, ev_info.info); @@ -365,10 +389,12 @@ static void smc_ism_event_work(struct work_struct *work) { struct smc_ism_event_work *wrk = container_of(work, struct smc_ism_event_work, work); + struct smcd_gid smcd_gid = { .gid = wrk->event.tok, + .gid_ext = 0 }; switch (wrk->event.type) { case ISM_EVENT_GID: /* GID event, token is peer GID */ - smc_smcd_terminate(wrk->smcd, wrk->event.tok, VLAN_VID_MASK); + smc_smcd_terminate(wrk->smcd, &smcd_gid, VLAN_VID_MASK); break; case ISM_EVENT_DMB: break; @@ -426,14 +452,8 @@ static void smcd_register_dev(struct ism_dev *ism) mutex_lock(&smcd_dev_list.mutex); if (list_empty(&smcd_dev_list.list)) { - u8 *system_eid = NULL; - - system_eid = smcd->ops->get_system_eid(); - if (smcd->ops->supports_v2()) { + if (smcd->ops->supports_v2()) smc_ism_v2_capable = true; - memcpy(smc_ism_v2_system_eid, system_eid, - SMC_MAX_EID_LEN); - } } /* sort list: devices without pnetid before devices with pnetid */ if (smcd->pnetid[0]) @@ -525,7 +545,7 @@ int smc_ism_signal_shutdown(struct smc_link_group *lgr) memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); ev_info.vlan_id = lgr->vlan_id; ev_info.code = ISM_EVENT_REQUEST; - rc = lgr->smcd->ops->signal_event(lgr->smcd, lgr->peer_gid, + rc = lgr->smcd->ops->signal_event(lgr->smcd, &lgr->peer_gid, ISM_EVENT_REQUEST_IR, ISM_EVENT_CODE_SHUTDOWN, ev_info.info); @@ -537,10 +557,10 @@ int smc_ism_init(void) { int rc = 0; -#if IS_ENABLED(CONFIG_ISM) smc_ism_v2_capable = false; - memset(smc_ism_v2_system_eid, 0, SMC_MAX_EID_LEN); + smc_ism_create_system_eid(); +#if IS_ENABLED(CONFIG_ISM) rc = ism_register_client(&smc_ism_client); #endif return rc; diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 832b2f42d7..ffff40c30a 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -15,6 +15,9 @@ #include "smc.h" +#define SMC_VIRTUAL_ISM_CHID_MASK 0xFF00 +#define SMC_ISM_IDENT_MASK 0x00FFFF + struct smcd_dev_list { /* List of SMCD devices */ struct list_head list; struct mutex mutex; /* Protects list of devices */ @@ -28,9 +31,16 @@ struct smc_ism_vlanid { /* VLAN id set on ISM device */ refcount_t refcnt; /* Reference count */ }; +struct smc_ism_seid { + u8 seid_string[24]; + u8 serial_number[4]; + u8 type[4]; +}; + struct smcd_dev; -int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *dev); +int smc_ism_cantalk(struct smcd_gid *peer_gid, unsigned short vlan_id, + struct smcd_dev *dev); void smc_ism_set_conn(struct smc_connection *conn); void smc_ism_unset_conn(struct smc_connection *conn); int smc_ism_get_vlan(struct smcd_dev *dev, unsigned short vlan_id); @@ -56,4 +66,22 @@ static inline int smc_ism_write(struct smcd_dev *smcd, u64 dmb_tok, return rc < 0 ? rc : 0; } +static inline bool __smc_ism_is_virtual(u16 chid) +{ + /* CHIDs in range of 0xFF00 to 0xFFFF are reserved + * for virtual ISM device. + * + * loopback-ism: 0xFFFF + * virtio-ism: 0xFF00 ~ 0xFFFE + */ + return ((chid & 0xFF00) == 0xFF00); +} + +static inline bool smc_ism_is_virtual(struct smcd_dev *smcd) +{ + u16 chid = smcd->ops->get_chid(smcd); + + return __smc_ism_is_virtual(chid); +} + #endif diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 11775401df..2adb92b8c4 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -806,6 +806,16 @@ static void smc_pnet_create_pnetids_list(struct net *net) u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct net_device *dev; + /* Newly created netns do not have devices. + * Do not even acquire rtnl. + */ + if (list_empty(&net->dev_base_head)) + return; + + /* Note: This might not be needed, because smc_pnet_netdev_event() + * is also calling smc_pnet_add_base_pnetid() when handling + * NETDEV_UP event. + */ rtnl_lock(); for_each_netdev(net, dev) smc_pnet_add_base_pnetid(net, dev, ndev_pnetid); @@ -1103,8 +1113,8 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, list_for_each_entry(ismdev, &smcd_dev_list.list, list) { if (smc_pnet_match(ismdev->pnetid, ndev_pnetid) && !ismdev->going_away && - (!ini->ism_peer_gid[0] || - !smc_ism_cantalk(ini->ism_peer_gid[0], ini->vlan_id, + (!ini->ism_peer_gid[0].gid || + !smc_ism_cantalk(&ini->ism_peer_gid[0], ini->vlan_id, ismdev))) { ini->ism_dev[0] = ismdev; break; diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 5cbc18c6e6..a5946d1b9d 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -25,6 +25,10 @@ static int max_sndbuf = INT_MAX / 2; static int max_rcvbuf = INT_MAX / 2; static const int net_smc_wmem_init = (64 * 1024); static const int net_smc_rmem_init = (64 * 1024); +static int links_per_lgr_min = SMC_LINKS_ADD_LNK_MIN; +static int links_per_lgr_max = SMC_LINKS_ADD_LNK_MAX; +static int conns_per_lgr_min = SMC_CONN_PER_LGR_MIN; +static int conns_per_lgr_max = SMC_CONN_PER_LGR_MAX; static struct ctl_table smc_table[] = { { @@ -68,6 +72,24 @@ static struct ctl_table smc_table[] = { .extra1 = &min_rcvbuf, .extra2 = &max_rcvbuf, }, + { + .procname = "smcr_max_links_per_lgr", + .data = &init_net.smc.sysctl_max_links_per_lgr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &links_per_lgr_min, + .extra2 = &links_per_lgr_max, + }, + { + .procname = "smcr_max_conns_per_lgr", + .data = &init_net.smc.sysctl_max_conns_per_lgr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &conns_per_lgr_min, + .extra2 = &conns_per_lgr_max, + }, { } }; @@ -97,6 +119,8 @@ int __net_init smc_sysctl_net_init(struct net *net) net->smc.sysctl_smcr_testlink_time = SMC_LLC_TESTLINK_DEFAULT_TIME; WRITE_ONCE(net->smc.sysctl_wmem, net_smc_wmem_init); WRITE_ONCE(net->smc.sysctl_rmem, net_smc_rmem_init); + net->smc.sysctl_max_links_per_lgr = SMC_LINKS_PER_LGR_MAX_PREFER; + net->smc.sysctl_max_conns_per_lgr = SMC_CONN_PER_LGR_PREFER; return 0; diff --git a/net/smc/smc_sysctl.h b/net/smc/smc_sysctl.h index 0becc11bd2..eb2465ae1e 100644 --- a/net/smc/smc_sysctl.h +++ b/net/smc/smc_sysctl.h @@ -23,6 +23,8 @@ void __net_exit smc_sysctl_net_exit(struct net *net); static inline int smc_sysctl_net_init(struct net *net) { net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; + net->smc.sysctl_max_links_per_lgr = SMC_LINKS_PER_LGR_MAX_PREFER; + net->smc.sysctl_max_conns_per_lgr = SMC_CONN_PER_LGR_PREFER; return 0; } diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 3b0ff3b589..214ac3cbcf 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -621,7 +621,7 @@ static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn) return rc; } -static int __smc_tx_sndbuf_nonempty(struct smc_connection *conn) +int smc_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); int rc = 0; @@ -655,34 +655,6 @@ out: return rc; } -int smc_tx_sndbuf_nonempty(struct smc_connection *conn) -{ - int rc; - - /* This make sure only one can send simultaneously to prevent wasting - * of CPU and CDC slot. - * Record whether someone has tried to push while we are pushing. - */ - if (atomic_inc_return(&conn->tx_pushing) > 1) - return 0; - -again: - atomic_set(&conn->tx_pushing, 1); - smp_wmb(); /* Make sure tx_pushing is 1 before real send */ - rc = __smc_tx_sndbuf_nonempty(conn); - - /* We need to check whether someone else have added some data into - * the send queue and tried to push but failed after the atomic_set() - * when we are pushing. - * If so, we need to push again to prevent those data hang in the send - * queue. - */ - if (unlikely(!atomic_dec_and_test(&conn->tx_pushing))) - goto again; - - return rc; -} - /* Wakeup sndbuf consumers from process context * since there is more data to transmit. The caller * must hold sock lock. diff --git a/net/socket.c b/net/socket.c index 89d79205bf..ed3df2f749 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2163,10 +2163,9 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, struct sockaddr_storage address; int err; struct msghdr msg; - struct iovec iov; int fput_needed; - err = import_single_range(ITER_SOURCE, buff, len, &iov, &msg.msg_iter); + err = import_ubuf(ITER_SOURCE, buff, len, &msg.msg_iter); if (unlikely(err)) return err; sock = sockfd_lookup_light(fd, &err, &fput_needed); @@ -2228,11 +2227,10 @@ int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags, .msg_name = addr ? (struct sockaddr *)&address : NULL, }; struct socket *sock; - struct iovec iov; int err, err2; int fput_needed; - err = import_single_range(ITER_DEST, ubuf, size, &iov, &msg.msg_iter); + err = import_ubuf(ITER_DEST, ubuf, size, &msg.msg_iter); if (unlikely(err)) return err; sock = sockfd_lookup_light(fd, &err, &fput_needed); diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 1af71fbb0d..c7af0220f8 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -2280,6 +2280,7 @@ static void __exit exit_rpcsec_gss(void) } MODULE_ALIAS("rpc-auth-6"); +MODULE_DESCRIPTION("Sun RPC Kerberos RPCSEC_GSS client authentication"); MODULE_LICENSE("GPL"); module_param_named(expired_cred_retry_delay, gss_expired_cred_retry_delay, diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index f6fc80e1d6..3366505bc6 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -659,6 +659,7 @@ static void __exit cleanup_kerberos_module(void) gss_mech_unregister(&gss_kerberos_mech); } +MODULE_DESCRIPTION("Sun RPC Kerberos 5 module"); MODULE_LICENSE("GPL"); module_init(init_kerberos_module); module_exit(cleanup_kerberos_module); diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 18734e70c5..24de941847 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -866,14 +866,6 @@ svcauth_gss_unwrap_integ(struct svc_rqst *rqstp, u32 seq, struct gss_ctx *ctx) struct xdr_buf databody_integ; struct xdr_netobj checksum; - /* NFS READ normally uses splice to send data in-place. However - * the data in cache can change after the reply's MIC is computed - * but before the RPC reply is sent. To prevent the client from - * rejecting the server-computed MIC in this somewhat rare case, - * do not use splice with the GSS integrity service. - */ - clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); - /* Did we already verify the signature on the original pass through? */ if (rqstp->rq_deferred) return 0; @@ -948,8 +940,6 @@ svcauth_gss_unwrap_priv(struct svc_rqst *rqstp, u32 seq, struct gss_ctx *ctx) struct xdr_buf *buf = xdr->buf; unsigned int saved_len; - clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); - if (xdr_stream_decode_u32(xdr, &len) < 0) goto unwrap_failed; if (rqstp->rq_deferred) { @@ -2014,6 +2004,11 @@ svcauth_gss_domain_release(struct auth_domain *dom) call_rcu(&dom->rcu_head, svcauth_gss_domain_release_rcu); } +static rpc_authflavor_t svcauth_gss_pseudoflavor(struct svc_rqst *rqstp) +{ + return svcauth_gss_flavor(rqstp->rq_gssclient); +} + static struct auth_ops svcauthops_gss = { .name = "rpcsec_gss", .owner = THIS_MODULE, @@ -2022,6 +2017,7 @@ static struct auth_ops svcauthops_gss = { .release = svcauth_gss_release, .domain_release = svcauth_gss_domain_release, .set_client = svcauth_gss_set_client, + .pseudoflavor = svcauth_gss_pseudoflavor, }; static int rsi_cache_create_net(struct net *net) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index daa9582ec8..cda0935a68 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -287,8 +287,14 @@ static struct rpc_xprt *rpc_clnt_set_transport(struct rpc_clnt *clnt, static void rpc_clnt_set_nodename(struct rpc_clnt *clnt, const char *nodename) { - clnt->cl_nodelen = strlcpy(clnt->cl_nodename, - nodename, sizeof(clnt->cl_nodename)); + ssize_t copied; + + copied = strscpy(clnt->cl_nodename, + nodename, sizeof(clnt->cl_nodename)); + + clnt->cl_nodelen = copied < 0 + ? sizeof(clnt->cl_nodename) - 1 + : copied; } static int rpc_client_register(struct rpc_clnt *clnt, @@ -797,15 +803,24 @@ out_revert: } EXPORT_SYMBOL_GPL(rpc_switch_client_transport); -static -int _rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi, - void func(struct rpc_xprt_iter *xpi, struct rpc_xprt_switch *xps)) +static struct rpc_xprt_switch *rpc_clnt_xprt_switch_get(struct rpc_clnt *clnt) { struct rpc_xprt_switch *xps; rcu_read_lock(); xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); rcu_read_unlock(); + + return xps; +} + +static +int _rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi, + void func(struct rpc_xprt_iter *xpi, struct rpc_xprt_switch *xps)) +{ + struct rpc_xprt_switch *xps; + + xps = rpc_clnt_xprt_switch_get(clnt); if (xps == NULL) return -EAGAIN; func(xpi, xps); @@ -1302,8 +1317,10 @@ static void call_bc_encode(struct rpc_task *task); * rpc_run_bc_task - Allocate a new RPC task for backchannel use, then run * rpc_execute against it * @req: RPC request + * @timeout: timeout values to use for this task */ -struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req) +struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req, + struct rpc_timeout *timeout) { struct rpc_task *task; struct rpc_task_setup task_setup_data = { @@ -1322,7 +1339,7 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req) return task; } - xprt_init_bc_request(req, task); + xprt_init_bc_request(req, task, timeout); task->tk_action = call_bc_encode; atomic_inc(&task->tk_count); @@ -2206,9 +2223,7 @@ call_connect_status(struct rpc_task *task) struct rpc_xprt *saved = task->tk_xprt; struct rpc_xprt_switch *xps; - rcu_read_lock(); - xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); - rcu_read_unlock(); + xps = rpc_clnt_xprt_switch_get(clnt); if (xps->xps_nxprts > 1) { long value; @@ -3116,7 +3131,6 @@ static int rpc_xprt_probe_trunked(struct rpc_clnt *clnt, struct rpc_xprt *xprt, struct rpc_add_xprt_test *data) { - struct rpc_xprt_switch *xps; struct rpc_xprt *main_xprt; int status = 0; @@ -3124,7 +3138,6 @@ static int rpc_xprt_probe_trunked(struct rpc_clnt *clnt, rcu_read_lock(); main_xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); - xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); status = rpc_cmp_addr_port((struct sockaddr *)&xprt->addr, (struct sockaddr *)&main_xprt->addr); rcu_read_unlock(); @@ -3135,7 +3148,6 @@ static int rpc_xprt_probe_trunked(struct rpc_clnt *clnt, status = rpc_clnt_add_xprt_helper(clnt, xprt, data); out: xprt_put(xprt); - xprt_switch_put(xps); return status; } @@ -3250,34 +3262,27 @@ rpc_set_connect_timeout(struct rpc_clnt *clnt, } EXPORT_SYMBOL_GPL(rpc_set_connect_timeout); -void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt) -{ - rcu_read_lock(); - xprt_switch_put(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_put); - void rpc_clnt_xprt_set_online(struct rpc_clnt *clnt, struct rpc_xprt *xprt) { struct rpc_xprt_switch *xps; - rcu_read_lock(); - xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); - rcu_read_unlock(); + xps = rpc_clnt_xprt_switch_get(clnt); xprt_set_online_locked(xprt, xps); + xprt_switch_put(xps); } void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) { + struct rpc_xprt_switch *xps; + if (rpc_clnt_xprt_switch_has_addr(clnt, (const struct sockaddr *)&xprt->addr)) { return rpc_clnt_xprt_set_online(clnt, xprt); } - rcu_read_lock(); - rpc_xprt_switch_add_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch), - xprt); - rcu_read_unlock(); + + xps = rpc_clnt_xprt_switch_get(clnt); + rpc_xprt_switch_add_xprt(xps, xprt); + xprt_switch_put(xps); } EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_add_xprt); diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 691c0000e9..bab6cab294 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -148,6 +148,7 @@ cleanup_sunrpc(void) #endif rcu_barrier(); /* Wait for completion of call_rcu()'s */ } +MODULE_DESCRIPTION("Sun RPC core"); MODULE_LICENSE("GPL"); fs_initcall(init_sunrpc); /* Ensure we're initialised before nfs */ module_exit(cleanup_sunrpc); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 3f2ea7a049..b969e505c7 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -463,7 +463,6 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, return NULL; serv->sv_name = prog->pg_name; serv->sv_program = prog; - kref_init(&serv->sv_refcnt); serv->sv_stats = prog->pg_stats; if (bufsize > RPCSVC_MAXPAYLOAD) bufsize = RPCSVC_MAXPAYLOAD; @@ -564,11 +563,13 @@ EXPORT_SYMBOL_GPL(svc_create_pooled); * protect sv_permsocks and sv_tempsocks. */ void -svc_destroy(struct kref *ref) +svc_destroy(struct svc_serv **servp) { - struct svc_serv *serv = container_of(ref, struct svc_serv, sv_refcnt); + struct svc_serv *serv = *servp; unsigned int i; + *servp = NULL; + dprintk("svc: svc_destroy(%s)\n", serv->sv_program->pg_name); timer_shutdown_sync(&serv->sv_temptimer); @@ -675,7 +676,6 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) if (!rqstp) return ERR_PTR(-ENOMEM); - svc_get(serv); spin_lock_bh(&serv->sv_lock); serv->sv_nrthreads += 1; spin_unlock_bh(&serv->sv_lock); @@ -935,11 +935,6 @@ svc_exit_thread(struct svc_rqst *rqstp) svc_rqst_free(rqstp); - svc_put(serv); - /* That svc_put() cannot be the last, because the thread - * waiting for SP_VICTIM_REMAINS to clear must hold - * a reference. So it is still safe to access pool. - */ clear_and_wake_up_bit(SP_VICTIM_REMAINS, &pool->sp_flags); } EXPORT_SYMBOL_GPL(svc_exit_thread); @@ -1305,8 +1300,6 @@ svc_process_common(struct svc_rqst *rqstp) int rc; __be32 *p; - /* Will be turned off by GSS integrity and privacy services */ - set_bit(RQ_SPLICE_OK, &rqstp->rq_flags); /* Will be turned off only when NFSv4 Sessions are used */ set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); clear_bit(RQ_DROPME, &rqstp->rq_flags); @@ -1557,6 +1550,7 @@ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp) { struct rpc_task *task; int proc_error; + struct rpc_timeout timeout; /* Build the svc_rqst used by the common processing routine */ rqstp->rq_xid = req->rq_xid; @@ -1602,8 +1596,16 @@ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp) return; } /* Finally, send the reply synchronously */ + if (rqstp->bc_to_initval > 0) { + timeout.to_initval = rqstp->bc_to_initval; + timeout.to_retries = rqstp->bc_to_retries; + } else { + timeout.to_initval = req->rq_xprt->timeout->to_initval; + timeout.to_retries = req->rq_xprt->timeout->to_retries; + } memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf)); - task = rpc_run_bc_task(req); + task = rpc_run_bc_task(req, &timeout); + if (IS_ERR(task)) return; diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 1b71055fc3..b4a85a227b 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1362,29 +1362,36 @@ int svc_xprt_names(struct svc_serv *serv, char *buf, const int buflen) } EXPORT_SYMBOL_GPL(svc_xprt_names); - /*----------------------------------------------------------------------------*/ static void *svc_pool_stats_start(struct seq_file *m, loff_t *pos) { unsigned int pidx = (unsigned int)*pos; - struct svc_serv *serv = m->private; + struct svc_info *si = m->private; dprintk("svc_pool_stats_start, *pidx=%u\n", pidx); + mutex_lock(si->mutex); + if (!pidx) return SEQ_START_TOKEN; - return (pidx > serv->sv_nrpools ? NULL : &serv->sv_pools[pidx-1]); + if (!si->serv) + return NULL; + return pidx > si->serv->sv_nrpools ? NULL + : &si->serv->sv_pools[pidx - 1]; } static void *svc_pool_stats_next(struct seq_file *m, void *p, loff_t *pos) { struct svc_pool *pool = p; - struct svc_serv *serv = m->private; + struct svc_info *si = m->private; + struct svc_serv *serv = si->serv; dprintk("svc_pool_stats_next, *pos=%llu\n", *pos); - if (p == SEQ_START_TOKEN) { + if (!serv) { + pool = NULL; + } else if (p == SEQ_START_TOKEN) { pool = &serv->sv_pools[0]; } else { unsigned int pidx = (pool - &serv->sv_pools[0]); @@ -1399,6 +1406,9 @@ static void *svc_pool_stats_next(struct seq_file *m, void *p, loff_t *pos) static void svc_pool_stats_stop(struct seq_file *m, void *p) { + struct svc_info *si = m->private; + + mutex_unlock(si->mutex); } static int svc_pool_stats_show(struct seq_file *m, void *p) @@ -1426,14 +1436,18 @@ static const struct seq_operations svc_pool_stats_seq_ops = { .show = svc_pool_stats_show, }; -int svc_pool_stats_open(struct svc_serv *serv, struct file *file) +int svc_pool_stats_open(struct svc_info *info, struct file *file) { + struct seq_file *seq; int err; err = seq_open(file, &svc_pool_stats_seq_ops); - if (!err) - ((struct seq_file *) file->private_data)->private = serv; - return err; + if (err) + return err; + seq = file->private_data; + seq->private = info; + + return 0; } EXPORT_SYMBOL(svc_pool_stats_open); diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index aa4429d0b8..1619211f09 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -160,6 +160,22 @@ svc_auth_unregister(rpc_authflavor_t flavor) } EXPORT_SYMBOL_GPL(svc_auth_unregister); +/** + * svc_auth_flavor - return RPC transaction's RPC_AUTH flavor + * @rqstp: RPC transaction context + * + * Returns an RPC flavor or GSS pseudoflavor. + */ +rpc_authflavor_t svc_auth_flavor(struct svc_rqst *rqstp) +{ + struct auth_ops *aops = rqstp->rq_authop; + + if (!aops->pseudoflavor) + return aops->flavour; + return aops->pseudoflavor(rqstp); +} +EXPORT_SYMBOL_GPL(svc_auth_flavor); + /************************************************** * 'auth_domains' are stored in a hash table indexed by name. * When the last reference to an 'auth_domain' is dropped, diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index e0ce427627..6b3f01beb2 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1049,18 +1049,14 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) struct rpc_rqst *req = NULL; struct kvec *src, *dst; __be32 *p = (__be32 *)rqstp->rq_arg.head[0].iov_base; - __be32 xid; - __be32 calldir; - - xid = *p++; - calldir = *p; + __be32 xid = *p; if (!bc_xprt) return -EAGAIN; spin_lock(&bc_xprt->queue_lock); req = xprt_lookup_rqst(bc_xprt, xid); if (!req) - goto unlock_notfound; + goto unlock_eagain; memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf)); /* @@ -1077,12 +1073,6 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) rqstp->rq_arg.len = 0; spin_unlock(&bc_xprt->queue_lock); return 0; -unlock_notfound: - printk(KERN_NOTICE - "%s: Got unrecognized reply: " - "calldir 0x%x xpt_bc_xprt %p xid %08x\n", - __func__, ntohl(calldir), - bc_xprt, ntohl(xid)); unlock_eagain: spin_unlock(&bc_xprt->queue_lock); return -EAGAIN; @@ -1216,15 +1206,6 @@ err_noclose: * MSG_SPLICE_PAGES is used exclusively to reduce the number of * copy operations in this path. Therefore the caller must ensure * that the pages backing @xdr are unchanging. - * - * Note that the send is non-blocking. The caller has incremented - * the reference count on each page backing the RPC message, and - * the network layer will "put" these pages when transmission is - * complete. - * - * This is safe for our RPC services because the memory backing - * the head and tail components is never kmalloc'd. These always - * come from pages in the svc_rqst::rq_pages array. */ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, rpc_fraghdr marker, unsigned int *sentp) @@ -1254,6 +1235,7 @@ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, 1 + count, sizeof(marker) + rqstp->rq_res.len); ret = sock_sendmsg(svsk->sk_sock, &msg); + page_frag_free(buf); if (ret < 0) return ret; *sentp += ret; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 6cc9ffac96..af13fdfa66 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1986,7 +1986,8 @@ void xprt_release(struct rpc_task *task) #ifdef CONFIG_SUNRPC_BACKCHANNEL void -xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task) +xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task, + const struct rpc_timeout *to) { struct xdr_buf *xbufp = &req->rq_snd_buf; @@ -1999,8 +2000,13 @@ xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task) */ xbufp->len = xbufp->head[0].iov_len + xbufp->page_len + xbufp->tail[0].iov_len; - - xprt_init_majortimeo(task, req, req->rq_xprt->timeout); + /* + * Backchannel Replies are sent with !RPC_TASK_SOFT and + * RPC_TASK_NO_RETRANS_TIMEOUT. The major timeout setting + * affects only how long each Reply waits to be sent when + * a transport connection cannot be established. + */ + xprt_init_majortimeo(task, req, to); } #endif diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index f0d5eeed4c..f86970733e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -256,28 +256,44 @@ out_err: return rc; } +struct workqueue_struct *svcrdma_wq; + void svc_rdma_cleanup(void) { - dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); svc_unreg_xprt_class(&svc_rdma_class); svc_rdma_proc_cleanup(); + if (svcrdma_wq) { + struct workqueue_struct *wq = svcrdma_wq; + + svcrdma_wq = NULL; + destroy_workqueue(wq); + } + + dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); } int svc_rdma_init(void) { + struct workqueue_struct *wq; int rc; - dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); - dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); - dprintk("\tmax_requests : %u\n", svcrdma_max_requests); - dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests); - dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); + wq = alloc_workqueue("svcrdma", WQ_UNBOUND, 0); + if (!wq) + return -ENOMEM; rc = svc_rdma_proc_init(); - if (rc) + if (rc) { + destroy_workqueue(wq); return rc; + } - /* Register RDMA with the SVC transport switch */ + svcrdma_wq = wq; svc_reg_xprt_class(&svc_rdma_class); + + dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); + dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); + dprintk("\tmax_requests : %u\n", svcrdma_max_requests); + dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests); + dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); return 0; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 7420a2c990..c9be677864 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -76,15 +76,12 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst, struct svc_rdma_send_ctxt *sctxt) { - struct svc_rdma_recv_ctxt *rctxt; + struct svc_rdma_pcl empty_pcl; int ret; - rctxt = svc_rdma_recv_ctxt_get(rdma); - if (!rctxt) - return -EIO; - - ret = svc_rdma_map_reply_msg(rdma, sctxt, rctxt, &rqst->rq_snd_buf); - svc_rdma_recv_ctxt_put(rdma, rctxt); + pcl_init(&empty_pcl); + ret = svc_rdma_map_reply_msg(rdma, sctxt, &empty_pcl, &empty_pcl, + &rqst->rq_snd_buf); if (ret < 0) return -EIO; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 3b05f90a3e..d72953f292 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -115,13 +115,6 @@ svc_rdma_next_recv_ctxt(struct list_head *list) rc_list); } -static void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma, - struct rpc_rdma_cid *cid) -{ - cid->ci_queue_id = rdma->sc_rq_cq->res.id; - cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); -} - static struct svc_rdma_recv_ctxt * svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) { @@ -130,7 +123,7 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) dma_addr_t addr; void *buffer; - ctxt = kmalloc_node(sizeof(*ctxt), GFP_KERNEL, node); + ctxt = kzalloc_node(sizeof(*ctxt), GFP_KERNEL, node); if (!ctxt) goto fail0; buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node); @@ -156,6 +149,7 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) ctxt->rc_recv_sge.length = rdma->sc_max_req_size; ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey; ctxt->rc_recv_buf = buffer; + svc_rdma_cc_init(rdma, &ctxt->rc_cc); return ctxt; fail2: @@ -204,18 +198,11 @@ struct svc_rdma_recv_ctxt *svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma) node = llist_del_first(&rdma->sc_recv_ctxts); if (!node) - goto out_empty; - ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node); + return NULL; -out: + ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node); ctxt->rc_page_count = 0; return ctxt; - -out_empty: - ctxt = svc_rdma_recv_ctxt_alloc(rdma); - if (!ctxt) - return NULL; - goto out; } /** @@ -227,6 +214,13 @@ out_empty: void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, struct svc_rdma_recv_ctxt *ctxt) { + svc_rdma_cc_release(rdma, &ctxt->rc_cc, DMA_FROM_DEVICE); + + /* @rc_page_count is normally zero here, but error flows + * can leave pages in @rc_pages. + */ + release_pages(ctxt->rc_pages, ctxt->rc_page_count); + pcl_free(&ctxt->rc_call_pcl); pcl_free(&ctxt->rc_read_pcl); pcl_free(&ctxt->rc_write_pcl); @@ -271,13 +265,13 @@ static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma, if (!ctxt) break; - trace_svcrdma_post_recv(ctxt); + trace_svcrdma_post_recv(&ctxt->rc_cid); ctxt->rc_recv_wr.next = recv_chain; recv_chain = &ctxt->rc_recv_wr; rdma->sc_pending_recvs++; } if (!recv_chain) - return false; + return true; ret = ib_post_recv(rdma->sc_qp, recv_chain, &bad_wr); if (ret) @@ -301,10 +295,27 @@ err_free: * svc_rdma_post_recvs - Post initial set of Recv WRs * @rdma: fresh svcxprt_rdma * - * Returns true if successful, otherwise false. + * Return values: + * %true: Receive Queue initialization successful + * %false: memory allocation or DMA error */ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma) { + unsigned int total; + + /* For each credit, allocate enough recv_ctxts for one + * posted Receive and one RPC in process. + */ + total = (rdma->sc_max_requests * 2) + rdma->sc_recv_batch; + while (total--) { + struct svc_rdma_recv_ctxt *ctxt; + + ctxt = svc_rdma_recv_ctxt_alloc(rdma); + if (!ctxt) + return false; + llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts); + } + return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests); } @@ -373,6 +384,10 @@ void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma) { struct svc_rdma_recv_ctxt *ctxt; + while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_read_complete_q))) { + list_del(&ctxt->rc_list); + svc_rdma_recv_ctxt_put(rdma, ctxt); + } while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_rq_dto_q))) { list_del(&ctxt->rc_list); svc_rdma_recv_ctxt_put(rdma, ctxt); @@ -754,6 +769,122 @@ static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt, return true; } +/* Finish constructing the RPC Call message in rqstp::rq_arg. + * + * The incoming RPC/RDMA message is an RDMA_MSG type message + * with a single Read chunk (only the upper layer data payload + * was conveyed via RDMA Read). + */ +static void svc_rdma_read_complete_one(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *ctxt) +{ + struct svc_rdma_chunk *chunk = pcl_first_chunk(&ctxt->rc_read_pcl); + struct xdr_buf *buf = &rqstp->rq_arg; + unsigned int length; + + /* Split the Receive buffer between the head and tail + * buffers at Read chunk's position. XDR roundup of the + * chunk is not included in either the pagelist or in + * the tail. + */ + buf->tail[0].iov_base = buf->head[0].iov_base + chunk->ch_position; + buf->tail[0].iov_len = buf->head[0].iov_len - chunk->ch_position; + buf->head[0].iov_len = chunk->ch_position; + + /* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2). + * + * If the client already rounded up the chunk length, the + * length does not change. Otherwise, the length of the page + * list is increased to include XDR round-up. + * + * Currently these chunks always start at page offset 0, + * thus the rounded-up length never crosses a page boundary. + */ + buf->pages = &rqstp->rq_pages[0]; + length = xdr_align_size(chunk->ch_length); + buf->page_len = length; + buf->len += length; + buf->buflen += length; +} + +/* Finish constructing the RPC Call message in rqstp::rq_arg. + * + * The incoming RPC/RDMA message is an RDMA_MSG type message + * with payload in multiple Read chunks and no PZRC. + */ +static void svc_rdma_read_complete_multiple(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *ctxt) +{ + struct xdr_buf *buf = &rqstp->rq_arg; + + buf->len += ctxt->rc_readbytes; + buf->buflen += ctxt->rc_readbytes; + + buf->head[0].iov_base = page_address(rqstp->rq_pages[0]); + buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, ctxt->rc_readbytes); + buf->pages = &rqstp->rq_pages[1]; + buf->page_len = ctxt->rc_readbytes - buf->head[0].iov_len; +} + +/* Finish constructing the RPC Call message in rqstp::rq_arg. + * + * The incoming RPC/RDMA message is an RDMA_NOMSG type message + * (the RPC message body was conveyed via RDMA Read). + */ +static void svc_rdma_read_complete_pzrc(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *ctxt) +{ + struct xdr_buf *buf = &rqstp->rq_arg; + + buf->len += ctxt->rc_readbytes; + buf->buflen += ctxt->rc_readbytes; + + buf->head[0].iov_base = page_address(rqstp->rq_pages[0]); + buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, ctxt->rc_readbytes); + buf->pages = &rqstp->rq_pages[1]; + buf->page_len = ctxt->rc_readbytes - buf->head[0].iov_len; +} + +static noinline void svc_rdma_read_complete(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *ctxt) +{ + unsigned int i; + + /* Transfer the Read chunk pages into @rqstp.rq_pages, replacing + * the rq_pages that were already allocated for this rqstp. + */ + release_pages(rqstp->rq_respages, ctxt->rc_page_count); + for (i = 0; i < ctxt->rc_page_count; i++) + rqstp->rq_pages[i] = ctxt->rc_pages[i]; + + /* Update @rqstp's result send buffer to start after the + * last page in the RDMA Read payload. + */ + rqstp->rq_respages = &rqstp->rq_pages[ctxt->rc_page_count]; + rqstp->rq_next_page = rqstp->rq_respages + 1; + + /* Prevent svc_rdma_recv_ctxt_put() from releasing the + * pages in ctxt::rc_pages a second time. + */ + ctxt->rc_page_count = 0; + + /* Finish constructing the RPC Call message. The exact + * procedure for that depends on what kind of RPC/RDMA + * chunks were provided by the client. + */ + rqstp->rq_arg = ctxt->rc_saved_arg; + if (pcl_is_empty(&ctxt->rc_call_pcl)) { + if (ctxt->rc_read_pcl.cl_count == 1) + svc_rdma_read_complete_one(rqstp, ctxt); + else + svc_rdma_read_complete_multiple(rqstp, ctxt); + } else { + svc_rdma_read_complete_pzrc(rqstp, ctxt); + } + + trace_svcrdma_read_finished(&ctxt->rc_cid); +} + /** * svc_rdma_recvfrom - Receive an RPC call * @rqstp: request structure into which to receive an RPC Call @@ -798,8 +929,15 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) rqstp->rq_xprt_ctxt = NULL; - ctxt = NULL; spin_lock(&rdma_xprt->sc_rq_dto_lock); + ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_read_complete_q); + if (ctxt) { + list_del(&ctxt->rc_list); + spin_unlock(&rdma_xprt->sc_rq_dto_lock); + svc_xprt_received(xprt); + svc_rdma_read_complete(rqstp, ctxt); + goto complete; + } ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_rq_dto_q); if (ctxt) list_del(&ctxt->rc_list); @@ -831,12 +969,10 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) svc_rdma_get_inv_rkey(rdma_xprt, ctxt); if (!pcl_is_empty(&ctxt->rc_read_pcl) || - !pcl_is_empty(&ctxt->rc_call_pcl)) { - ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt); - if (ret < 0) - goto out_readfail; - } + !pcl_is_empty(&ctxt->rc_call_pcl)) + goto out_readlist; +complete: rqstp->rq_xprt_ctxt = ctxt; rqstp->rq_prot = IPPROTO_MAX; svc_xprt_copy_addrs(rqstp, xprt); @@ -848,12 +984,23 @@ out_err: svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return 0; -out_readfail: - if (ret == -EINVAL) - svc_rdma_send_error(rdma_xprt, ctxt, ret); - svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); - svc_xprt_deferred_close(xprt); - return -ENOTCONN; +out_readlist: + /* This @rqstp is about to be recycled. Save the work + * already done constructing the Call message in rq_arg + * so it can be restored when the RDMA Reads have + * completed. + */ + ctxt->rc_saved_arg = rqstp->rq_arg; + + ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt); + if (ret < 0) { + if (ret == -EINVAL) + svc_rdma_send_error(rdma_xprt, ctxt, ret); + svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); + svc_xprt_deferred_close(xprt); + return ret; + } + return 0; out_backchannel: svc_rdma_handle_bc_reply(rqstp, ctxt); diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index e460e25a1d..c00fcce61d 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -39,6 +39,7 @@ struct svc_rdma_rw_ctxt { struct list_head rw_list; struct rdma_rw_ctx rw_ctx; unsigned int rw_nents; + unsigned int rw_first_sgl_nents; struct sg_table rw_sg_table; struct scatterlist rw_first_sgl[]; }; @@ -53,6 +54,8 @@ svc_rdma_next_ctxt(struct list_head *list) static struct svc_rdma_rw_ctxt * svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) { + struct ib_device *dev = rdma->sc_cm_id->device; + unsigned int first_sgl_nents = dev->attrs.max_send_sge; struct svc_rdma_rw_ctxt *ctxt; struct llist_node *node; @@ -62,32 +65,33 @@ svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) if (node) { ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node); } else { - ctxt = kmalloc_node(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE), - GFP_KERNEL, ibdev_to_node(rdma->sc_cm_id->device)); + ctxt = kmalloc_node(struct_size(ctxt, rw_first_sgl, first_sgl_nents), + GFP_KERNEL, ibdev_to_node(dev)); if (!ctxt) goto out_noctx; INIT_LIST_HEAD(&ctxt->rw_list); + ctxt->rw_first_sgl_nents = first_sgl_nents; } ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl; if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges, ctxt->rw_sg_table.sgl, - SG_CHUNK_SIZE)) + first_sgl_nents)) goto out_free; return ctxt; out_free: kfree(ctxt); out_noctx: - trace_svcrdma_no_rwctx_err(rdma, sges); + trace_svcrdma_rwctx_empty(rdma, sges); return NULL; } static void __svc_rdma_put_rw_ctxt(struct svc_rdma_rw_ctxt *ctxt, struct llist_head *list) { - sg_free_table_chained(&ctxt->rw_sg_table, SG_CHUNK_SIZE); + sg_free_table_chained(&ctxt->rw_sg_table, ctxt->rw_first_sgl_nents); llist_add(&ctxt->rw_node, list); } @@ -135,57 +139,40 @@ static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma, ctxt->rw_sg_table.sgl, ctxt->rw_nents, 0, offset, handle, direction); if (unlikely(ret < 0)) { + trace_svcrdma_dma_map_rw_err(rdma, offset, handle, + ctxt->rw_nents, ret); svc_rdma_put_rw_ctxt(rdma, ctxt); - trace_svcrdma_dma_map_rw_err(rdma, ctxt->rw_nents, ret); } return ret; } -/* A chunk context tracks all I/O for moving one Read or Write - * chunk. This is a set of rdma_rw's that handle data movement - * for all segments of one chunk. - * - * These are small, acquired with a single allocator call, and - * no more than one is needed per chunk. They are allocated on - * demand, and not cached. +/** + * svc_rdma_cc_init - Initialize an svc_rdma_chunk_ctxt + * @rdma: controlling transport instance + * @cc: svc_rdma_chunk_ctxt to be initialized */ -struct svc_rdma_chunk_ctxt { - struct rpc_rdma_cid cc_cid; - struct ib_cqe cc_cqe; - struct svcxprt_rdma *cc_rdma; - struct list_head cc_rwctxts; - ktime_t cc_posttime; - int cc_sqecount; - enum ib_wc_status cc_status; - struct completion cc_done; -}; - -static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma, - struct rpc_rdma_cid *cid) +void svc_rdma_cc_init(struct svcxprt_rdma *rdma, + struct svc_rdma_chunk_ctxt *cc) { - cid->ci_queue_id = rdma->sc_sq_cq->res.id; - cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); -} + struct rpc_rdma_cid *cid = &cc->cc_cid; -static void svc_rdma_cc_init(struct svcxprt_rdma *rdma, - struct svc_rdma_chunk_ctxt *cc) -{ - svc_rdma_cc_cid_init(rdma, &cc->cc_cid); - cc->cc_rdma = rdma; + if (unlikely(!cid->ci_completion_id)) + svc_rdma_send_cid_init(rdma, cid); INIT_LIST_HEAD(&cc->cc_rwctxts); cc->cc_sqecount = 0; } -/* - * The consumed rw_ctx's are cleaned and placed on a local llist so - * that only one atomic llist operation is needed to put them all - * back on the free list. +/** + * svc_rdma_cc_release - Release resources held by a svc_rdma_chunk_ctxt + * @rdma: controlling transport instance + * @cc: svc_rdma_chunk_ctxt to be released + * @dir: DMA direction */ -static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, - enum dma_data_direction dir) +void svc_rdma_cc_release(struct svcxprt_rdma *rdma, + struct svc_rdma_chunk_ctxt *cc, + enum dma_data_direction dir) { - struct svcxprt_rdma *rdma = cc->cc_rdma; struct llist_node *first, *last; struct svc_rdma_rw_ctxt *ctxt; LLIST_HEAD(free); @@ -215,6 +202,8 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, * - Stores arguments for the SGL constructor functions */ struct svc_rdma_write_info { + struct svcxprt_rdma *wi_rdma; + const struct svc_rdma_chunk *wi_chunk; /* write state of this chunk */ @@ -227,6 +216,7 @@ struct svc_rdma_write_info { unsigned int wi_next_off; struct svc_rdma_chunk_ctxt wi_cc; + struct work_struct wi_work; }; static struct svc_rdma_write_info * @@ -235,25 +225,33 @@ svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, { struct svc_rdma_write_info *info; - info = kmalloc_node(sizeof(*info), GFP_KERNEL, + info = kzalloc_node(sizeof(*info), GFP_KERNEL, ibdev_to_node(rdma->sc_cm_id->device)); if (!info) return info; + info->wi_rdma = rdma; info->wi_chunk = chunk; - info->wi_seg_off = 0; - info->wi_seg_no = 0; svc_rdma_cc_init(rdma, &info->wi_cc); info->wi_cc.cc_cqe.done = svc_rdma_write_done; return info; } -static void svc_rdma_write_info_free(struct svc_rdma_write_info *info) +static void svc_rdma_write_info_free_async(struct work_struct *work) { - svc_rdma_cc_release(&info->wi_cc, DMA_TO_DEVICE); + struct svc_rdma_write_info *info; + + info = container_of(work, struct svc_rdma_write_info, wi_work); + svc_rdma_cc_release(info->wi_rdma, &info->wi_cc, DMA_TO_DEVICE); kfree(info); } +static void svc_rdma_write_info_free(struct svc_rdma_write_info *info) +{ + INIT_WORK(&info->wi_work, svc_rdma_write_info_free_async); + queue_work(svcrdma_wq, &info->wi_work); +} + /** * svc_rdma_write_done - Write chunk completion * @cq: controlling Completion Queue @@ -263,16 +261,16 @@ static void svc_rdma_write_info_free(struct svc_rdma_write_info *info) */ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) { + struct svcxprt_rdma *rdma = cq->cq_context; struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_chunk_ctxt *cc = container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); - struct svcxprt_rdma *rdma = cc->cc_rdma; struct svc_rdma_write_info *info = container_of(cc, struct svc_rdma_write_info, wi_cc); switch (wc->status) { case IB_WC_SUCCESS: - trace_svcrdma_wc_write(wc, &cc->cc_cid); + trace_svcrdma_wc_write(&cc->cc_cid); break; case IB_WC_WR_FLUSH_ERR: trace_svcrdma_wc_write_flush(wc, &cc->cc_cid); @@ -289,39 +287,6 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) svc_rdma_write_info_free(info); } -/* State for pulling a Read chunk. - */ -struct svc_rdma_read_info { - struct svc_rqst *ri_rqst; - struct svc_rdma_recv_ctxt *ri_readctxt; - unsigned int ri_pageno; - unsigned int ri_pageoff; - unsigned int ri_totalbytes; - - struct svc_rdma_chunk_ctxt ri_cc; -}; - -static struct svc_rdma_read_info * -svc_rdma_read_info_alloc(struct svcxprt_rdma *rdma) -{ - struct svc_rdma_read_info *info; - - info = kmalloc_node(sizeof(*info), GFP_KERNEL, - ibdev_to_node(rdma->sc_cm_id->device)); - if (!info) - return info; - - svc_rdma_cc_init(rdma, &info->ri_cc); - info->ri_cc.cc_cqe.done = svc_rdma_wc_read_done; - return info; -} - -static void svc_rdma_read_info_free(struct svc_rdma_read_info *info) -{ - svc_rdma_cc_release(&info->ri_cc, DMA_FROM_DEVICE); - kfree(info); -} - /** * svc_rdma_wc_read_done - Handle completion of an RDMA Read ctx * @cq: controlling Completion Queue @@ -330,17 +295,27 @@ static void svc_rdma_read_info_free(struct svc_rdma_read_info *info) */ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) { + struct svcxprt_rdma *rdma = cq->cq_context; struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_chunk_ctxt *cc = container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); - struct svc_rdma_read_info *info; + struct svc_rdma_recv_ctxt *ctxt; + svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); + + ctxt = container_of(cc, struct svc_rdma_recv_ctxt, rc_cc); switch (wc->status) { case IB_WC_SUCCESS: - info = container_of(cc, struct svc_rdma_read_info, ri_cc); - trace_svcrdma_wc_read(wc, &cc->cc_cid, info->ri_totalbytes, + trace_svcrdma_wc_read(wc, &cc->cc_cid, ctxt->rc_readbytes, cc->cc_posttime); - break; + + spin_lock(&rdma->sc_rq_dto_lock); + list_add_tail(&ctxt->rc_list, &rdma->sc_read_complete_q); + /* the unlock pairs with the smp_rmb in svc_xprt_ready */ + set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags); + spin_unlock(&rdma->sc_rq_dto_lock); + svc_xprt_enqueue(&rdma->sc_xprt); + return; case IB_WC_WR_FLUSH_ERR: trace_svcrdma_wc_read_flush(wc, &cc->cc_cid); break; @@ -348,10 +323,13 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) trace_svcrdma_wc_read_err(wc, &cc->cc_cid); } - svc_rdma_wake_send_waiters(cc->cc_rdma, cc->cc_sqecount); - cc->cc_status = wc->status; - complete(&cc->cc_done); - return; + /* The RDMA Read has flushed, so the incoming RPC message + * cannot be constructed and must be dropped. Signal the + * loss to the client by closing the connection. + */ + svc_rdma_cc_release(rdma, cc, DMA_FROM_DEVICE); + svc_rdma_recv_ctxt_put(rdma, ctxt); + svc_xprt_deferred_close(&rdma->sc_xprt); } /* @@ -360,9 +338,9 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) * even if one or more WRs are flushed. This is true when posting * an rdma_rw_ctx or when posting a single signaled WR. */ -static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) +static int svc_rdma_post_chunk_ctxt(struct svcxprt_rdma *rdma, + struct svc_rdma_chunk_ctxt *cc) { - struct svcxprt_rdma *rdma = cc->cc_rdma; struct ib_send_wr *first_wr; const struct ib_send_wr *bad_wr; struct list_head *tmp; @@ -396,14 +374,14 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) } percpu_counter_inc(&svcrdma_stat_sq_starve); - trace_svcrdma_sq_full(rdma); + trace_svcrdma_sq_full(rdma, &cc->cc_cid); atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); wait_event(rdma->sc_send_wait, atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount); - trace_svcrdma_sq_retry(rdma); + trace_svcrdma_sq_retry(rdma, &cc->cc_cid); } while (1); - trace_svcrdma_sq_post_err(rdma, ret); + trace_svcrdma_sq_post_err(rdma, &cc->cc_cid, ret); svc_xprt_deferred_close(&rdma->sc_xprt); /* If even one was posted, there will be a completion. */ @@ -473,7 +451,7 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, unsigned int remaining) { struct svc_rdma_chunk_ctxt *cc = &info->wi_cc; - struct svcxprt_rdma *rdma = cc->cc_rdma; + struct svcxprt_rdma *rdma = info->wi_rdma; const struct svc_rdma_segment *seg; struct svc_rdma_rw_ctxt *ctxt; int ret; @@ -516,7 +494,7 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, return 0; out_overflow: - trace_svcrdma_small_wrch_err(rdma, remaining, info->wi_seg_no, + trace_svcrdma_small_wrch_err(&cc->cc_cid, remaining, info->wi_seg_no, info->wi_chunk->ch_segcount); return -E2BIG; } @@ -633,7 +611,7 @@ int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, goto out_err; trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount); - ret = svc_rdma_post_chunk_ctxt(cc); + ret = svc_rdma_post_chunk_ctxt(rdma, cc); if (ret < 0) goto out_err; return xdr->len; @@ -680,7 +658,7 @@ int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, goto out_err; trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount); - ret = svc_rdma_post_chunk_ctxt(cc); + ret = svc_rdma_post_chunk_ctxt(rdma, cc); if (ret < 0) goto out_err; @@ -693,7 +671,8 @@ out_err: /** * svc_rdma_build_read_segment - Build RDMA Read WQEs to pull one RDMA segment - * @info: context for ongoing I/O + * @rqstp: RPC transaction context + * @head: context for ongoing I/O * @segment: co-ordinates of remote memory to be read * * Returns: @@ -702,20 +681,20 @@ out_err: * %-ENOMEM: allocating a local resources failed * %-EIO: a DMA mapping error occurred */ -static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, +static int svc_rdma_build_read_segment(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head, const struct svc_rdma_segment *segment) { - struct svc_rdma_recv_ctxt *head = info->ri_readctxt; - struct svc_rdma_chunk_ctxt *cc = &info->ri_cc; - struct svc_rqst *rqstp = info->ri_rqst; + struct svcxprt_rdma *rdma = svc_rdma_rqst_rdma(rqstp); + struct svc_rdma_chunk_ctxt *cc = &head->rc_cc; unsigned int sge_no, seg_len, len; struct svc_rdma_rw_ctxt *ctxt; struct scatterlist *sg; int ret; len = segment->rs_length; - sge_no = PAGE_ALIGN(info->ri_pageoff + len) >> PAGE_SHIFT; - ctxt = svc_rdma_get_rw_ctxt(cc->cc_rdma, sge_no); + sge_no = PAGE_ALIGN(head->rc_pageoff + len) >> PAGE_SHIFT; + ctxt = svc_rdma_get_rw_ctxt(rdma, sge_no); if (!ctxt) return -ENOMEM; ctxt->rw_nents = sge_no; @@ -723,29 +702,27 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, sg = ctxt->rw_sg_table.sgl; for (sge_no = 0; sge_no < ctxt->rw_nents; sge_no++) { seg_len = min_t(unsigned int, len, - PAGE_SIZE - info->ri_pageoff); + PAGE_SIZE - head->rc_pageoff); - if (!info->ri_pageoff) + if (!head->rc_pageoff) head->rc_page_count++; - sg_set_page(sg, rqstp->rq_pages[info->ri_pageno], - seg_len, info->ri_pageoff); + sg_set_page(sg, rqstp->rq_pages[head->rc_curpage], + seg_len, head->rc_pageoff); sg = sg_next(sg); - info->ri_pageoff += seg_len; - if (info->ri_pageoff == PAGE_SIZE) { - info->ri_pageno++; - info->ri_pageoff = 0; + head->rc_pageoff += seg_len; + if (head->rc_pageoff == PAGE_SIZE) { + head->rc_curpage++; + head->rc_pageoff = 0; } len -= seg_len; - /* Safety check */ - if (len && - &rqstp->rq_pages[info->ri_pageno + 1] > rqstp->rq_page_end) + if (len && ((head->rc_curpage + 1) > ARRAY_SIZE(rqstp->rq_pages))) goto out_overrun; } - ret = svc_rdma_rw_ctx_init(cc->cc_rdma, ctxt, segment->rs_offset, + ret = svc_rdma_rw_ctx_init(rdma, ctxt, segment->rs_offset, segment->rs_handle, DMA_FROM_DEVICE); if (ret < 0) return -EIO; @@ -756,13 +733,14 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, return 0; out_overrun: - trace_svcrdma_page_overrun_err(cc->cc_rdma, rqstp, info->ri_pageno); + trace_svcrdma_page_overrun_err(&cc->cc_cid, head->rc_curpage); return -EINVAL; } /** * svc_rdma_build_read_chunk - Build RDMA Read WQEs to pull one RDMA chunk - * @info: context for ongoing I/O + * @rqstp: RPC transaction context + * @head: context for ongoing I/O * @chunk: Read chunk to pull * * Return values: @@ -771,7 +749,8 @@ out_overrun: * %-ENOMEM: allocating a local resources failed * %-EIO: a DMA mapping error occurred */ -static int svc_rdma_build_read_chunk(struct svc_rdma_read_info *info, +static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head, const struct svc_rdma_chunk *chunk) { const struct svc_rdma_segment *segment; @@ -779,56 +758,56 @@ static int svc_rdma_build_read_chunk(struct svc_rdma_read_info *info, ret = -EINVAL; pcl_for_each_segment(segment, chunk) { - ret = svc_rdma_build_read_segment(info, segment); + ret = svc_rdma_build_read_segment(rqstp, head, segment); if (ret < 0) break; - info->ri_totalbytes += segment->rs_length; + head->rc_readbytes += segment->rs_length; } return ret; } /** * svc_rdma_copy_inline_range - Copy part of the inline content into pages - * @info: context for RDMA Reads + * @rqstp: RPC transaction context + * @head: context for ongoing I/O * @offset: offset into the Receive buffer of region to copy * @remaining: length of region to copy * * Take a page at a time from rqstp->rq_pages and copy the inline * content from the Receive buffer into that page. Update - * info->ri_pageno and info->ri_pageoff so that the next RDMA Read + * head->rc_curpage and head->rc_pageoff so that the next RDMA Read * result will land contiguously with the copied content. * * Return values: * %0: Inline content was successfully copied * %-EINVAL: offset or length was incorrect */ -static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info, +static int svc_rdma_copy_inline_range(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head, unsigned int offset, unsigned int remaining) { - struct svc_rdma_recv_ctxt *head = info->ri_readctxt; unsigned char *dst, *src = head->rc_recv_buf; - struct svc_rqst *rqstp = info->ri_rqst; unsigned int page_no, numpages; - numpages = PAGE_ALIGN(info->ri_pageoff + remaining) >> PAGE_SHIFT; + numpages = PAGE_ALIGN(head->rc_pageoff + remaining) >> PAGE_SHIFT; for (page_no = 0; page_no < numpages; page_no++) { unsigned int page_len; page_len = min_t(unsigned int, remaining, - PAGE_SIZE - info->ri_pageoff); + PAGE_SIZE - head->rc_pageoff); - if (!info->ri_pageoff) + if (!head->rc_pageoff) head->rc_page_count++; - dst = page_address(rqstp->rq_pages[info->ri_pageno]); - memcpy(dst + info->ri_pageno, src + offset, page_len); + dst = page_address(rqstp->rq_pages[head->rc_curpage]); + memcpy(dst + head->rc_curpage, src + offset, page_len); - info->ri_totalbytes += page_len; - info->ri_pageoff += page_len; - if (info->ri_pageoff == PAGE_SIZE) { - info->ri_pageno++; - info->ri_pageoff = 0; + head->rc_readbytes += page_len; + head->rc_pageoff += page_len; + if (head->rc_pageoff == PAGE_SIZE) { + head->rc_curpage++; + head->rc_pageoff = 0; } remaining -= page_len; offset += page_len; @@ -839,7 +818,8 @@ static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info, /** * svc_rdma_read_multiple_chunks - Construct RDMA Reads to pull data item Read chunks - * @info: context for RDMA Reads + * @rqstp: RPC transaction context + * @head: context for ongoing I/O * * The chunk data lands in rqstp->rq_arg as a series of contiguous pages, * like an incoming TCP call. @@ -851,11 +831,11 @@ static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info, * %-ENOTCONN: posting failed (connection is lost), * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *info) +static noinline int +svc_rdma_read_multiple_chunks(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head) { - struct svc_rdma_recv_ctxt *head = info->ri_readctxt; const struct svc_rdma_pcl *pcl = &head->rc_read_pcl; - struct xdr_buf *buf = &info->ri_rqst->rq_arg; struct svc_rdma_chunk *chunk, *next; unsigned int start, length; int ret; @@ -863,12 +843,12 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf start = 0; chunk = pcl_first_chunk(pcl); length = chunk->ch_position; - ret = svc_rdma_copy_inline_range(info, start, length); + ret = svc_rdma_copy_inline_range(rqstp, head, start, length); if (ret < 0) return ret; pcl_for_each_chunk(chunk, pcl) { - ret = svc_rdma_build_read_chunk(info, chunk); + ret = svc_rdma_build_read_chunk(rqstp, head, chunk); if (ret < 0) return ret; @@ -877,31 +857,21 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf break; start += length; - length = next->ch_position - info->ri_totalbytes; - ret = svc_rdma_copy_inline_range(info, start, length); + length = next->ch_position - head->rc_readbytes; + ret = svc_rdma_copy_inline_range(rqstp, head, start, length); if (ret < 0) return ret; } start += length; length = head->rc_byte_len - start; - ret = svc_rdma_copy_inline_range(info, start, length); - if (ret < 0) - return ret; - - buf->len += info->ri_totalbytes; - buf->buflen += info->ri_totalbytes; - - buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]); - buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes); - buf->pages = &info->ri_rqst->rq_pages[1]; - buf->page_len = info->ri_totalbytes - buf->head[0].iov_len; - return 0; + return svc_rdma_copy_inline_range(rqstp, head, start, length); } /** * svc_rdma_read_data_item - Construct RDMA Reads to pull data item Read chunks - * @info: context for RDMA Reads + * @rqstp: RPC transaction context + * @head: context for ongoing I/O * * The chunk data lands in the page list of rqstp->rq_arg.pages. * @@ -916,50 +886,17 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf * %-ENOTCONN: posting failed (connection is lost), * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static int svc_rdma_read_data_item(struct svc_rdma_read_info *info) +static int svc_rdma_read_data_item(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head) { - struct svc_rdma_recv_ctxt *head = info->ri_readctxt; - struct xdr_buf *buf = &info->ri_rqst->rq_arg; - struct svc_rdma_chunk *chunk; - unsigned int length; - int ret; - - chunk = pcl_first_chunk(&head->rc_read_pcl); - ret = svc_rdma_build_read_chunk(info, chunk); - if (ret < 0) - goto out; - - /* Split the Receive buffer between the head and tail - * buffers at Read chunk's position. XDR roundup of the - * chunk is not included in either the pagelist or in - * the tail. - */ - buf->tail[0].iov_base = buf->head[0].iov_base + chunk->ch_position; - buf->tail[0].iov_len = buf->head[0].iov_len - chunk->ch_position; - buf->head[0].iov_len = chunk->ch_position; - - /* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2). - * - * If the client already rounded up the chunk length, the - * length does not change. Otherwise, the length of the page - * list is increased to include XDR round-up. - * - * Currently these chunks always start at page offset 0, - * thus the rounded-up length never crosses a page boundary. - */ - buf->pages = &info->ri_rqst->rq_pages[0]; - length = xdr_align_size(chunk->ch_length); - buf->page_len = length; - buf->len += length; - buf->buflen += length; - -out: - return ret; + return svc_rdma_build_read_chunk(rqstp, head, + pcl_first_chunk(&head->rc_read_pcl)); } /** - * svc_rdma_read_chunk_range - Build RDMA Read WQEs for portion of a chunk - * @info: context for RDMA Reads + * svc_rdma_read_chunk_range - Build RDMA Read WRs for portion of a chunk + * @rqstp: RPC transaction context + * @head: context for ongoing I/O * @chunk: parsed Call chunk to pull * @offset: offset of region to pull * @length: length of region to pull @@ -971,7 +908,8 @@ out: * %-ENOTCONN: posting failed (connection is lost), * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static int svc_rdma_read_chunk_range(struct svc_rdma_read_info *info, +static int svc_rdma_read_chunk_range(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head, const struct svc_rdma_chunk *chunk, unsigned int offset, unsigned int length) { @@ -991,11 +929,11 @@ static int svc_rdma_read_chunk_range(struct svc_rdma_read_info *info, dummy.rs_length = min_t(u32, length, segment->rs_length) - offset; dummy.rs_offset = segment->rs_offset + offset; - ret = svc_rdma_build_read_segment(info, &dummy); + ret = svc_rdma_build_read_segment(rqstp, head, &dummy); if (ret < 0) break; - info->ri_totalbytes += dummy.rs_length; + head->rc_readbytes += dummy.rs_length; length -= dummy.rs_length; offset = 0; } @@ -1004,7 +942,8 @@ static int svc_rdma_read_chunk_range(struct svc_rdma_read_info *info, /** * svc_rdma_read_call_chunk - Build RDMA Read WQEs to pull a Long Message - * @info: context for RDMA Reads + * @rqstp: RPC transaction context + * @head: context for ongoing I/O * * Return values: * %0: RDMA Read WQEs were successfully built @@ -1013,9 +952,9 @@ static int svc_rdma_read_chunk_range(struct svc_rdma_read_info *info, * %-ENOTCONN: posting failed (connection is lost), * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info) +static int svc_rdma_read_call_chunk(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head) { - struct svc_rdma_recv_ctxt *head = info->ri_readctxt; const struct svc_rdma_chunk *call_chunk = pcl_first_chunk(&head->rc_call_pcl); const struct svc_rdma_pcl *pcl = &head->rc_read_pcl; @@ -1024,17 +963,18 @@ static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info) int ret; if (pcl_is_empty(pcl)) - return svc_rdma_build_read_chunk(info, call_chunk); + return svc_rdma_build_read_chunk(rqstp, head, call_chunk); start = 0; chunk = pcl_first_chunk(pcl); length = chunk->ch_position; - ret = svc_rdma_read_chunk_range(info, call_chunk, start, length); + ret = svc_rdma_read_chunk_range(rqstp, head, call_chunk, + start, length); if (ret < 0) return ret; pcl_for_each_chunk(chunk, pcl) { - ret = svc_rdma_build_read_chunk(info, chunk); + ret = svc_rdma_build_read_chunk(rqstp, head, chunk); if (ret < 0) return ret; @@ -1043,8 +983,8 @@ static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info) break; start += length; - length = next->ch_position - info->ri_totalbytes; - ret = svc_rdma_read_chunk_range(info, call_chunk, + length = next->ch_position - head->rc_readbytes; + ret = svc_rdma_read_chunk_range(rqstp, head, call_chunk, start, length); if (ret < 0) return ret; @@ -1052,12 +992,14 @@ static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info) start += length; length = call_chunk->ch_length - start; - return svc_rdma_read_chunk_range(info, call_chunk, start, length); + return svc_rdma_read_chunk_range(rqstp, head, call_chunk, + start, length); } /** * svc_rdma_read_special - Build RDMA Read WQEs to pull a Long Message - * @info: context for RDMA Reads + * @rqstp: RPC transaction context + * @head: context for ongoing I/O * * The start of the data lands in the first page just after the * Transport header, and the rest lands in rqstp->rq_arg.pages. @@ -1073,25 +1015,31 @@ static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info) * %-ENOTCONN: posting failed (connection is lost), * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static noinline int svc_rdma_read_special(struct svc_rdma_read_info *info) +static noinline int svc_rdma_read_special(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head) { - struct xdr_buf *buf = &info->ri_rqst->rq_arg; - int ret; - - ret = svc_rdma_read_call_chunk(info); - if (ret < 0) - goto out; - - buf->len += info->ri_totalbytes; - buf->buflen += info->ri_totalbytes; + return svc_rdma_read_call_chunk(rqstp, head); +} - buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]); - buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes); - buf->pages = &info->ri_rqst->rq_pages[1]; - buf->page_len = info->ri_totalbytes - buf->head[0].iov_len; +/* Pages under I/O have been copied to head->rc_pages. Ensure that + * svc_xprt_release() does not put them when svc_rdma_recvfrom() + * returns. This has to be done after all Read WRs are constructed + * to properly handle a page that happens to be part of I/O on behalf + * of two different RDMA segments. + * + * Note: if the subsequent post_send fails, these pages have already + * been moved to head->rc_pages and thus will be cleaned up by + * svc_rdma_recv_ctxt_put(). + */ +static void svc_rdma_clear_rqst_pages(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head) +{ + unsigned int i; -out: - return ret; + for (i = 0; i < head->rc_page_count; i++) { + head->rc_pages[i] = rqstp->rq_pages[i]; + rqstp->rq_pages[i] = NULL; + } } /** @@ -1121,49 +1069,27 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, struct svc_rdma_recv_ctxt *head) { - struct svc_rdma_read_info *info; - struct svc_rdma_chunk_ctxt *cc; + struct svc_rdma_chunk_ctxt *cc = &head->rc_cc; int ret; - info = svc_rdma_read_info_alloc(rdma); - if (!info) - return -ENOMEM; - cc = &info->ri_cc; - info->ri_rqst = rqstp; - info->ri_readctxt = head; - info->ri_pageno = 0; - info->ri_pageoff = 0; - info->ri_totalbytes = 0; + cc->cc_cqe.done = svc_rdma_wc_read_done; + cc->cc_sqecount = 0; + head->rc_pageoff = 0; + head->rc_curpage = 0; + head->rc_readbytes = 0; if (pcl_is_empty(&head->rc_call_pcl)) { if (head->rc_read_pcl.cl_count == 1) - ret = svc_rdma_read_data_item(info); + ret = svc_rdma_read_data_item(rqstp, head); else - ret = svc_rdma_read_multiple_chunks(info); + ret = svc_rdma_read_multiple_chunks(rqstp, head); } else - ret = svc_rdma_read_special(info); + ret = svc_rdma_read_special(rqstp, head); + svc_rdma_clear_rqst_pages(rqstp, head); if (ret < 0) - goto out_err; + return ret; trace_svcrdma_post_read_chunk(&cc->cc_cid, cc->cc_sqecount); - init_completion(&cc->cc_done); - ret = svc_rdma_post_chunk_ctxt(cc); - if (ret < 0) - goto out_err; - - ret = 1; - wait_for_completion(&cc->cc_done); - if (cc->cc_status != IB_WC_SUCCESS) - ret = -EIO; - - /* rq_respages starts after the last arg page */ - rqstp->rq_respages = &rqstp->rq_pages[head->rc_page_count]; - rqstp->rq_next_page = rqstp->rq_respages + 1; - - /* Ensure svc_rdma_recv_ctxt_put() does not try to release pages */ - head->rc_page_count = 0; - -out_err: - svc_rdma_read_info_free(info); - return ret; + ret = svc_rdma_post_chunk_ctxt(rdma, cc); + return ret < 0 ? ret : 1; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index c6644cca52..1a49b7f020 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -113,13 +113,6 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc); -static void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma, - struct rpc_rdma_cid *cid) -{ - cid->ci_queue_id = rdma->sc_sq_cq->res.id; - cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); -} - static struct svc_rdma_send_ctxt * svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) { @@ -129,7 +122,7 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) void *buffer; int i; - ctxt = kmalloc_node(struct_size(ctxt, sc_sges, rdma->sc_max_send_sges), + ctxt = kzalloc_node(struct_size(ctxt, sc_sges, rdma->sc_max_send_sges), GFP_KERNEL, node); if (!ctxt) goto fail0; @@ -143,6 +136,7 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) svc_rdma_send_cid_init(rdma, &ctxt->sc_cid); + ctxt->sc_rdma = rdma; ctxt->sc_send_wr.next = NULL; ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe; ctxt->sc_send_wr.sg_list = ctxt->sc_sges; @@ -200,10 +194,11 @@ struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma) spin_lock(&rdma->sc_send_lock); node = llist_del_first(&rdma->sc_send_ctxts); + spin_unlock(&rdma->sc_send_lock); if (!node) goto out_empty; + ctxt = llist_entry(node, struct svc_rdma_send_ctxt, sc_node); - spin_unlock(&rdma->sc_send_lock); out: rpcrdma_set_xdrlen(&ctxt->sc_hdrbuf, 0); @@ -216,22 +211,14 @@ out: return ctxt; out_empty: - spin_unlock(&rdma->sc_send_lock); ctxt = svc_rdma_send_ctxt_alloc(rdma); if (!ctxt) return NULL; goto out; } -/** - * svc_rdma_send_ctxt_put - Return send_ctxt to free list - * @rdma: controlling svcxprt_rdma - * @ctxt: object to return to the free list - * - * Pages left in sc_pages are DMA unmapped and released. - */ -void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt) +static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt) { struct ib_device *device = rdma->sc_cm_id->device; unsigned int i; @@ -243,18 +230,40 @@ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, * remains mapped until @ctxt is destroyed. */ for (i = 1; i < ctxt->sc_send_wr.num_sge; i++) { + trace_svcrdma_dma_unmap_page(&ctxt->sc_cid, + ctxt->sc_sges[i].addr, + ctxt->sc_sges[i].length); ib_dma_unmap_page(device, ctxt->sc_sges[i].addr, ctxt->sc_sges[i].length, DMA_TO_DEVICE); - trace_svcrdma_dma_unmap_page(rdma, - ctxt->sc_sges[i].addr, - ctxt->sc_sges[i].length); } llist_add(&ctxt->sc_node, &rdma->sc_send_ctxts); } +static void svc_rdma_send_ctxt_put_async(struct work_struct *work) +{ + struct svc_rdma_send_ctxt *ctxt; + + ctxt = container_of(work, struct svc_rdma_send_ctxt, sc_work); + svc_rdma_send_ctxt_release(ctxt->sc_rdma, ctxt); +} + +/** + * svc_rdma_send_ctxt_put - Return send_ctxt to free list + * @rdma: controlling svcxprt_rdma + * @ctxt: object to return to the free list + * + * Pages left in sc_pages are DMA unmapped and released. + */ +void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt) +{ + INIT_WORK(&ctxt->sc_work, svc_rdma_send_ctxt_put_async); + queue_work(svcrdma_wq, &ctxt->sc_work); +} + /** * svc_rdma_wake_send_waiters - manage Send Queue accounting * @rdma: controlling transport @@ -289,7 +298,7 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) if (unlikely(wc->status != IB_WC_SUCCESS)) goto flushed; - trace_svcrdma_wc_send(wc, &ctxt->sc_cid); + trace_svcrdma_wc_send(&ctxt->sc_cid); svc_rdma_send_ctxt_put(rdma, ctxt); return; @@ -327,13 +336,13 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt) while (1) { if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) { percpu_counter_inc(&svcrdma_stat_sq_starve); - trace_svcrdma_sq_full(rdma); + trace_svcrdma_sq_full(rdma, &ctxt->sc_cid); atomic_inc(&rdma->sc_sq_avail); wait_event(rdma->sc_send_wait, atomic_read(&rdma->sc_sq_avail) > 1); if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) return -ENOTCONN; - trace_svcrdma_sq_retry(rdma); + trace_svcrdma_sq_retry(rdma, &ctxt->sc_cid); continue; } @@ -344,7 +353,7 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt) return 0; } - trace_svcrdma_sq_post_err(rdma, ret); + trace_svcrdma_sq_post_err(rdma, &ctxt->sc_cid, ret); svc_xprt_deferred_close(&rdma->sc_xprt); wake_up(&rdma->sc_send_wait); return ret; @@ -534,14 +543,14 @@ static int svc_rdma_page_dma_map(void *data, struct page *page, if (ib_dma_mapping_error(dev, dma_addr)) goto out_maperr; - trace_svcrdma_dma_map_page(rdma, dma_addr, len); + trace_svcrdma_dma_map_page(&ctxt->sc_cid, dma_addr, len); ctxt->sc_sges[ctxt->sc_cur_sge_no].addr = dma_addr; ctxt->sc_sges[ctxt->sc_cur_sge_no].length = len; ctxt->sc_send_wr.num_sge++; return 0; out_maperr: - trace_svcrdma_dma_map_err(rdma, dma_addr, len); + trace_svcrdma_dma_map_err(&ctxt->sc_cid, dma_addr, len); return -EIO; } @@ -653,7 +662,7 @@ static int svc_rdma_xb_count_sges(const struct xdr_buf *xdr, * svc_rdma_pull_up_needed - Determine whether to use pull-up * @rdma: controlling transport * @sctxt: send_ctxt for the Send WR - * @rctxt: Write and Reply chunks provided by client + * @write_pcl: Write chunk list provided by client * @xdr: xdr_buf containing RPC message to transmit * * Returns: @@ -662,7 +671,7 @@ static int svc_rdma_xb_count_sges(const struct xdr_buf *xdr, */ static bool svc_rdma_pull_up_needed(const struct svcxprt_rdma *rdma, const struct svc_rdma_send_ctxt *sctxt, - const struct svc_rdma_recv_ctxt *rctxt, + const struct svc_rdma_pcl *write_pcl, const struct xdr_buf *xdr) { /* Resources needed for the transport header */ @@ -672,7 +681,7 @@ static bool svc_rdma_pull_up_needed(const struct svcxprt_rdma *rdma, }; int ret; - ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + ret = pcl_process_nonpayloads(write_pcl, xdr, svc_rdma_xb_count_sges, &args); if (ret < 0) return false; @@ -728,7 +737,7 @@ static int svc_rdma_xb_linearize(const struct xdr_buf *xdr, * svc_rdma_pull_up_reply_msg - Copy Reply into a single buffer * @rdma: controlling transport * @sctxt: send_ctxt for the Send WR; xprt hdr is already prepared - * @rctxt: Write and Reply chunks provided by client + * @write_pcl: Write chunk list provided by client * @xdr: prepared xdr_buf containing RPC message * * The device is not capable of sending the reply directly. @@ -743,7 +752,7 @@ static int svc_rdma_xb_linearize(const struct xdr_buf *xdr, */ static int svc_rdma_pull_up_reply_msg(const struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, - const struct svc_rdma_recv_ctxt *rctxt, + const struct svc_rdma_pcl *write_pcl, const struct xdr_buf *xdr) { struct svc_rdma_pullup_data args = { @@ -751,7 +760,7 @@ static int svc_rdma_pull_up_reply_msg(const struct svcxprt_rdma *rdma, }; int ret; - ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + ret = pcl_process_nonpayloads(write_pcl, xdr, svc_rdma_xb_linearize, &args); if (ret < 0) return ret; @@ -764,7 +773,8 @@ static int svc_rdma_pull_up_reply_msg(const struct svcxprt_rdma *rdma, /* svc_rdma_map_reply_msg - DMA map the buffer holding RPC message * @rdma: controlling transport * @sctxt: send_ctxt for the Send WR - * @rctxt: Write and Reply chunks provided by client + * @write_pcl: Write chunk list provided by client + * @reply_pcl: Reply chunk provided by client * @xdr: prepared xdr_buf containing RPC message * * Returns: @@ -776,7 +786,8 @@ static int svc_rdma_pull_up_reply_msg(const struct svcxprt_rdma *rdma, */ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, - const struct svc_rdma_recv_ctxt *rctxt, + const struct svc_rdma_pcl *write_pcl, + const struct svc_rdma_pcl *reply_pcl, const struct xdr_buf *xdr) { struct svc_rdma_map_data args = { @@ -789,18 +800,18 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len; /* If there is a Reply chunk, nothing follows the transport - * header, and we're done here. + * header, so there is nothing to map. */ - if (!pcl_is_empty(&rctxt->rc_reply_pcl)) + if (!pcl_is_empty(reply_pcl)) return 0; /* For pull-up, svc_rdma_send() will sync the transport header. * No additional DMA mapping is necessary. */ - if (svc_rdma_pull_up_needed(rdma, sctxt, rctxt, xdr)) - return svc_rdma_pull_up_reply_msg(rdma, sctxt, rctxt, xdr); + if (svc_rdma_pull_up_needed(rdma, sctxt, write_pcl, xdr)) + return svc_rdma_pull_up_reply_msg(rdma, sctxt, write_pcl, xdr); - return pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + return pcl_process_nonpayloads(write_pcl, xdr, svc_rdma_xb_dma_map, &args); } @@ -848,7 +859,8 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, { int ret; - ret = svc_rdma_map_reply_msg(rdma, sctxt, rctxt, &rqstp->rq_res); + ret = svc_rdma_map_reply_msg(rdma, sctxt, &rctxt->rc_write_pcl, + &rctxt->rc_reply_pcl, &rqstp->rq_res); if (ret < 0) return ret; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 2abd895046..4f27325ace 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -125,6 +125,9 @@ static void qp_event_handler(struct ib_event *event, void *context) static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, struct net *net, int node) { + static struct lock_class_key svcrdma_rwctx_lock; + static struct lock_class_key svcrdma_sctx_lock; + static struct lock_class_key svcrdma_dto_lock; struct svcxprt_rdma *cma_xprt; cma_xprt = kzalloc_node(sizeof(*cma_xprt), GFP_KERNEL, node); @@ -134,6 +137,7 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv); INIT_LIST_HEAD(&cma_xprt->sc_accept_q); INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); + INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); init_llist_head(&cma_xprt->sc_send_ctxts); init_llist_head(&cma_xprt->sc_recv_ctxts); init_llist_head(&cma_xprt->sc_rw_ctxts); @@ -141,8 +145,11 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); + lockdep_set_class(&cma_xprt->sc_rq_dto_lock, &svcrdma_dto_lock); spin_lock_init(&cma_xprt->sc_send_lock); + lockdep_set_class(&cma_xprt->sc_send_lock, &svcrdma_sctx_lock); spin_lock_init(&cma_xprt->sc_rw_ctxt_lock); + lockdep_set_class(&cma_xprt->sc_rw_ctxt_lock, &svcrdma_rwctx_lock); /* * Note that this implies that the underlying transport support @@ -391,37 +398,35 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dev = newxprt->sc_cm_id->device; newxprt->sc_port_num = newxprt->sc_cm_id->port_num; - /* Qualify the transport resource defaults with the - * capabilities of this particular device */ + newxprt->sc_max_req_size = svcrdma_max_req_size; + newxprt->sc_max_requests = svcrdma_max_requests; + newxprt->sc_max_bc_requests = svcrdma_max_bc_requests; + newxprt->sc_recv_batch = RPCRDMA_MAX_RECV_BATCH; + newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests); + + /* Qualify the transport's resource defaults with the + * capabilities of this particular device. + */ + /* Transport header, head iovec, tail iovec */ newxprt->sc_max_send_sges = 3; /* Add one SGE per page list entry */ newxprt->sc_max_send_sges += (svcrdma_max_req_size / PAGE_SIZE) + 1; if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge) newxprt->sc_max_send_sges = dev->attrs.max_send_sge; - newxprt->sc_max_req_size = svcrdma_max_req_size; - newxprt->sc_max_requests = svcrdma_max_requests; - newxprt->sc_max_bc_requests = svcrdma_max_bc_requests; - newxprt->sc_recv_batch = RPCRDMA_MAX_RECV_BATCH; rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests + newxprt->sc_recv_batch; if (rq_depth > dev->attrs.max_qp_wr) { - pr_warn("svcrdma: reducing receive depth to %d\n", - dev->attrs.max_qp_wr); rq_depth = dev->attrs.max_qp_wr; newxprt->sc_recv_batch = 1; newxprt->sc_max_requests = rq_depth - 2; newxprt->sc_max_bc_requests = 2; } - newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests); ctxts = rdma_rw_mr_factor(dev, newxprt->sc_port_num, RPCSVC_MAXPAGES); ctxts *= newxprt->sc_max_requests; newxprt->sc_sq_depth = rq_depth + ctxts; - if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) { - pr_warn("svcrdma: reducing send depth to %d\n", - dev->attrs.max_qp_wr); + if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) newxprt->sc_sq_depth = dev->attrs.max_qp_wr; - } atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth); newxprt->sc_pd = ib_alloc_pd(dev, 0); @@ -451,8 +456,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) qp_attr.qp_type = IB_QPT_RC; qp_attr.send_cq = newxprt->sc_sq_cq; qp_attr.recv_cq = newxprt->sc_rq_cq; - dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n", - newxprt->sc_cm_id, newxprt->sc_pd); dprintk(" cap.max_send_wr = %d, cap.max_recv_wr = %d\n", qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr); dprintk(" cap.max_send_sge = %d, cap.max_recv_sge = %d\n", @@ -506,7 +509,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) } #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - dprintk("svcrdma: new connection %p accepted:\n", newxprt); + dprintk("svcrdma: new connection accepted on device %s:\n", dev->name); sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap)); sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; @@ -547,6 +550,7 @@ static void __svc_rdma_free(struct work_struct *work) /* This blocks until the Completion Queues are empty */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) ib_drain_qp(rdma->sc_qp); + flush_workqueue(svcrdma_wq); svc_rdma_flush_recv_queues(rdma); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 28c0771c4e..4f8d7efa46 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1364,7 +1364,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp) } rep->rr_cid.ci_queue_id = ep->re_attr.recv_cq->res.id; - trace_xprtrdma_post_recv(rep); + trace_xprtrdma_post_recv(&rep->rr_cid); rep->rr_recv_wr.next = wr; wr = &rep->rr_recv_wr; --needed; diff --git a/net/tipc/link.c b/net/tipc/link.c index d014382365..0716eb5c8a 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -82,10 +82,7 @@ struct tipc_stats { * struct tipc_link - TIPC link data structure * @addr: network address of link's peer node * @name: link name character string - * @media_addr: media address to use when sending messages over link - * @timer: link timer * @net: pointer to namespace struct - * @refcnt: reference counter for permanent references (owner node & timer) * @peer_session: link session # being used by peer end of link * @peer_bearer_id: bearer id used by link's peer endpoint * @bearer_id: local bearer id used by link @@ -94,31 +91,19 @@ struct tipc_stats { * @state: current state of link FSM * @peer_caps: bitmap describing capabilities of peer node * @silent_intv_cnt: # of timer intervals without any reception from peer - * @proto_msg: template for control messages generated by link - * @pmsg: convenience pointer to "proto_msg" field * @priority: current link priority * @net_plane: current link network plane ('A' through 'H') * @mon_state: cookie with information needed by link monitor - * @backlog_limit: backlog queue congestion thresholds (indexed by importance) - * @exp_msg_count: # of tunnelled messages expected during link changeover - * @reset_rcv_checkpt: seq # of last acknowledged message at time of link reset * @mtu: current maximum packet size for this link * @advertised_mtu: advertised own mtu when link is being established - * @transmitq: queue for sent, non-acked messages * @backlogq: queue for messages waiting to be sent - * @snt_nxt: next sequence number to use for outbound messages * @ackers: # of peers that needs to ack each packet before it can be released * @acked: # last packet acked by a certain peer. Used for broadcast. * @rcv_nxt: next sequence number to expect for inbound messages - * @deferred_queue: deferred queue saved OOS b'cast message received from node - * @unacked_window: # of inbound messages rx'd without ack'ing back to peer * @inputq: buffer queue for messages to be delivered upwards * @namedq: buffer queue for name table messages to be delivered upwards - * @next_out: ptr to first unsent outbound message in queue * @wakeupq: linked list of wakeup msgs waiting for link congestion to abate - * @long_msg_seq_no: next identifier to use for outbound fragmented messages * @reasm_buf: head of partially reassembled inbound message fragments - * @bc_rcvr: marks that this is a broadcast receiver link * @stats: collects statistics regarding link activity * @session: session to be used by link * @snd_nxt_state: next send seq number diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index c763008a8a..079aebb16e 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -168,7 +168,7 @@ static struct sk_buff *tipc_get_err_tlv(char *str) int str_len = strlen(str) + 1; struct sk_buff *buf; - buf = tipc_tlv_alloc(TLV_SPACE(str_len)); + buf = tipc_tlv_alloc(str_len); if (buf) tipc_add_tlv(buf, TIPC_TLV_ERROR_STRING, str, str_len); diff --git a/net/tls/tls.h b/net/tls/tls.h index 762f424ff2..e5e4745230 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -215,7 +215,7 @@ static inline struct sk_buff *tls_strp_msg(struct tls_sw_context_rx *ctx) static inline bool tls_strp_msg_ready(struct tls_sw_context_rx *ctx) { - return ctx->strp.msg_ready; + return READ_ONCE(ctx->strp.msg_ready); } static inline bool tls_strp_msg_mixed_decrypted(struct tls_sw_context_rx *ctx) diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index ca1e0e198c..5df08d848b 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -360,7 +360,7 @@ static int tls_strp_copyin(read_descriptor_t *desc, struct sk_buff *in_skb, if (strp->stm.full_len && strp->stm.full_len == skb->len) { desc->count = 0; - strp->msg_ready = 1; + WRITE_ONCE(strp->msg_ready, 1); tls_rx_msg_ready(strp); } @@ -528,7 +528,7 @@ static int tls_strp_read_sock(struct tls_strparser *strp) if (!tls_strp_check_queue_ok(strp)) return tls_strp_read_copy(strp, false); - strp->msg_ready = 1; + WRITE_ONCE(strp->msg_ready, 1); tls_rx_msg_ready(strp); return 0; @@ -580,7 +580,7 @@ void tls_strp_msg_done(struct tls_strparser *strp) else tls_strp_flush_anchor_copy(strp); - strp->msg_ready = 0; + WRITE_ONCE(strp->msg_ready, 0); memset(&strp->stm, 0, sizeof(strp->stm)); tls_strp_check_rcv(strp); diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 211f57164c..b783231668 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1976,10 +1976,10 @@ int tls_sw_recvmsg(struct sock *sk, if (unlikely(flags & MSG_ERRQUEUE)) return sock_recv_errqueue(sk, msg, len, SOL_IP, IP_RECVERR); - psock = sk_psock_get(sk); err = tls_rx_reader_lock(sk, ctx, flags & MSG_DONTWAIT); if (err < 0) return err; + psock = sk_psock_get(sk); bpf_strp_enabled = sk_psock_strp_enabled(psock); /* If crypto failed the connection is broken */ @@ -2152,12 +2152,15 @@ recv_end: } /* Drain records from the rx_list & copy if required */ - if (is_peek || is_kvec) + if (is_peek) err = process_rx_list(ctx, msg, &control, copied + peeked, decrypted - peeked, is_peek, NULL); else err = process_rx_list(ctx, msg, &control, 0, async_copy_bytes, is_peek, NULL); + + /* we could have copied less than we wanted, and possibly nothing */ + decrypted += max(err, 0) - async_copy_bytes; } copied += decrypted; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 0748e7ea52..9df15a7bc2 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -980,11 +980,11 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sk->sk_write_space = unix_write_space; sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; sk->sk_destruct = unix_sock_destructor; - u = unix_sk(sk); + u = unix_sk(sk); + u->inflight = 0; u->path.dentry = NULL; u->path.mnt = NULL; spin_lock_init(&u->lock); - atomic_long_set(&u->inflight, 0); INIT_LIST_HEAD(&u->link); mutex_init(&u->iolock); /* single task reading lock */ mutex_init(&u->bindlock); /* single task binding lock */ @@ -2602,9 +2602,13 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, WRITE_ONCE(u->oob_skb, NULL); consume_skb(skb); } - } else if (!(flags & MSG_PEEK)) { + } else if (flags & MSG_PEEK) { + skb = NULL; + } else { skb_unlink(skb, &sk->sk_receive_queue); - consume_skb(skb); + WRITE_ONCE(u->oob_skb, NULL); + if (!WARN_ON_ONCE(skb_unref(skb))) + kfree_skb(skb); skb = skb_peek(&sk->sk_receive_queue); } } @@ -2678,18 +2682,16 @@ redo: last = skb = skb_peek(&sk->sk_receive_queue); last_len = last ? last->len : 0; +again: #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (skb) { skb = manage_oob(skb, sk, flags, copied); - if (!skb) { + if (!skb && copied) { unix_state_unlock(sk); - if (copied) - break; - goto redo; + break; } } #endif -again: if (skb == NULL) { if (copied >= target) goto unlock; diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 027c86e804..2a758531e1 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -166,17 +166,18 @@ static void scan_children(struct sock *x, void (*func)(struct unix_sock *), static void dec_inflight(struct unix_sock *usk) { - atomic_long_dec(&usk->inflight); + usk->inflight--; } static void inc_inflight(struct unix_sock *usk) { - atomic_long_inc(&usk->inflight); + usk->inflight++; } static void inc_inflight_move_tail(struct unix_sock *u) { - atomic_long_inc(&u->inflight); + u->inflight++; + /* If this still might be part of a cycle, move it to the end * of the list, so that it's checked even if it was already * passed over @@ -234,20 +235,34 @@ void unix_gc(void) * receive queues. Other, non candidate sockets _can_ be * added to queue, so we must make sure only to touch * candidates. + * + * Embryos, though never candidates themselves, affect which + * candidates are reachable by the garbage collector. Before + * being added to a listener's queue, an embryo may already + * receive data carrying SCM_RIGHTS, potentially making the + * passed socket a candidate that is not yet reachable by the + * collector. It becomes reachable once the embryo is + * enqueued. Therefore, we must ensure that no SCM-laden + * embryo appears in a (candidate) listener's queue between + * consecutive scan_children() calls. */ list_for_each_entry_safe(u, next, &gc_inflight_list, link) { + struct sock *sk = &u->sk; long total_refs; - long inflight_refs; - total_refs = file_count(u->sk.sk_socket->file); - inflight_refs = atomic_long_read(&u->inflight); + total_refs = file_count(sk->sk_socket->file); - BUG_ON(inflight_refs < 1); - BUG_ON(total_refs < inflight_refs); - if (total_refs == inflight_refs) { + BUG_ON(!u->inflight); + BUG_ON(total_refs < u->inflight); + if (total_refs == u->inflight) { list_move_tail(&u->link, &gc_candidates); __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); + + if (sk->sk_state == TCP_LISTEN) { + unix_state_lock_nested(sk, U_LOCK_GC_LISTENER); + unix_state_unlock(sk); + } } } @@ -271,7 +286,7 @@ void unix_gc(void) /* Move cursor to after the current position. */ list_move(&cursor, &u->link); - if (atomic_long_read(&u->inflight) > 0) { + if (u->inflight) { list_move_tail(&u->link, ¬_cycle_list); __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); scan_children(&u->sk, inc_inflight_move_tail, NULL); diff --git a/net/unix/scm.c b/net/unix/scm.c index 822ce0d0d7..e92f2fad64 100644 --- a/net/unix/scm.c +++ b/net/unix/scm.c @@ -53,12 +53,13 @@ void unix_inflight(struct user_struct *user, struct file *fp) if (s) { struct unix_sock *u = unix_sk(s); - if (atomic_long_inc_return(&u->inflight) == 1) { + if (!u->inflight) { BUG_ON(!list_empty(&u->link)); list_add_tail(&u->link, &gc_inflight_list); } else { BUG_ON(list_empty(&u->link)); } + u->inflight++; /* Paired with READ_ONCE() in wait_for_unix_gc() */ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); } @@ -75,10 +76,11 @@ void unix_notinflight(struct user_struct *user, struct file *fp) if (s) { struct unix_sock *u = unix_sk(s); - BUG_ON(!atomic_long_read(&u->inflight)); + BUG_ON(!u->inflight); BUG_ON(list_empty(&u->link)); - if (atomic_long_dec_and_test(&u->inflight)) + u->inflight--; + if (!u->inflight) list_del_init(&u->link); /* Paired with READ_ONCE() in wait_for_unix_gc() */ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index f495b9e518..ee5d306a96 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -120,7 +120,6 @@ virtio_transport_send_pkt_work(struct work_struct *work) if (!skb) break; - virtio_transport_deliver_tap_pkt(skb); reply = virtio_vsock_skb_reply(skb); sgs = vsock->out_sgs; sg_init_one(sgs[out_sg], virtio_vsock_hdr(skb), @@ -153,10 +152,10 @@ virtio_transport_send_pkt_work(struct work_struct *work) * 'virt_to_phys()' later to fill the buffer descriptor. * We don't touch memory at "virtual" address of this page. */ - va = page_to_virt(skb_frag->bv_page); + va = page_to_virt(skb_frag_page(skb_frag)); sg_init_one(sgs[out_sg], - va + skb_frag->bv_offset, - skb_frag->bv_len); + va + skb_frag_off(skb_frag), + skb_frag_size(skb_frag)); out_sg++; } } @@ -170,6 +169,8 @@ virtio_transport_send_pkt_work(struct work_struct *work) break; } + virtio_transport_deliver_tap_pkt(skb); + if (reply) { struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX]; int val; diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index a9ac85e09a..10345388ad 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -206,7 +206,6 @@ config CFG80211_KUNIT_TEST depends on KUNIT depends on CFG80211 default KUNIT_ALL_TESTS - depends on !KERNEL_6_2 help Enable this option to test cfg80211 functions with kunit. diff --git a/net/wireless/Makefile b/net/wireless/Makefile index 089c841528..72074fd36d 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -25,7 +25,7 @@ ifneq ($(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR),) cfg80211-y += extra-certs.o endif -$(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.hex) +$(obj)/shipped-certs.c: $(sort $(wildcard $(srctree)/$(src)/certs/*.hex)) @$(kecho) " GEN $@" $(Q)(echo '#include "reg.h"'; \ echo 'const u8 shipped_regdb_certs[] = {'; \ @@ -35,7 +35,7 @@ $(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.hex) ) > $@ $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR) \ - $(wildcard $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR)/*.x509) + $(sort $(wildcard $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR)/*.x509)) @$(kecho) " GEN $@" $(Q)(set -e; \ allf=""; \ diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 2d21e423ab..ceb9174c5c 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -141,7 +141,7 @@ static bool cfg80211_edmg_chandef_valid(const struct cfg80211_chan_def *chandef) return true; } -static int nl80211_chan_width_to_mhz(enum nl80211_chan_width chan_width) +int nl80211_chan_width_to_mhz(enum nl80211_chan_width chan_width) { int mhz; @@ -190,6 +190,7 @@ static int nl80211_chan_width_to_mhz(enum nl80211_chan_width chan_width) } return mhz; } +EXPORT_SYMBOL(nl80211_chan_width_to_mhz); static int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c) { @@ -514,9 +515,83 @@ static u32 cfg80211_get_end_freq(u32 center_freq, return end_freq; } +static bool +cfg80211_dfs_permissive_check_wdev(struct cfg80211_registered_device *rdev, + enum nl80211_iftype iftype, + struct wireless_dev *wdev, + struct ieee80211_channel *chan) +{ + unsigned int link_id; + + for_each_valid_link(wdev, link_id) { + struct ieee80211_channel *other_chan = NULL; + struct cfg80211_chan_def chandef = {}; + int ret; + + /* In order to avoid daisy chaining only allow BSS STA */ + if (wdev->iftype != NL80211_IFTYPE_STATION || + !wdev->links[link_id].client.current_bss) + continue; + + other_chan = + wdev->links[link_id].client.current_bss->pub.channel; + + if (!other_chan) + continue; + + if (chan == other_chan) + return true; + + /* continue if we can't get the channel */ + ret = rdev_get_channel(rdev, wdev, link_id, &chandef); + if (ret) + continue; + + if (cfg80211_is_sub_chan(&chandef, chan, false)) + return true; + } + + return false; +} + +/* + * Check if P2P GO is allowed to operate on a DFS channel + */ +static bool cfg80211_dfs_permissive_chan(struct wiphy *wiphy, + enum nl80211_iftype iftype, + struct ieee80211_channel *chan) +{ + struct wireless_dev *wdev; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + + lockdep_assert_held(&rdev->wiphy.mtx); + + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_DFS_CONCURRENT) || + !(chan->flags & IEEE80211_CHAN_DFS_CONCURRENT)) + return false; + + /* only valid for P2P GO */ + if (iftype != NL80211_IFTYPE_P2P_GO) + return false; + + /* + * Allow only if there's a concurrent BSS + */ + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { + bool ret = cfg80211_dfs_permissive_check_wdev(rdev, iftype, + wdev, chan); + if (ret) + return ret; + } + + return false; +} + static int cfg80211_get_chans_dfs_required(struct wiphy *wiphy, u32 center_freq, - u32 bandwidth) + u32 bandwidth, + enum nl80211_iftype iftype) { struct ieee80211_channel *c; u32 freq, start_freq, end_freq; @@ -529,9 +604,11 @@ static int cfg80211_get_chans_dfs_required(struct wiphy *wiphy, if (!c) return -EINVAL; - if (c->flags & IEEE80211_CHAN_RADAR) + if (c->flags & IEEE80211_CHAN_RADAR && + !cfg80211_dfs_permissive_chan(wiphy, iftype, c)) return 1; } + return 0; } @@ -557,7 +634,7 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy, ret = cfg80211_get_chans_dfs_required(wiphy, ieee80211_chandef_to_khz(chandef), - width); + width, iftype); if (ret < 0) return ret; else if (ret > 0) @@ -568,7 +645,7 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy, ret = cfg80211_get_chans_dfs_required(wiphy, MHZ_TO_KHZ(chandef->center_freq2), - width); + width, iftype); if (ret < 0) return ret; else if (ret > 0) @@ -1336,15 +1413,19 @@ static bool _cfg80211_reg_can_beacon(struct wiphy *wiphy, bool check_no_ir) { bool res; - u32 prohibited_flags = IEEE80211_CHAN_DISABLED | - IEEE80211_CHAN_RADAR; + u32 prohibited_flags = IEEE80211_CHAN_DISABLED; + int dfs_required; trace_cfg80211_reg_can_beacon(wiphy, chandef, iftype, check_no_ir); if (check_no_ir) prohibited_flags |= IEEE80211_CHAN_NO_IR; - if (cfg80211_chandef_dfs_required(wiphy, chandef, iftype) > 0 && + dfs_required = cfg80211_chandef_dfs_required(wiphy, chandef, iftype); + if (dfs_required != 0) + prohibited_flags |= IEEE80211_CHAN_RADAR; + + if (dfs_required > 0 && cfg80211_chandef_dfs_available(wiphy, chandef)) { /* We can skip IEEE80211_CHAN_NO_IR if chandef dfs available */ prohibited_flags = IEEE80211_CHAN_DISABLED; diff --git a/net/wireless/core.h b/net/wireless/core.h index cb61d33d4f..13657a85cf 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -3,7 +3,7 @@ * Wireless configuration interface internals. * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2023 Intel Corporation */ #ifndef __NET_WIRELESS_CORE_H #define __NET_WIRELESS_CORE_H @@ -458,6 +458,9 @@ int cfg80211_scan(struct cfg80211_registered_device *rdev); extern struct work_struct cfg80211_disconnect_work; +#define NL80211_BSS_USE_FOR_ALL (NL80211_BSS_USE_FOR_NORMAL | \ + NL80211_BSS_USE_FOR_MLD_LINK) + void cfg80211_set_dfs_state(struct wiphy *wiphy, const struct cfg80211_chan_def *chandef, enum nl80211_dfs_state dfs_state); @@ -546,4 +549,15 @@ int cfg80211_remove_virtual_intf(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); void cfg80211_wdev_release_link_bsses(struct wireless_dev *wdev, u16 link_mask); +#if IS_ENABLED(CONFIG_CFG80211_KUNIT_TEST) +#define EXPORT_SYMBOL_IF_CFG80211_KUNIT(sym) EXPORT_SYMBOL_IF_KUNIT(sym) +#define VISIBLE_IF_CFG80211_KUNIT +size_t cfg80211_gen_new_ie(const u8 *ie, size_t ielen, + const u8 *subie, size_t subie_len, + u8 *new_ie, size_t new_ie_len); +#else +#define EXPORT_SYMBOL_IF_CFG80211_KUNIT(sym) +#define VISIBLE_IF_CFG80211_KUNIT static +#endif /* IS_ENABLED(CONFIG_CFG80211_KUNIT_TEST) */ + #endif /* __NET_WIRELESS_CORE_H */ diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index bad9e4fd84..f635a8b6ca 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -22,7 +22,7 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, - struct cfg80211_rx_assoc_resp_data *data) + const struct cfg80211_rx_assoc_resp_data *data) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f853b54415..bd54a928ba 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -818,6 +818,9 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_HW_TIMESTAMP_ENABLED] = { .type = NLA_FLAG }, [NL80211_ATTR_EMA_RNR_ELEMS] = { .type = NLA_NESTED }, [NL80211_ATTR_MLO_LINK_DISABLED] = { .type = NLA_FLAG }, + [NL80211_ATTR_BSS_DUMP_INCLUDE_USE_DATA] = { .type = NLA_FLAG }, + [NL80211_ATTR_MLO_TTLM_DLINK] = NLA_POLICY_EXACT_LEN(sizeof(u16) * 8), + [NL80211_ATTR_MLO_TTLM_ULINK] = NLA_POLICY_EXACT_LEN(sizeof(u16) * 8), }; /* policy for the key attributes */ @@ -1198,6 +1201,15 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, if ((chan->flags & IEEE80211_CHAN_NO_EHT) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_EHT)) goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_DFS_CONCURRENT) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_DFS_CONCURRENT)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_UHB_VLP_CLIENT) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_UHB_VLP_CLIENT)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_UHB_AFC_CLIENT) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_UHB_AFC_CLIENT)) + goto nla_put_failure; } if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER, @@ -4858,7 +4870,7 @@ static struct cfg80211_acl_data *parse_acl_data(struct wiphy *wiphy, return ERR_PTR(n_entries); if (n_entries > wiphy->max_acl_mac_addrs) - return ERR_PTR(-ENOTSUPP); + return ERR_PTR(-EOPNOTSUPP); acl = kzalloc(struct_size(acl, mac_addrs, n_entries), GFP_KERNEL); if (!acl) @@ -9345,6 +9357,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) else eth_broadcast_addr(request->bssid); + request->tsf_report_link_id = nl80211_link_id_or_invalid(info->attrs); request->wdev = wdev; request->wiphy = &rdev->wiphy; request->scan_start = jiffies; @@ -10412,6 +10425,15 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, break; } + if (nla_put_u32(msg, NL80211_BSS_USE_FOR, res->use_for)) + goto nla_put_failure; + + if (res->cannot_use_reasons && + nla_put_u64_64bit(msg, NL80211_BSS_CANNOT_USE_REASONS, + res->cannot_use_reasons, + NL80211_BSS_PAD)) + goto nla_put_failure; + nla_nest_end(msg, bss); genlmsg_end(msg, hdr); @@ -10429,15 +10451,27 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb) struct cfg80211_registered_device *rdev; struct cfg80211_internal_bss *scan; struct wireless_dev *wdev; + struct nlattr **attrbuf; int start = cb->args[2], idx = 0; + bool dump_include_use_data; int err; - err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev, NULL); - if (err) + attrbuf = kcalloc(NUM_NL80211_ATTR, sizeof(*attrbuf), GFP_KERNEL); + if (!attrbuf) + return -ENOMEM; + + err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev, attrbuf); + if (err) { + kfree(attrbuf); return err; + } /* nl80211_prepare_wdev_dump acquired it in the successful case */ __acquire(&rdev->wiphy.mtx); + dump_include_use_data = + attrbuf[NL80211_ATTR_BSS_DUMP_INCLUDE_USE_DATA]; + kfree(attrbuf); + spin_lock_bh(&rdev->bss_lock); /* @@ -10454,6 +10488,9 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb) list_for_each_entry(scan, &rdev->bss_list, list) { if (++idx <= start) continue; + if (!dump_include_use_data && + !(scan->pub.use_for & NL80211_BSS_USE_FOR_NORMAL)) + continue; if (nl80211_send_bss(skb, cb, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wdev, scan) < 0) { @@ -10905,12 +10942,13 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, static struct cfg80211_bss *nl80211_assoc_bss(struct cfg80211_registered_device *rdev, const u8 *ssid, int ssid_len, - struct nlattr **attrs) + struct nlattr **attrs, + int assoc_link_id, int link_id) { struct ieee80211_channel *chan; struct cfg80211_bss *bss; const u8 *bssid; - u32 freq; + u32 freq, use_for = 0; if (!attrs[NL80211_ATTR_MAC] || !attrs[NL80211_ATTR_WIPHY_FREQ]) return ERR_PTR(-EINVAL); @@ -10925,10 +10963,16 @@ static struct cfg80211_bss *nl80211_assoc_bss(struct cfg80211_registered_device if (!chan) return ERR_PTR(-EINVAL); - bss = cfg80211_get_bss(&rdev->wiphy, chan, bssid, - ssid, ssid_len, - IEEE80211_BSS_TYPE_ESS, - IEEE80211_PRIVACY_ANY); + if (assoc_link_id >= 0) + use_for = NL80211_BSS_USE_FOR_MLD_LINK; + if (assoc_link_id == link_id) + use_for |= NL80211_BSS_USE_FOR_NORMAL; + + bss = __cfg80211_get_bss(&rdev->wiphy, chan, bssid, + ssid, ssid_len, + IEEE80211_BSS_TYPE_ESS, + IEEE80211_PRIVACY_ANY, + use_for); if (!bss) return ERR_PTR(-ENOENT); @@ -11107,7 +11151,8 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) goto free; } req.links[link_id].bss = - nl80211_assoc_bss(rdev, ssid, ssid_len, attrs); + nl80211_assoc_bss(rdev, ssid, ssid_len, attrs, + req.link_id, link_id); if (IS_ERR(req.links[link_id].bss)) { err = PTR_ERR(req.links[link_id].bss); req.links[link_id].bss = NULL; @@ -11172,7 +11217,8 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) if (req.link_id >= 0) return -EINVAL; - req.bss = nl80211_assoc_bss(rdev, ssid, ssid_len, info->attrs); + req.bss = nl80211_assoc_bss(rdev, ssid, ssid_len, info->attrs, + -1, -1); if (IS_ERR(req.bss)) return PTR_ERR(req.bss); ap_addr = req.bss->bssid; @@ -12181,16 +12227,18 @@ static int nl80211_wiphy_netns(struct sk_buff *skb, struct genl_info *info) return err; } -static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info) +static int nl80211_set_pmksa(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; - int (*rdev_ops)(struct wiphy *wiphy, struct net_device *dev, - struct cfg80211_pmksa *pmksa) = NULL; struct net_device *dev = info->user_ptr[1]; struct cfg80211_pmksa pmksa; + bool ap_pmksa_caching_support = false; memset(&pmksa, 0, sizeof(struct cfg80211_pmksa)); + ap_pmksa_caching_support = wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_AP_PMKSA_CACHING); + if (!info->attrs[NL80211_ATTR_PMKID]) return -EINVAL; @@ -12199,16 +12247,15 @@ static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_MAC]) { pmksa.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); } else if (info->attrs[NL80211_ATTR_SSID] && - info->attrs[NL80211_ATTR_FILS_CACHE_ID] && - (info->genlhdr->cmd == NL80211_CMD_DEL_PMKSA || - info->attrs[NL80211_ATTR_PMK])) { + info->attrs[NL80211_ATTR_FILS_CACHE_ID] && + info->attrs[NL80211_ATTR_PMK]) { pmksa.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]); pmksa.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); - pmksa.cache_id = - nla_data(info->attrs[NL80211_ATTR_FILS_CACHE_ID]); + pmksa.cache_id = nla_data(info->attrs[NL80211_ATTR_FILS_CACHE_ID]); } else { return -EINVAL; } + if (info->attrs[NL80211_ATTR_PMK]) { pmksa.pmk = nla_data(info->attrs[NL80211_ATTR_PMK]); pmksa.pmk_len = nla_len(info->attrs[NL80211_ATTR_PMK]); @@ -12220,32 +12267,71 @@ static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_PMK_REAUTH_THRESHOLD]) pmksa.pmk_reauth_threshold = - nla_get_u8( - info->attrs[NL80211_ATTR_PMK_REAUTH_THRESHOLD]); + nla_get_u8(info->attrs[NL80211_ATTR_PMK_REAUTH_THRESHOLD]); if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT && - !(dev->ieee80211_ptr->iftype == NL80211_IFTYPE_AP && - wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_AP_PMKSA_CACHING))) + !((dev->ieee80211_ptr->iftype == NL80211_IFTYPE_AP || + dev->ieee80211_ptr->iftype == NL80211_IFTYPE_P2P_GO) && + ap_pmksa_caching_support)) return -EOPNOTSUPP; - switch (info->genlhdr->cmd) { - case NL80211_CMD_SET_PMKSA: - rdev_ops = rdev->ops->set_pmksa; - break; - case NL80211_CMD_DEL_PMKSA: - rdev_ops = rdev->ops->del_pmksa; - break; - default: - WARN_ON(1); - break; + if (!rdev->ops->set_pmksa) + return -EOPNOTSUPP; + + return rdev_set_pmksa(rdev, dev, &pmksa); +} + +static int nl80211_del_pmksa(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct cfg80211_pmksa pmksa; + bool sae_offload_support = false; + bool owe_offload_support = false; + bool ap_pmksa_caching_support = false; + + memset(&pmksa, 0, sizeof(struct cfg80211_pmksa)); + + sae_offload_support = wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_SAE_OFFLOAD); + owe_offload_support = wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_OWE_OFFLOAD); + ap_pmksa_caching_support = wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_AP_PMKSA_CACHING); + + if (info->attrs[NL80211_ATTR_PMKID]) + pmksa.pmkid = nla_data(info->attrs[NL80211_ATTR_PMKID]); + + if (info->attrs[NL80211_ATTR_MAC]) { + pmksa.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); + } else if (info->attrs[NL80211_ATTR_SSID]) { + /* SSID based pmksa flush suppported only for FILS, + * OWE/SAE OFFLOAD cases + */ + if (info->attrs[NL80211_ATTR_FILS_CACHE_ID] && + info->attrs[NL80211_ATTR_PMK]) { + pmksa.cache_id = nla_data(info->attrs[NL80211_ATTR_FILS_CACHE_ID]); + } else if (!sae_offload_support && !owe_offload_support) { + return -EINVAL; + } + pmksa.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]); + pmksa.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); + } else { + return -EINVAL; } - if (!rdev_ops) + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT && + !((dev->ieee80211_ptr->iftype == NL80211_IFTYPE_AP || + dev->ieee80211_ptr->iftype == NL80211_IFTYPE_P2P_GO) && + ap_pmksa_caching_support)) + return -EOPNOTSUPP; + + if (!rdev->ops->del_pmksa) return -EOPNOTSUPP; - return rdev_ops(&rdev->wiphy, dev, &pmksa); + return rdev_del_pmksa(rdev, dev, &pmksa); } static int nl80211_flush_pmksa(struct sk_buff *skb, struct genl_info *info) @@ -15849,7 +15935,7 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev, if (tid_conf->mask & ~mask) { NL_SET_ERR_MSG(extack, "unsupported TID configuration"); - return -ENOTSUPP; + return -EOPNOTSUPP; } return 0; @@ -16242,6 +16328,35 @@ static int nl80211_set_hw_timestamp(struct sk_buff *skb, return rdev_set_hw_timestamp(rdev, dev, &hwts); } +static int +nl80211_set_ttlm(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_ttlm_params params = {}; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + + if (wdev->iftype != NL80211_IFTYPE_STATION && + wdev->iftype != NL80211_IFTYPE_P2P_CLIENT) + return -EOPNOTSUPP; + + if (!wdev->connected) + return -ENOLINK; + + if (!info->attrs[NL80211_ATTR_MLO_TTLM_DLINK] || + !info->attrs[NL80211_ATTR_MLO_TTLM_ULINK]) + return -EINVAL; + + nla_memcpy(params.dlink, + info->attrs[NL80211_ATTR_MLO_TTLM_DLINK], + sizeof(params.dlink)); + nla_memcpy(params.ulink, + info->attrs[NL80211_ATTR_MLO_TTLM_ULINK], + sizeof(params.ulink)); + + return rdev_set_ttlm(rdev, dev, ¶ms); +} + #define NL80211_FLAG_NEED_WIPHY 0x01 #define NL80211_FLAG_NEED_NETDEV 0x02 #define NL80211_FLAG_NEED_RTNL 0x04 @@ -16930,7 +17045,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { { .cmd = NL80211_CMD_SET_PMKSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = nl80211_setdel_pmksa, + .doit = nl80211_set_pmksa, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_CLEAR_SKB), @@ -16938,7 +17053,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { { .cmd = NL80211_CMD_DEL_PMKSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = nl80211_setdel_pmksa, + .doit = nl80211_del_pmksa, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, @@ -17423,6 +17538,12 @@ static const struct genl_small_ops nl80211_small_ops[] = { .flags = GENL_UNS_ADMIN_PERM, .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, + { + .cmd = NL80211_CMD_SET_TID_TO_LINK_MAPPING, + .doit = nl80211_set_ttlm, + .flags = GENL_UNS_ADMIN_PERM, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), + }, }; static struct genl_family nl80211_fam __ro_after_init = { @@ -17754,21 +17875,29 @@ nla_put_failure: nlmsg_free(msg); } +struct nl80211_mlme_event { + enum nl80211_commands cmd; + const u8 *buf; + size_t buf_len; + int uapsd_queues; + const u8 *req_ies; + size_t req_ies_len; + bool reconnect; +}; + static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev, struct net_device *netdev, - const u8 *buf, size_t len, - enum nl80211_commands cmd, gfp_t gfp, - int uapsd_queues, const u8 *req_ies, - size_t req_ies_len, bool reconnect) + const struct nl80211_mlme_event *event, + gfp_t gfp) { struct sk_buff *msg; void *hdr; - msg = nlmsg_new(100 + len + req_ies_len, gfp); + msg = nlmsg_new(100 + event->buf_len + event->req_ies_len, gfp); if (!msg) return; - hdr = nl80211hdr_put(msg, 0, 0, 0, cmd); + hdr = nl80211hdr_put(msg, 0, 0, 0, event->cmd); if (!hdr) { nlmsg_free(msg); return; @@ -17776,22 +17905,24 @@ static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev, if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) || - nla_put(msg, NL80211_ATTR_FRAME, len, buf) || - (req_ies && - nla_put(msg, NL80211_ATTR_REQ_IE, req_ies_len, req_ies))) + nla_put(msg, NL80211_ATTR_FRAME, event->buf_len, event->buf) || + (event->req_ies && + nla_put(msg, NL80211_ATTR_REQ_IE, event->req_ies_len, + event->req_ies))) goto nla_put_failure; - if (reconnect && nla_put_flag(msg, NL80211_ATTR_RECONNECT_REQUESTED)) + if (event->reconnect && + nla_put_flag(msg, NL80211_ATTR_RECONNECT_REQUESTED)) goto nla_put_failure; - if (uapsd_queues >= 0) { + if (event->uapsd_queues >= 0) { struct nlattr *nla_wmm = nla_nest_start_noflag(msg, NL80211_ATTR_STA_WME); if (!nla_wmm) goto nla_put_failure; if (nla_put_u8(msg, NL80211_STA_WME_UAPSD_QUEUES, - uapsd_queues)) + event->uapsd_queues)) goto nla_put_failure; nla_nest_end(msg, nla_wmm); @@ -17811,37 +17942,60 @@ void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *buf, size_t len, gfp_t gfp) { - nl80211_send_mlme_event(rdev, netdev, buf, len, - NL80211_CMD_AUTHENTICATE, gfp, -1, NULL, 0, - false); + struct nl80211_mlme_event event = { + .cmd = NL80211_CMD_AUTHENTICATE, + .buf = buf, + .buf_len = len, + .uapsd_queues = -1, + }; + + nl80211_send_mlme_event(rdev, netdev, &event, gfp); } void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev, struct net_device *netdev, - struct cfg80211_rx_assoc_resp_data *data) + const struct cfg80211_rx_assoc_resp_data *data) { - nl80211_send_mlme_event(rdev, netdev, data->buf, data->len, - NL80211_CMD_ASSOCIATE, GFP_KERNEL, - data->uapsd_queues, - data->req_ies, data->req_ies_len, false); + struct nl80211_mlme_event event = { + .cmd = NL80211_CMD_ASSOCIATE, + .buf = data->buf, + .buf_len = data->len, + .uapsd_queues = data->uapsd_queues, + .req_ies = data->req_ies, + .req_ies_len = data->req_ies_len, + }; + + nl80211_send_mlme_event(rdev, netdev, &event, GFP_KERNEL); } void nl80211_send_deauth(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *buf, size_t len, bool reconnect, gfp_t gfp) { - nl80211_send_mlme_event(rdev, netdev, buf, len, - NL80211_CMD_DEAUTHENTICATE, gfp, -1, NULL, 0, - reconnect); + struct nl80211_mlme_event event = { + .cmd = NL80211_CMD_DEAUTHENTICATE, + .buf = buf, + .buf_len = len, + .reconnect = reconnect, + .uapsd_queues = -1, + }; + + nl80211_send_mlme_event(rdev, netdev, &event, gfp); } void nl80211_send_disassoc(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *buf, size_t len, bool reconnect, gfp_t gfp) { - nl80211_send_mlme_event(rdev, netdev, buf, len, - NL80211_CMD_DISASSOCIATE, gfp, -1, NULL, 0, - reconnect); + struct nl80211_mlme_event event = { + .cmd = NL80211_CMD_DISASSOCIATE, + .buf = buf, + .buf_len = len, + .reconnect = reconnect, + .uapsd_queues = -1, + }; + + nl80211_send_mlme_event(rdev, netdev, &event, gfp); } void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev, const u8 *buf, @@ -17851,28 +18005,31 @@ void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev, const u8 *buf, struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); const struct ieee80211_mgmt *mgmt = (void *)buf; - u32 cmd; + struct nl80211_mlme_event event = { + .buf = buf, + .buf_len = len, + .uapsd_queues = -1, + }; if (WARN_ON(len < 2)) return; if (ieee80211_is_deauth(mgmt->frame_control)) { - cmd = NL80211_CMD_UNPROT_DEAUTHENTICATE; + event.cmd = NL80211_CMD_UNPROT_DEAUTHENTICATE; } else if (ieee80211_is_disassoc(mgmt->frame_control)) { - cmd = NL80211_CMD_UNPROT_DISASSOCIATE; + event.cmd = NL80211_CMD_UNPROT_DISASSOCIATE; } else if (ieee80211_is_beacon(mgmt->frame_control)) { if (wdev->unprot_beacon_reported && elapsed_jiffies_msecs(wdev->unprot_beacon_reported) < 10000) return; - cmd = NL80211_CMD_UNPROT_BEACON; + event.cmd = NL80211_CMD_UNPROT_BEACON; wdev->unprot_beacon_reported = jiffies; } else { return; } trace_cfg80211_rx_unprot_mlme_mgmt(dev, buf, len); - nl80211_send_mlme_event(rdev, dev, buf, len, cmd, GFP_ATOMIC, -1, - NULL, 0, false); + nl80211_send_mlme_event(rdev, dev, &event, GFP_ATOMIC); } EXPORT_SYMBOL(cfg80211_rx_unprot_mlme_mgmt); @@ -19326,6 +19483,7 @@ void cfg80211_ch_switch_notify(struct net_device *dev, break; } + cfg80211_schedule_channels_check(wdev); cfg80211_sched_dfs_chan_update(rdev); nl80211_ch_switch_notify(rdev, dev, link_id, chandef, GFP_KERNEL, @@ -20083,6 +20241,20 @@ nla_put_failure: } EXPORT_SYMBOL(cfg80211_update_owe_info_event); +void cfg80211_schedule_channels_check(struct wireless_dev *wdev) +{ + struct wiphy *wiphy = wdev->wiphy; + + /* Schedule channels check if NO_IR or DFS relaxations are supported */ + if (wdev->iftype == NL80211_IFTYPE_STATION && + (wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_DFS_CONCURRENT) || + (IS_ENABLED(CONFIG_CFG80211_REG_RELAX_NO_IR) && + wiphy->regulatory_flags & REGULATORY_ENABLE_RELAX_NO_IR))) + reg_check_channels(); +} +EXPORT_SYMBOL(cfg80211_schedule_channels_check); + /* initialisation/exit functions */ int __init nl80211_init(void) diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index aad40240d9..6376f3a87f 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -60,7 +60,7 @@ void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev, const u8 *buf, size_t len, gfp_t gfp); void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev, struct net_device *netdev, - struct cfg80211_rx_assoc_resp_data *data); + const struct cfg80211_rx_assoc_resp_data *data); void nl80211_send_deauth(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *buf, size_t len, diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 2214a90cf1..43897a5269 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -1046,7 +1046,7 @@ rdev_nan_change_conf(struct cfg80211_registered_device *rdev, ret = rdev->ops->nan_change_conf(&rdev->wiphy, wdev, conf, changes); else - ret = -ENOTSUPP; + ret = -EOPNOTSUPP; trace_rdev_return_int(&rdev->wiphy, ret); return ret; } @@ -1200,7 +1200,7 @@ rdev_start_radar_detection(struct cfg80211_registered_device *rdev, struct cfg80211_chan_def *chandef, u32 cac_time_ms) { - int ret = -ENOTSUPP; + int ret = -EOPNOTSUPP; trace_rdev_start_radar_detection(&rdev->wiphy, dev, chandef, cac_time_ms); @@ -1226,7 +1226,7 @@ rdev_set_mcast_rate(struct cfg80211_registered_device *rdev, struct net_device *dev, int mcast_rate[NUM_NL80211_BANDS]) { - int ret = -ENOTSUPP; + int ret = -EOPNOTSUPP; trace_rdev_set_mcast_rate(&rdev->wiphy, dev, mcast_rate); if (rdev->ops->set_mcast_rate) @@ -1239,7 +1239,7 @@ static inline int rdev_set_coalesce(struct cfg80211_registered_device *rdev, struct cfg80211_coalesce *coalesce) { - int ret = -ENOTSUPP; + int ret = -EOPNOTSUPP; trace_rdev_set_coalesce(&rdev->wiphy, coalesce); if (rdev->ops->set_coalesce) @@ -1524,4 +1524,22 @@ rdev_set_hw_timestamp(struct cfg80211_registered_device *rdev, return ret; } + +static inline int +rdev_set_ttlm(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_ttlm_params *params) +{ + struct wiphy *wiphy = &rdev->wiphy; + int ret; + + if (!rdev->ops->set_ttlm) + return -EOPNOTSUPP; + + trace_rdev_set_ttlm(wiphy, dev, params); + ret = rdev->ops->set_ttlm(wiphy, dev, params); + trace_rdev_return_int(wiphy, ret); + + return ret; +} #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 2ef4f6cc7a..2741b62691 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1593,6 +1593,12 @@ static u32 map_regdom_flags(u32 rd_flags) channel_flags |= IEEE80211_CHAN_NO_320MHZ; if (rd_flags & NL80211_RRF_NO_EHT) channel_flags |= IEEE80211_CHAN_NO_EHT; + if (rd_flags & NL80211_RRF_DFS_CONCURRENT) + channel_flags |= IEEE80211_CHAN_DFS_CONCURRENT; + if (rd_flags & NL80211_RRF_NO_UHB_VLP_CLIENT) + channel_flags |= IEEE80211_CHAN_NO_UHB_VLP_CLIENT; + if (rd_flags & NL80211_RRF_NO_UHB_AFC_CLIENT) + channel_flags |= IEEE80211_CHAN_NO_UHB_AFC_CLIENT; if (rd_flags & NL80211_RRF_PSD) channel_flags |= IEEE80211_CHAN_PSD; return channel_flags; @@ -2478,7 +2484,7 @@ static void reg_check_chans_work(struct work_struct *work) rtnl_unlock(); } -static void reg_check_channels(void) +void reg_check_channels(void) { /* * Give usermode a chance to do something nicer (move to another diff --git a/net/wireless/reg.h b/net/wireless/reg.h index a703e53c23..a02ef5609f 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -181,6 +181,11 @@ bool reg_dfs_domain_same(struct wiphy *wiphy1, struct wiphy *wiphy2); */ int reg_reload_regdb(void); +/** + * reg_check_channels - schedule regulatory enforcement + */ +void reg_check_channels(void); + extern const u8 shipped_regdb_certs[]; extern unsigned int shipped_regdb_certs_len; extern const u8 extra_regdb_certs[]; diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 3f49f5c699..f138f88be9 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -20,6 +20,7 @@ #include <net/cfg80211.h> #include <net/cfg80211-wext.h> #include <net/iw_handler.h> +#include <kunit/visibility.h> #include "core.h" #include "nl80211.h" #include "wext-compat.h" @@ -303,9 +304,10 @@ static size_t cfg80211_copy_elem_with_frags(const struct element *elem, return *pos - buf; } -static size_t cfg80211_gen_new_ie(const u8 *ie, size_t ielen, - const u8 *subie, size_t subie_len, - u8 *new_ie, size_t new_ie_len) +VISIBLE_IF_CFG80211_KUNIT size_t +cfg80211_gen_new_ie(const u8 *ie, size_t ielen, + const u8 *subie, size_t subie_len, + u8 *new_ie, size_t new_ie_len) { const struct element *non_inherit_elem, *parent, *sub; u8 *pos = new_ie; @@ -413,6 +415,7 @@ static size_t cfg80211_gen_new_ie(const u8 *ie, size_t ielen, return pos - new_ie; } +EXPORT_SYMBOL_IF_CFG80211_KUNIT(cfg80211_gen_new_ie); static bool is_bss(struct cfg80211_bss *a, const u8 *bssid, const u8 *ssid, size_t ssid_len) @@ -1535,12 +1538,13 @@ static bool cfg80211_bss_type_match(u16 capability, } /* Returned bss is reference counted and must be cleaned up appropriately. */ -struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy, - struct ieee80211_channel *channel, - const u8 *bssid, - const u8 *ssid, size_t ssid_len, - enum ieee80211_bss_type bss_type, - enum ieee80211_privacy privacy) +struct cfg80211_bss *__cfg80211_get_bss(struct wiphy *wiphy, + struct ieee80211_channel *channel, + const u8 *bssid, + const u8 *ssid, size_t ssid_len, + enum ieee80211_bss_type bss_type, + enum ieee80211_privacy privacy, + u32 use_for) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_internal_bss *bss, *res = NULL; @@ -1565,6 +1569,8 @@ struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy, continue; if (!is_valid_ether_addr(bss->pub.bssid)) continue; + if ((bss->pub.use_for & use_for) != use_for) + continue; /* Don't get expired BSS structs */ if (time_after(now, bss->ts + IEEE80211_SCAN_RESULT_EXPIRE) && !atomic_read(&bss->hold)) @@ -1582,7 +1588,7 @@ struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy, trace_cfg80211_return_bss(&res->pub); return &res->pub; } -EXPORT_SYMBOL(cfg80211_get_bss); +EXPORT_SYMBOL(__cfg80211_get_bss); static void rb_insert_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *bss) @@ -1859,6 +1865,8 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, ether_addr_copy(known->parent_bssid, new->parent_bssid); known->pub.max_bssid_indicator = new->pub.max_bssid_indicator; known->pub.bssid_index = new->pub.bssid_index; + known->pub.use_for &= new->pub.use_for; + known->pub.cannot_use_reasons = new->pub.cannot_use_reasons; return true; } @@ -1870,15 +1878,15 @@ __cfg80211_bss_update(struct cfg80211_registered_device *rdev, bool signal_valid, unsigned long ts) { struct cfg80211_internal_bss *found = NULL; + struct cfg80211_bss_ies *ies; if (WARN_ON(!tmp->pub.channel)) - return NULL; + goto free_ies; tmp->ts = ts; - if (WARN_ON(!rcu_access_pointer(tmp->pub.ies))) { - return NULL; - } + if (WARN_ON(!rcu_access_pointer(tmp->pub.ies))) + goto free_ies; found = rb_find_bss(rdev, tmp, BSS_CMP_REGULAR); @@ -1888,7 +1896,6 @@ __cfg80211_bss_update(struct cfg80211_registered_device *rdev, } else { struct cfg80211_internal_bss *new; struct cfg80211_internal_bss *hidden; - struct cfg80211_bss_ies *ies; /* * create a copy -- the "res" variable that is passed in @@ -1897,15 +1904,8 @@ __cfg80211_bss_update(struct cfg80211_registered_device *rdev, */ new = kzalloc(sizeof(*new) + rdev->wiphy.bss_priv_size, GFP_ATOMIC); - if (!new) { - ies = (void *)rcu_dereference(tmp->pub.beacon_ies); - if (ies) - kfree_rcu(ies, rcu_head); - ies = (void *)rcu_dereference(tmp->pub.proberesp_ies); - if (ies) - kfree_rcu(ies, rcu_head); - return NULL; - } + if (!new) + goto free_ies; memcpy(new, tmp, sizeof(*new)); new->refcount = 1; INIT_LIST_HEAD(&new->hidden_list); @@ -1965,6 +1965,16 @@ __cfg80211_bss_update(struct cfg80211_registered_device *rdev, bss_ref_get(rdev, found); return found; + +free_ies: + ies = (void *)rcu_dereference(tmp->pub.beacon_ies); + if (ies) + kfree_rcu(ies, rcu_head); + ies = (void *)rcu_dereference(tmp->pub.proberesp_ies); + if (ies) + kfree_rcu(ies, rcu_head); + + return NULL; } struct cfg80211_internal_bss * @@ -2107,6 +2117,9 @@ struct cfg80211_inform_single_bss_data { struct cfg80211_bss *source_bss; u8 max_bssid_indicator; u8 bssid_index; + + u8 use_for; + u64 cannot_use_reasons; }; /* Returned bss is reference counted and must be cleaned up appropriately. */ @@ -2152,6 +2165,8 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy, tmp.ts_boottime = drv_data->boottime_ns; tmp.parent_tsf = drv_data->parent_tsf; ether_addr_copy(tmp.parent_bssid, drv_data->parent_bssid); + tmp.pub.use_for = data->use_for; + tmp.pub.cannot_use_reasons = data->cannot_use_reasons; if (data->bss_source != BSS_SOURCE_DIRECT) { tmp.pub.transmitted_bss = data->source_bss; @@ -2322,6 +2337,8 @@ cfg80211_parse_mbssid_data(struct wiphy *wiphy, .beacon_interval = tx_data->beacon_interval, .source_bss = source_bss, .bss_source = BSS_SOURCE_MBSSID, + .use_for = tx_data->use_for, + .cannot_use_reasons = tx_data->cannot_use_reasons, }; const u8 *mbssid_index_ie; const struct element *elem, *sub; @@ -2584,10 +2601,10 @@ error: return NULL; } -static bool -cfg80211_tbtt_info_for_mld_ap(const u8 *ie, size_t ielen, u8 mld_id, u8 link_id, - const struct ieee80211_neighbor_ap_info **ap_info, - const u8 **tbtt_info) +static u8 +cfg80211_rnr_info_for_mld_ap(const u8 *ie, size_t ielen, u8 mld_id, u8 link_id, + const struct ieee80211_neighbor_ap_info **ap_info, + u8 *param_ch_count) { const struct ieee80211_neighbor_ap_info *info; const struct element *rnr; @@ -2603,6 +2620,7 @@ cfg80211_tbtt_info_for_mld_ap(const u8 *ie, size_t ielen, u8 mld_id, u8 link_id, u16 params; u8 length, i, count, mld_params_offset; u8 type, lid; + u32 use_for; info = (void *)pos; count = u8_get_bits(info->tbtt_info_hdr, @@ -2612,20 +2630,22 @@ cfg80211_tbtt_info_for_mld_ap(const u8 *ie, size_t ielen, u8 mld_id, u8 link_id, pos += sizeof(*info); if (count * length > end - pos) - return false; + return 0; type = u8_get_bits(info->tbtt_info_hdr, IEEE80211_AP_INFO_TBTT_HDR_TYPE); - /* Only accept full TBTT information. NSTR mobile APs - * use the shortened version, but we ignore them here. - */ if (type == IEEE80211_TBTT_INFO_TYPE_TBTT && length >= offsetofend(struct ieee80211_tbtt_info_ge_11, mld_params)) { mld_params_offset = offsetof(struct ieee80211_tbtt_info_ge_11, mld_params); + use_for = NL80211_BSS_USE_FOR_ALL; + } else if (type == IEEE80211_TBTT_INFO_TYPE_MLD && + length >= sizeof(struct ieee80211_rnr_mld_params)) { + mld_params_offset = 0; + use_for = NL80211_BSS_USE_FOR_MLD_LINK; } else { pos += count * length; continue; @@ -2641,9 +2661,11 @@ cfg80211_tbtt_info_for_mld_ap(const u8 *ie, size_t ielen, u8 mld_id, u8 link_id, if (mld_id == mld_params->mld_id && link_id == lid) { *ap_info = info; - *tbtt_info = pos; + *param_ch_count = + le16_get_bits(mld_params->params, + IEEE80211_RNR_MLD_PARAMS_BSS_CHANGE_COUNT); - return true; + return use_for; } pos += length; @@ -2651,7 +2673,104 @@ cfg80211_tbtt_info_for_mld_ap(const u8 *ie, size_t ielen, u8 mld_id, u8 link_id, } } - return false; + return 0; +} + +static struct element * +cfg80211_gen_reporter_rnr(struct cfg80211_bss *source_bss, bool is_mbssid, + bool same_mld, u8 link_id, u8 bss_change_count, + gfp_t gfp) +{ + const struct cfg80211_bss_ies *ies; + struct ieee80211_neighbor_ap_info ap_info; + struct ieee80211_tbtt_info_ge_11 tbtt_info; + u32 short_ssid; + const struct element *elem; + struct element *res; + + /* + * We only generate the RNR to permit ML lookups. For that we do not + * need an entry for the corresponding transmitting BSS, lets just skip + * it even though it would be easy to add. + */ + if (!same_mld) + return NULL; + + /* We could use tx_data->ies if we change cfg80211_calc_short_ssid */ + rcu_read_lock(); + ies = rcu_dereference(source_bss->ies); + + ap_info.tbtt_info_len = offsetofend(typeof(tbtt_info), mld_params); + ap_info.tbtt_info_hdr = + u8_encode_bits(IEEE80211_TBTT_INFO_TYPE_TBTT, + IEEE80211_AP_INFO_TBTT_HDR_TYPE) | + u8_encode_bits(0, IEEE80211_AP_INFO_TBTT_HDR_COUNT); + + ap_info.channel = ieee80211_frequency_to_channel(source_bss->channel->center_freq); + + /* operating class */ + elem = cfg80211_find_elem(WLAN_EID_SUPPORTED_REGULATORY_CLASSES, + ies->data, ies->len); + if (elem && elem->datalen >= 1) { + ap_info.op_class = elem->data[0]; + } else { + struct cfg80211_chan_def chandef; + + /* The AP is not providing us with anything to work with. So + * make up a somewhat reasonable operating class, but don't + * bother with it too much as no one will ever use the + * information. + */ + cfg80211_chandef_create(&chandef, source_bss->channel, + NL80211_CHAN_NO_HT); + + if (!ieee80211_chandef_to_operating_class(&chandef, + &ap_info.op_class)) + goto out_unlock; + } + + /* Just set TBTT offset and PSD 20 to invalid/unknown */ + tbtt_info.tbtt_offset = 255; + tbtt_info.psd_20 = IEEE80211_RNR_TBTT_PARAMS_PSD_RESERVED; + + memcpy(tbtt_info.bssid, source_bss->bssid, ETH_ALEN); + if (cfg80211_calc_short_ssid(ies, &elem, &short_ssid)) + goto out_unlock; + + rcu_read_unlock(); + + tbtt_info.short_ssid = cpu_to_le32(short_ssid); + + tbtt_info.bss_params = IEEE80211_RNR_TBTT_PARAMS_SAME_SSID; + + if (is_mbssid) { + tbtt_info.bss_params |= IEEE80211_RNR_TBTT_PARAMS_MULTI_BSSID; + tbtt_info.bss_params |= IEEE80211_RNR_TBTT_PARAMS_TRANSMITTED_BSSID; + } + + tbtt_info.mld_params.mld_id = 0; + tbtt_info.mld_params.params = + le16_encode_bits(link_id, IEEE80211_RNR_MLD_PARAMS_LINK_ID) | + le16_encode_bits(bss_change_count, + IEEE80211_RNR_MLD_PARAMS_BSS_CHANGE_COUNT); + + res = kzalloc(struct_size(res, data, + sizeof(ap_info) + ap_info.tbtt_info_len), + gfp); + if (!res) + return NULL; + + /* Copy the data */ + res->id = WLAN_EID_REDUCED_NEIGHBOR_REPORT; + res->datalen = sizeof(ap_info) + ap_info.tbtt_info_len; + memcpy(res->data, &ap_info, sizeof(ap_info)); + memcpy(res->data + sizeof(ap_info), &tbtt_info, ap_info.tbtt_info_len); + + return res; + +out_unlock: + rcu_read_unlock(); + return NULL; } static void @@ -2667,12 +2786,14 @@ cfg80211_parse_ml_elem_sta_data(struct wiphy *wiphy, .source_bss = source_bss, .bss_source = BSS_SOURCE_STA_PROFILE, }; + struct element *reporter_rnr = NULL; struct ieee80211_multi_link_elem *ml_elem; struct cfg80211_mle *mle; u16 control; - u8 *new_ie; + u8 ml_common_len; + u8 *new_ie = NULL; struct cfg80211_bss *bss; - int mld_id; + u8 mld_id, reporter_link_id, bss_change_count; u16 seen_links = 0; const u8 *pos; u8 i; @@ -2692,8 +2813,16 @@ cfg80211_parse_ml_elem_sta_data(struct wiphy *wiphy, !(control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP)) return; - /* length + MLD MAC address + link ID info + BSS Params Change Count */ - pos = ml_elem->variable + 1 + 6 + 1 + 1; + ml_common_len = ml_elem->variable[0]; + + /* length + MLD MAC address */ + pos = ml_elem->variable + 1 + 6; + + reporter_link_id = pos[0]; + pos += 1; + + bss_change_count = pos[0]; + pos += 1; if (u16_get_bits(control, IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY)) pos += 2; @@ -2724,18 +2853,29 @@ cfg80211_parse_ml_elem_sta_data(struct wiphy *wiphy, if (!mle) return; + /* No point in doing anything if there is no per-STA profile */ + if (!mle->sta_prof[0]) + goto out; + new_ie = kmalloc(IEEE80211_MAX_DATA_LEN, gfp); if (!new_ie) goto out; + reporter_rnr = cfg80211_gen_reporter_rnr(source_bss, + u16_get_bits(control, + IEEE80211_MLC_BASIC_PRES_MLD_ID), + mld_id == 0, reporter_link_id, + bss_change_count, + gfp); + for (i = 0; i < ARRAY_SIZE(mle->sta_prof) && mle->sta_prof[i]; i++) { const struct ieee80211_neighbor_ap_info *ap_info; enum nl80211_band band; u32 freq; const u8 *profile; - const u8 *tbtt_info; ssize_t profile_len; - u8 link_id; + u8 param_ch_count; + u8 link_id, use_for; if (!ieee80211_mle_basic_sta_prof_size_ok((u8 *)mle->sta_prof[i], mle->sta_prof_len[i])) @@ -2777,9 +2917,12 @@ cfg80211_parse_ml_elem_sta_data(struct wiphy *wiphy, profile_len -= 2; /* Find in RNR to look up channel information */ - if (!cfg80211_tbtt_info_for_mld_ap(tx_data->ie, tx_data->ielen, - mld_id, link_id, - &ap_info, &tbtt_info)) + use_for = cfg80211_rnr_info_for_mld_ap(tx_data->ie, + tx_data->ielen, + mld_id, link_id, + &ap_info, + ¶m_ch_count); + if (!use_for) continue; /* We could sanity check the BSSID is included */ @@ -2791,6 +2934,14 @@ cfg80211_parse_ml_elem_sta_data(struct wiphy *wiphy, freq = ieee80211_channel_to_freq_khz(ap_info->channel, band); data.channel = ieee80211_get_channel_khz(wiphy, freq); + if (use_for == NL80211_BSS_USE_FOR_MLD_LINK && + !(wiphy->flags & WIPHY_FLAG_SUPPORTS_NSTR_NONPRIMARY)) { + use_for = 0; + data.cannot_use_reasons = + NL80211_BSS_CANNOT_USE_NSTR_NONPRIMARY; + } + data.use_for = use_for; + /* Generate new elements */ memset(new_ie, 0, IEEE80211_MAX_DATA_LEN); data.ie = new_ie; @@ -2801,6 +2952,46 @@ cfg80211_parse_ml_elem_sta_data(struct wiphy *wiphy, if (!data.ielen) continue; + /* The generated elements do not contain: + * - Basic ML element + * - A TBTT entry in the RNR for the transmitting AP + * + * This information is needed both internally and in userspace + * as such, we should append it here. + */ + if (data.ielen + 3 + sizeof(*ml_elem) + ml_common_len > + IEEE80211_MAX_DATA_LEN) + continue; + + /* Copy the Basic Multi-Link element including the common + * information, and then fix up the link ID and BSS param + * change count. + * Note that the ML element length has been verified and we + * also checked that it contains the link ID. + */ + new_ie[data.ielen++] = WLAN_EID_EXTENSION; + new_ie[data.ielen++] = 1 + sizeof(*ml_elem) + ml_common_len; + new_ie[data.ielen++] = WLAN_EID_EXT_EHT_MULTI_LINK; + memcpy(new_ie + data.ielen, ml_elem, + sizeof(*ml_elem) + ml_common_len); + + new_ie[data.ielen + sizeof(*ml_elem) + 1 + ETH_ALEN] = link_id; + new_ie[data.ielen + sizeof(*ml_elem) + 1 + ETH_ALEN + 1] = + param_ch_count; + + data.ielen += sizeof(*ml_elem) + ml_common_len; + + if (reporter_rnr && (use_for & NL80211_BSS_USE_FOR_NORMAL)) { + if (data.ielen + sizeof(struct element) + + reporter_rnr->datalen > IEEE80211_MAX_DATA_LEN) + continue; + + memcpy(new_ie + data.ielen, reporter_rnr, + sizeof(struct element) + reporter_rnr->datalen); + data.ielen += sizeof(struct element) + + reporter_rnr->datalen; + } + bss = cfg80211_inform_single_bss_data(wiphy, &data, gfp); if (!bss) break; @@ -2808,6 +2999,7 @@ cfg80211_parse_ml_elem_sta_data(struct wiphy *wiphy, } out: + kfree(reporter_rnr); kfree(new_ie); kfree(mle); } @@ -2847,6 +3039,10 @@ cfg80211_inform_bss_data(struct wiphy *wiphy, .beacon_interval = beacon_interval, .ie = ie, .ielen = ielen, + .use_for = data->restrict_use ? + data->use_for : + NL80211_BSS_USE_FOR_ALL, + .cannot_use_reasons = data->cannot_use_reasons, }; struct cfg80211_bss *res; @@ -2864,6 +3060,36 @@ cfg80211_inform_bss_data(struct wiphy *wiphy, } EXPORT_SYMBOL(cfg80211_inform_bss_data); +static bool cfg80211_uhb_power_type_valid(const u8 *ie, + size_t ielen, + const u32 flags) +{ + const struct element *tmp; + struct ieee80211_he_operation *he_oper; + + tmp = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_OPERATION, ie, ielen); + if (tmp && tmp->datalen >= sizeof(*he_oper) + 1) { + const struct ieee80211_he_6ghz_oper *he_6ghz_oper; + + he_oper = (void *)&tmp->data[1]; + he_6ghz_oper = ieee80211_he_6ghz_oper(he_oper); + + if (!he_6ghz_oper) + return false; + + switch (u8_get_bits(he_6ghz_oper->control, + IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) { + case IEEE80211_6GHZ_CTRL_REG_LPI_AP: + return true; + case IEEE80211_6GHZ_CTRL_REG_SP_AP: + return !(flags & IEEE80211_CHAN_NO_UHB_AFC_CLIENT); + case IEEE80211_6GHZ_CTRL_REG_VLP_AP: + return !(flags & IEEE80211_CHAN_NO_UHB_VLP_CLIENT); + } + } + return false; +} + /* cfg80211_inform_bss_width_frame helper */ static struct cfg80211_bss * cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy, @@ -2922,6 +3148,14 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy, if (!channel) return NULL; + if (channel->band == NL80211_BAND_6GHZ && + !cfg80211_uhb_power_type_valid(variable, ielen, channel->flags)) { + data->restrict_use = 1; + data->use_for = 0; + data->cannot_use_reasons = + NL80211_BSS_CANNOT_USE_UHB_PWR_MISMATCH; + } + if (ext) { const struct ieee80211_s1g_bcn_compat_ie *compat; const struct element *elem; @@ -2977,6 +3211,10 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy, tmp.pub.chains = data->chains; memcpy(tmp.pub.chain_signal, data->chain_signal, IEEE80211_MAX_CHAINS); ether_addr_copy(tmp.parent_bssid, data->parent_bssid); + tmp.pub.use_for = data->restrict_use ? + data->use_for : + NL80211_BSS_USE_FOR_ALL; + tmp.pub.cannot_use_reasons = data->cannot_use_reasons; signal_valid = data->chan == channel; spin_lock_bh(&rdev->bss_lock); @@ -3008,6 +3246,10 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, .ie = mgmt->u.probe_resp.variable, .ielen = len - offsetof(struct ieee80211_mgmt, u.probe_resp.variable), + .use_for = data->restrict_use ? + data->use_for : + NL80211_BSS_USE_FOR_ALL, + .cannot_use_reasons = data->cannot_use_reasons, }; struct cfg80211_bss *res; @@ -3158,10 +3400,9 @@ void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev, if (new) { /* to save time, update IEs for transmitting bss only */ - if (cfg80211_update_known_bss(rdev, cbss, new, false)) { - new->pub.proberesp_ies = NULL; - new->pub.beacon_ies = NULL; - } + cfg80211_update_known_bss(rdev, cbss, new, false); + new->pub.proberesp_ies = NULL; + new->pub.beacon_ies = NULL; list_for_each_entry_safe(nontrans_bss, tmp, &new->pub.nontrans_list, diff --git a/net/wireless/sme.c b/net/wireless/sme.c index acfe66da71..195c853273 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -1394,6 +1394,8 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, #endif schedule_work(&cfg80211_disconnect_work); + + cfg80211_schedule_channels_check(wdev); } void cfg80211_disconnected(struct net_device *dev, u16 reason, diff --git a/net/wireless/tests/Makefile b/net/wireless/tests/Makefile index fa8e297bbc..1f6622fcb7 100644 --- a/net/wireless/tests/Makefile +++ b/net/wireless/tests/Makefile @@ -1,3 +1,3 @@ -cfg80211-tests-y += module.o fragmentation.o +cfg80211-tests-y += module.o fragmentation.o scan.o util.o obj-$(CONFIG_CFG80211_KUNIT_TEST) += cfg80211-tests.o diff --git a/net/wireless/tests/scan.c b/net/wireless/tests/scan.c new file mode 100644 index 0000000000..f9ea44aee9 --- /dev/null +++ b/net/wireless/tests/scan.c @@ -0,0 +1,655 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KUnit tests for inform_bss functions + * + * Copyright (C) 2023-2024 Intel Corporation + */ +#include <linux/ieee80211.h> +#include <net/cfg80211.h> +#include <kunit/test.h> +#include <kunit/skbuff.h> +#include "../core.h" +#include "util.h" + +/* mac80211 helpers for element building */ +#include "../../mac80211/ieee80211_i.h" + +MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING); + +struct test_elem { + u8 id; + u8 len; + union { + u8 data[255]; + struct { + u8 eid; + u8 edata[254]; + }; + }; +}; + +static struct gen_new_ie_case { + const char *desc; + struct test_elem parent_ies[16]; + struct test_elem child_ies[16]; + struct test_elem result_ies[16]; +} gen_new_ie_cases[] = { + { + .desc = "ML not inherited", + .parent_ies = { + { .id = WLAN_EID_EXTENSION, .len = 255, + .eid = WLAN_EID_EXT_EHT_MULTI_LINK }, + }, + .child_ies = { + { .id = WLAN_EID_SSID, .len = 2 }, + }, + .result_ies = { + { .id = WLAN_EID_SSID, .len = 2 }, + }, + }, + { + .desc = "fragments are ignored if previous len not 255", + .parent_ies = { + { .id = WLAN_EID_REDUCED_NEIGHBOR_REPORT, .len = 254, }, + { .id = WLAN_EID_FRAGMENT, .len = 125, }, + }, + .child_ies = { + { .id = WLAN_EID_SSID, .len = 2 }, + { .id = WLAN_EID_FRAGMENT, .len = 125, }, + }, + .result_ies = { + { .id = WLAN_EID_REDUCED_NEIGHBOR_REPORT, .len = 254, }, + { .id = WLAN_EID_SSID, .len = 2 }, + }, + }, + { + .desc = "fragments inherited", + .parent_ies = { + { .id = WLAN_EID_REDUCED_NEIGHBOR_REPORT, .len = 255, }, + { .id = WLAN_EID_FRAGMENT, .len = 125, }, + }, + .child_ies = { + { .id = WLAN_EID_SSID, .len = 2 }, + }, + .result_ies = { + { .id = WLAN_EID_REDUCED_NEIGHBOR_REPORT, .len = 255, }, + { .id = WLAN_EID_FRAGMENT, .len = 125, }, + { .id = WLAN_EID_SSID, .len = 2 }, + }, + }, + { + .desc = "fragments copied", + .parent_ies = { + { .id = WLAN_EID_REDUCED_NEIGHBOR_REPORT, .len = 255, }, + { .id = WLAN_EID_FRAGMENT, .len = 125, }, + }, + .child_ies = { + { .id = WLAN_EID_SSID, .len = 2 }, + }, + .result_ies = { + { .id = WLAN_EID_REDUCED_NEIGHBOR_REPORT, .len = 255, }, + { .id = WLAN_EID_FRAGMENT, .len = 125, }, + { .id = WLAN_EID_SSID, .len = 2 }, + }, + }, + { + .desc = "multiple elements inherit", + .parent_ies = { + { .id = WLAN_EID_REDUCED_NEIGHBOR_REPORT, .len = 255, }, + { .id = WLAN_EID_FRAGMENT, .len = 125, }, + { .id = WLAN_EID_REDUCED_NEIGHBOR_REPORT, .len = 123, }, + }, + .child_ies = { + { .id = WLAN_EID_SSID, .len = 2 }, + }, + .result_ies = { + { .id = WLAN_EID_REDUCED_NEIGHBOR_REPORT, .len = 255, }, + { .id = WLAN_EID_FRAGMENT, .len = 125, }, + { .id = WLAN_EID_REDUCED_NEIGHBOR_REPORT, .len = 123, }, + { .id = WLAN_EID_SSID, .len = 2 }, + }, + }, + { + .desc = "one child element overrides", + .parent_ies = { + { .id = WLAN_EID_REDUCED_NEIGHBOR_REPORT, .len = 255, }, + { .id = WLAN_EID_FRAGMENT, .len = 125, }, + { .id = WLAN_EID_REDUCED_NEIGHBOR_REPORT, .len = 123, }, + }, + .child_ies = { + { .id = WLAN_EID_REDUCED_NEIGHBOR_REPORT, .len = 127, }, + { .id = WLAN_EID_SSID, .len = 2 }, + }, + .result_ies = { + { .id = WLAN_EID_REDUCED_NEIGHBOR_REPORT, .len = 127, }, + { .id = WLAN_EID_SSID, .len = 2 }, + }, + }, + { + .desc = "empty elements from parent", + .parent_ies = { + { .id = 0x1, .len = 0, }, + { .id = WLAN_EID_EXTENSION, .len = 1, .eid = 0x10 }, + }, + .child_ies = { + }, + .result_ies = { + { .id = 0x1, .len = 0, }, + { .id = WLAN_EID_EXTENSION, .len = 1, .eid = 0x10 }, + }, + }, + { + .desc = "empty elements from child", + .parent_ies = { + }, + .child_ies = { + { .id = 0x1, .len = 0, }, + { .id = WLAN_EID_EXTENSION, .len = 1, .eid = 0x10 }, + }, + .result_ies = { + { .id = 0x1, .len = 0, }, + { .id = WLAN_EID_EXTENSION, .len = 1, .eid = 0x10 }, + }, + }, + { + .desc = "invalid extended elements ignored", + .parent_ies = { + { .id = WLAN_EID_EXTENSION, .len = 0 }, + }, + .child_ies = { + { .id = WLAN_EID_EXTENSION, .len = 0 }, + }, + .result_ies = { + }, + }, + { + .desc = "multiple extended elements", + .parent_ies = { + { .id = WLAN_EID_EXTENSION, .len = 3, + .eid = WLAN_EID_EXT_HE_CAPABILITY }, + { .id = WLAN_EID_EXTENSION, .len = 5, + .eid = WLAN_EID_EXT_ASSOC_DELAY_INFO }, + { .id = WLAN_EID_EXTENSION, .len = 7, + .eid = WLAN_EID_EXT_HE_OPERATION }, + { .id = WLAN_EID_EXTENSION, .len = 11, + .eid = WLAN_EID_EXT_FILS_REQ_PARAMS }, + }, + .child_ies = { + { .id = WLAN_EID_SSID, .len = 13 }, + { .id = WLAN_EID_EXTENSION, .len = 17, + .eid = WLAN_EID_EXT_HE_CAPABILITY }, + { .id = WLAN_EID_EXTENSION, .len = 11, + .eid = WLAN_EID_EXT_FILS_KEY_CONFIRM }, + { .id = WLAN_EID_EXTENSION, .len = 19, + .eid = WLAN_EID_EXT_HE_OPERATION }, + }, + .result_ies = { + { .id = WLAN_EID_EXTENSION, .len = 17, + .eid = WLAN_EID_EXT_HE_CAPABILITY }, + { .id = WLAN_EID_EXTENSION, .len = 5, + .eid = WLAN_EID_EXT_ASSOC_DELAY_INFO }, + { .id = WLAN_EID_EXTENSION, .len = 19, + .eid = WLAN_EID_EXT_HE_OPERATION }, + { .id = WLAN_EID_EXTENSION, .len = 11, + .eid = WLAN_EID_EXT_FILS_REQ_PARAMS }, + { .id = WLAN_EID_SSID, .len = 13 }, + { .id = WLAN_EID_EXTENSION, .len = 11, + .eid = WLAN_EID_EXT_FILS_KEY_CONFIRM }, + }, + }, + { + .desc = "non-inherit element", + .parent_ies = { + { .id = 0x1, .len = 7, }, + { .id = 0x2, .len = 11, }, + { .id = 0x3, .len = 13, }, + { .id = WLAN_EID_EXTENSION, .len = 17, .eid = 0x10 }, + { .id = WLAN_EID_EXTENSION, .len = 19, .eid = 0x11 }, + { .id = WLAN_EID_EXTENSION, .len = 23, .eid = 0x12 }, + { .id = WLAN_EID_EXTENSION, .len = 29, .eid = 0x14 }, + }, + .child_ies = { + { .id = WLAN_EID_EXTENSION, + .eid = WLAN_EID_EXT_NON_INHERITANCE, + .len = 10, + .edata = { 0x3, 0x1, 0x2, 0x3, + 0x4, 0x10, 0x11, 0x13, 0x14 } }, + { .id = WLAN_EID_SSID, .len = 2 }, + }, + .result_ies = { + { .id = WLAN_EID_EXTENSION, .len = 23, .eid = 0x12 }, + { .id = WLAN_EID_SSID, .len = 2 }, + }, + }, +}; +KUNIT_ARRAY_PARAM_DESC(gen_new_ie, gen_new_ie_cases, desc) + +static void test_gen_new_ie(struct kunit *test) +{ + const struct gen_new_ie_case *params = test->param_value; + struct sk_buff *parent = kunit_zalloc_skb(test, 1024, GFP_KERNEL); + struct sk_buff *child = kunit_zalloc_skb(test, 1024, GFP_KERNEL); + struct sk_buff *reference = kunit_zalloc_skb(test, 1024, GFP_KERNEL); + u8 *out = kunit_kzalloc(test, IEEE80211_MAX_DATA_LEN, GFP_KERNEL); + size_t len; + int i; + + KUNIT_ASSERT_NOT_NULL(test, parent); + KUNIT_ASSERT_NOT_NULL(test, child); + KUNIT_ASSERT_NOT_NULL(test, reference); + KUNIT_ASSERT_NOT_NULL(test, out); + + for (i = 0; i < ARRAY_SIZE(params->parent_ies); i++) { + if (params->parent_ies[i].len != 0) { + skb_put_u8(parent, params->parent_ies[i].id); + skb_put_u8(parent, params->parent_ies[i].len); + skb_put_data(parent, params->parent_ies[i].data, + params->parent_ies[i].len); + } + + if (params->child_ies[i].len != 0) { + skb_put_u8(child, params->child_ies[i].id); + skb_put_u8(child, params->child_ies[i].len); + skb_put_data(child, params->child_ies[i].data, + params->child_ies[i].len); + } + + if (params->result_ies[i].len != 0) { + skb_put_u8(reference, params->result_ies[i].id); + skb_put_u8(reference, params->result_ies[i].len); + skb_put_data(reference, params->result_ies[i].data, + params->result_ies[i].len); + } + } + + len = cfg80211_gen_new_ie(parent->data, parent->len, + child->data, child->len, + out, IEEE80211_MAX_DATA_LEN); + KUNIT_EXPECT_EQ(test, len, reference->len); + KUNIT_EXPECT_MEMEQ(test, out, reference->data, reference->len); + memset(out, 0, IEEE80211_MAX_DATA_LEN); + + /* Exactly enough space */ + len = cfg80211_gen_new_ie(parent->data, parent->len, + child->data, child->len, + out, reference->len); + KUNIT_EXPECT_EQ(test, len, reference->len); + KUNIT_EXPECT_MEMEQ(test, out, reference->data, reference->len); + memset(out, 0, IEEE80211_MAX_DATA_LEN); + + /* Not enough space (or expected zero length) */ + len = cfg80211_gen_new_ie(parent->data, parent->len, + child->data, child->len, + out, reference->len - 1); + KUNIT_EXPECT_EQ(test, len, 0); +} + +static void test_gen_new_ie_malformed(struct kunit *test) +{ + struct sk_buff *malformed = kunit_zalloc_skb(test, 1024, GFP_KERNEL); + u8 *out = kunit_kzalloc(test, IEEE80211_MAX_DATA_LEN, GFP_KERNEL); + size_t len; + + KUNIT_ASSERT_NOT_NULL(test, malformed); + KUNIT_ASSERT_NOT_NULL(test, out); + + skb_put_u8(malformed, WLAN_EID_SSID); + skb_put_u8(malformed, 3); + skb_put(malformed, 3); + skb_put_u8(malformed, WLAN_EID_REDUCED_NEIGHBOR_REPORT); + skb_put_u8(malformed, 10); + skb_put(malformed, 9); + + len = cfg80211_gen_new_ie(malformed->data, malformed->len, + out, 0, + out, IEEE80211_MAX_DATA_LEN); + KUNIT_EXPECT_EQ(test, len, 5); + + len = cfg80211_gen_new_ie(out, 0, + malformed->data, malformed->len, + out, IEEE80211_MAX_DATA_LEN); + KUNIT_EXPECT_EQ(test, len, 5); +} + +struct inform_bss { + struct kunit *test; + + int inform_bss_count; +}; + +static void inform_bss_inc_counter(struct wiphy *wiphy, + struct cfg80211_bss *bss, + const struct cfg80211_bss_ies *ies, + void *drv_data) +{ + struct inform_bss *ctx = t_wiphy_ctx(wiphy); + + ctx->inform_bss_count++; + + rcu_read_lock(); + KUNIT_EXPECT_PTR_EQ(ctx->test, drv_data, ctx); + KUNIT_EXPECT_PTR_EQ(ctx->test, ies, rcu_dereference(bss->ies)); + rcu_read_unlock(); +} + +static void test_inform_bss_ssid_only(struct kunit *test) +{ + struct inform_bss ctx = { + .test = test, + }; + struct wiphy *wiphy = T_WIPHY(test, ctx); + struct t_wiphy_priv *w_priv = wiphy_priv(wiphy); + struct cfg80211_inform_bss inform_bss = { + .signal = 50, + .drv_data = &ctx, + }; + const u8 bssid[ETH_ALEN] = { 0x10, 0x22, 0x33, 0x44, 0x55, 0x66 }; + u64 tsf = 0x1000000000000000ULL; + int beacon_int = 100; + u16 capability = 0x1234; + static const u8 input[] = { + [0] = WLAN_EID_SSID, + [1] = 4, + [2] = 'T', 'E', 'S', 'T' + }; + struct cfg80211_bss *bss, *other; + const struct cfg80211_bss_ies *ies; + + w_priv->ops->inform_bss = inform_bss_inc_counter; + + inform_bss.chan = ieee80211_get_channel_khz(wiphy, MHZ_TO_KHZ(2412)); + KUNIT_ASSERT_NOT_NULL(test, inform_bss.chan); + + bss = cfg80211_inform_bss_data(wiphy, &inform_bss, + CFG80211_BSS_FTYPE_PRESP, bssid, tsf, + capability, beacon_int, + input, sizeof(input), + GFP_KERNEL); + KUNIT_EXPECT_NOT_NULL(test, bss); + KUNIT_EXPECT_EQ(test, ctx.inform_bss_count, 1); + + /* Check values in returned bss are correct */ + KUNIT_EXPECT_EQ(test, bss->signal, inform_bss.signal); + KUNIT_EXPECT_EQ(test, bss->beacon_interval, beacon_int); + KUNIT_EXPECT_EQ(test, bss->capability, capability); + KUNIT_EXPECT_EQ(test, bss->bssid_index, 0); + KUNIT_EXPECT_PTR_EQ(test, bss->channel, inform_bss.chan); + KUNIT_EXPECT_MEMEQ(test, bssid, bss->bssid, sizeof(bssid)); + + /* Check the IEs have the expected value */ + rcu_read_lock(); + ies = rcu_dereference(bss->ies); + KUNIT_EXPECT_NOT_NULL(test, ies); + KUNIT_EXPECT_EQ(test, ies->tsf, tsf); + KUNIT_EXPECT_EQ(test, ies->len, sizeof(input)); + KUNIT_EXPECT_MEMEQ(test, ies->data, input, sizeof(input)); + rcu_read_unlock(); + + /* Check we can look up the BSS - by SSID */ + other = cfg80211_get_bss(wiphy, NULL, NULL, "TEST", 4, + IEEE80211_BSS_TYPE_ANY, + IEEE80211_PRIVACY_ANY); + KUNIT_EXPECT_PTR_EQ(test, bss, other); + cfg80211_put_bss(wiphy, other); + + /* Check we can look up the BSS - by BSSID */ + other = cfg80211_get_bss(wiphy, NULL, bssid, NULL, 0, + IEEE80211_BSS_TYPE_ANY, + IEEE80211_PRIVACY_ANY); + KUNIT_EXPECT_PTR_EQ(test, bss, other); + cfg80211_put_bss(wiphy, other); + + cfg80211_put_bss(wiphy, bss); +} + +static struct inform_bss_ml_sta_case { + const char *desc; + int mld_id; + bool sta_prof_vendor_elems; + bool include_oper_class; +} inform_bss_ml_sta_cases[] = { + { + .desc = "zero_mld_id", + .mld_id = 0, + .sta_prof_vendor_elems = false, + }, { + .desc = "zero_mld_id_with_oper_class", + .mld_id = 0, + .sta_prof_vendor_elems = false, + .include_oper_class = true, + }, { + .desc = "mld_id_eq_1", + .mld_id = 1, + .sta_prof_vendor_elems = true, + }, { + .desc = "mld_id_eq_1_with_oper_class", + .mld_id = 1, + .sta_prof_vendor_elems = true, + .include_oper_class = true, + }, +}; +KUNIT_ARRAY_PARAM_DESC(inform_bss_ml_sta, inform_bss_ml_sta_cases, desc) + +static void test_inform_bss_ml_sta(struct kunit *test) +{ + const struct inform_bss_ml_sta_case *params = test->param_value; + struct inform_bss ctx = { + .test = test, + }; + struct wiphy *wiphy = T_WIPHY(test, ctx); + struct t_wiphy_priv *w_priv = wiphy_priv(wiphy); + struct cfg80211_inform_bss inform_bss = { + .signal = 50, + .drv_data = &ctx, + }; + struct cfg80211_bss *bss, *link_bss; + const struct cfg80211_bss_ies *ies; + + /* sending station */ + const u8 bssid[ETH_ALEN] = { 0x10, 0x22, 0x33, 0x44, 0x55, 0x66 }; + u64 tsf = 0x1000000000000000ULL; + int beacon_int = 100; + u16 capability = 0x1234; + + /* Building the frame *************************************************/ + struct sk_buff *input = kunit_zalloc_skb(test, 1024, GFP_KERNEL); + u8 *len_mle, *len_prof; + u8 link_id = 2; + struct { + struct ieee80211_neighbor_ap_info info; + struct ieee80211_tbtt_info_ge_11 ap; + } __packed rnr = { + .info = { + .tbtt_info_hdr = u8_encode_bits(0, IEEE80211_AP_INFO_TBTT_HDR_COUNT), + .tbtt_info_len = sizeof(struct ieee80211_tbtt_info_ge_11), + .op_class = 81, + .channel = 11, + }, + .ap = { + .tbtt_offset = 0xff, + .bssid = { 0x10, 0x22, 0x33, 0x44, 0x55, 0x67 }, + .short_ssid = 0, /* unused */ + .bss_params = 0, + .psd_20 = 0, + .mld_params.mld_id = params->mld_id, + .mld_params.params = + le16_encode_bits(link_id, + IEEE80211_RNR_MLD_PARAMS_LINK_ID), + } + }; + struct { + __le16 control; + u8 var_len; + u8 mld_mac_addr[ETH_ALEN]; + u8 link_id_info; + u8 params_change_count; + __le16 mld_caps_and_ops; + u8 mld_id; + __le16 ext_mld_caps_and_ops; + } __packed mle_basic_common_info = { + .control = + cpu_to_le16(IEEE80211_ML_CONTROL_TYPE_BASIC | + IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT | + IEEE80211_MLC_BASIC_PRES_LINK_ID | + (params->mld_id ? IEEE80211_MLC_BASIC_PRES_MLD_ID : 0) | + IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP), + .mld_id = params->mld_id, + .mld_caps_and_ops = cpu_to_le16(0x0102), + .ext_mld_caps_and_ops = cpu_to_le16(0x0304), + .var_len = sizeof(mle_basic_common_info) - 2 - + (params->mld_id ? 0 : 1), + .mld_mac_addr = { 0x10, 0x22, 0x33, 0x44, 0x55, 0x60 }, + }; + struct { + __le16 control; + u8 var_len; + u8 bssid[ETH_ALEN]; + __le16 beacon_int; + __le64 tsf_offset; + __le16 capabilities; /* already part of payload */ + } __packed sta_prof = { + .control = + cpu_to_le16(IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE | + IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT | + IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT | + IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT | + u16_encode_bits(link_id, + IEEE80211_MLE_STA_CONTROL_LINK_ID)), + .var_len = sizeof(sta_prof) - 2 - 2, + .bssid = { *rnr.ap.bssid }, + .beacon_int = cpu_to_le16(101), + .tsf_offset = cpu_to_le64(-123ll), + .capabilities = cpu_to_le16(0xdead), + }; + + KUNIT_ASSERT_NOT_NULL(test, input); + + w_priv->ops->inform_bss = inform_bss_inc_counter; + + inform_bss.chan = ieee80211_get_channel_khz(wiphy, MHZ_TO_KHZ(2412)); + KUNIT_ASSERT_NOT_NULL(test, inform_bss.chan); + + skb_put_u8(input, WLAN_EID_SSID); + skb_put_u8(input, 4); + skb_put_data(input, "TEST", 4); + + if (params->include_oper_class) { + skb_put_u8(input, WLAN_EID_SUPPORTED_REGULATORY_CLASSES); + skb_put_u8(input, 1); + skb_put_u8(input, 81); + } + + skb_put_u8(input, WLAN_EID_REDUCED_NEIGHBOR_REPORT); + skb_put_u8(input, sizeof(rnr)); + skb_put_data(input, &rnr, sizeof(rnr)); + + /* build a multi-link element */ + skb_put_u8(input, WLAN_EID_EXTENSION); + len_mle = skb_put(input, 1); + skb_put_u8(input, WLAN_EID_EXT_EHT_MULTI_LINK); + skb_put_data(input, &mle_basic_common_info, sizeof(mle_basic_common_info)); + if (!params->mld_id) + t_skb_remove_member(input, typeof(mle_basic_common_info), mld_id); + /* with a STA profile inside */ + skb_put_u8(input, IEEE80211_MLE_SUBELEM_PER_STA_PROFILE); + len_prof = skb_put(input, 1); + skb_put_data(input, &sta_prof, sizeof(sta_prof)); + + if (params->sta_prof_vendor_elems) { + /* Put two (vendor) element into sta_prof */ + skb_put_u8(input, WLAN_EID_VENDOR_SPECIFIC); + skb_put_u8(input, 160); + skb_put(input, 160); + + skb_put_u8(input, WLAN_EID_VENDOR_SPECIFIC); + skb_put_u8(input, 165); + skb_put(input, 165); + } + + /* fragment STA profile */ + ieee80211_fragment_element(input, len_prof, + IEEE80211_MLE_SUBELEM_FRAGMENT); + /* fragment MLE */ + ieee80211_fragment_element(input, len_mle, WLAN_EID_FRAGMENT); + + /* Put a (vendor) element after the ML element */ + skb_put_u8(input, WLAN_EID_VENDOR_SPECIFIC); + skb_put_u8(input, 155); + skb_put(input, 155); + + /* Submit *************************************************************/ + bss = cfg80211_inform_bss_data(wiphy, &inform_bss, + CFG80211_BSS_FTYPE_PRESP, bssid, tsf, + capability, beacon_int, + input->data, input->len, + GFP_KERNEL); + KUNIT_EXPECT_NOT_NULL(test, bss); + KUNIT_EXPECT_EQ(test, ctx.inform_bss_count, 2); + + /* Check link_bss *****************************************************/ + link_bss = cfg80211_get_bss(wiphy, NULL, sta_prof.bssid, NULL, 0, + IEEE80211_BSS_TYPE_ANY, + IEEE80211_PRIVACY_ANY); + KUNIT_ASSERT_NOT_NULL(test, link_bss); + KUNIT_EXPECT_EQ(test, link_bss->signal, 0); + KUNIT_EXPECT_EQ(test, link_bss->beacon_interval, + le16_to_cpu(sta_prof.beacon_int)); + KUNIT_EXPECT_EQ(test, link_bss->capability, + le16_to_cpu(sta_prof.capabilities)); + KUNIT_EXPECT_EQ(test, link_bss->bssid_index, 0); + KUNIT_EXPECT_PTR_EQ(test, link_bss->channel, + ieee80211_get_channel_khz(wiphy, MHZ_TO_KHZ(2462))); + + rcu_read_lock(); + ies = rcu_dereference(link_bss->ies); + KUNIT_EXPECT_NOT_NULL(test, ies); + KUNIT_EXPECT_EQ(test, ies->tsf, tsf + le64_to_cpu(sta_prof.tsf_offset)); + /* Resulting length should be: + * SSID (inherited) + RNR (inherited) + vendor element(s) + + * operating class (if requested) + + * generated RNR (if MLD ID == 0) + + * MLE common info + MLE header and control + */ + if (params->sta_prof_vendor_elems) + KUNIT_EXPECT_EQ(test, ies->len, + 6 + 2 + sizeof(rnr) + 2 + 160 + 2 + 165 + + (params->include_oper_class ? 3 : 0) + + (!params->mld_id ? 22 : 0) + + mle_basic_common_info.var_len + 5); + else + KUNIT_EXPECT_EQ(test, ies->len, + 6 + 2 + sizeof(rnr) + 2 + 155 + + (params->include_oper_class ? 3 : 0) + + (!params->mld_id ? 22 : 0) + + mle_basic_common_info.var_len + 5); + rcu_read_unlock(); + + cfg80211_put_bss(wiphy, bss); + cfg80211_put_bss(wiphy, link_bss); +} + +static struct kunit_case gen_new_ie_test_cases[] = { + KUNIT_CASE_PARAM(test_gen_new_ie, gen_new_ie_gen_params), + KUNIT_CASE(test_gen_new_ie_malformed), + {} +}; + +static struct kunit_suite gen_new_ie = { + .name = "cfg80211-ie-generation", + .test_cases = gen_new_ie_test_cases, +}; + +kunit_test_suite(gen_new_ie); + +static struct kunit_case inform_bss_test_cases[] = { + KUNIT_CASE(test_inform_bss_ssid_only), + KUNIT_CASE_PARAM(test_inform_bss_ml_sta, inform_bss_ml_sta_gen_params), + {} +}; + +static struct kunit_suite inform_bss = { + .name = "cfg80211-inform-bss", + .test_cases = inform_bss_test_cases, +}; + +kunit_test_suite(inform_bss); diff --git a/net/wireless/tests/util.c b/net/wireless/tests/util.c new file mode 100644 index 0000000000..8abdaeb820 --- /dev/null +++ b/net/wireless/tests/util.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KUnit fixture to have a (configurable) wiphy + * + * Copyright (C) 2023 Intel Corporation + */ +#include <linux/ieee80211.h> +#include <net/cfg80211.h> +#include <kunit/test.h> +#include <kunit/test-bug.h> +#include "util.h" + +int t_wiphy_init(struct kunit_resource *resource, void *ctx) +{ + struct kunit *test = kunit_get_current_test(); + struct cfg80211_ops *ops; + struct wiphy *wiphy; + struct t_wiphy_priv *priv; + + ops = kzalloc(sizeof(*ops), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, ops); + + wiphy = wiphy_new_nm(ops, sizeof(*priv), "kunit"); + KUNIT_ASSERT_NOT_NULL(test, wiphy); + + priv = wiphy_priv(wiphy); + priv->ctx = ctx; + priv->ops = ops; + + /* Initialize channels, feel free to add more here channels/bands */ + memcpy(priv->channels_2ghz, channels_2ghz, sizeof(channels_2ghz)); + wiphy->bands[NL80211_BAND_2GHZ] = &priv->band_2ghz; + priv->band_2ghz.channels = priv->channels_2ghz; + priv->band_2ghz.n_channels = ARRAY_SIZE(channels_2ghz); + + resource->data = wiphy; + resource->name = "wiphy"; + + return 0; +} + +void t_wiphy_exit(struct kunit_resource *resource) +{ + struct t_wiphy_priv *priv; + struct cfg80211_ops *ops; + + priv = wiphy_priv(resource->data); + ops = priv->ops; + + /* Should we ensure anything about the state here? + * e.g. full destruction or no calls to any ops on destruction? + */ + + wiphy_free(resource->data); + kfree(ops); +} diff --git a/net/wireless/tests/util.h b/net/wireless/tests/util.h new file mode 100644 index 0000000000..6de712e0d4 --- /dev/null +++ b/net/wireless/tests/util.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Utilities for cfg80211 unit testing + * + * Copyright (C) 2023 Intel Corporation + */ +#ifndef __CFG80211_UTILS_H +#define __CFG80211_UTILS_H + +#define CHAN2G(_freq) { \ + .band = NL80211_BAND_2GHZ, \ + .center_freq = (_freq), \ + .hw_value = (_freq), \ +} + +static const struct ieee80211_channel channels_2ghz[] = { + CHAN2G(2412), /* Channel 1 */ + CHAN2G(2417), /* Channel 2 */ + CHAN2G(2422), /* Channel 3 */ + CHAN2G(2427), /* Channel 4 */ + CHAN2G(2432), /* Channel 5 */ + CHAN2G(2437), /* Channel 6 */ + CHAN2G(2442), /* Channel 7 */ + CHAN2G(2447), /* Channel 8 */ + CHAN2G(2452), /* Channel 9 */ + CHAN2G(2457), /* Channel 10 */ + CHAN2G(2462), /* Channel 11 */ + CHAN2G(2467), /* Channel 12 */ + CHAN2G(2472), /* Channel 13 */ + CHAN2G(2484), /* Channel 14 */ +}; + +struct t_wiphy_priv { + struct kunit *test; + struct cfg80211_ops *ops; + + void *ctx; + + struct ieee80211_supported_band band_2ghz; + struct ieee80211_channel channels_2ghz[ARRAY_SIZE(channels_2ghz)]; +}; + +#define T_WIPHY(test, ctx) ({ \ + struct wiphy *__wiphy = \ + kunit_alloc_resource(test, t_wiphy_init, \ + t_wiphy_exit, \ + GFP_KERNEL, &(ctx)); \ + \ + KUNIT_ASSERT_NOT_NULL(test, __wiphy); \ + __wiphy; \ + }) +#define t_wiphy_ctx(wiphy) (((struct t_wiphy_priv *)wiphy_priv(wiphy))->ctx) + +int t_wiphy_init(struct kunit_resource *resource, void *data); +void t_wiphy_exit(struct kunit_resource *resource); + +#define t_skb_remove_member(skb, type, member) do { \ + memmove((skb)->data + (skb)->len - sizeof(type) + \ + offsetof(type, member), \ + (skb)->data + (skb)->len - sizeof(type) + \ + offsetofend(type, member), \ + offsetofend(type, member)); \ + skb_trim(skb, (skb)->len - sizeof_field(type, member)); \ + } while (0) + +#endif /* __CFG80211_UTILS_H */ diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 30cd1bd58a..1f374c8a17 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2928,7 +2928,7 @@ DEFINE_EVENT(netdev_evt_only, cfg80211_send_rx_auth, TRACE_EVENT(cfg80211_send_rx_assoc, TP_PROTO(struct net_device *netdev, - struct cfg80211_rx_assoc_resp_data *data), + const struct cfg80211_rx_assoc_resp_data *data), TP_ARGS(netdev, data), TP_STRUCT__entry( NETDEV_ENTRY @@ -3979,6 +3979,26 @@ TRACE_EVENT(cfg80211_links_removed, __entry->link_mask) ); +TRACE_EVENT(rdev_set_ttlm, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_ttlm_params *params), + TP_ARGS(wiphy, netdev, params), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + __array(u8, dlink, sizeof(u16) * 8) + __array(u8, ulink, sizeof(u16) * 8) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + memcpy(__entry->dlink, params->dlink, sizeof(params->dlink)); + memcpy(__entry->ulink, params->ulink, sizeof(params->ulink)); + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT, + WIPHY_PR_ARG, NETDEV_PR_ARG) +); + #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH diff --git a/net/wireless/util.c b/net/wireless/util.c index 626b858b4b..b9d15f3693 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -791,15 +791,19 @@ ieee80211_amsdu_subframe_length(void *field, u8 mesh_flags, u8 hdr_type) bool ieee80211_is_valid_amsdu(struct sk_buff *skb, u8 mesh_hdr) { - int offset = 0, remaining, subframe_len, padding; + int offset = 0, subframe_len, padding; for (offset = 0; offset < skb->len; offset += subframe_len + padding) { + int remaining = skb->len - offset; struct { __be16 len; u8 mesh_flags; } hdr; u16 len; + if (sizeof(hdr) > remaining) + return false; + if (skb_copy_bits(skb, offset + 2 * ETH_ALEN, &hdr, sizeof(hdr)) < 0) return false; @@ -807,7 +811,6 @@ bool ieee80211_is_valid_amsdu(struct sk_buff *skb, u8 mesh_hdr) mesh_hdr); subframe_len = sizeof(struct ethhdr) + len; padding = (4 - subframe_len) & 0x3; - remaining = skb->len - offset; if (subframe_len > remaining) return false; @@ -825,7 +828,7 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, { unsigned int hlen = ALIGN(extra_headroom, 4); struct sk_buff *frame = NULL; - int offset = 0, remaining; + int offset = 0; struct { struct ethhdr eth; uint8_t flags; @@ -839,10 +842,14 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, copy_len = sizeof(hdr); while (!last) { + int remaining = skb->len - offset; unsigned int subframe_len; int len, mesh_len = 0; u8 padding; + if (copy_len > remaining) + goto purge; + skb_copy_bits(skb, offset, &hdr, copy_len); if (iftype == NL80211_IFTYPE_MESH_POINT) mesh_len = __ieee80211_get_mesh_hdrlen(hdr.flags); @@ -852,7 +859,6 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, padding = (4 - subframe_len) & 0x3; /* the last MSDU has no padding */ - remaining = skb->len - offset; if (subframe_len > remaining) goto purge; /* mitigate A-MSDU aggregation injection attacks */ @@ -980,7 +986,63 @@ unsigned int cfg80211_classify8021d(struct sk_buff *skb, } } + /* The default mapping as defined Section 2.3 in RFC8325: The three + * Most Significant Bits (MSBs) of the DSCP are used as the + * corresponding L2 markings. + */ ret = dscp >> 5; + + /* Handle specific DSCP values for which the default mapping (as + * described above) doesn't adhere to the intended usage of the DSCP + * value. See section 4 in RFC8325. Specifically, for the following + * Diffserv Service Classes no update is needed: + * - Standard: DF + * - Low Priority Data: CS1 + * - Multimedia Streaming: AF31, AF32, AF33 + * - Multimedia Conferencing: AF41, AF42, AF43 + * - Network Control Traffic: CS7 + * - Real-Time Interactive: CS4 + */ + switch (dscp >> 2) { + case 10: + case 12: + case 14: + /* High throughput data: AF11, AF12, AF13 */ + ret = 0; + break; + case 16: + /* Operations, Administration, and Maintenance and Provisioning: + * CS2 + */ + ret = 0; + break; + case 18: + case 20: + case 22: + /* Low latency data: AF21, AF22, AF23 */ + ret = 3; + break; + case 24: + /* Broadcasting video: CS3 */ + ret = 4; + break; + case 40: + /* Signaling: CS5 */ + ret = 5; + break; + case 44: + /* Voice Admit: VA */ + ret = 6; + break; + case 46: + /* Telephony traffic: EF */ + ret = 6; + break; + case 48: + /* Network Control Traffic: CS6 */ + ret = 7; + break; + } out: return array_index_nospec(ret, IEEE80211_NUM_TIDS); } diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index ae90696efe..d18d51412c 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -704,7 +704,7 @@ static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) rc = -EINVAL; } release_sock(sk); - SOCK_DEBUG(sk, "x25_bind: socket is bound\n"); + net_dbg_ratelimited("x25_bind: socket is bound\n"); out: return rc; } @@ -1165,10 +1165,10 @@ static int x25_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) goto out; } - SOCK_DEBUG(sk, "x25_sendmsg: sendto: Addresses built.\n"); + net_dbg_ratelimited("x25_sendmsg: sendto: Addresses built.\n"); /* Build a packet */ - SOCK_DEBUG(sk, "x25_sendmsg: sendto: building packet.\n"); + net_dbg_ratelimited("x25_sendmsg: sendto: building packet.\n"); if ((msg->msg_flags & MSG_OOB) && len > 32) len = 32; @@ -1187,7 +1187,7 @@ static int x25_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) /* * Put the data on the end */ - SOCK_DEBUG(sk, "x25_sendmsg: Copying user data\n"); + net_dbg_ratelimited("x25_sendmsg: Copying user data\n"); skb_reset_transport_header(skb); skb_put(skb, len); @@ -1211,7 +1211,7 @@ static int x25_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) /* * Push down the X.25 header */ - SOCK_DEBUG(sk, "x25_sendmsg: Building X.25 Header.\n"); + net_dbg_ratelimited("x25_sendmsg: Building X.25 Header.\n"); if (msg->msg_flags & MSG_OOB) { if (x25->neighbour->extended) { @@ -1245,8 +1245,8 @@ static int x25_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) skb->data[0] |= X25_Q_BIT; } - SOCK_DEBUG(sk, "x25_sendmsg: Built header.\n"); - SOCK_DEBUG(sk, "x25_sendmsg: Transmitting buffer\n"); + net_dbg_ratelimited("x25_sendmsg: Built header.\n"); + net_dbg_ratelimited("x25_sendmsg: Transmitting buffer\n"); rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c index 8e1a49b0c0..6dadb217e1 100644 --- a/net/x25/x25_facilities.c +++ b/net/x25/x25_facilities.c @@ -282,7 +282,7 @@ int x25_negotiate_facilities(struct sk_buff *skb, struct sock *sk, * They want reverse charging, we won't accept it. */ if ((theirs.reverse & 0x01 ) && (ours->reverse & 0x01)) { - SOCK_DEBUG(sk, "X.25: rejecting reverse charging request\n"); + net_dbg_ratelimited("X.25: rejecting reverse charging request\n"); return -1; } @@ -294,11 +294,11 @@ int x25_negotiate_facilities(struct sk_buff *skb, struct sock *sk, int ours_in = ours->throughput & 0x0f; int ours_out = ours->throughput & 0xf0; if (!ours_in || theirs_in < ours_in) { - SOCK_DEBUG(sk, "X.25: inbound throughput negotiated\n"); + net_dbg_ratelimited("X.25: inbound throughput negotiated\n"); new->throughput = (new->throughput & 0xf0) | theirs_in; } if (!ours_out || theirs_out < ours_out) { - SOCK_DEBUG(sk, + net_dbg_ratelimited( "X.25: outbound throughput negotiated\n"); new->throughput = (new->throughput & 0x0f) | theirs_out; } @@ -306,22 +306,22 @@ int x25_negotiate_facilities(struct sk_buff *skb, struct sock *sk, if (theirs.pacsize_in && theirs.pacsize_out) { if (theirs.pacsize_in < ours->pacsize_in) { - SOCK_DEBUG(sk, "X.25: packet size inwards negotiated down\n"); + net_dbg_ratelimited("X.25: packet size inwards negotiated down\n"); new->pacsize_in = theirs.pacsize_in; } if (theirs.pacsize_out < ours->pacsize_out) { - SOCK_DEBUG(sk, "X.25: packet size outwards negotiated down\n"); + net_dbg_ratelimited("X.25: packet size outwards negotiated down\n"); new->pacsize_out = theirs.pacsize_out; } } if (theirs.winsize_in && theirs.winsize_out) { if (theirs.winsize_in < ours->winsize_in) { - SOCK_DEBUG(sk, "X.25: window size inwards negotiated down\n"); + net_dbg_ratelimited("X.25: window size inwards negotiated down\n"); new->winsize_in = theirs.winsize_in; } if (theirs.winsize_out < ours->winsize_out) { - SOCK_DEBUG(sk, "X.25: window size outwards negotiated down\n"); + net_dbg_ratelimited("X.25: window size outwards negotiated down\n"); new->winsize_out = theirs.winsize_out; } } diff --git a/net/x25/x25_out.c b/net/x25/x25_out.c index dbc0940bf3..f8922b0e23 100644 --- a/net/x25/x25_out.c +++ b/net/x25/x25_out.c @@ -72,7 +72,7 @@ int x25_output(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); return sent; } - SOCK_DEBUG(sk, "x25_output: fragment alloc" + net_dbg_ratelimited("x25_output: fragment alloc" " failed, err=%d, %d bytes " "sent\n", err, sent); return err; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 06cead2b8e..caa340134b 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -148,6 +148,11 @@ static int xdp_umem_account_pages(struct xdp_umem *umem) return 0; } +#define XDP_UMEM_FLAGS_VALID ( \ + XDP_UMEM_UNALIGNED_CHUNK_FLAG | \ + XDP_UMEM_TX_SW_CSUM | \ + 0) + static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) { bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG; @@ -167,7 +172,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) return -EINVAL; } - if (mr->flags & ~XDP_UMEM_UNALIGNED_CHUNK_FLAG) + if (mr->flags & ~XDP_UMEM_FLAGS_VALID) return -EINVAL; if (!unaligned_chunks && !is_power_of_2(chunk_size)) @@ -199,6 +204,9 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (headroom >= chunk_size - XDP_PACKET_HEADROOM) return -EINVAL; + if (mr->tx_metadata_len >= 256 || mr->tx_metadata_len % 8) + return -EINVAL; + umem->size = size; umem->headroom = headroom; umem->chunk_size = chunk_size; @@ -207,6 +215,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) umem->pgs = NULL; umem->user = NULL; umem->flags = mr->flags; + umem->tx_metadata_len = mr->tx_metadata_len; INIT_LIST_HEAD(&umem->xsk_dma_list); refcount_set(&umem->users, 1); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index da1582de6e..7d1c0986f9 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -575,6 +575,13 @@ static u32 xsk_get_num_desc(struct sk_buff *skb) static void xsk_destruct_skb(struct sk_buff *skb) { + struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta; + + if (compl->tx_timestamp) { + /* sw completion timestamp, not a real one */ + *compl->tx_timestamp = ktime_get_tai_fast_ns(); + } + xsk_cq_submit_locked(xdp_sk(skb->sk), xsk_get_num_desc(skb)); sock_wfree(skb); } @@ -659,8 +666,10 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, struct xdp_desc *desc) { + struct xsk_tx_metadata *meta = NULL; struct net_device *dev = xs->dev; struct sk_buff *skb = xs->skb; + bool first_frag = false; int err; if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { @@ -691,6 +700,8 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, kfree_skb(skb); goto free_err; } + + first_frag = true; } else { int nr_frags = skb_shinfo(skb)->nr_frags; struct page *page; @@ -714,12 +725,45 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE); refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc); } + + if (first_frag && desc->options & XDP_TX_METADATA) { + if (unlikely(xs->pool->tx_metadata_len == 0)) { + err = -EINVAL; + goto free_err; + } + + meta = buffer - xs->pool->tx_metadata_len; + if (unlikely(!xsk_buff_valid_tx_metadata(meta))) { + err = -EINVAL; + goto free_err; + } + + if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) { + if (unlikely(meta->request.csum_start + + meta->request.csum_offset + + sizeof(__sum16) > len)) { + err = -EINVAL; + goto free_err; + } + + skb->csum_start = hr + meta->request.csum_start; + skb->csum_offset = meta->request.csum_offset; + skb->ip_summed = CHECKSUM_PARTIAL; + + if (unlikely(xs->pool->tx_sw_csum)) { + err = skb_checksum_help(skb); + if (err) + goto free_err; + } + } + } } skb->dev = dev; skb->priority = READ_ONCE(xs->sk.sk_priority); skb->mark = READ_ONCE(xs->sk.sk_mark); skb->destructor = xsk_destruct_skb; + xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); xsk_set_destructor_arg(skb); return skb; @@ -1287,6 +1331,14 @@ struct xdp_umem_reg_v1 { __u32 headroom; }; +struct xdp_umem_reg_v2 { + __u64 addr; /* Start of packet data area */ + __u64 len; /* Length of packet data area */ + __u32 chunk_size; + __u32 headroom; + __u32 flags; +}; + static int xsk_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { @@ -1330,8 +1382,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(struct xdp_umem_reg_v1)) return -EINVAL; - else if (optlen < sizeof(mr)) + else if (optlen < sizeof(struct xdp_umem_reg_v2)) mr_size = sizeof(struct xdp_umem_reg_v1); + else if (optlen < sizeof(mr)) + mr_size = sizeof(struct xdp_umem_reg_v2); if (copy_from_sockptr(&mr, optval, mr_size)) return -EFAULT; @@ -1360,6 +1414,8 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, struct xsk_queue **q; int entries; + if (optlen < sizeof(entries)) + return -EINVAL; if (copy_from_sockptr(&entries, optval, sizeof(entries))) return -EFAULT; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index b0a6116778..ce60ecd48a 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -85,6 +85,8 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, XDP_PACKET_HEADROOM; pool->umem = umem; pool->addrs = umem->addrs; + pool->tx_metadata_len = umem->tx_metadata_len; + pool->tx_sw_csum = umem->flags & XDP_UMEM_TX_SW_CSUM; INIT_LIST_HEAD(&pool->free_list); INIT_LIST_HEAD(&pool->xskb_list); INIT_LIST_HEAD(&pool->xsk_tx_list); @@ -123,6 +125,18 @@ void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) } EXPORT_SYMBOL(xp_set_rxq_info); +void xp_fill_cb(struct xsk_buff_pool *pool, struct xsk_cb_desc *desc) +{ + u32 i; + + for (i = 0; i < pool->heads_cnt; i++) { + struct xdp_buff_xsk *xskb = &pool->heads[i]; + + memcpy(xskb->cb + desc->off, desc->src, desc->bytes); + } +} +EXPORT_SYMBOL(xp_fill_cb); + static void xp_disable_drv_zc(struct xsk_buff_pool *pool) { struct netdev_bpf bpf; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 13354a1e42..6f2d1621c9 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -137,21 +137,23 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) static inline bool xp_unused_options_set(u32 options) { - return options & ~XDP_PKT_CONTD; + return options & ~(XDP_PKT_CONTD | XDP_TX_METADATA); } static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { - u64 offset = desc->addr & (pool->chunk_size - 1); + u64 addr = desc->addr - pool->tx_metadata_len; + u64 len = desc->len + pool->tx_metadata_len; + u64 offset = addr & (pool->chunk_size - 1); if (!desc->len) return false; - if (offset + desc->len > pool->chunk_size) + if (offset + len > pool->chunk_size) return false; - if (desc->addr >= pool->addrs_cnt) + if (addr >= pool->addrs_cnt) return false; if (xp_unused_options_set(desc->options)) @@ -162,16 +164,17 @@ static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { - u64 addr = xp_unaligned_add_offset_to_addr(desc->addr); + u64 addr = xp_unaligned_add_offset_to_addr(desc->addr) - pool->tx_metadata_len; + u64 len = desc->len + pool->tx_metadata_len; if (!desc->len) return false; - if (desc->len > pool->chunk_size) + if (len > pool->chunk_size) return false; - if (addr >= pool->addrs_cnt || addr + desc->len > pool->addrs_cnt || - xp_desc_crosses_non_contig_pg(pool, addr, desc->len)) + if (addr >= pool->addrs_cnt || addr + len > pool->addrs_cnt || + xp_desc_crosses_non_contig_pg(pool, addr, len)) return false; if (xp_unused_options_set(desc->options)) diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index cd47f88921..547cec77ba 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -21,3 +21,4 @@ obj-$(CONFIG_XFRM_USER_COMPAT) += xfrm_compat.o obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o obj-$(CONFIG_XFRM_ESPINTCP) += espintcp.o +obj-$(CONFIG_DEBUG_INFO_BTF) += xfrm_state_bpf.o diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c index 41533c6314..e6da7e8495 100644 --- a/net/xfrm/xfrm_algo.c +++ b/net/xfrm/xfrm_algo.c @@ -858,4 +858,5 @@ int xfrm_count_pfkey_enc_supported(void) } EXPORT_SYMBOL_GPL(xfrm_count_pfkey_enc_supported); +MODULE_DESCRIPTION("XFRM Algorithm interface"); MODULE_LICENSE("GPL"); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 9c5f2efed3..da6ecc6b3e 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -4220,6 +4220,8 @@ void __init xfrm_init(void) #ifdef CONFIG_XFRM_ESPINTCP espintcp_init(); #endif + + register_xfrm_state_bpf(); } #ifdef CONFIG_AUDITSYSCALL diff --git a/net/xfrm/xfrm_state_bpf.c b/net/xfrm/xfrm_state_bpf.c new file mode 100644 index 0000000000..9e20d4a377 --- /dev/null +++ b/net/xfrm/xfrm_state_bpf.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Unstable XFRM state BPF helpers. + * + * Note that it is allowed to break compatibility for these functions since the + * interface they are exposed through to BPF programs is explicitly unstable. + */ + +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/btf_ids.h> +#include <net/xdp.h> +#include <net/xfrm.h> + +/* bpf_xfrm_state_opts - Options for XFRM state lookup helpers + * + * Members: + * @error - Out parameter, set for any errors encountered + * Values: + * -EINVAL - netns_id is less than -1 + * -EINVAL - opts__sz isn't BPF_XFRM_STATE_OPTS_SZ + * -ENONET - No network namespace found for netns_id + * -ENOENT - No xfrm_state found + * @netns_id - Specify the network namespace for lookup + * Values: + * BPF_F_CURRENT_NETNS (-1) + * Use namespace associated with ctx + * [0, S32_MAX] + * Network Namespace ID + * @mark - XFRM mark to match on + * @daddr - Destination address to match on + * @spi - Security parameter index to match on + * @proto - IP protocol to match on (eg. IPPROTO_ESP) + * @family - Protocol family to match on (AF_INET/AF_INET6) + */ +struct bpf_xfrm_state_opts { + s32 error; + s32 netns_id; + u32 mark; + xfrm_address_t daddr; + __be32 spi; + u8 proto; + u16 family; +}; + +enum { + BPF_XFRM_STATE_OPTS_SZ = sizeof(struct bpf_xfrm_state_opts), +}; + +__bpf_kfunc_start_defs(); + +/* bpf_xdp_get_xfrm_state - Get XFRM state + * + * A `struct xfrm_state *`, if found, must be released with a corresponding + * bpf_xdp_xfrm_state_release. + * + * Parameters: + * @ctx - Pointer to ctx (xdp_md) in XDP program + * Cannot be NULL + * @opts - Options for lookup (documented above) + * Cannot be NULL + * @opts__sz - Length of the bpf_xfrm_state_opts structure + * Must be BPF_XFRM_STATE_OPTS_SZ + */ +__bpf_kfunc struct xfrm_state * +bpf_xdp_get_xfrm_state(struct xdp_md *ctx, struct bpf_xfrm_state_opts *opts, u32 opts__sz) +{ + struct xdp_buff *xdp = (struct xdp_buff *)ctx; + struct net *net = dev_net(xdp->rxq->dev); + struct xfrm_state *x; + + if (!opts || opts__sz < sizeof(opts->error)) + return NULL; + + if (opts__sz != BPF_XFRM_STATE_OPTS_SZ) { + opts->error = -EINVAL; + return NULL; + } + + if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS)) { + opts->error = -EINVAL; + return NULL; + } + + if (opts->netns_id >= 0) { + net = get_net_ns_by_id(net, opts->netns_id); + if (unlikely(!net)) { + opts->error = -ENONET; + return NULL; + } + } + + x = xfrm_state_lookup(net, opts->mark, &opts->daddr, opts->spi, + opts->proto, opts->family); + + if (opts->netns_id >= 0) + put_net(net); + if (!x) + opts->error = -ENOENT; + + return x; +} + +/* bpf_xdp_xfrm_state_release - Release acquired xfrm_state object + * + * This must be invoked for referenced PTR_TO_BTF_ID, and the verifier rejects + * the program if any references remain in the program in all of the explored + * states. + * + * Parameters: + * @x - Pointer to referenced xfrm_state object, obtained using + * bpf_xdp_get_xfrm_state. + */ +__bpf_kfunc void bpf_xdp_xfrm_state_release(struct xfrm_state *x) +{ + xfrm_state_put(x); +} + +__bpf_kfunc_end_defs(); + +BTF_SET8_START(xfrm_state_kfunc_set) +BTF_ID_FLAGS(func, bpf_xdp_get_xfrm_state, KF_RET_NULL | KF_ACQUIRE) +BTF_ID_FLAGS(func, bpf_xdp_xfrm_state_release, KF_RELEASE) +BTF_SET8_END(xfrm_state_kfunc_set) + +static const struct btf_kfunc_id_set xfrm_state_xdp_kfunc_set = { + .owner = THIS_MODULE, + .set = &xfrm_state_kfunc_set, +}; + +int __init register_xfrm_state_bpf(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, + &xfrm_state_xdp_kfunc_set); +} diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 444e58bc3f..912c1189ba 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -3891,5 +3891,6 @@ static void __exit xfrm_user_exit(void) module_init(xfrm_user_init); module_exit(xfrm_user_exit); +MODULE_DESCRIPTION("XFRM User interface"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_XFRM); |