diff options
Diffstat (limited to 'include/net')
63 files changed, 1690 insertions, 428 deletions
diff --git a/include/net/9p/client.h b/include/net/9p/client.h index 78ebcf782c..4f785098c6 100644 --- a/include/net/9p/client.h +++ b/include/net/9p/client.h @@ -207,6 +207,8 @@ int p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err int p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err); int p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err); +struct netfs_io_subrequest; +void p9_client_write_subreq(struct netfs_io_subrequest *subreq); int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset); int p9dirent_read(struct p9_client *clnt, char *buf, int len, struct p9_dirent *dirent); diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 3dee0b2721..b6eedf7650 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -17,14 +17,31 @@ static inline struct unix_sock *unix_get_socket(struct file *filp) } #endif -extern spinlock_t unix_gc_lock; extern unsigned int unix_tot_inflight; - -void unix_inflight(struct user_struct *user, struct file *fp); -void unix_notinflight(struct user_struct *user, struct file *fp); +void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); +void unix_del_edges(struct scm_fp_list *fpl); +void unix_update_edges(struct unix_sock *receiver); +int unix_prepare_fpl(struct scm_fp_list *fpl); +void unix_destroy_fpl(struct scm_fp_list *fpl); void unix_gc(void); void wait_for_unix_gc(struct scm_fp_list *fpl); +struct unix_vertex { + struct list_head edges; + struct list_head entry; + struct list_head scc_entry; + unsigned long out_degree; + unsigned long index; + unsigned long scc_index; +}; + +struct unix_edge { + struct unix_sock *predecessor; + struct unix_sock *successor; + struct list_head vertex_entry; + struct list_head stack_entry; +}; + struct sock *unix_peer_get(struct sock *sk); #define UNIX_HASH_MOD (256 - 1) @@ -50,6 +67,7 @@ struct unix_skb_parms { struct scm_stat { atomic_t nr_fds; + unsigned long nr_unix_fds; }; #define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) @@ -62,12 +80,9 @@ struct unix_sock { struct path path; struct mutex iolock, bindlock; struct sock *peer; - struct list_head link; - unsigned long inflight; + struct sock *listener; + struct unix_vertex *vertex; spinlock_t lock; - unsigned long gc_flags; -#define UNIX_GC_CANDIDATE 0 -#define UNIX_GC_MAYBE_CYCLE 1 struct socket_wq peer_wq; wait_queue_entry_t peer_wake; struct scm_stat scm_stat; diff --git a/include/net/ax25.h b/include/net/ax25.h index c2a85fd3f5..cb622d84cd 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -139,7 +139,9 @@ enum { AX25_VALUES_N2, /* Default N2 value */ AX25_VALUES_PACLEN, /* AX.25 MTU */ AX25_VALUES_PROTOCOL, /* Std AX.25, DAMA Slave, DAMA Master */ +#ifdef CONFIG_AX25_DAMA_SLAVE AX25_VALUES_DS_TIMEOUT, /* DAMA Slave timeout */ +#endif AX25_MAX_VALUES /* THIS MUST REMAIN THE LAST ENTRY OF THIS LIST */ }; diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index d46320f7fd..e372a88e8c 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -444,7 +444,6 @@ enum { #define HCI_AUTO_OFF_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */ #define HCI_ACL_CONN_TIMEOUT msecs_to_jiffies(20000) /* 20 seconds */ #define HCI_LE_CONN_TIMEOUT msecs_to_jiffies(20000) /* 20 seconds */ -#define HCI_LE_AUTOCONN_TIMEOUT msecs_to_jiffies(4000) /* 4 seconds */ /* HCI data types */ #define HCI_COMMAND_PKT 0x01 @@ -2050,7 +2049,7 @@ struct hci_cp_le_set_cig_params { __le16 c_latency; __le16 p_latency; __u8 num_cis; - struct hci_cis_params cis[]; + struct hci_cis_params cis[] __counted_by(num_cis); } __packed; struct hci_rp_le_set_cig_params { @@ -2122,7 +2121,7 @@ struct hci_cp_le_big_create_sync { __u8 mse; __le16 timeout; __u8 num_bis; - __u8 bis[]; + __u8 bis[] __counted_by(num_bis); } __packed; #define HCI_OP_LE_BIG_TERM_SYNC 0x206c diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 17b2a7baa4..b15f51ae3b 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -91,8 +91,6 @@ struct discovery_state { s8 rssi; u16 uuid_count; u8 (*uuids)[16]; - unsigned long scan_start; - unsigned long scan_duration; unsigned long name_resolve_timeout; }; @@ -890,8 +888,6 @@ static inline void hci_discovery_filter_clear(struct hci_dev *hdev) hdev->discovery.uuid_count = 0; kfree(hdev->discovery.uuids); hdev->discovery.uuids = NULL; - hdev->discovery.scan_start = 0; - hdev->discovery.scan_duration = 0; } bool hci_discovery_active(struct hci_dev *hdev); @@ -2220,8 +2216,22 @@ void hci_mgmt_chan_unregister(struct hci_mgmt_chan *c); /* These LE scan and inquiry parameters were chosen according to LE General * Discovery Procedure specification. */ -#define DISCOV_LE_SCAN_WIN 0x12 -#define DISCOV_LE_SCAN_INT 0x12 +#define DISCOV_LE_SCAN_WIN 0x0012 /* 11.25 msec */ +#define DISCOV_LE_SCAN_INT 0x0012 /* 11.25 msec */ +#define DISCOV_LE_SCAN_INT_FAST 0x0060 /* 60 msec */ +#define DISCOV_LE_SCAN_WIN_FAST 0x0030 /* 30 msec */ +#define DISCOV_LE_SCAN_INT_CONN 0x0060 /* 60 msec */ +#define DISCOV_LE_SCAN_WIN_CONN 0x0060 /* 60 msec */ +#define DISCOV_LE_SCAN_INT_SLOW1 0x0800 /* 1.28 sec */ +#define DISCOV_LE_SCAN_WIN_SLOW1 0x0012 /* 11.25 msec */ +#define DISCOV_LE_SCAN_INT_SLOW2 0x1000 /* 2.56 sec */ +#define DISCOV_LE_SCAN_WIN_SLOW2 0x0024 /* 22.5 msec */ +#define DISCOV_CODED_SCAN_INT_FAST 0x0120 /* 180 msec */ +#define DISCOV_CODED_SCAN_WIN_FAST 0x0090 /* 90 msec */ +#define DISCOV_CODED_SCAN_INT_SLOW1 0x1800 /* 3.84 sec */ +#define DISCOV_CODED_SCAN_WIN_SLOW1 0x0036 /* 33.75 msec */ +#define DISCOV_CODED_SCAN_INT_SLOW2 0x3000 /* 7.68 sec */ +#define DISCOV_CODED_SCAN_WIN_SLOW2 0x006c /* 67.5 msec */ #define DISCOV_LE_TIMEOUT 10240 /* msec */ #define DISCOV_INTERLEAVED_TIMEOUT 5120 /* msec */ #define DISCOV_INTERLEAVED_INQUIRY_LEN 0x04 diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 3434cfc26b..5cfdc81349 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -463,18 +463,24 @@ struct l2cap_le_credits { #define L2CAP_ECRED_MAX_CID 5 struct l2cap_ecred_conn_req { - __le16 psm; - __le16 mtu; - __le16 mps; - __le16 credits; + /* New members must be added within the struct_group() macro below. */ + __struct_group(l2cap_ecred_conn_req_hdr, hdr, __packed, + __le16 psm; + __le16 mtu; + __le16 mps; + __le16 credits; + ); __le16 scid[]; } __packed; struct l2cap_ecred_conn_rsp { - __le16 mtu; - __le16 mps; - __le16 credits; - __le16 result; + /* New members must be added within the struct_group() macro below. */ + struct_group_tagged(l2cap_ecred_conn_rsp_hdr, hdr, + __le16 mtu; + __le16 mps; + __le16 credits; + __le16 result; + ); __le16 dcid[]; }; diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 1e09329acc..cbf1664dc5 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1149,6 +1149,8 @@ ieee80211_chandef_max_power(struct cfg80211_chan_def *chandef) * @band_mask: which bands to check on * @prohibited_flags: which channels to not consider usable, * %IEEE80211_CHAN_DISABLED is always taken into account + * + * Return: %true if usable channels found, %false otherwise */ bool cfg80211_any_usable_channels(struct wiphy *wiphy, unsigned long band_mask, @@ -1575,6 +1577,8 @@ struct cfg80211_csa_settings { * @beacon_next: beacon data to be used after the color change * @count: number of beacons until the color change * @color: the color used after the change + * @link_id: defines the link on which color change is expected during MLO. + * 0 in case of non-MLO. */ struct cfg80211_color_change_settings { struct cfg80211_beacon_data beacon_color_change; @@ -1583,6 +1587,7 @@ struct cfg80211_color_change_settings { struct cfg80211_beacon_data beacon_next; u8 count; u8 color; + u8 link_id; }; /** @@ -1833,9 +1838,11 @@ enum cfg80211_station_type { * * Utility function for the @change_station driver method. Call this function * with the appropriate station type looking up the station (and checking that - * it exists). It will verify whether the station change is acceptable, and if - * not will return an error code. Note that it may modify the parameters for - * backward compatibility reasons, so don't use them before calling this. + * it exists). It will verify whether the station change is acceptable. + * + * Return: 0 if the change is acceptable, otherwise an error code. Note that + * it may modify the parameters for backward compatibility reasons, so don't + * use them before calling this. */ int cfg80211_check_station_change(struct wiphy *wiphy, struct station_parameters *params, @@ -2229,7 +2236,7 @@ struct cfg80211_sar_capa { * @mac_addr: the mac address of the station of interest * @sinfo: pointer to the structure to fill with the information * - * Returns 0 on success and sinfo is filled with the available information + * Return: 0 on success and sinfo is filled with the available information * otherwise returns a negative error code and the content of sinfo has to be * considered undefined. */ @@ -5334,6 +5341,8 @@ struct wiphy_iftype_ext_capab { * cfg80211_get_iftype_ext_capa - lookup interface type extended capability * @wiphy: the wiphy to look up from * @type: the interface type to look up + * + * Return: The extended capability for the given interface @type, may be %NULL */ const struct wiphy_iftype_ext_capab * cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type); @@ -5904,6 +5913,10 @@ int wiphy_register(struct wiphy *wiphy); /** * get_wiphy_regdom - get custom regdomain for the given wiphy * @wiphy: the wiphy to get the regdomain from + * + * Context: Requires any of RTNL, wiphy mutex or RCU protection. + * + * Return: pointer to the regulatory domain associated with the wiphy */ const struct ieee80211_regdomain *get_wiphy_regdom(struct wiphy *wiphy); @@ -6421,6 +6434,8 @@ ieee80211_get_channel(struct wiphy *wiphy, int freq) * * The Preferred Scanning Channels (PSC) are defined in * Draft IEEE P802.11ax/D5.0, 26.17.2.3.3 + * + * Return: %true if channel is a PSC, %false otherwise */ static inline bool cfg80211_channel_is_psc(struct ieee80211_channel *chan) { @@ -6450,8 +6465,8 @@ ieee80211_get_response_rate(struct ieee80211_supported_band *sband, * ieee80211_mandatory_rates - get mandatory rates for a given band * @sband: the band to look for rates in * - * This function returns a bitmap of the mandatory rates for the given - * band, bits are set according to the rate position in the bitrates array. + * Return: a bitmap of the mandatory rates for the given band, bits + * are set according to the rate position in the bitrates array. */ u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband); @@ -6665,6 +6680,8 @@ bool ieee80211_get_8023_tunnel_proto(const void *hdr, __be16 *proto); * header to the ethernet header (if present). * * @skb: The 802.3 frame with embedded mesh header + * + * Return: 0 on success. Non-zero on error. */ int ieee80211_strip_8023_mesh_hdr(struct sk_buff *skb); @@ -7043,6 +7060,8 @@ const struct ieee80211_reg_rule *freq_reg_info(struct wiphy *wiphy, * * You can use this to map the regulatory request initiator enum to a * proper string representation. + * + * Return: pointer to string representation of the initiator */ const char *reg_initiator_name(enum nl80211_reg_initiator initiator); @@ -7051,6 +7070,8 @@ const char *reg_initiator_name(enum nl80211_reg_initiator initiator); * @wiphy: wiphy for which pre-CAC capability is checked. * * Pre-CAC is allowed only in some regdomains (notable ETSI). + * + * Return: %true if allowed, %false otherwise */ bool regulatory_pre_cac_allowed(struct wiphy *wiphy); @@ -7185,6 +7206,8 @@ static inline void cfg80211_gen_new_bssid(const u8 *bssid, u8 max_bssid, * cfg80211_is_element_inherited - returns if element ID should be inherited * @element: element to check * @non_inherit_element: non inheritance element + * + * Return: %true if should be inherited, %false otherwise */ bool cfg80211_is_element_inherited(const struct element *element, const struct element *non_inherit_element); @@ -7197,6 +7220,8 @@ bool cfg80211_is_element_inherited(const struct element *element, * @sub_elem: current MBSSID subelement (profile) * @merged_ie: location of the merged profile * @max_copy_len: max merged profile length + * + * Return: the number of bytes merged */ size_t cfg80211_merge_profile(const u8 *ie, size_t ielen, const struct element *mbssid_elem, @@ -7224,7 +7249,7 @@ enum cfg80211_bss_frame_type { * @ielen: length of IEs * @band: enum nl80211_band of the channel * - * Returns the channel number, or -1 if none could be determined. + * Return: the channel number, or -1 if none could be determined. */ int cfg80211_get_ies_channel_number(const u8 *ie, size_t ielen, enum nl80211_band band); @@ -7302,6 +7327,8 @@ cfg80211_inform_bss(struct wiphy *wiphy, * @bss_type: type of BSS, see &enum ieee80211_bss_type * @privacy: privacy filter, see &enum ieee80211_privacy * @use_for: indicates which use is intended + * + * Return: Reference-counted BSS on success. %NULL on error. */ struct cfg80211_bss *__cfg80211_get_bss(struct wiphy *wiphy, struct ieee80211_channel *channel, @@ -7322,6 +7349,8 @@ struct cfg80211_bss *__cfg80211_get_bss(struct wiphy *wiphy, * @privacy: privacy filter, see &enum ieee80211_privacy * * This version implies regular usage, %NL80211_BSS_USE_FOR_NORMAL. + * + * Return: Reference-counted BSS on success. %NULL on error. */ static inline struct cfg80211_bss * cfg80211_get_bss(struct wiphy *wiphy, struct ieee80211_channel *channel, @@ -7706,8 +7735,9 @@ int cfg80211_vendor_cmd_reply(struct sk_buff *skb); * cfg80211_vendor_cmd_get_sender - get the current sender netlink ID * @wiphy: the wiphy * - * Return the current netlink port ID in a vendor command handler. - * Valid to call only there. + * Return: the current netlink port ID in a vendor command handler. + * + * Context: May only be called from a vendor command handler */ unsigned int cfg80211_vendor_cmd_get_sender(struct wiphy *wiphy); @@ -8260,6 +8290,8 @@ void cfg80211_tx_mgmt_expired(struct wireless_dev *wdev, u64 cookie, * * @sinfo: the station information * @gfp: allocation flags + * + * Return: 0 on success. Non-zero on error. */ int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp); @@ -8777,13 +8809,13 @@ bool cfg80211_reg_can_beacon(struct wiphy *wiphy, * also checks if IR-relaxation conditions apply, to allow beaconing under * more permissive conditions. * - * Requires the wiphy mutex to be held. + * Context: Requires the wiphy mutex to be held. */ bool cfg80211_reg_can_beacon_relax(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, enum nl80211_iftype iftype); -/* +/** * cfg80211_ch_switch_notify - update wdev channel and notify userspace * @dev: the device which switched channels * @chandef: the new channel definition @@ -8796,7 +8828,7 @@ void cfg80211_ch_switch_notify(struct net_device *dev, struct cfg80211_chan_def *chandef, unsigned int link_id); -/* +/** * cfg80211_ch_switch_started_notify - notify channel switch start * @dev: the device on which the channel switch started * @chandef: the future channel definition @@ -8819,7 +8851,7 @@ void cfg80211_ch_switch_started_notify(struct net_device *dev, * @operating_class: the operating class to convert * @band: band pointer to fill * - * Returns %true if the conversion was successful, %false otherwise. + * Return: %true if the conversion was successful, %false otherwise. */ bool ieee80211_operating_class_to_band(u8 operating_class, enum nl80211_band *band); @@ -8831,7 +8863,7 @@ bool ieee80211_operating_class_to_band(u8 operating_class, * @chan: the ieee80211_channel to convert * @chandef: a pointer to the resulting chandef * - * Returns %true if the conversion was successful, %false otherwise. + * Return: %true if the conversion was successful, %false otherwise. */ bool ieee80211_operating_class_to_chandef(u8 operating_class, struct ieee80211_channel *chan, @@ -8843,7 +8875,7 @@ bool ieee80211_operating_class_to_chandef(u8 operating_class, * @chandef: the chandef to convert * @op_class: a pointer to the resulting operating class * - * Returns %true if the conversion was successful, %false otherwise. + * Return: %true if the conversion was successful, %false otherwise. */ bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef, u8 *op_class); @@ -8853,7 +8885,7 @@ bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef, * * @chandef: the chandef to convert * - * Returns the center frequency of chandef (1st segment) in KHz. + * Return: the center frequency of chandef (1st segment) in KHz. */ static inline u32 ieee80211_chandef_to_khz(const struct cfg80211_chan_def *chandef) @@ -8861,7 +8893,7 @@ ieee80211_chandef_to_khz(const struct cfg80211_chan_def *chandef) return MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset; } -/* +/** * cfg80211_tdls_oper_request - request userspace to perform TDLS operation * @dev: the device on which the operation is requested * @peer: the MAC address of the peer device @@ -8880,11 +8912,11 @@ void cfg80211_tdls_oper_request(struct net_device *dev, const u8 *peer, enum nl80211_tdls_operation oper, u16 reason_code, gfp_t gfp); -/* +/** * cfg80211_calculate_bitrate - calculate actual bitrate (in 100Kbps units) * @rate: given rate_info to calculate bitrate from * - * return 0 if MCS index >= 32 + * Return: calculated bitrate */ u32 cfg80211_calculate_bitrate(struct rate_info *rate); @@ -8898,7 +8930,7 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate); * when the driver wishes to unregister the wdev, e.g. when the hardware device * is unbound from the driver. * - * Requires the RTNL and wiphy mutex to be held. + * Context: Requires the RTNL and wiphy mutex to be held. */ void cfg80211_unregister_wdev(struct wireless_dev *wdev); @@ -8911,7 +8943,9 @@ void cfg80211_unregister_wdev(struct wireless_dev *wdev); * held. Otherwise, both register_netdevice() and register_netdev() are usable * instead as well. * - * Requires the RTNL and wiphy mutex to be held. + * Context: Requires the RTNL and wiphy mutex to be held. + * + * Return: 0 on success. Non-zero on error. */ int cfg80211_register_netdevice(struct net_device *dev); @@ -8924,7 +8958,7 @@ int cfg80211_register_netdevice(struct net_device *dev); * is held. Otherwise, both unregister_netdevice() and unregister_netdev() are * usable instead as well. * - * Requires the RTNL and wiphy mutex to be held. + * Context: Requires the RTNL and wiphy mutex to be held. */ static inline void cfg80211_unregister_netdevice(struct net_device *dev) { @@ -9000,9 +9034,9 @@ int cfg80211_get_p2p_attr(const u8 *ies, unsigned int len, * correctly, if not the result of using this function will not * be ordered correctly either, i.e. it does no reordering. * - * The function returns the offset where the next part of the - * buffer starts, which may be @ielen if the entire (remainder) - * of the buffer should be used. + * Return: The offset where the next part of the buffer starts, which + * may be @ielen if the entire (remainder) of the buffer should be + * used. */ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, const u8 *ids, int n_ids, @@ -9030,9 +9064,9 @@ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, * correctly, if not the result of using this function will not * be ordered correctly either, i.e. it does no reordering. * - * The function returns the offset where the next part of the - * buffer starts, which may be @ielen if the entire (remainder) - * of the buffer should be used. + * Return: The offset where the next part of the buffer starts, which + * may be @ielen if the entire (remainder) of the buffer should be + * used. */ static inline size_t ieee80211_ie_split(const u8 *ies, size_t ielen, const u8 *ids, int n_ids, size_t offset) @@ -9096,6 +9130,8 @@ unsigned int ieee80211_get_num_supported_channels(struct wiphy *wiphy); * This function can be called by the driver to check whether a * combination of interfaces and their types are allowed according to * the interface combinations. + * + * Return: 0 if combinations are allowed. Non-zero on error. */ int cfg80211_check_combinations(struct wiphy *wiphy, struct iface_combination_params *params); @@ -9111,6 +9147,8 @@ int cfg80211_check_combinations(struct wiphy *wiphy, * This function can be called by the driver to check what possible * combinations it fits in at a given moment, e.g. for channel switching * purposes. + * + * Return: 0 on success. Non-zero on error. */ int cfg80211_iter_combinations(struct wiphy *wiphy, struct iface_combination_params *params, @@ -9118,7 +9156,7 @@ int cfg80211_iter_combinations(struct wiphy *wiphy, void *data), void *data); -/* +/** * cfg80211_stop_iface - trigger interface disconnection * * @wiphy: the wiphy @@ -9173,6 +9211,8 @@ static inline void wiphy_ext_feature_set(struct wiphy *wiphy, * * The extended features are flagged in multiple bytes (see * &struct wiphy.@ext_features) + * + * Return: %true if extended feature flag is set, %false otherwise */ static inline bool wiphy_ext_feature_isset(struct wiphy *wiphy, @@ -9294,6 +9334,8 @@ void cfg80211_pmsr_complete(struct wireless_dev *wdev, * Check whether the interface is allowed to operate; additionally, this API * can be used to check iftype against the software interfaces when * check_swif is '1'. + * + * Return: %true if allowed, %false otherwise */ bool cfg80211_iftype_allowed(struct wiphy *wiphy, enum nl80211_iftype iftype, bool is_4addr, u8 check_swif); @@ -9386,60 +9428,78 @@ void cfg80211_bss_flush(struct wiphy *wiphy); * @cmd: the actual event we want to notify * @count: the number of TBTTs until the color change happens * @color_bitmap: representations of the colors that the local BSS is aware of + * @link_id: valid link_id in case of MLO or 0 for non-MLO. + * + * Return: 0 on success. Non-zero on error. */ int cfg80211_bss_color_notify(struct net_device *dev, enum nl80211_commands cmd, u8 count, - u64 color_bitmap); + u64 color_bitmap, u8 link_id); /** * cfg80211_obss_color_collision_notify - notify about bss color collision * @dev: network device * @color_bitmap: representations of the colors that the local BSS is aware of + * @link_id: valid link_id in case of MLO or 0 for non-MLO. + * + * Return: 0 on success. Non-zero on error. */ static inline int cfg80211_obss_color_collision_notify(struct net_device *dev, - u64 color_bitmap) + u64 color_bitmap, + u8 link_id) { return cfg80211_bss_color_notify(dev, NL80211_CMD_OBSS_COLOR_COLLISION, - 0, color_bitmap); + 0, color_bitmap, link_id); } /** * cfg80211_color_change_started_notify - notify color change start * @dev: the device on which the color is switched * @count: the number of TBTTs until the color change happens + * @link_id: valid link_id in case of MLO or 0 for non-MLO. * * Inform the userspace about the color change that has started. + * + * Return: 0 on success. Non-zero on error. */ static inline int cfg80211_color_change_started_notify(struct net_device *dev, - u8 count) + u8 count, u8 link_id) { return cfg80211_bss_color_notify(dev, NL80211_CMD_COLOR_CHANGE_STARTED, - count, 0); + count, 0, link_id); } /** * cfg80211_color_change_aborted_notify - notify color change abort * @dev: the device on which the color is switched + * @link_id: valid link_id in case of MLO or 0 for non-MLO. * * Inform the userspace about the color change that has aborted. + * + * Return: 0 on success. Non-zero on error. */ -static inline int cfg80211_color_change_aborted_notify(struct net_device *dev) +static inline int cfg80211_color_change_aborted_notify(struct net_device *dev, + u8 link_id) { return cfg80211_bss_color_notify(dev, NL80211_CMD_COLOR_CHANGE_ABORTED, - 0, 0); + 0, 0, link_id); } /** * cfg80211_color_change_notify - notify color change completion * @dev: the device on which the color was switched + * @link_id: valid link_id in case of MLO or 0 for non-MLO. * * Inform the userspace about the color change that has completed. + * + * Return: 0 on success. Non-zero on error. */ -static inline int cfg80211_color_change_notify(struct net_device *dev) +static inline int cfg80211_color_change_notify(struct net_device *dev, + u8 link_id) { return cfg80211_bss_color_notify(dev, NL80211_CMD_COLOR_CHANGE_COMPLETED, - 0, 0); + 0, 0, link_id); } /** @@ -9477,6 +9537,8 @@ void cfg80211_schedule_channels_check(struct wireless_dev *wdev); * @ppos: read position * @handler: the read handler to call (under wiphy lock) * @data: additional data to pass to the read handler + * + * Return: the number of characters read, or a negative errno */ ssize_t wiphy_locked_debugfs_read(struct wiphy *wiphy, struct file *file, char *buf, size_t bufsize, @@ -9499,6 +9561,8 @@ ssize_t wiphy_locked_debugfs_read(struct wiphy *wiphy, struct file *file, * @count: read count * @handler: the write handler to call (under wiphy lock) * @data: additional data to pass to the write handler + * + * Return: the number of characters written, or a negative errno */ ssize_t wiphy_locked_debugfs_write(struct wiphy *wiphy, struct file *file, char *buf, size_t bufsize, diff --git a/include/net/cipso_ipv4.h b/include/net/cipso_ipv4.h index 53dd7d988a..c9111bb2f5 100644 --- a/include/net/cipso_ipv4.h +++ b/include/net/cipso_ipv4.h @@ -183,7 +183,8 @@ int cipso_v4_getattr(const unsigned char *cipso, struct netlbl_lsm_secattr *secattr); int cipso_v4_sock_setattr(struct sock *sk, const struct cipso_v4_doi *doi_def, - const struct netlbl_lsm_secattr *secattr); + const struct netlbl_lsm_secattr *secattr, + bool sk_locked); void cipso_v4_sock_delattr(struct sock *sk); int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr); int cipso_v4_req_setattr(struct request_sock *req, @@ -214,7 +215,8 @@ static inline int cipso_v4_getattr(const unsigned char *cipso, static inline int cipso_v4_sock_setattr(struct sock *sk, const struct cipso_v4_doi *doi_def, - const struct netlbl_lsm_secattr *secattr) + const struct netlbl_lsm_secattr *secattr, + bool sk_locked) { return -ENOSYS; } diff --git a/include/net/devlink.h b/include/net/devlink.h index 9ac394bdfb..35eb0f8843 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -483,7 +483,8 @@ struct devlink_param { int (*get)(struct devlink *devlink, u32 id, struct devlink_param_gset_ctx *ctx); int (*set)(struct devlink *devlink, u32 id, - struct devlink_param_gset_ctx *ctx); + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack); int (*validate)(struct devlink *devlink, u32 id, union devlink_param_value val, struct netlink_ext_ack *extack); @@ -599,12 +600,14 @@ enum devlink_param_generic_id { .validate = _validate, \ } -/* Part number, identifier of board design */ +/* Identifier of board design */ #define DEVLINK_INFO_VERSION_GENERIC_BOARD_ID "board.id" /* Revision of board design */ #define DEVLINK_INFO_VERSION_GENERIC_BOARD_REV "board.rev" /* Maker of the board */ #define DEVLINK_INFO_VERSION_GENERIC_BOARD_MANUFACTURE "board.manufacture" +/* Part number of the board and its components */ +#define DEVLINK_INFO_VERSION_GENERIC_BOARD_PART_NUMBER "board.part_number" /* Part number, identifier of asic design */ #define DEVLINK_INFO_VERSION_GENERIC_ASIC_ID "asic.id" @@ -1602,6 +1605,14 @@ void devlink_free(struct devlink *devlink); * capability. Should be used by device drivers to * enable/disable ipsec_packet capability of a * function managed by the devlink port. + * @port_fn_max_io_eqs_get: Callback used to get port function's maximum number + * of event queues. Should be used by device drivers to + * report the maximum event queues of a function + * managed by the devlink port. + * @port_fn_max_io_eqs_set: Callback used to set port function's maximum number + * of event queues. Should be used by device drivers to + * configure maximum number of event queues + * of a function managed by the devlink port. * * Note: Driver should return -EOPNOTSUPP if it doesn't support * port function (@port_fn_*) handling for a particular port. @@ -1651,6 +1662,12 @@ struct devlink_port_ops { int (*port_fn_ipsec_packet_set)(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack); + int (*port_fn_max_io_eqs_get)(struct devlink_port *devlink_port, + u32 *max_eqs, + struct netlink_ext_ack *extack); + int (*port_fn_max_io_eqs_set)(struct devlink_port *devlink_port, + u32 max_eqs, + struct netlink_ext_ack *extack); }; void devlink_port_init(struct devlink *devlink, diff --git a/include/net/dsa.h b/include/net/dsa.h index 7c0da9effe..b60e7e410a 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -24,9 +24,6 @@ struct dsa_8021q_context; struct tc_action; -struct phy_device; -struct fixed_phy_status; -struct phylink_link_state; #define DSA_TAG_PROTO_NONE_VALUE 0 #define DSA_TAG_PROTO_BRCM_VALUE 1 @@ -327,6 +324,12 @@ struct dsa_port { }; }; +static inline struct dsa_port * +dsa_phylink_to_port(struct phylink_config *config) +{ + return container_of(config, struct dsa_port, pl_config); +} + /* TODO: ideally DSA ports would have a single dp->link_dp member, * and no dst->rtable nor this struct dsa_link would be needed, * but this would require some more complex tree walking, @@ -430,6 +433,11 @@ struct dsa_switch { */ u32 fdb_isolation:1; + /* Drivers that have global DSCP mapping settings must set this to + * true to automatically apply the settings to all ports. + */ + u32 dscp_prio_mapping_is_global:1; + /* Listener for switch fabric events */ struct notifier_block nb; @@ -452,6 +460,11 @@ struct dsa_switch { const struct dsa_switch_ops *ops; /* + * Allow a DSA switch driver to override the phylink MAC ops + */ + const struct phylink_mac_ops *phylink_mac_ops; + + /* * User mii_bus and devices for the individual ports. */ u32 phys_mii_mask; @@ -578,6 +591,10 @@ static inline bool dsa_is_user_port(struct dsa_switch *ds, int p) dsa_switch_for_each_port((_dp), (_ds)) \ if (dsa_port_is_user((_dp))) +#define dsa_switch_for_each_user_port_continue_reverse(_dp, _ds) \ + dsa_switch_for_each_port_continue_reverse((_dp), (_ds)) \ + if (dsa_port_is_user((_dp))) + #define dsa_switch_for_each_cpu_port(_dp, _ds) \ dsa_switch_for_each_port((_dp), (_ds)) \ if (dsa_port_is_cpu((_dp))) @@ -858,14 +875,6 @@ struct dsa_switch_ops { int regnum, u16 val); /* - * Link state adjustment (called from libphy) - */ - void (*adjust_link)(struct dsa_switch *ds, int port, - struct phy_device *phydev); - void (*fixed_link_update)(struct dsa_switch *ds, int port, - struct fixed_phy_status *st); - - /* * PHYLINK integration */ void (*phylink_get_caps)(struct dsa_switch *ds, int port, @@ -955,6 +964,10 @@ struct dsa_switch_ops { u8 prio); int (*port_del_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp, u8 prio); + int (*port_set_apptrust)(struct dsa_switch *ds, int port, + const u8 *sel, int nsel); + int (*port_get_apptrust)(struct dsa_switch *ds, int port, u8 *sel, + int *nsel); /* * Suspend and resume @@ -1247,7 +1260,8 @@ struct dsa_switch_ops { int dsa_devlink_param_get(struct devlink *dl, u32 id, struct devlink_param_gset_ctx *ctx); int dsa_devlink_param_set(struct devlink *dl, u32 id, - struct devlink_param_gset_ctx *ctx); + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack); int dsa_devlink_params_register(struct dsa_switch *ds, const struct devlink_param *params, size_t params_count); diff --git a/include/net/dscp.h b/include/net/dscp.h new file mode 100644 index 0000000000..ba40540868 --- /dev/null +++ b/include/net/dscp.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2024 Pengutronix, Oleksij Rempel <kernel@pengutronix.de> */ + +#ifndef __DSCP_H__ +#define __DSCP_H__ + +/* + * DSCP Pools and Codepoint Space Division: + * + * The Differentiated Services (Diffserv) architecture defines a method for + * classifying and managing network traffic using the DS field in IPv4 and IPv6 + * packet headers. This field can carry one of 64 distinct DSCP (Differentiated + * Services Code Point) values, which are divided into three pools based on + * their Least Significant Bits (LSB) patterns and intended usage. Each pool has + * a specific registration procedure for assigning DSCP values: + * + * Pool 1 (Standards Action Pool): + * - Codepoint Space: xxxxx0 + * This pool includes DSCP values ending in '0' (binary), allocated via + * Standards Action. It is intended for globally recognized traffic classes, + * ensuring interoperability across the internet. This pool encompasses + * well-known DSCP values such as CS0-CS7, AFxx, EF, and VOICE-ADMIT. + * + * Pool 2 (Experimental/Local Use Pool): + * - Codepoint Space: xxxx11 + * Reserved for DSCP values ending in '11' (binary), this pool is designated + * for Experimental or Local Use. It allows for private or temporary traffic + * marking schemes not intended for standardized global use, facilitating + * testing and network-specific configurations without impacting + * interoperability. + * + * Pool 3 (Preferential Standardization Pool): + * - Codepoint Space: xxxx01 + * Initially reserved for experimental or local use, this pool now serves as + * a secondary standardization resource should Pool 1 become exhausted. DSCP + * values ending in '01' (binary) are assigned via Standards Action, with a + * focus on adopting new, standardized traffic classes as the need arises. + * + * For pool updates see: + * https://www.iana.org/assignments/dscp-registry/dscp-registry.xhtml + */ + +/* Pool 1: Standardized DSCP values as per [RFC8126] */ +#define DSCP_CS0 0 /* 000000, [RFC2474] */ +/* CS0 is some times called default (DF) */ +#define DSCP_DF 0 /* 000000, [RFC2474] */ +#define DSCP_CS1 8 /* 001000, [RFC2474] */ +#define DSCP_CS2 16 /* 010000, [RFC2474] */ +#define DSCP_CS3 24 /* 011000, [RFC2474] */ +#define DSCP_CS4 32 /* 100000, [RFC2474] */ +#define DSCP_CS5 40 /* 101000, [RFC2474] */ +#define DSCP_CS6 48 /* 110000, [RFC2474] */ +#define DSCP_CS7 56 /* 111000, [RFC2474] */ +#define DSCP_AF11 10 /* 001010, [RFC2597] */ +#define DSCP_AF12 12 /* 001100, [RFC2597] */ +#define DSCP_AF13 14 /* 001110, [RFC2597] */ +#define DSCP_AF21 18 /* 010010, [RFC2597] */ +#define DSCP_AF22 20 /* 010100, [RFC2597] */ +#define DSCP_AF23 22 /* 010110, [RFC2597] */ +#define DSCP_AF31 26 /* 011010, [RFC2597] */ +#define DSCP_AF32 28 /* 011100, [RFC2597] */ +#define DSCP_AF33 30 /* 011110, [RFC2597] */ +#define DSCP_AF41 34 /* 100010, [RFC2597] */ +#define DSCP_AF42 36 /* 100100, [RFC2597] */ +#define DSCP_AF43 38 /* 100110, [RFC2597] */ +#define DSCP_EF 46 /* 101110, [RFC3246] */ +#define DSCP_VOICE_ADMIT 44 /* 101100, [RFC5865] */ + +/* Pool 3: Standardized assignments, previously available for experimental/local + * use + */ +#define DSCP_LE 1 /* 000001, [RFC8622] */ + +#define DSCP_MAX 64 + +#endif /* __DSCP_H__ */ diff --git a/include/net/dst_cache.h b/include/net/dst_cache.h index df6622a5fe..b4a55d2d5e 100644 --- a/include/net/dst_cache.h +++ b/include/net/dst_cache.h @@ -76,7 +76,7 @@ struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, */ static inline void dst_cache_reset(struct dst_cache *dst_cache) { - dst_cache->reset_ts = jiffies; + WRITE_ONCE(dst_cache->reset_ts, jiffies); } /** diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 1b7fae4c6b..4160731dcb 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -198,7 +198,7 @@ static inline struct metadata_dst *__ip_tun_set_dst(__be32 saddr, __be32 daddr, __u8 tos, __u8 ttl, __be16 tp_dst, - __be16 flags, + const unsigned long *flags, __be64 tunnel_id, int md_size) { @@ -215,7 +215,7 @@ static inline struct metadata_dst *__ip_tun_set_dst(__be32 saddr, } static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, - __be16 flags, + const unsigned long *flags, __be64 tunnel_id, int md_size) { @@ -230,7 +230,7 @@ static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *sad __u8 tos, __u8 ttl, __be16 tp_dst, __be32 label, - __be16 flags, + const unsigned long *flags, __be64 tunnel_id, int md_size) { @@ -243,7 +243,7 @@ static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *sad info = &tun_dst->u.tun_info; info->mode = IP_TUNNEL_INFO_IPV6; - info->key.tun_flags = flags; + ip_tunnel_flags_copy(info->key.tun_flags, flags); info->key.tun_id = tunnel_id; info->key.tp_src = 0; info->key.tp_dst = tp_dst; @@ -259,7 +259,7 @@ static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *sad } static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb, - __be16 flags, + const unsigned long *flags, __be64 tunnel_id, int md_size) { diff --git a/include/net/espintcp.h b/include/net/espintcp.h index 0335bbd765..c70efd704b 100644 --- a/include/net/espintcp.h +++ b/include/net/espintcp.h @@ -32,7 +32,7 @@ struct espintcp_ctx { static inline struct espintcp_ctx *espintcp_getctx(const struct sock *sk) { - struct inet_connection_sock *icsk = inet_csk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); /* RCU is only needed for diag */ return (__force void *)icsk->icsk_ulp_data; diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 1a7131d6cb..9ab376d1a6 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -97,7 +97,7 @@ struct flow_dissector_key_enc_opts { * here but seems difficult to #include */ u8 len; - __be16 dst_opt_type; + u32 dst_opt_type; }; struct flow_dissector_key_keyid { diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 314087a5e1..ec9f80509f 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -345,7 +345,7 @@ static inline bool flow_action_has_entries(const struct flow_action *action) * flow_offload_has_one_action() - check if exactly one action is present * @action: tc filter flow offload action * - * Returns true if exactly one action is present. + * Return: true if exactly one action is present. */ static inline bool flow_offload_has_one_action(const struct flow_action *action) { @@ -449,6 +449,61 @@ static inline bool flow_rule_match_key(const struct flow_rule *rule, return dissector_uses_key(rule->match.dissector, key); } +/** + * flow_rule_is_supp_control_flags() - check for supported control flags + * @supp_flags: control flags supported by driver + * @ctrl_flags: control flags present in rule + * @extack: The netlink extended ACK for reporting errors. + * + * Return: true if only supported control flags are set, false otherwise. + */ +static inline bool flow_rule_is_supp_control_flags(const u32 supp_flags, + const u32 ctrl_flags, + struct netlink_ext_ack *extack) +{ + if (likely((ctrl_flags & ~supp_flags) == 0)) + return true; + + NL_SET_ERR_MSG_FMT_MOD(extack, + "Unsupported match on control.flags %#x", + ctrl_flags); + + return false; +} + +/** + * flow_rule_has_control_flags() - check for presence of any control flags + * @ctrl_flags: control flags present in rule + * @extack: The netlink extended ACK for reporting errors. + * + * Return: true if control flags are set, false otherwise. + */ +static inline bool flow_rule_has_control_flags(const u32 ctrl_flags, + struct netlink_ext_ack *extack) +{ + return !flow_rule_is_supp_control_flags(0, ctrl_flags, extack); +} + +/** + * flow_rule_match_has_control_flags() - match and check for any control flags + * @rule: The flow_rule under evaluation. + * @extack: The netlink extended ACK for reporting errors. + * + * Return: true if control flags are set, false otherwise. + */ +static inline bool flow_rule_match_has_control_flags(struct flow_rule *rule, + struct netlink_ext_ack *extack) +{ + struct flow_match_control match; + + if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_CONTROL)) + return false; + + flow_rule_match_control(rule, &match); + + return flow_rule_has_control_flags(match.mask->flags, extack); +} + struct flow_stats { u64 pkts; u64 bytes; diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 9ece6e5a3e..9ab49bfeae 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -2,12 +2,20 @@ #ifndef __NET_GENERIC_NETLINK_H #define __NET_GENERIC_NETLINK_H -#include <linux/genetlink.h> +#include <linux/net.h> #include <net/netlink.h> #include <net/net_namespace.h> +#include <uapi/linux/genetlink.h> #define GENLMSG_DEFAULT_SIZE (NLMSG_DEFAULT_SIZE - GENL_HDRLEN) +/* Non-parallel generic netlink requests are serialized by a global lock. */ +void genl_lock(void); +void genl_unlock(void); + +#define MODULE_ALIAS_GENL_FAMILY(family) \ + MODULE_ALIAS_NET_PF_PROTO_NAME(PF_NETLINK, NETLINK_GENERIC, "-family-" family) + /* Binding to multicast group requires %CAP_NET_ADMIN */ #define GENL_MCAST_CAP_NET_ADMIN BIT(0) /* Binding to multicast group requires %CAP_SYS_ADMIN */ diff --git a/include/net/gre.h b/include/net/gre.h index 4e209708b7..ccd2932032 100644 --- a/include/net/gre.h +++ b/include/net/gre.h @@ -49,67 +49,61 @@ static inline bool netif_is_ip6gretap(const struct net_device *dev) !strcmp(dev->rtnl_link_ops->kind, "ip6gretap"); } -static inline int gre_calc_hlen(__be16 o_flags) +static inline int gre_calc_hlen(const unsigned long *o_flags) { int addend = 4; - if (o_flags & TUNNEL_CSUM) + if (test_bit(IP_TUNNEL_CSUM_BIT, o_flags)) addend += 4; - if (o_flags & TUNNEL_KEY) + if (test_bit(IP_TUNNEL_KEY_BIT, o_flags)) addend += 4; - if (o_flags & TUNNEL_SEQ) + if (test_bit(IP_TUNNEL_SEQ_BIT, o_flags)) addend += 4; return addend; } -static inline __be16 gre_flags_to_tnl_flags(__be16 flags) +static inline void gre_flags_to_tnl_flags(unsigned long *dst, __be16 flags) { - __be16 tflags = 0; - - if (flags & GRE_CSUM) - tflags |= TUNNEL_CSUM; - if (flags & GRE_ROUTING) - tflags |= TUNNEL_ROUTING; - if (flags & GRE_KEY) - tflags |= TUNNEL_KEY; - if (flags & GRE_SEQ) - tflags |= TUNNEL_SEQ; - if (flags & GRE_STRICT) - tflags |= TUNNEL_STRICT; - if (flags & GRE_REC) - tflags |= TUNNEL_REC; - if (flags & GRE_VERSION) - tflags |= TUNNEL_VERSION; - - return tflags; + IP_TUNNEL_DECLARE_FLAGS(res) = { }; + + __assign_bit(IP_TUNNEL_CSUM_BIT, res, flags & GRE_CSUM); + __assign_bit(IP_TUNNEL_ROUTING_BIT, res, flags & GRE_ROUTING); + __assign_bit(IP_TUNNEL_KEY_BIT, res, flags & GRE_KEY); + __assign_bit(IP_TUNNEL_SEQ_BIT, res, flags & GRE_SEQ); + __assign_bit(IP_TUNNEL_STRICT_BIT, res, flags & GRE_STRICT); + __assign_bit(IP_TUNNEL_REC_BIT, res, flags & GRE_REC); + __assign_bit(IP_TUNNEL_VERSION_BIT, res, flags & GRE_VERSION); + + ip_tunnel_flags_copy(dst, res); } -static inline __be16 gre_tnl_flags_to_gre_flags(__be16 tflags) +static inline __be16 gre_tnl_flags_to_gre_flags(const unsigned long *tflags) { __be16 flags = 0; - if (tflags & TUNNEL_CSUM) + if (test_bit(IP_TUNNEL_CSUM_BIT, tflags)) flags |= GRE_CSUM; - if (tflags & TUNNEL_ROUTING) + if (test_bit(IP_TUNNEL_ROUTING_BIT, tflags)) flags |= GRE_ROUTING; - if (tflags & TUNNEL_KEY) + if (test_bit(IP_TUNNEL_KEY_BIT, tflags)) flags |= GRE_KEY; - if (tflags & TUNNEL_SEQ) + if (test_bit(IP_TUNNEL_SEQ_BIT, tflags)) flags |= GRE_SEQ; - if (tflags & TUNNEL_STRICT) + if (test_bit(IP_TUNNEL_STRICT_BIT, tflags)) flags |= GRE_STRICT; - if (tflags & TUNNEL_REC) + if (test_bit(IP_TUNNEL_REC_BIT, tflags)) flags |= GRE_REC; - if (tflags & TUNNEL_VERSION) + if (test_bit(IP_TUNNEL_VERSION_BIT, tflags)) flags |= GRE_VERSION; return flags; } static inline void gre_build_header(struct sk_buff *skb, int hdr_len, - __be16 flags, __be16 proto, + const unsigned long *flags, __be16 proto, __be32 key, __be32 seq) { + IP_TUNNEL_DECLARE_FLAGS(cond) = { }; struct gre_base_hdr *greh; skb_push(skb, hdr_len); @@ -120,18 +114,22 @@ static inline void gre_build_header(struct sk_buff *skb, int hdr_len, greh->flags = gre_tnl_flags_to_gre_flags(flags); greh->protocol = proto; - if (flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) { + __set_bit(IP_TUNNEL_KEY_BIT, cond); + __set_bit(IP_TUNNEL_CSUM_BIT, cond); + __set_bit(IP_TUNNEL_SEQ_BIT, cond); + + if (ip_tunnel_flags_intersect(flags, cond)) { __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); - if (flags & TUNNEL_SEQ) { + if (test_bit(IP_TUNNEL_SEQ_BIT, flags)) { *ptr = seq; ptr--; } - if (flags & TUNNEL_KEY) { + if (test_bit(IP_TUNNEL_KEY_BIT, flags)) { *ptr = key; ptr--; } - if (flags & TUNNEL_CSUM && + if (test_bit(IP_TUNNEL_CSUM_BIT, flags) && !(skb_shinfo(skb)->gso_type & (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) { *ptr = 0; diff --git a/include/net/gro.h b/include/net/gro.h index c1d4ca0463..b9b58c1f8d 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -36,15 +36,14 @@ struct napi_gro_cb { /* This is non-zero if the packet cannot be merged with the new skb. */ u16 flush; - /* Save the IP ID here and check when we get to the transport layer */ - u16 flush_id; - /* Number of segments aggregated. */ u16 count; /* Used in ipv6_gro_receive() and foo-over-udp and esp-in-udp */ u16 proto; + u16 pad; + /* Used in napi_gro_cb::free */ #define NAPI_GRO_FREE 1 #define NAPI_GRO_FREE_STOLEN_HEAD 2 @@ -75,8 +74,8 @@ struct napi_gro_cb { /* Used in GRE, set in fou/gue_gro_receive */ u8 is_fou:1; - /* Used to determine if flush_id can be ignored */ - u8 is_atomic:1; + /* Used to determine if ipid_offset can be ignored */ + u8 ip_fixedid:1; /* Number of gro_receive callbacks this packet already went through */ u8 recursion_counter:4; @@ -181,12 +180,17 @@ static inline void *skb_gro_header(struct sk_buff *skb, unsigned int hlen, return ptr; } +static inline int skb_gro_receive_network_offset(const struct sk_buff *skb) +{ + return NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark]; +} + static inline void *skb_gro_network_header(const struct sk_buff *skb) { if (skb_gro_may_pull(skb, skb_gro_offset(skb))) - return skb_gro_header_fast(skb, skb_network_offset(skb)); + return skb_gro_header_fast(skb, skb_gro_receive_network_offset(skb)); - return skb_network_header(skb); + return skb->data + skb_gro_receive_network_offset(skb); } static inline __wsum inet_gro_compute_pseudo(const struct sk_buff *skb, @@ -437,7 +441,71 @@ static inline __wsum ip6_gro_compute_pseudo(const struct sk_buff *skb, skb_gro_len(skb), proto, 0)); } +static inline int inet_gro_flush(const struct iphdr *iph, const struct iphdr *iph2, + struct sk_buff *p, bool outer) +{ + const u32 id = ntohl(*(__be32 *)&iph->id); + const u32 id2 = ntohl(*(__be32 *)&iph2->id); + const u16 ipid_offset = (id >> 16) - (id2 >> 16); + const u16 count = NAPI_GRO_CB(p)->count; + const u32 df = id & IP_DF; + int flush; + + /* All fields must match except length and checksum. */ + flush = (iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | (df ^ (id2 & IP_DF)); + + if (flush | (outer && df)) + return flush; + + /* When we receive our second frame we can make a decision on if we + * continue this flow as an atomic flow with a fixed ID or if we use + * an incrementing ID. + */ + if (count == 1 && df && !ipid_offset) + NAPI_GRO_CB(p)->ip_fixedid = true; + + return ipid_offset ^ (count * !NAPI_GRO_CB(p)->ip_fixedid); +} + +static inline int ipv6_gro_flush(const struct ipv6hdr *iph, const struct ipv6hdr *iph2) +{ + /* <Version:4><Traffic_Class:8><Flow_Label:20> */ + __be32 first_word = *(__be32 *)iph ^ *(__be32 *)iph2; + + /* Flush if Traffic Class fields are different. */ + return !!((first_word & htonl(0x0FF00000)) | + (__force __be32)(iph->hop_limit ^ iph2->hop_limit)); +} + +static inline int __gro_receive_network_flush(const void *th, const void *th2, + struct sk_buff *p, const u16 diff, + bool outer) +{ + const void *nh = th - diff; + const void *nh2 = th2 - diff; + + if (((struct iphdr *)nh)->version == 6) + return ipv6_gro_flush(nh, nh2); + else + return inet_gro_flush(nh, nh2, p, outer); +} + +static inline int gro_receive_network_flush(const void *th, const void *th2, + struct sk_buff *p) +{ + const bool encap_mark = NAPI_GRO_CB(p)->encap_mark; + int off = skb_transport_offset(p); + int flush; + + flush = __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->network_offset, encap_mark); + if (encap_mark) + flush |= __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->inner_network_offset, false); + + return flush; +} + int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb); /* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ static inline void gro_normal_list(struct napi_struct *napi) diff --git a/include/net/gtp.h b/include/net/gtp.h index 2a503f035d..c0253c8702 100644 --- a/include/net/gtp.h +++ b/include/net/gtp.h @@ -78,4 +78,9 @@ static inline bool netif_is_gtp(const struct net_device *dev) #define GTP1_F_EXTHDR 0x04 #define GTP1_F_MASK 0x07 +struct gtp_ext_hdr { + __u8 len; + __u8 data[]; +}; + #endif diff --git a/include/net/hotdata.h b/include/net/hotdata.h index 003667a1ef..30e9570beb 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -38,6 +38,9 @@ struct net_hotdata { int max_backlog; int dev_tx_weight; int dev_rx_weight; + int sysctl_max_skb_frags; + int sysctl_skb_defer_max; + int sysctl_mem_pcpu_rsv; }; #define inet_ehash_secret net_hotdata.tcp_protocol.secret diff --git a/include/net/ieee8021q.h b/include/net/ieee8021q.h new file mode 100644 index 0000000000..8bfe903dd3 --- /dev/null +++ b/include/net/ieee8021q.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2024 Pengutronix, Oleksij Rempel <kernel@pengutronix.de> */ + +#ifndef _NET_IEEE8021Q_H +#define _NET_IEEE8021Q_H + +#include <linux/errno.h> + +/** + * enum ieee8021q_traffic_type - 802.1Q traffic type priority values (802.1Q-2022) + * + * @IEEE8021Q_TT_BK: Background + * @IEEE8021Q_TT_BE: Best Effort (default). According to 802.1Q-2022, BE is 0 + * but has higher priority than BK which is 1. + * @IEEE8021Q_TT_EE: Excellent Effort + * @IEEE8021Q_TT_CA: Critical Applications + * @IEEE8021Q_TT_VI: Video, < 100 ms latency and jitter + * @IEEE8021Q_TT_VO: Voice, < 10 ms latency and jitter + * @IEEE8021Q_TT_IC: Internetwork Control + * @IEEE8021Q_TT_NC: Network Control + */ +enum ieee8021q_traffic_type { + IEEE8021Q_TT_BK = 0, + IEEE8021Q_TT_BE = 1, + IEEE8021Q_TT_EE = 2, + IEEE8021Q_TT_CA = 3, + IEEE8021Q_TT_VI = 4, + IEEE8021Q_TT_VO = 5, + IEEE8021Q_TT_IC = 6, + IEEE8021Q_TT_NC = 7, + + /* private: */ + IEEE8021Q_TT_MAX, +}; + +#define SIMPLE_IETF_DSCP_TO_IEEE8021Q_TT(dscp) ((dscp >> 3) & 0x7) + +#if IS_ENABLED(CONFIG_NET_IEEE8021Q_HELPERS) + +int ietf_dscp_to_ieee8021q_tt(u8 dscp); +int ieee8021q_tt_to_tc(enum ieee8021q_traffic_type tt, unsigned int num_queues); + +#else + +static inline int ietf_dscp_to_ieee8021q_tt(u8 dscp) +{ + return -EOPNOTSUPP; +} + +static inline int ieee8021q_tt_to_tc(enum ieee8021q_traffic_type tt, + unsigned int num_queues) +{ + return -EOPNOTSUPP; +} + +#endif +#endif /* _NET_IEEE8021Q_H */ diff --git a/include/net/inet_common.h b/include/net/inet_common.h index f50a644d87..c17a6585d0 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -29,8 +29,8 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags, int is_sendmsg); int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags); -int inet_accept(struct socket *sock, struct socket *newsock, int flags, - bool kern); +int inet_accept(struct socket *sock, struct socket *newsock, + struct proto_accept_arg *arg); void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk); int inet_send_prepare(struct sock *sk); diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 146ece8563..c0deaafebf 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -147,10 +147,7 @@ struct inet_connection_sock { #define ICSK_TIME_LOSS_PROBE 5 /* Tail loss probe timer */ #define ICSK_TIME_REO_TIMEOUT 6 /* Reordering timer */ -static inline struct inet_connection_sock *inet_csk(const struct sock *sk) -{ - return (struct inet_connection_sock *)sk; -} +#define inet_csk(ptr) container_of_const(ptr, struct inet_connection_sock, icsk_inet.sk) static inline void *inet_csk_ca(const struct sock *sk) { @@ -253,7 +250,7 @@ inet_csk_rto_backoff(const struct inet_connection_sock *icsk, return (unsigned long)min_t(u64, when, max_when); } -struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern); +struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg); int inet_csk_get_port(struct sock *sk, unsigned short snum); @@ -284,7 +281,7 @@ static inline int inet_csk_reqsk_queue_len(const struct sock *sk) static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) { - return inet_csk_reqsk_queue_len(sk) >= sk->sk_max_ack_backlog; + return inet_csk_reqsk_queue_len(sk) >= READ_ONCE(sk->sk_max_ack_backlog); } bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index f28da08a37..2a536eea94 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -111,7 +111,7 @@ static inline void inet_twsk_reschedule(struct inet_timewait_sock *tw, int timeo void inet_twsk_deschedule_put(struct inet_timewait_sock *tw); -void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family); +void inet_twsk_purge(struct inet_hashinfo *hashinfo); static inline struct net *twsk_net(const struct inet_timewait_sock *twsk) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 73524fa0c0..6cb867ce48 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -340,7 +340,7 @@ static inline void fib6_info_release(struct fib6_info *f6i) { if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) { DEBUG_NET_WARN_ON_ONCE(!hlist_unhashed(&f6i->gc_link)); - call_rcu(&f6i->rcu, fib6_info_destroy_rcu); + call_rcu_hurry(&f6i->rcu, fib6_info_destroy_rcu); } } diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index a18ed24fed..6dbdf60b34 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -127,18 +127,26 @@ void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args, static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *f6i, const struct in6_addr *daddr, - unsigned int prefs, + unsigned int prefs, int l3mdev_index, struct in6_addr *saddr) { + struct net_device *l3mdev; + struct net_device *dev; + bool same_vrf; int err = 0; - if (f6i && f6i->fib6_prefsrc.plen) { + rcu_read_lock(); + + l3mdev = dev_get_by_index_rcu(net, l3mdev_index); + if (!f6i || !f6i->fib6_prefsrc.plen || l3mdev) + dev = f6i ? fib6_info_nh_dev(f6i) : NULL; + same_vrf = !l3mdev || l3mdev_master_dev_rcu(dev) == l3mdev; + if (f6i && f6i->fib6_prefsrc.plen && same_vrf) *saddr = f6i->fib6_prefsrc.addr; - } else { - struct net_device *dev = f6i ? fib6_info_nh_dev(f6i) : NULL; + else + err = ipv6_dev_get_saddr(net, same_vrf ? dev : l3mdev, daddr, prefs, saddr); - err = ipv6_dev_get_saddr(net, dev, daddr, prefs, saddr); - } + rcu_read_unlock(); return err; } diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 74b369bddf..399592405c 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -30,8 +30,8 @@ struct __ip6_tnl_parm { struct in6_addr laddr; /* local tunnel end-point address */ struct in6_addr raddr; /* remote tunnel end-point address */ - __be16 i_flags; - __be16 o_flags; + IP_TUNNEL_DECLARE_FLAGS(i_flags); + IP_TUNNEL_DECLARE_FLAGS(o_flags); __be32 i_key; __be32 o_key; diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 9b2f69ba5e..c29639b432 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -173,6 +173,7 @@ struct fib_result { unsigned char type; unsigned char scope; u32 tclassid; + dscp_t dscp; struct fib_nh_common *nhc; struct fib_info *fi; struct fib_table *table; diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index cc666da626..1db2417b8f 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -36,6 +36,24 @@ (sizeof_field(struct ip_tunnel_key, u) - \ sizeof_field(struct ip_tunnel_key, u.ipv4)) +#define __ipt_flag_op(op, ...) \ + op(__VA_ARGS__, __IP_TUNNEL_FLAG_NUM) + +#define IP_TUNNEL_DECLARE_FLAGS(...) \ + __ipt_flag_op(DECLARE_BITMAP, __VA_ARGS__) + +#define ip_tunnel_flags_zero(...) __ipt_flag_op(bitmap_zero, __VA_ARGS__) +#define ip_tunnel_flags_copy(...) __ipt_flag_op(bitmap_copy, __VA_ARGS__) +#define ip_tunnel_flags_and(...) __ipt_flag_op(bitmap_and, __VA_ARGS__) +#define ip_tunnel_flags_or(...) __ipt_flag_op(bitmap_or, __VA_ARGS__) + +#define ip_tunnel_flags_empty(...) \ + __ipt_flag_op(bitmap_empty, __VA_ARGS__) +#define ip_tunnel_flags_intersect(...) \ + __ipt_flag_op(bitmap_intersects, __VA_ARGS__) +#define ip_tunnel_flags_subset(...) \ + __ipt_flag_op(bitmap_subset, __VA_ARGS__) + struct ip_tunnel_key { __be64 tun_id; union { @@ -48,11 +66,11 @@ struct ip_tunnel_key { struct in6_addr dst; } ipv6; } u; - __be16 tun_flags; - u8 tos; /* TOS for IPv4, TC for IPv6 */ - u8 ttl; /* TTL for IPv4, HL for IPv6 */ + IP_TUNNEL_DECLARE_FLAGS(tun_flags); __be32 label; /* Flow Label for IPv6 */ u32 nhid; + u8 tos; /* TOS for IPv4, TC for IPv6 */ + u8 ttl; /* TTL for IPv4, HL for IPv6 */ __be16 tp_src; __be16 tp_dst; __u8 flow_flags; @@ -110,6 +128,17 @@ struct ip_tunnel_prl_entry { struct metadata_dst; +/* Kernel-side variant of ip_tunnel_parm */ +struct ip_tunnel_parm_kern { + char name[IFNAMSIZ]; + IP_TUNNEL_DECLARE_FLAGS(i_flags); + IP_TUNNEL_DECLARE_FLAGS(o_flags); + __be32 i_key; + __be32 o_key; + int link; + struct iphdr iph; +}; + struct ip_tunnel { struct ip_tunnel __rcu *next; struct hlist_node hash_node; @@ -136,7 +165,7 @@ struct ip_tunnel { struct dst_cache dst_cache; - struct ip_tunnel_parm parms; + struct ip_tunnel_parm_kern parms; int mlink; int encap_hlen; /* Encap header length (FOU,GUE) */ @@ -157,7 +186,7 @@ struct ip_tunnel { }; struct tnl_ptk_info { - __be16 flags; + IP_TUNNEL_DECLARE_FLAGS(flags); __be16 proto; __be32 key; __be32 seq; @@ -179,11 +208,80 @@ struct ip_tunnel_net { int type; }; +static inline void ip_tunnel_set_options_present(unsigned long *flags) +{ + IP_TUNNEL_DECLARE_FLAGS(present) = { }; + + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present); + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present); + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present); + __set_bit(IP_TUNNEL_GTP_OPT_BIT, present); + __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present); + + ip_tunnel_flags_or(flags, flags, present); +} + +static inline void ip_tunnel_clear_options_present(unsigned long *flags) +{ + IP_TUNNEL_DECLARE_FLAGS(present) = { }; + + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present); + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present); + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present); + __set_bit(IP_TUNNEL_GTP_OPT_BIT, present); + __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present); + + __ipt_flag_op(bitmap_andnot, flags, flags, present); +} + +static inline bool ip_tunnel_is_options_present(const unsigned long *flags) +{ + IP_TUNNEL_DECLARE_FLAGS(present) = { }; + + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present); + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present); + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present); + __set_bit(IP_TUNNEL_GTP_OPT_BIT, present); + __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present); + + return ip_tunnel_flags_intersect(flags, present); +} + +static inline bool ip_tunnel_flags_is_be16_compat(const unsigned long *flags) +{ + IP_TUNNEL_DECLARE_FLAGS(supp) = { }; + + bitmap_set(supp, 0, BITS_PER_TYPE(__be16)); + __set_bit(IP_TUNNEL_VTI_BIT, supp); + + return ip_tunnel_flags_subset(flags, supp); +} + +static inline void ip_tunnel_flags_from_be16(unsigned long *dst, __be16 flags) +{ + ip_tunnel_flags_zero(dst); + + bitmap_write(dst, be16_to_cpu(flags), 0, BITS_PER_TYPE(__be16)); + __assign_bit(IP_TUNNEL_VTI_BIT, dst, flags & VTI_ISVTI); +} + +static inline __be16 ip_tunnel_flags_to_be16(const unsigned long *flags) +{ + __be16 ret; + + ret = cpu_to_be16(bitmap_read(flags, 0, BITS_PER_TYPE(__be16))); + if (test_bit(IP_TUNNEL_VTI_BIT, flags)) + ret |= VTI_ISVTI; + + return ret; +} + static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, __be32 saddr, __be32 daddr, u8 tos, u8 ttl, __be32 label, __be16 tp_src, __be16 tp_dst, - __be64 tun_id, __be16 tun_flags) + __be64 tun_id, + const unsigned long *tun_flags) { key->tun_id = tun_id; key->u.ipv4.src = saddr; @@ -193,7 +291,7 @@ static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, key->tos = tos; key->ttl = ttl; key->label = label; - key->tun_flags = tun_flags; + ip_tunnel_flags_copy(key->tun_flags, tun_flags); /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of * the upper tunnel are used. @@ -214,12 +312,8 @@ ip_tunnel_dst_cache_usable(const struct sk_buff *skb, { if (skb->mark) return false; - if (!info) - return true; - if (info->key.tun_flags & TUNNEL_NOCACHE) - return false; - return true; + return !info || !test_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags); } static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info @@ -291,14 +385,18 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const u8 proto, int tunnel_hlen); -int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); +int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, + int cmd); +bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp, + const void __user *data); +bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp); int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd); int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict); int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu); struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, - int link, __be16 flags, + int link, const unsigned long *flags, __be32 remote, __be32 local, __be32 key); @@ -307,16 +405,16 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_error); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm *p, __u32 fwmark); + struct ip_tunnel_parm_kern *p, __u32 fwmark); int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm *p, __u32 fwmark); + struct ip_tunnel_parm_kern *p, __u32 fwmark); void ip_tunnel_setup(struct net_device *dev, unsigned int net_id); bool ip_tunnel_netlink_encap_parms(struct nlattr *data[], struct ip_tunnel_encap *encap); void ip_tunnel_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms); + struct ip_tunnel_parm_kern *parms); extern const struct header_ops ip_tunnel_header_ops; __be16 ip_tunnel_parse_protocol(const struct sk_buff *skb); @@ -548,12 +646,13 @@ static inline void ip_tunnel_info_opts_get(void *to, static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, const void *from, int len, - __be16 flags) + const unsigned long *flags) { info->options_len = len; if (len > 0) { memcpy(ip_tunnel_info_opts(info), from, len); - info->key.tun_flags |= flags; + ip_tunnel_flags_or(info->key.tun_flags, info->key.tun_flags, + flags); } } @@ -597,7 +696,7 @@ static inline void ip_tunnel_info_opts_get(void *to, static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, const void *from, int len, - __be16 flags) + const unsigned long *flags) { info->options_len = 0; } diff --git a/include/net/iucv/iucv.h b/include/net/iucv/iucv.h index 5cd7871127..4d114e6d6d 100644 --- a/include/net/iucv/iucv.h +++ b/include/net/iucv/iucv.h @@ -82,7 +82,12 @@ struct iucv_array { } __attribute__ ((aligned (8))); extern const struct bus_type iucv_bus; -extern struct device *iucv_root; + +struct device_driver; + +struct device *iucv_alloc_device(const struct attribute_group **attrs, + struct device_driver *driver, void *priv, + const char *fmt, ...) __printf(4, 5); /* * struct iucv_path diff --git a/include/net/libeth/rx.h b/include/net/libeth/rx.h new file mode 100644 index 0000000000..f29ea3e34c --- /dev/null +++ b/include/net/libeth/rx.h @@ -0,0 +1,242 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2024 Intel Corporation */ + +#ifndef __LIBETH_RX_H +#define __LIBETH_RX_H + +#include <linux/if_vlan.h> + +#include <net/page_pool/helpers.h> +#include <net/xdp.h> + +/* Rx buffer management */ + +/* Space reserved in front of each frame */ +#define LIBETH_SKB_HEADROOM (NET_SKB_PAD + NET_IP_ALIGN) +/* Maximum headroom for worst-case calculations */ +#define LIBETH_MAX_HEADROOM LIBETH_SKB_HEADROOM +/* Link layer / L2 overhead: Ethernet, 2 VLAN tags (C + S), FCS */ +#define LIBETH_RX_LL_LEN (ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN) + +/* Always use order-0 pages */ +#define LIBETH_RX_PAGE_ORDER 0 +/* Pick a sane buffer stride and align to a cacheline boundary */ +#define LIBETH_RX_BUF_STRIDE SKB_DATA_ALIGN(128) +/* HW-writeable space in one buffer: truesize - headroom/tailroom, aligned */ +#define LIBETH_RX_PAGE_LEN(hr) \ + ALIGN_DOWN(SKB_MAX_ORDER(hr, LIBETH_RX_PAGE_ORDER), \ + LIBETH_RX_BUF_STRIDE) + +/** + * struct libeth_fqe - structure representing an Rx buffer (fill queue element) + * @page: page holding the buffer + * @offset: offset from the page start (to the headroom) + * @truesize: total space occupied by the buffer (w/ headroom and tailroom) + * + * Depending on the MTU, API switches between one-page-per-frame and shared + * page model (to conserve memory on bigger-page platforms). In case of the + * former, @offset is always 0 and @truesize is always ```PAGE_SIZE```. + */ +struct libeth_fqe { + struct page *page; + u32 offset; + u32 truesize; +} __aligned_largest; + +/** + * struct libeth_fq - structure representing a buffer (fill) queue + * @fp: hotpath part of the structure + * @pp: &page_pool for buffer management + * @fqes: array of Rx buffers + * @truesize: size to allocate per buffer, w/overhead + * @count: number of descriptors/buffers the queue has + * @buf_len: HW-writeable length per each buffer + * @nid: ID of the closest NUMA node with memory + */ +struct libeth_fq { + struct_group_tagged(libeth_fq_fp, fp, + struct page_pool *pp; + struct libeth_fqe *fqes; + + u32 truesize; + u32 count; + ); + + /* Cold fields */ + u32 buf_len; + int nid; +}; + +int libeth_rx_fq_create(struct libeth_fq *fq, struct napi_struct *napi); +void libeth_rx_fq_destroy(struct libeth_fq *fq); + +/** + * libeth_rx_alloc - allocate a new Rx buffer + * @fq: fill queue to allocate for + * @i: index of the buffer within the queue + * + * Return: DMA address to be passed to HW for Rx on successful allocation, + * ```DMA_MAPPING_ERROR``` otherwise. + */ +static inline dma_addr_t libeth_rx_alloc(const struct libeth_fq_fp *fq, u32 i) +{ + struct libeth_fqe *buf = &fq->fqes[i]; + + buf->truesize = fq->truesize; + buf->page = page_pool_dev_alloc(fq->pp, &buf->offset, &buf->truesize); + if (unlikely(!buf->page)) + return DMA_MAPPING_ERROR; + + return page_pool_get_dma_addr(buf->page) + buf->offset + + fq->pp->p.offset; +} + +void libeth_rx_recycle_slow(struct page *page); + +/** + * libeth_rx_sync_for_cpu - synchronize or recycle buffer post DMA + * @fqe: buffer to process + * @len: frame length from the descriptor + * + * Process the buffer after it's written by HW. The regular path is to + * synchronize DMA for CPU, but in case of no data it will be immediately + * recycled back to its PP. + * + * Return: true when there's data to process, false otherwise. + */ +static inline bool libeth_rx_sync_for_cpu(const struct libeth_fqe *fqe, + u32 len) +{ + struct page *page = fqe->page; + + /* Very rare, but possible case. The most common reason: + * the last fragment contained FCS only, which was then + * stripped by the HW. + */ + if (unlikely(!len)) { + libeth_rx_recycle_slow(page); + return false; + } + + page_pool_dma_sync_for_cpu(page->pp, page, fqe->offset, len); + + return true; +} + +/* Converting abstract packet type numbers into a software structure with + * the packet parameters to do O(1) lookup on Rx. + */ + +enum { + LIBETH_RX_PT_OUTER_L2 = 0U, + LIBETH_RX_PT_OUTER_IPV4, + LIBETH_RX_PT_OUTER_IPV6, +}; + +enum { + LIBETH_RX_PT_NOT_FRAG = 0U, + LIBETH_RX_PT_FRAG, +}; + +enum { + LIBETH_RX_PT_TUNNEL_IP_NONE = 0U, + LIBETH_RX_PT_TUNNEL_IP_IP, + LIBETH_RX_PT_TUNNEL_IP_GRENAT, + LIBETH_RX_PT_TUNNEL_IP_GRENAT_MAC, + LIBETH_RX_PT_TUNNEL_IP_GRENAT_MAC_VLAN, +}; + +enum { + LIBETH_RX_PT_TUNNEL_END_NONE = 0U, + LIBETH_RX_PT_TUNNEL_END_IPV4, + LIBETH_RX_PT_TUNNEL_END_IPV6, +}; + +enum { + LIBETH_RX_PT_INNER_NONE = 0U, + LIBETH_RX_PT_INNER_UDP, + LIBETH_RX_PT_INNER_TCP, + LIBETH_RX_PT_INNER_SCTP, + LIBETH_RX_PT_INNER_ICMP, + LIBETH_RX_PT_INNER_TIMESYNC, +}; + +#define LIBETH_RX_PT_PAYLOAD_NONE PKT_HASH_TYPE_NONE +#define LIBETH_RX_PT_PAYLOAD_L2 PKT_HASH_TYPE_L2 +#define LIBETH_RX_PT_PAYLOAD_L3 PKT_HASH_TYPE_L3 +#define LIBETH_RX_PT_PAYLOAD_L4 PKT_HASH_TYPE_L4 + +struct libeth_rx_pt { + u32 outer_ip:2; + u32 outer_frag:1; + u32 tunnel_type:3; + u32 tunnel_end_prot:2; + u32 tunnel_end_frag:1; + u32 inner_prot:3; + enum pkt_hash_types payload_layer:2; + + u32 pad:2; + enum xdp_rss_hash_type hash_type:16; +}; + +void libeth_rx_pt_gen_hash_type(struct libeth_rx_pt *pt); + +/** + * libeth_rx_pt_get_ip_ver - get IP version from a packet type structure + * @pt: packet type params + * + * Wrapper to compile out the IPv6 code from the drivers when not supported + * by the kernel. + * + * Return: @pt.outer_ip or stub for IPv6 when not compiled-in. + */ +static inline u32 libeth_rx_pt_get_ip_ver(struct libeth_rx_pt pt) +{ +#if !IS_ENABLED(CONFIG_IPV6) + switch (pt.outer_ip) { + case LIBETH_RX_PT_OUTER_IPV4: + return LIBETH_RX_PT_OUTER_IPV4; + default: + return LIBETH_RX_PT_OUTER_L2; + } +#else + return pt.outer_ip; +#endif +} + +/* libeth_has_*() can be used to quickly check whether the HW metadata is + * available to avoid further expensive processing such as descriptor reads. + * They already check for the corresponding netdev feature to be enabled, + * thus can be used as drop-in replacements. + */ + +static inline bool libeth_rx_pt_has_checksum(const struct net_device *dev, + struct libeth_rx_pt pt) +{ + /* Non-zero _INNER* is only possible when _OUTER_IPV* is set, + * it is enough to check only for the L4 type. + */ + return likely(pt.inner_prot > LIBETH_RX_PT_INNER_NONE && + (dev->features & NETIF_F_RXCSUM)); +} + +static inline bool libeth_rx_pt_has_hash(const struct net_device *dev, + struct libeth_rx_pt pt) +{ + return likely(pt.payload_layer > LIBETH_RX_PT_PAYLOAD_NONE && + (dev->features & NETIF_F_RXHASH)); +} + +/** + * libeth_rx_pt_set_hash - fill in skb hash value basing on the PT + * @skb: skb to fill the hash in + * @hash: 32-bit hash value from the descriptor + * @pt: packet type + */ +static inline void libeth_rx_pt_set_hash(struct sk_buff *skb, u32 hash, + struct libeth_rx_pt pt) +{ + skb_set_hash(skb, hash, pt.payload_layer); +} + +#endif /* __LIBETH_RX_H */ diff --git a/include/net/mac80211.h b/include/net/mac80211.h index baaff7bc09..45ad37adbe 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -361,7 +361,7 @@ struct ieee80211_vif_chanctx_switch { * @BSS_CHANGED_UNSOL_BCAST_PROBE_RESP: Unsolicited broadcast probe response * status changed. * @BSS_CHANGED_MLD_VALID_LINKS: MLD valid links status changed. - * @BSS_CHANGED_MLD_TTLM: TID to link mapping was changed + * @BSS_CHANGED_MLD_TTLM: negotiated TID to link mapping was changed */ enum ieee80211_bss_change { BSS_CHANGED_ASSOC = 1<<0, @@ -1924,10 +1924,12 @@ enum ieee80211_neg_ttlm_res { * @active_links: The bitmap of active links, or 0 for non-MLO. * The driver shouldn't change this directly, but use the * API calls meant for that purpose. - * @dormant_links: bitmap of valid but disabled links, or 0 for non-MLO. - * Must be a subset of valid_links. + * @dormant_links: subset of the valid links that are disabled/suspended + * due to advertised or negotiated TTLM respectively. + * 0 for non-MLO. * @suspended_links: subset of dormant_links representing links that are - * suspended. + * suspended due to negotiated TTLM, and could be activated in the + * future by tearing down the TTLM negotiation. * 0 for non-MLO. * @neg_ttlm: negotiated TID to link mapping info. * see &struct ieee80211_neg_ttlm. @@ -2052,7 +2054,7 @@ static inline bool ieee80211_vif_is_mesh(struct ieee80211_vif *vif) * This can be used by mac80211 drivers with direct cfg80211 APIs * (like the vendor commands) that get a wdev. * - * Note that this function may return %NULL if the given wdev isn't + * Return: pointer to the wdev, or %NULL if the given wdev isn't * associated with a vif that the driver knows about (e.g. monitor * or AP_VLAN interfaces.) */ @@ -2065,6 +2067,8 @@ struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev); * This can be used by mac80211 drivers with direct cfg80211 APIs * (like the vendor commands) that needs to get the wdev for a vif. * This can also be useful to get the netdev associated to a vif. + * + * Return: pointer to the wdev */ struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif); @@ -2123,8 +2127,8 @@ static inline bool lockdep_vif_wiphy_mutex_held(struct ieee80211_vif *vif) * @IEEE80211_KEY_FLAG_GENERATE_MMIC on the same key. * @IEEE80211_KEY_FLAG_NO_AUTO_TX: Key needs explicit Tx activation. * @IEEE80211_KEY_FLAG_GENERATE_MMIE: This flag should be set by the driver - * for a AES_CMAC key to indicate that it requires sequence number - * generation only + * for a AES_CMAC or a AES_GMAC key to indicate that it requires sequence + * number generation only * @IEEE80211_KEY_FLAG_SPP_AMSDU: SPP A-MSDUs can be used with this key * (set by mac80211 from the sta->spp_amsdu flag) */ @@ -2780,6 +2784,8 @@ struct ieee80211_txq { * * @IEEE80211_HW_DISALLOW_PUNCTURING: HW requires disabling puncturing in EHT * and connecting with a lower bandwidth instead + * @IEEE80211_HW_DISALLOW_PUNCTURING_5GHZ: HW requires disabling puncturing in + * EHT in 5 GHz and connecting with a lower bandwidth instead * * @IEEE80211_HW_HANDLES_QUIET_CSA: HW/driver handles quieting for CSA, so * no need to stop queues. This really should be set by a driver that @@ -2844,6 +2850,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_DETECTS_COLOR_COLLISION, IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX, IEEE80211_HW_DISALLOW_PUNCTURING, + IEEE80211_HW_DISALLOW_PUNCTURING_5GHZ, IEEE80211_HW_HANDLES_QUIET_CSA, /* keep last, obviously */ @@ -5600,7 +5607,7 @@ void ieee80211_csa_finish(struct ieee80211_vif *vif, unsigned int link_id); * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @link_id: valid link_id during MLO or 0 for non-MLO * - * This function returns whether the countdown reached zero. + * Return: %true if the countdown reached 1, %false otherwise */ bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif, unsigned int link_id); @@ -5608,12 +5615,13 @@ bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif, /** * ieee80211_color_change_finish - notify mac80211 about color change * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @link_id: valid link_id during MLO or 0 for non-MLO * * After a color change announcement was scheduled and the counter in this * announcement hits 1, this function must be called by the driver to * notify mac80211 that the color can be changed */ -void ieee80211_color_change_finish(struct ieee80211_vif *vif); +void ieee80211_color_change_finish(struct ieee80211_vif *vif, u8 link_id); /** * ieee80211_proberesp_get - retrieve a Probe Response template @@ -5945,8 +5953,8 @@ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf); * key(s) will be available. These will be needed by mac80211 for proper * RX processing, so this function allows setting them. * - * The function returns the newly allocated key structure, which will - * have similar contents to the passed key configuration but point to + * Return: the newly allocated key structure, which will have + * similar contents to the passed key configuration but point to * mac80211-owned memory. In case of errors, the function returns an * ERR_PTR(), use IS_ERR() etc. * @@ -6345,6 +6353,8 @@ struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw, * may be %NULL if the link ID is not needed * * Obtain the STA by link address, must use RCU protection. + * + * Return: pointer to STA if found, otherwise %NULL. */ struct ieee80211_sta * ieee80211_find_sta_by_link_addrs(struct ieee80211_hw *hw, @@ -6474,8 +6484,8 @@ void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, * @hw: pointer obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface * - * Return true if the AQL's airtime limit has not been reached and the txq can - * continue to send more packets to the device. Otherwise return false. + * Return: %true if the AQL's airtime limit has not been reached and the txq can + * continue to send more packets to the device. Otherwise return %false. */ bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw, struct ieee80211_txq *txq); @@ -6978,6 +6988,8 @@ bool rate_usable_index_exists(struct ieee80211_supported_band *sband, * @hw: pointer as obtained from ieee80211_alloc_hw() * @pubsta: &struct ieee80211_sta pointer to the target destination. * @rates: new tx rate set to be used for this station. + * + * Return: 0 on success. An error code otherwise. */ int rate_control_set_rates(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, @@ -7138,6 +7150,8 @@ void ieee80211_report_wowlan_wakeup(struct ieee80211_vif *vif, * @band: the band to transmit on * @sta: optional pointer to get the station to send the frame to * + * Return: %true if the skb was prepared, %false otherwise + * * Note: must be called under RCU lock */ bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw, @@ -7154,6 +7168,8 @@ bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw, * * @skb: packet injected by userspace * @dev: the &struct device of this 802.11 device + * + * Return: %true if the radiotap header was parsed, %false otherwise */ bool ieee80211_parse_tx_radiotap(struct sk_buff *skb, struct net_device *dev); @@ -7263,7 +7279,7 @@ void ieee80211_unreserve_tid(struct ieee80211_sta *sta, u8 tid); * @txq: pointer obtained from station or virtual interface, or from * ieee80211_next_txq() * - * Returns the skb if successful, %NULL if no frame was available. + * Return: the skb if successful, %NULL if no frame was available. * * Note that this must be called in an rcu_read_lock() critical section, * which can only be released after the SKB was handled. Some pointers in @@ -7289,6 +7305,8 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, * @hw: pointer as obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface, or from * ieee80211_next_txq() + * + * Return: the skb if successful, %NULL if no frame was available. */ static inline struct sk_buff *ieee80211_tx_dequeue_ni(struct ieee80211_hw *hw, struct ieee80211_txq *txq) @@ -7320,7 +7338,7 @@ void ieee80211_handle_wake_tx_queue(struct ieee80211_hw *hw, * @hw: pointer as obtained from ieee80211_alloc_hw() * @ac: AC number to return packets from. * - * Returns the next txq if successful, %NULL if no queue is eligible. If a txq + * Return: the next txq if successful, %NULL if no queue is eligible. If a txq * is returned, it should be returned with ieee80211_return_txq() after the * driver has finished scheduling it. */ @@ -7403,6 +7421,8 @@ ieee80211_return_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq, * * @hw: pointer as obtained from ieee80211_alloc_hw() * @txq: pointer obtained from station or virtual interface + * + * Return: %true if transmission is allowed, %false otherwise */ bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw, struct ieee80211_txq *txq); @@ -7463,6 +7483,8 @@ void ieee80211_nan_func_match(struct ieee80211_vif *vif, * @status: &struct ieee80211_rx_status containing the transmission rate * information. * @len: frame length in bytes + * + * Return: the airtime estimate */ u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw, struct ieee80211_rx_status *status, @@ -7477,23 +7499,13 @@ u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw, * @hw: pointer as obtained from ieee80211_alloc_hw() * @info: &struct ieee80211_tx_info of the frame. * @len: frame length in bytes + * + * Return: the airtime estimate */ u32 ieee80211_calc_tx_airtime(struct ieee80211_hw *hw, struct ieee80211_tx_info *info, int len); /** - * ieee80211_set_hw_80211_encap - enable hardware encapsulation offloading. - * - * This function is used to notify mac80211 that a vif can be passed raw 802.3 - * frames. The driver needs to then handle the 802.11 encapsulation inside the - * hardware or firmware. - * - * @vif: &struct ieee80211_vif pointer from the add_interface callback. - * @enable: indicate if the feature should be turned on or off - */ -bool ieee80211_set_hw_80211_encap(struct ieee80211_vif *vif, bool enable); - -/** * ieee80211_get_fils_discovery_tmpl - Get FILS discovery template. * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. @@ -7522,6 +7534,7 @@ ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw, /** * ieee80211_obss_color_collision_notify - notify userland about a BSS color * collision. + * @link_id: valid link_id during MLO or 0 for non-MLO * * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @color_bitmap: a 64 bit bitmap representing the colors that the local BSS is @@ -7529,7 +7542,7 @@ ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw, */ void ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif, - u64 color_bitmap); + u64 color_bitmap, u8 link_id); /** * ieee80211_is_tx_data - check if frame is a data frame @@ -7538,6 +7551,8 @@ ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif, * hardware encapsulation enabled are data frames. * * @skb: the frame to be transmitted. + * + * Return: %true if @skb is a data frame, %false otherwise */ static inline bool ieee80211_is_tx_data(struct sk_buff *skb) { @@ -7573,6 +7588,8 @@ static inline bool ieee80211_is_tx_data(struct sk_buff *skb) * - change_sta_links(0x10) for each affected STA (the AP) * - assign_vif_chanctx(link_id=4) * - change_vif_links(0x10) + * + * Return: 0 on success. An error code otherwise. */ int ieee80211_set_active_links(struct ieee80211_vif *vif, u16 active_links); @@ -7589,6 +7606,15 @@ int ieee80211_set_active_links(struct ieee80211_vif *vif, u16 active_links); void ieee80211_set_active_links_async(struct ieee80211_vif *vif, u16 active_links); +/** + * ieee80211_send_teardown_neg_ttlm - tear down a negotiated TTLM request + * @vif: the interface on which the tear down request should be sent. + * + * This function can be used to tear down a previously accepted negotiated + * TTLM request. + */ +void ieee80211_send_teardown_neg_ttlm(struct ieee80211_vif *vif); + /* for older drivers - let's not document these ... */ int ieee80211_emulate_add_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx); diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 4eeedf1471..f207a6e104 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -670,6 +670,7 @@ struct mana_cfg_rx_steer_req_v2 { u8 hashkey[MANA_HASH_KEY_SIZE]; u8 cqe_coalescing_enable; u8 reserved2[7]; + mana_handle_t indir_tab[] __counted_by(num_indir_entries); }; /* HW DATA */ struct mana_cfg_rx_steer_resp { @@ -795,4 +796,6 @@ void mana_destroy_wq_obj(struct mana_port_context *apc, u32 wq_type, int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id, u32 doorbell_pg_id); void mana_uncfg_vport(struct mana_port_context *apc); + +struct net_device *mana_get_primary_netdev_rcu(struct mana_context *ac, u32 port_index); #endif /* _MANA_H */ diff --git a/include/net/mptcp.h b/include/net/mptcp.h index fb996124b3..0bc4ab03f4 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -97,6 +97,9 @@ struct mptcp_out_options { }; #define MPTCP_SCHED_NAME_MAX 16 +#define MPTCP_SCHED_MAX 128 +#define MPTCP_SCHED_BUF_MAX (MPTCP_SCHED_NAME_MAX * MPTCP_SCHED_MAX) + #define MPTCP_SUBFLOWS_MAX 8 struct mptcp_sched_data { diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index 1ec4085853..a8a7e48dfa 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -9,11 +9,41 @@ struct netdev_queue_stats_rx { u64 bytes; u64 packets; u64 alloc_fail; + + u64 hw_drops; + u64 hw_drop_overruns; + + u64 csum_unnecessary; + u64 csum_none; + u64 csum_bad; + + u64 hw_gro_packets; + u64 hw_gro_bytes; + u64 hw_gro_wire_packets; + u64 hw_gro_wire_bytes; + + u64 hw_drop_ratelimits; }; struct netdev_queue_stats_tx { u64 bytes; u64 packets; + + u64 hw_drops; + u64 hw_drop_errors; + + u64 csum_none; + u64 needs_csum; + + u64 hw_gso_packets; + u64 hw_gso_bytes; + u64 hw_gso_wire_packets; + u64 hw_gso_wire_bytes; + + u64 hw_drop_ratelimits; + + u64 stop; + u64 wake; }; /** @@ -61,6 +91,37 @@ struct netdev_stat_ops { }; /** + * struct netdev_queue_mgmt_ops - netdev ops for queue management + * + * @ndo_queue_mem_size: Size of the struct that describes a queue's memory. + * + * @ndo_queue_mem_alloc: Allocate memory for an RX queue at the specified index. + * The new memory is written at the specified address. + * + * @ndo_queue_mem_free: Free memory from an RX queue. + * + * @ndo_queue_start: Start an RX queue with the specified memory and at the + * specified index. + * + * @ndo_queue_stop: Stop the RX queue at the specified index. The stopped + * queue's memory is written at the specified address. + */ +struct netdev_queue_mgmt_ops { + size_t ndo_queue_mem_size; + int (*ndo_queue_mem_alloc)(struct net_device *dev, + void *per_queue_mem, + int idx); + void (*ndo_queue_mem_free)(struct net_device *dev, + void *per_queue_mem); + int (*ndo_queue_start)(struct net_device *dev, + void *per_queue_mem, + int idx); + int (*ndo_queue_stop)(struct net_device *dev, + void *per_queue_mem, + int idx); +}; + +/** * DOC: Lockless queue stopping / waking helpers. * * The netif_txq_maybe_stop() and __netif_txq_completed_wake() diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 2164fa350f..188d41da1a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -416,7 +416,7 @@ struct nft_expr_info; int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, struct nft_expr_info *info); -int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src); +int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp); void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, const struct nft_expr *expr, bool reset); @@ -940,7 +940,7 @@ struct nft_expr_ops { struct nft_regs *regs, const struct nft_pktinfo *pkt); int (*clone)(struct nft_expr *dst, - const struct nft_expr *src); + const struct nft_expr *src, gfp_t gfp); unsigned int size; int (*init)(const struct nft_ctx *ctx, diff --git a/include/net/netlabel.h b/include/net/netlabel.h index f3ab0b8a4b..654bc777d2 100644 --- a/include/net/netlabel.h +++ b/include/net/netlabel.h @@ -274,15 +274,17 @@ struct netlbl_calipso_ops { * on success, NULL on failure. * */ -static inline struct netlbl_lsm_cache *netlbl_secattr_cache_alloc(gfp_t flags) +static inline struct netlbl_lsm_cache *netlbl_secattr_cache_alloc_noprof(gfp_t flags) { struct netlbl_lsm_cache *cache; - cache = kzalloc(sizeof(*cache), flags); + cache = kzalloc_noprof(sizeof(*cache), flags); if (cache) refcount_set(&cache->refcount, 1); return cache; } +#define netlbl_secattr_cache_alloc(...) \ + alloc_hooks(netlbl_secattr_cache_alloc_noprof(__VA_ARGS__)) /** * netlbl_secattr_cache_free - Frees a netlbl_lsm_cache struct @@ -311,10 +313,11 @@ static inline void netlbl_secattr_cache_free(struct netlbl_lsm_cache *cache) * on failure. * */ -static inline struct netlbl_lsm_catmap *netlbl_catmap_alloc(gfp_t flags) +static inline struct netlbl_lsm_catmap *netlbl_catmap_alloc_noprof(gfp_t flags) { - return kzalloc(sizeof(struct netlbl_lsm_catmap), flags); + return kzalloc_noprof(sizeof(struct netlbl_lsm_catmap), flags); } +#define netlbl_catmap_alloc(...) alloc_hooks(netlbl_catmap_alloc_noprof(__VA_ARGS__)) /** * netlbl_catmap_free - Free a LSM secattr catmap @@ -376,10 +379,11 @@ static inline void netlbl_secattr_destroy(struct netlbl_lsm_secattr *secattr) * pointer on success, or NULL on failure. * */ -static inline struct netlbl_lsm_secattr *netlbl_secattr_alloc(gfp_t flags) +static inline struct netlbl_lsm_secattr *netlbl_secattr_alloc_noprof(gfp_t flags) { - return kzalloc(sizeof(struct netlbl_lsm_secattr), flags); + return kzalloc_noprof(sizeof(struct netlbl_lsm_secattr), flags); } +#define netlbl_secattr_alloc(...) alloc_hooks(netlbl_secattr_alloc_noprof(__VA_ARGS__)) /** * netlbl_secattr_free - Frees a netlbl_lsm_secattr struct @@ -470,7 +474,8 @@ void netlbl_bitmap_setbit(unsigned char *bitmap, u32 bit, u8 state); int netlbl_enabled(void); int netlbl_sock_setattr(struct sock *sk, u16 family, - const struct netlbl_lsm_secattr *secattr); + const struct netlbl_lsm_secattr *secattr, + bool sk_locked); void netlbl_sock_delattr(struct sock *sk); int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr); @@ -487,6 +492,7 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb, u16 family, struct netlbl_lsm_secattr *secattr); void netlbl_skbuff_err(struct sk_buff *skb, u16 family, int error, int gateway); +bool netlbl_sk_lock_check(struct sock *sk); /* * LSM label mapping cache operations @@ -614,7 +620,8 @@ static inline int netlbl_enabled(void) } static inline int netlbl_sock_setattr(struct sock *sk, u16 family, - const struct netlbl_lsm_secattr *secattr) + const struct netlbl_lsm_secattr *secattr, + bool sk_locked) { return -ENOSYS; } @@ -673,6 +680,11 @@ static inline struct audit_buffer *netlbl_audit_start(int type, { return NULL; } + +static inline bool netlbl_sk_lock_check(struct sock *sk) +{ + return true; +} #endif /* CONFIG_NETLABEL */ const struct netlbl_calipso_ops * diff --git a/include/net/netlink.h b/include/net/netlink.h index c19ff921b6..e78ce008e0 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -41,7 +41,8 @@ * nlmsg_get_pos() return current position in message * nlmsg_trim() trim part of message * nlmsg_cancel() cancel message construction - * nlmsg_free() free a netlink message + * nlmsg_consume() free a netlink message (expected) + * nlmsg_free() free a netlink message (drop) * * Message Sending: * nlmsg_multicast() multicast message to several groups @@ -157,7 +158,11 @@ * nla_parse() parse and validate stream of attrs * nla_parse_nested() parse nested attributes * nla_for_each_attr() loop over all attributes + * nla_for_each_attr_type() loop over all attributes with the + * given type * nla_for_each_nested() loop over the nested attributes + * nla_for_each_nested_type() loop over the nested attributes with + * the given type *========================================================================= */ @@ -1078,7 +1083,7 @@ static inline void nlmsg_cancel(struct sk_buff *skb, struct nlmsghdr *nlh) } /** - * nlmsg_free - free a netlink message + * nlmsg_free - drop a netlink message * @skb: socket buffer of netlink message */ static inline void nlmsg_free(struct sk_buff *skb) @@ -1087,6 +1092,15 @@ static inline void nlmsg_free(struct sk_buff *skb) } /** + * nlmsg_consume - free a netlink message + * @skb: socket buffer of netlink message + */ +static inline void nlmsg_consume(struct sk_buff *skb) +{ + consume_skb(skb); +} + +/** * nlmsg_multicast_filtered - multicast a netlink message with filter function * @sk: netlink socket to spread messages to * @skb: netlink message as socket buffer @@ -1891,10 +1905,11 @@ static inline struct nla_bitfield32 nla_get_bitfield32(const struct nlattr *nla) * @src: netlink attribute to duplicate from * @gfp: GFP mask */ -static inline void *nla_memdup(const struct nlattr *src, gfp_t gfp) +static inline void *nla_memdup_noprof(const struct nlattr *src, gfp_t gfp) { - return kmemdup(nla_data(src), nla_len(src), gfp); + return kmemdup_noprof(nla_data(src), nla_len(src), gfp); } +#define nla_memdup(...) alloc_hooks(nla_memdup_noprof(__VA_ARGS__)) /** * nla_nest_start_noflag - Start a new level of nested attributes @@ -2071,6 +2086,18 @@ static inline int nla_total_size_64bit(int payload) pos = nla_next(pos, &(rem))) /** + * nla_for_each_attr_type - iterate over a stream of attributes + * @pos: loop counter, set to current attribute + * @type: required attribute type for @pos + * @head: head of attribute stream + * @len: length of attribute stream + * @rem: initialized to len, holds bytes currently remaining in stream + */ +#define nla_for_each_attr_type(pos, type, head, len, rem) \ + nla_for_each_attr(pos, head, len, rem) \ + if (nla_type(pos) == type) + +/** * nla_for_each_nested - iterate over nested attributes * @pos: loop counter, set to current attribute * @nla: attribute containing the nested attributes @@ -2080,6 +2107,17 @@ static inline int nla_total_size_64bit(int payload) nla_for_each_attr(pos, nla_data(nla), nla_len(nla), rem) /** + * nla_for_each_nested_type - iterate over nested attributes + * @pos: loop counter, set to current attribute + * @type: required attribute type for @pos + * @nla: attribute containing the nested attributes + * @rem: initialized to len, holds bytes currently remaining in stream + */ +#define nla_for_each_nested_type(pos, type, nla, rem) \ + nla_for_each_nested(pos, nla, rem) \ + if (nla_type(pos) == type) + +/** * nla_is_last - Test if attribute is last in stream * @nla: attribute to test * @rem: bytes remaining in stream diff --git a/include/net/nexthop.h b/include/net/nexthop.h index 7ca315ad50..68463aebcc 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -267,7 +267,7 @@ static inline bool nexthop_get(struct nexthop *nh) static inline void nexthop_put(struct nexthop *nh) { if (refcount_dec_and_test(&nh->refcnt)) - call_rcu(&nh->rcu, nexthop_free_rcu); + call_rcu_hurry(&nh->rcu, nexthop_free_rcu); } static inline bool nexthop_cmp(const struct nexthop *nh1, diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 1d397c1a00..873631c79a 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -52,13 +52,15 @@ #ifndef _NET_PAGE_POOL_HELPERS_H #define _NET_PAGE_POOL_HELPERS_H +#include <linux/dma-mapping.h> + #include <net/page_pool/types.h> #ifdef CONFIG_PAGE_POOL_STATS /* Deprecated driver-facing API, use netlink instead */ int page_pool_ethtool_stats_get_count(void); u8 *page_pool_ethtool_stats_get_strings(u8 *data); -u64 *page_pool_ethtool_stats_get(u64 *data, void *stats); +u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats); bool page_pool_get_stats(const struct page_pool *pool, struct page_pool_stats *stats); @@ -73,7 +75,7 @@ static inline u8 *page_pool_ethtool_stats_get_strings(u8 *data) return data; } -static inline u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) +static inline u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) { return data; } @@ -204,8 +206,8 @@ static inline void *page_pool_dev_alloc_va(struct page_pool *pool, * Get the stored dma direction. A driver might decide to store this locally * and avoid the extra cache line from page_pool to determine the direction. */ -static -inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool) +static inline enum dma_data_direction +page_pool_get_dma_dir(const struct page_pool *pool) { return pool->p.dma_dir; } @@ -370,7 +372,7 @@ static inline void page_pool_free_va(struct page_pool *pool, void *va, * Fetch the DMA address of the page. The page pool to which the page belongs * must had been created with PP_FLAG_DMA_MAP. */ -static inline dma_addr_t page_pool_get_dma_addr(struct page *page) +static inline dma_addr_t page_pool_get_dma_addr(const struct page *page) { dma_addr_t ret = page->dma_addr; @@ -395,6 +397,28 @@ static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr) return false; } +/** + * page_pool_dma_sync_for_cpu - sync Rx page for CPU after it's written by HW + * @pool: &page_pool the @page belongs to + * @page: page to sync + * @offset: offset from page start to "hard" start if using PP frags + * @dma_sync_size: size of the data written to the page + * + * Can be used as a shorthand to sync Rx pages before accessing them in the + * driver. Caller must ensure the pool was created with ``PP_FLAG_DMA_MAP``. + * Note that this version performs DMA sync unconditionally, even if the + * associated PP doesn't perform sync-for-device. + */ +static inline void page_pool_dma_sync_for_cpu(const struct page_pool *pool, + const struct page *page, + u32 offset, u32 dma_sync_size) +{ + dma_sync_single_range_for_cpu(pool->p.dev, + page_pool_get_dma_addr(page), + offset + pool->p.offset, dma_sync_size, + page_pool_get_dma_dir(pool)); +} + static inline bool page_pool_put(struct page_pool *pool) { return refcount_dec_and_test(&pool->user_cnt); diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 5e43a08d32..7e8477057f 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -45,20 +45,21 @@ struct pp_alloc_cache { /** * struct page_pool_params - page pool parameters - * @flags: PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV + * @fast: params accessed frequently on hotpath * @order: 2^order pages on allocation * @pool_size: size of the ptr_ring * @nid: NUMA node id to allocate from pages from * @dev: device, for DMA pre-mapping purposes - * @netdev: netdev this pool will serve (leave as NULL if none or multiple) * @napi: NAPI which is the sole consumer of pages, otherwise NULL * @dma_dir: DMA mapping direction * @max_len: max DMA sync memory size for PP_FLAG_DMA_SYNC_DEV * @offset: DMA sync address offset for PP_FLAG_DMA_SYNC_DEV + * @slow: params with slowpath access only (initialization and Netlink) + * @netdev: netdev this pool will serve (leave as NULL if none or multiple) + * @flags: PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_SYSTEM_POOL */ struct page_pool_params { struct_group_tagged(page_pool_params_fast, fast, - unsigned int flags; unsigned int order; unsigned int pool_size; int nid; @@ -70,6 +71,7 @@ struct page_pool_params { ); struct_group_tagged(page_pool_params_slow, slow, struct net_device *netdev; + unsigned int flags; /* private: used by test code only */ void (*init_callback)(struct page *page, void *arg); void *init_arg; @@ -130,12 +132,28 @@ struct page_pool { struct page_pool_params_fast p; int cpuid; - bool has_init_callback; + u32 pages_state_hold_cnt; + bool has_init_callback:1; /* slow::init_callback is set */ + bool dma_map:1; /* Perform DMA mapping */ + bool dma_sync:1; /* Perform DMA sync */ +#ifdef CONFIG_PAGE_POOL_STATS + bool system:1; /* This is a global percpu pool */ +#endif + + /* The following block must stay within one cacheline. On 32-bit + * systems, sizeof(long) == sizeof(int), so that the block size is + * ``3 * sizeof(long)``. On 64-bit systems, the actual size is + * ``2 * sizeof(long) + sizeof(int)``. The closest pow-2 to both of + * them is ``4 * sizeof(long)``, so just use that one for simplicity. + * Having it aligned to a cacheline boundary may be excessive and + * doesn't bring any good. + */ + __cacheline_group_begin(frag) __aligned(4 * sizeof(long)); long frag_users; struct page *frag_page; unsigned int frag_offset; - u32 pages_state_hold_cnt; + __cacheline_group_end(frag); struct delayed_work release_dw; void (*disconnect)(void *pool); @@ -213,7 +231,7 @@ struct xdp_mem_info; #ifdef CONFIG_PAGE_POOL void page_pool_destroy(struct page_pool *pool); void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), - struct xdp_mem_info *mem); + const struct xdp_mem_info *mem); void page_pool_put_page_bulk(struct page_pool *pool, void **data, int count); #else @@ -223,7 +241,7 @@ static inline void page_pool_destroy(struct page_pool *pool) static inline void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), - struct xdp_mem_info *mem) + const struct xdp_mem_info *mem) { } diff --git a/include/net/pfcp.h b/include/net/pfcp.h new file mode 100644 index 0000000000..af14f970b8 --- /dev/null +++ b/include/net/pfcp.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _PFCP_H_ +#define _PFCP_H_ + +#include <uapi/linux/if_ether.h> +#include <net/dst_metadata.h> +#include <linux/netdevice.h> +#include <uapi/linux/ipv6.h> +#include <net/udp_tunnel.h> +#include <uapi/linux/udp.h> +#include <uapi/linux/ip.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/bits.h> + +#define PFCP_PORT 8805 + +/* PFCP protocol header */ +struct pfcphdr { + u8 flags; + u8 message_type; + __be16 message_length; +}; + +/* PFCP header flags */ +#define PFCP_SEID_FLAG BIT(0) +#define PFCP_MP_FLAG BIT(1) + +#define PFCP_VERSION_MASK GENMASK(4, 0) + +#define PFCP_HLEN (sizeof(struct udphdr) + sizeof(struct pfcphdr)) + +/* PFCP node related messages */ +struct pfcphdr_node { + u8 seq_number[3]; + u8 reserved; +}; + +/* PFCP session related messages */ +struct pfcphdr_session { + __be64 seid; + u8 seq_number[3]; +#ifdef __LITTLE_ENDIAN_BITFIELD + u8 message_priority:4, + reserved:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + u8 reserved:4, + message_priprity:4; +#else +#error "Please fix <asm/byteorder>" +#endif +}; + +struct pfcp_metadata { + u8 type; + __be64 seid; +} __packed; + +enum { + PFCP_TYPE_NODE = 0, + PFCP_TYPE_SESSION = 1, +}; + +#define PFCP_HEADROOM (sizeof(struct iphdr) + sizeof(struct udphdr) + \ + sizeof(struct pfcphdr) + sizeof(struct ethhdr)) +#define PFCP6_HEADROOM (sizeof(struct ipv6hdr) + sizeof(struct udphdr) + \ + sizeof(struct pfcphdr) + sizeof(struct ethhdr)) + +static inline struct pfcphdr *pfcp_hdr(struct sk_buff *skb) +{ + return (struct pfcphdr *)(udp_hdr(skb) + 1); +} + +static inline struct pfcphdr_node *pfcp_hdr_node(struct sk_buff *skb) +{ + return (struct pfcphdr_node *)(pfcp_hdr(skb) + 1); +} + +static inline struct pfcphdr_session *pfcp_hdr_session(struct sk_buff *skb) +{ + return (struct pfcphdr_session *)(pfcp_hdr(skb) + 1); +} + +static inline bool netif_is_pfcp(const struct net_device *dev) +{ + return dev->rtnl_link_ops && + !strcmp(dev->rtnl_link_ops->kind, "pfcp"); +} + +#endif diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index a4ee43f493..41297bd38d 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -74,6 +74,15 @@ static inline bool tcf_block_non_null_shared(struct tcf_block *block) return block && block->index; } +#ifdef CONFIG_NET_CLS_ACT +DECLARE_STATIC_KEY_FALSE(tcf_bypass_check_needed_key); + +static inline bool tcf_block_bypass_sw(struct tcf_block *block) +{ + return block && block->bypass_wanted; +} +#endif + static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { WARN_ON(tcf_block_shared(block)); diff --git a/include/net/proto_memory.h b/include/net/proto_memory.h new file mode 100644 index 0000000000..a6ab2f4f5e --- /dev/null +++ b/include/net/proto_memory.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _PROTO_MEMORY_H +#define _PROTO_MEMORY_H + +#include <net/sock.h> +#include <net/hotdata.h> + +/* 1 MB per cpu, in page units */ +#define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT)) + +static inline bool sk_has_memory_pressure(const struct sock *sk) +{ + return sk->sk_prot->memory_pressure != NULL; +} + +static inline bool +proto_memory_pressure(const struct proto *prot) +{ + if (!prot->memory_pressure) + return false; + return !!READ_ONCE(*prot->memory_pressure); +} + +static inline bool sk_under_global_memory_pressure(const struct sock *sk) +{ + return proto_memory_pressure(sk->sk_prot); +} + +static inline bool sk_under_memory_pressure(const struct sock *sk) +{ + if (!sk->sk_prot->memory_pressure) + return false; + + if (mem_cgroup_sockets_enabled && sk->sk_memcg && + mem_cgroup_under_socket_pressure(sk->sk_memcg)) + return true; + + return !!READ_ONCE(*sk->sk_prot->memory_pressure); +} + +static inline long +proto_memory_allocated(const struct proto *prot) +{ + return max(0L, atomic_long_read(prot->memory_allocated)); +} + +static inline long +sk_memory_allocated(const struct sock *sk) +{ + return proto_memory_allocated(sk->sk_prot); +} + +static inline void proto_memory_pcpu_drain(struct proto *proto) +{ + int val = this_cpu_xchg(*proto->per_cpu_fw_alloc, 0); + + if (val) + atomic_long_add(val, proto->memory_allocated); +} + +static inline void +sk_memory_allocated_add(const struct sock *sk, int val) +{ + struct proto *proto = sk->sk_prot; + + val = this_cpu_add_return(*proto->per_cpu_fw_alloc, val); + + if (unlikely(val >= READ_ONCE(net_hotdata.sysctl_mem_pcpu_rsv))) + proto_memory_pcpu_drain(proto); +} + +static inline void +sk_memory_allocated_sub(const struct sock *sk, int val) +{ + struct proto *proto = sk->sk_prot; + + val = this_cpu_sub_return(*proto->per_cpu_fw_alloc, val); + + if (unlikely(val <= -READ_ONCE(net_hotdata.sysctl_mem_pcpu_rsv))) + proto_memory_pcpu_drain(proto); +} + +#endif /* _PROTO_MEMORY_H */ diff --git a/include/net/red.h b/include/net/red.h index 425364de0d..802287d52c 100644 --- a/include/net/red.h +++ b/include/net/red.h @@ -233,10 +233,10 @@ static inline void red_set_parms(struct red_parms *p, int delta = qth_max - qth_min; u32 max_p_delta; - p->qth_min = qth_min << Wlog; - p->qth_max = qth_max << Wlog; - p->Wlog = Wlog; - p->Plog = Plog; + WRITE_ONCE(p->qth_min, qth_min << Wlog); + WRITE_ONCE(p->qth_max, qth_max << Wlog); + WRITE_ONCE(p->Wlog, Wlog); + WRITE_ONCE(p->Plog, Plog); if (delta <= 0) delta = 1; p->qth_delta = delta; @@ -244,7 +244,7 @@ static inline void red_set_parms(struct red_parms *p, max_P = red_maxp(Plog); max_P *= delta; /* max_P = (qth_max - qth_min)/2^Plog */ } - p->max_P = max_P; + WRITE_ONCE(p->max_P, max_P); max_p_delta = max_P / delta; max_p_delta = max(max_p_delta, 1U); p->max_P_reciprocal = reciprocal_value(max_p_delta); @@ -257,7 +257,7 @@ static inline void red_set_parms(struct red_parms *p, p->target_min = qth_min + 2*delta; p->target_max = qth_min + 3*delta; - p->Scell_log = Scell_log; + WRITE_ONCE(p->Scell_log, Scell_log); p->Scell_max = (255 << Scell_log); if (stab) diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 004e651e60..ebcb8896bf 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -18,6 +18,7 @@ #include <linux/refcount.h> #include <net/sock.h> +#include <net/rstreason.h> struct request_sock; struct sk_buff; @@ -34,7 +35,8 @@ struct request_sock_ops { void (*send_ack)(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); void (*send_reset)(const struct sock *sk, - struct sk_buff *skb); + struct sk_buff *skb, + enum sk_rst_reason reason); void (*destructor)(struct request_sock *req); void (*syn_ack_timeout)(const struct request_sock *req); }; @@ -127,12 +129,12 @@ static inline struct sock *skb_steal_sock(struct sk_buff *skb, } static inline struct request_sock * -reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, +reqsk_alloc_noprof(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener) { struct request_sock *req; - req = kmem_cache_alloc(ops->slab, GFP_ATOMIC | __GFP_NOWARN); + req = kmem_cache_alloc_noprof(ops->slab, GFP_ATOMIC | __GFP_NOWARN); if (!req) return NULL; req->rsk_listener = NULL; @@ -157,6 +159,7 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, return req; } +#define reqsk_alloc(...) alloc_hooks(reqsk_alloc_noprof(__VA_ARGS__)) static inline void __reqsk_free(struct request_sock *req) { @@ -282,4 +285,16 @@ static inline int reqsk_queue_len_young(const struct request_sock_queue *queue) return atomic_read(&queue->young); } +/* RFC 7323 2.3 Using the Window Scale Option + * The window field (SEG.WND) of every outgoing segment, with the + * exception of <SYN> segments, MUST be right-shifted by + * Rcv.Wind.Shift bits. + * + * This means the SEG.WND carried in SYNACK can not exceed 65535. + * We use this property to harden TCP stack while in NEW_SYN_RECV state. + */ +static inline u32 tcp_synack_window(const struct request_sock *req) +{ + return min(req->rsk_rcv_wnd, 65535U); +} #endif /* _REQUEST_SOCK_H */ diff --git a/include/net/route.h b/include/net/route.h index af55401aa8..93833cfe9c 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -35,8 +35,6 @@ #include <linux/cache.h> #include <linux/security.h> -#define RTO_ONLINK 0x01 - static inline __u8 ip_sock_rt_scope(const struct sock *sk) { if (sock_flag(sk, SOCK_LOCALROUTE)) @@ -152,15 +150,22 @@ static inline struct rtable *ip_route_output_key(struct net *net, struct flowi4 return ip_route_output_flow(net, flp, NULL); } +/* Simplistic IPv4 route lookup function. + * This is only suitable for some particular use cases: since the flowi4 + * structure is only partially set, it may bypass some fib-rules. + */ static inline struct rtable *ip_route_output(struct net *net, __be32 daddr, - __be32 saddr, u8 tos, int oif) + __be32 saddr, u8 tos, int oif, + __u8 scope) { struct flowi4 fl4 = { .flowi4_oif = oif, .flowi4_tos = tos, + .flowi4_scope = scope, .daddr = daddr, .saddr = saddr, }; + return ip_route_output_key(net, &fl4); } diff --git a/include/net/rps.h b/include/net/rps.h index 7660243e90..a93401d23d 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -122,4 +122,32 @@ static inline void sock_rps_record_flow(const struct sock *sk) #endif } +static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + return ++sd->input_queue_tail; +#else + return 0; +#endif +} + +static inline void rps_input_queue_tail_save(u32 *dest, u32 tail) +{ +#ifdef CONFIG_RPS + WRITE_ONCE(*dest, tail); +#endif +} + +static inline void rps_input_queue_head_add(struct softnet_data *sd, int val) +{ +#ifdef CONFIG_RPS + WRITE_ONCE(sd->input_queue_head, sd->input_queue_head + val); +#endif +} + +static inline void rps_input_queue_head_incr(struct softnet_data *sd) +{ + rps_input_queue_head_add(sd, 1); +} + #endif /* _NET_RPS_H */ diff --git a/include/net/rstreason.h b/include/net/rstreason.h new file mode 100644 index 0000000000..2575c85d7f --- /dev/null +++ b/include/net/rstreason.h @@ -0,0 +1,182 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _LINUX_RSTREASON_H +#define _LINUX_RSTREASON_H +#include <net/dropreason-core.h> +#include <uapi/linux/mptcp.h> + +#define DEFINE_RST_REASON(FN, FNe) \ + FN(NOT_SPECIFIED) \ + FN(NO_SOCKET) \ + FN(TCP_INVALID_ACK_SEQUENCE) \ + FN(TCP_RFC7323_PAWS) \ + FN(TCP_TOO_OLD_ACK) \ + FN(TCP_ACK_UNSENT_DATA) \ + FN(TCP_FLAGS) \ + FN(TCP_OLD_ACK) \ + FN(TCP_ABORT_ON_DATA) \ + FN(TCP_TIMEWAIT_SOCKET) \ + FN(INVALID_SYN) \ + FN(MPTCP_RST_EUNSPEC) \ + FN(MPTCP_RST_EMPTCP) \ + FN(MPTCP_RST_ERESOURCE) \ + FN(MPTCP_RST_EPROHIBIT) \ + FN(MPTCP_RST_EWQ2BIG) \ + FN(MPTCP_RST_EBADPERF) \ + FN(MPTCP_RST_EMIDDLEBOX) \ + FN(ERROR) \ + FNe(MAX) + +/** + * enum sk_rst_reason - the reasons of socket reset + * + * The reasons of sk reset, which are used in DCCP/TCP/MPTCP protocols. + * + * There are three parts in order: + * 1) skb drop reasons: relying on drop reasons for such as passive reset + * 2) independent reset reasons: such as active reset reasons + * 3) reset reasons in MPTCP: only for MPTCP use + */ +enum sk_rst_reason { + /* Refer to include/net/dropreason-core.h + * Rely on skb drop reasons because it indicates exactly why RST + * could happen. + */ + /** @SK_RST_REASON_NOT_SPECIFIED: reset reason is not specified */ + SK_RST_REASON_NOT_SPECIFIED, + /** @SK_RST_REASON_NO_SOCKET: no valid socket that can be used */ + SK_RST_REASON_NO_SOCKET, + /** + * @SK_RST_REASON_TCP_INVALID_ACK_SEQUENCE: Not acceptable ACK SEQ + * field because ack sequence is not in the window between snd_una + * and snd_nxt + */ + SK_RST_REASON_TCP_INVALID_ACK_SEQUENCE, + /** + * @SK_RST_REASON_TCP_RFC7323_PAWS: PAWS check, corresponding to + * LINUX_MIB_PAWSESTABREJECTED, LINUX_MIB_PAWSACTIVEREJECTED + */ + SK_RST_REASON_TCP_RFC7323_PAWS, + /** @SK_RST_REASON_TCP_TOO_OLD_ACK: TCP ACK is too old */ + SK_RST_REASON_TCP_TOO_OLD_ACK, + /** + * @SK_RST_REASON_TCP_ACK_UNSENT_DATA: TCP ACK for data we haven't + * sent yet + */ + SK_RST_REASON_TCP_ACK_UNSENT_DATA, + /** @SK_RST_REASON_TCP_FLAGS: TCP flags invalid */ + SK_RST_REASON_TCP_FLAGS, + /** @SK_RST_REASON_TCP_OLD_ACK: TCP ACK is old, but in window */ + SK_RST_REASON_TCP_OLD_ACK, + /** + * @SK_RST_REASON_TCP_ABORT_ON_DATA: abort on data + * corresponding to LINUX_MIB_TCPABORTONDATA + */ + SK_RST_REASON_TCP_ABORT_ON_DATA, + + /* Here start with the independent reasons */ + /** @SK_RST_REASON_TCP_TIMEWAIT_SOCKET: happen on the timewait socket */ + SK_RST_REASON_TCP_TIMEWAIT_SOCKET, + /** + * @SK_RST_REASON_INVALID_SYN: receive bad syn packet + * RFC 793 says if the state is not CLOSED/LISTEN/SYN-SENT then + * "fourth, check the SYN bit,...If the SYN is in the window it is + * an error, send a reset" + */ + SK_RST_REASON_INVALID_SYN, + + /* Copy from include/uapi/linux/mptcp.h. + * These reset fields will not be changed since they adhere to + * RFC 8684. So do not touch them. I'm going to list each definition + * of them respectively. + */ + /** + * @SK_RST_REASON_MPTCP_RST_EUNSPEC: Unspecified error. + * This is the default error; it implies that the subflow is no + * longer available. The presence of this option shows that the + * RST was generated by an MPTCP-aware device. + */ + SK_RST_REASON_MPTCP_RST_EUNSPEC, + /** + * @SK_RST_REASON_MPTCP_RST_EMPTCP: MPTCP-specific error. + * An error has been detected in the processing of MPTCP options. + * This is the usual reason code to return in the cases where a RST + * is being sent to close a subflow because of an invalid response. + */ + SK_RST_REASON_MPTCP_RST_EMPTCP, + /** + * @SK_RST_REASON_MPTCP_RST_ERESOURCE: Lack of resources. + * This code indicates that the sending host does not have enough + * resources to support the terminated subflow. + */ + SK_RST_REASON_MPTCP_RST_ERESOURCE, + /** + * @SK_RST_REASON_MPTCP_RST_EPROHIBIT: Administratively prohibited. + * This code indicates that the requested subflow is prohibited by + * the policies of the sending host. + */ + SK_RST_REASON_MPTCP_RST_EPROHIBIT, + /** + * @SK_RST_REASON_MPTCP_RST_EWQ2BIG: Too much outstanding data. + * This code indicates that there is an excessive amount of data + * that needs to be transmitted over the terminated subflow while + * having already been acknowledged over one or more other subflows. + * This may occur if a path has been unavailable for a short period + * and it is more efficient to reset and start again than it is to + * retransmit the queued data. + */ + SK_RST_REASON_MPTCP_RST_EWQ2BIG, + /** + * @SK_RST_REASON_MPTCP_RST_EBADPERF: Unacceptable performance. + * This code indicates that the performance of this subflow was + * too low compared to the other subflows of this Multipath TCP + * connection. + */ + SK_RST_REASON_MPTCP_RST_EBADPERF, + /** + * @SK_RST_REASON_MPTCP_RST_EMIDDLEBOX: Middlebox interference. + * Middlebox interference has been detected over this subflow, + * making MPTCP signaling invalid. For example, this may be sent + * if the checksum does not validate. + */ + SK_RST_REASON_MPTCP_RST_EMIDDLEBOX, + + /** @SK_RST_REASON_ERROR: unexpected error happens */ + SK_RST_REASON_ERROR, + + /** + * @SK_RST_REASON_MAX: Maximum of socket reset reasons. + * It shouldn't be used as a real 'reason'. + */ + SK_RST_REASON_MAX, +}; + +/* Convert skb drop reasons to enum sk_rst_reason type */ +static inline enum sk_rst_reason +sk_rst_convert_drop_reason(enum skb_drop_reason reason) +{ + switch (reason) { + case SKB_DROP_REASON_NOT_SPECIFIED: + return SK_RST_REASON_NOT_SPECIFIED; + case SKB_DROP_REASON_NO_SOCKET: + return SK_RST_REASON_NO_SOCKET; + case SKB_DROP_REASON_TCP_INVALID_ACK_SEQUENCE: + return SK_RST_REASON_TCP_INVALID_ACK_SEQUENCE; + case SKB_DROP_REASON_TCP_RFC7323_PAWS: + return SK_RST_REASON_TCP_RFC7323_PAWS; + case SKB_DROP_REASON_TCP_TOO_OLD_ACK: + return SK_RST_REASON_TCP_TOO_OLD_ACK; + case SKB_DROP_REASON_TCP_ACK_UNSENT_DATA: + return SK_RST_REASON_TCP_ACK_UNSENT_DATA; + case SKB_DROP_REASON_TCP_FLAGS: + return SK_RST_REASON_TCP_FLAGS; + case SKB_DROP_REASON_TCP_OLD_ACK: + return SK_RST_REASON_TCP_OLD_ACK; + case SKB_DROP_REASON_TCP_ABORT_ON_DATA: + return SK_RST_REASON_TCP_ABORT_ON_DATA; + default: + /* If we don't have our own corresponding reason */ + return SK_RST_REASON_NOT_SPECIFIED; + } +} +#endif diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 0014b9ee5e..79edd5b5e3 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -424,6 +424,7 @@ struct tcf_proto { */ spinlock_t lock; bool deleting; + bool counted; refcount_t refcnt; struct rcu_head rcu; struct hlist_node destroy_ht_node; @@ -473,6 +474,9 @@ struct tcf_block { struct flow_block flow_block; struct list_head owner_list; bool keep_dst; + bool bypass_wanted; + atomic_t filtercnt; /* Number of filters */ + atomic_t skipswcnt; /* Number of skip_sw filters */ atomic_t offloadcnt; /* Number of oddloaded filters */ unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */ unsigned int lockeddevcnt; /* Number of devs that require rtnl lock. */ diff --git a/include/net/scm.h b/include/net/scm.h index 92276a2c55..0d35c7c77a 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -23,10 +23,20 @@ struct scm_creds { kgid_t gid; }; +#ifdef CONFIG_UNIX +struct unix_edge; +#endif + struct scm_fp_list { short count; short count_unix; short max; +#ifdef CONFIG_UNIX + bool inflight; + bool dead; + struct list_head vertices; + struct unix_edge *edges; +#endif struct user_struct *user; struct file *fp[SCM_MAX_FD]; }; diff --git a/include/net/smc.h b/include/net/smc.h index c9dcb30e3f..db84e4e350 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -26,9 +26,6 @@ struct smc_hashinfo { struct hlist_head ht; }; -int smc_hash_sk(struct sock *sk); -void smc_unhash_sk(struct sock *sk); - /* SMCD/ISM device driver interface */ struct smcd_dmb { u64 dmb_tok; @@ -50,7 +47,6 @@ struct smcd_dmb { #define ISM_ERROR 0xFFFF struct smcd_dev; -struct ism_client; struct smcd_gid { u64 gid; @@ -61,14 +57,8 @@ struct smcd_ops { int (*query_remote_gid)(struct smcd_dev *dev, struct smcd_gid *rgid, u32 vid_valid, u32 vid); int (*register_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb, - struct ism_client *client); + void *client); int (*unregister_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); - int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id); - int (*del_vlan_id)(struct smcd_dev *dev, u64 vlan_id); - int (*set_vlan_required)(struct smcd_dev *dev); - int (*reset_vlan_required)(struct smcd_dev *dev); - int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid, - u32 trigger_irq, u32 event_code, u64 info); int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx, bool sf, unsigned int offset, void *data, unsigned int size); @@ -76,11 +66,23 @@ struct smcd_ops { void (*get_local_gid)(struct smcd_dev *dev, struct smcd_gid *gid); u16 (*get_chid)(struct smcd_dev *dev); struct device* (*get_dev)(struct smcd_dev *dev); + + /* optional operations */ + int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id); + int (*del_vlan_id)(struct smcd_dev *dev, u64 vlan_id); + int (*set_vlan_required)(struct smcd_dev *dev); + int (*reset_vlan_required)(struct smcd_dev *dev); + int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid, + u32 trigger_irq, u32 event_code, u64 info); + int (*support_dmb_nocopy)(struct smcd_dev *dev); + int (*attach_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); + int (*detach_dmb)(struct smcd_dev *dev, u64 token); }; struct smcd_dev { const struct smcd_ops *ops; void *priv; + void *client; struct list_head list; spinlock_t lock; struct smc_connection **conn; diff --git a/include/net/sock.h b/include/net/sock.h index 944f71a8ab..953c8dc4e2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1194,6 +1194,13 @@ static inline void sk_prot_clear_nulls(struct sock *sk, int size) size - offsetof(struct sock, sk_node.pprev)); } +struct proto_accept_arg { + int flags; + int err; + int is_empty; + bool kern; +}; + /* Networking protocol blocks we attach to sockets. * socket layer -> transport layer interface */ @@ -1208,8 +1215,8 @@ struct proto { int addr_len); int (*disconnect)(struct sock *sk, int flags); - struct sock * (*accept)(struct sock *sk, int flags, int *err, - bool kern); + struct sock * (*accept)(struct sock *sk, + struct proto_accept_arg *arg); int (*ioctl)(struct sock *sk, int cmd, int *karg); @@ -1371,75 +1378,6 @@ static inline int sk_under_cgroup_hierarchy(struct sock *sk, #endif } -static inline bool sk_has_memory_pressure(const struct sock *sk) -{ - return sk->sk_prot->memory_pressure != NULL; -} - -static inline bool sk_under_global_memory_pressure(const struct sock *sk) -{ - return sk->sk_prot->memory_pressure && - !!READ_ONCE(*sk->sk_prot->memory_pressure); -} - -static inline bool sk_under_memory_pressure(const struct sock *sk) -{ - if (!sk->sk_prot->memory_pressure) - return false; - - if (mem_cgroup_sockets_enabled && sk->sk_memcg && - mem_cgroup_under_socket_pressure(sk->sk_memcg)) - return true; - - return !!READ_ONCE(*sk->sk_prot->memory_pressure); -} - -static inline long -proto_memory_allocated(const struct proto *prot) -{ - return max(0L, atomic_long_read(prot->memory_allocated)); -} - -static inline long -sk_memory_allocated(const struct sock *sk) -{ - return proto_memory_allocated(sk->sk_prot); -} - -/* 1 MB per cpu, in page units */ -#define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT)) -extern int sysctl_mem_pcpu_rsv; - -static inline void proto_memory_pcpu_drain(struct proto *proto) -{ - int val = this_cpu_xchg(*proto->per_cpu_fw_alloc, 0); - - if (val) - atomic_long_add(val, proto->memory_allocated); -} - -static inline void -sk_memory_allocated_add(const struct sock *sk, int val) -{ - struct proto *proto = sk->sk_prot; - - val = this_cpu_add_return(*proto->per_cpu_fw_alloc, val); - - if (unlikely(val >= READ_ONCE(sysctl_mem_pcpu_rsv))) - proto_memory_pcpu_drain(proto); -} - -static inline void -sk_memory_allocated_sub(const struct sock *sk, int val) -{ - struct proto *proto = sk->sk_prot; - - val = this_cpu_sub_return(*proto->per_cpu_fw_alloc, val); - - if (unlikely(val <= -READ_ONCE(sysctl_mem_pcpu_rsv))) - proto_memory_pcpu_drain(proto); -} - #define SK_ALLOC_PERCPU_COUNTER_BATCH 16 static inline void sk_sockets_allocated_dec(struct sock *sk) @@ -1466,15 +1404,6 @@ proto_sockets_allocated_sum_positive(struct proto *prot) return percpu_counter_sum_positive(prot->sockets_allocated); } -static inline bool -proto_memory_pressure(struct proto *prot) -{ - if (!prot->memory_pressure) - return false; - return !!READ_ONCE(*prot->memory_pressure); -} - - #ifdef CONFIG_PROC_FS #define PROTO_INUSE_NR 64 /* should be enough for the first time */ struct prot_inuse { @@ -1882,7 +1811,7 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg, int sock_no_bind(struct socket *, struct sockaddr *, int); int sock_no_connect(struct socket *, struct sockaddr *, int, int); int sock_no_socketpair(struct socket *, struct socket *); -int sock_no_accept(struct socket *, struct socket *, int, bool); +int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *); int sock_no_getname(struct socket *, struct sockaddr *, int); int sock_no_ioctl(struct socket *, unsigned int, unsigned long); int sock_no_listen(struct socket *, int); @@ -2508,6 +2437,12 @@ static inline void sk_wake_async(const struct sock *sk, int how, int band) } } +static inline void sk_wake_async_rcu(const struct sock *sk, int how, int band) +{ + if (unlikely(sock_flag(sk, SOCK_FASYNC))) + sock_wake_async(rcu_dereference(sk->sk_wq), how, band); +} + /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might * need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak. * Note: for send buffers, TCP works better if we can build two skbs at @@ -2824,12 +2759,10 @@ static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { skb = sk->sk_validate_xmit_skb(sk, dev, skb); -#ifdef CONFIG_TLS_DEVICE - } else if (unlikely(skb->decrypted)) { + } else if (unlikely(skb_is_decrypted(skb))) { pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); kfree_skb(skb); skb = NULL; -#endif } #endif diff --git a/include/net/tcp.h b/include/net/tcp.h index 2bcf30381d..32815a40de 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -52,6 +52,8 @@ extern struct inet_hashinfo tcp_hashinfo; DECLARE_PER_CPU(unsigned int, tcp_orphan_count); int tcp_orphan_count_sum(void); +DECLARE_PER_CPU(u32, tcp_tw_isn); + void tcp_time_wait(struct sock *sk, int state, int timeo); #define MAX_TCP_HEADER L1_CACHE_ALIGN(128 + MAX_HEADER) @@ -294,14 +296,6 @@ static inline bool between(__u32 seq1, __u32 seq2, __u32 seq3) return seq3 - seq2 >= seq1 - seq2; } -static inline bool tcp_out_of_memory(struct sock *sk) -{ - if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && - sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2)) - return true; - return false; -} - static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb) { sk_wmem_queued_add(sk, -skb->truesize); @@ -314,7 +308,7 @@ static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb) void sk_forced_mem_schedule(struct sock *sk, int size); -bool tcp_check_oom(struct sock *sk, int shift); +bool tcp_check_oom(const struct sock *sk, int shift); extern struct proto tcp_prot; @@ -353,7 +347,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); -void tcp_twsk_purge(struct list_head *net_exit_list, int family); +void tcp_twsk_purge(struct list_head *net_exit_list); ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); @@ -392,7 +386,8 @@ enum tcp_tw_status { enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, - const struct tcphdr *th); + const struct tcphdr *th, + u32 *tw_isn); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, bool *lost_race); @@ -667,7 +662,8 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, void tcp_send_probe0(struct sock *); int tcp_write_wakeup(struct sock *, int mib); void tcp_send_fin(struct sock *sk); -void tcp_send_active_reset(struct sock *sk, gfp_t priority); +void tcp_send_active_reset(struct sock *sk, gfp_t priority, + enum sk_rst_reason reason); int tcp_send_synack(struct sock *); void tcp_push_one(struct sock *, unsigned int mss_now); void __tcp_send_ack(struct sock *sk, u32 rcv_nxt); @@ -681,6 +677,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb, /* tcp_input.c */ void tcp_rearm_rto(struct sock *sk); void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); +void tcp_done_with_error(struct sock *sk, int err); void tcp_reset(struct sock *sk, struct sk_buff *skb); void tcp_fin(struct sock *sk); void tcp_check_space(struct sock *sk); @@ -742,7 +739,7 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu); int tcp_mss_to_mtu(struct sock *sk, int mss); void tcp_mtup_init(struct sock *sk); -static inline void tcp_bound_rto(const struct sock *sk) +static inline void tcp_bound_rto(struct sock *sk) { if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) inet_csk(sk)->icsk_rto = TCP_RTO_MAX; @@ -925,6 +922,19 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) +/* State flags for sacked in struct tcp_skb_cb */ +enum tcp_skb_cb_sacked_flags { + TCPCB_SACKED_ACKED = (1 << 0), /* SKB ACK'd by a SACK block */ + TCPCB_SACKED_RETRANS = (1 << 1), /* SKB retransmitted */ + TCPCB_LOST = (1 << 2), /* SKB is lost */ + TCPCB_TAGBITS = (TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS | + TCPCB_LOST), /* All tag bits */ + TCPCB_REPAIRED = (1 << 4), /* SKB repaired (no skb_mstamp_ns) */ + TCPCB_EVER_RETRANS = (1 << 7), /* Ever retransmitted frame */ + TCPCB_RETRANS = (TCPCB_SACKED_RETRANS | TCPCB_EVER_RETRANS | + TCPCB_REPAIRED), +}; + /* This is what the send packet queuing engine uses to pass * TCP per-packet control information to the transmission code. * We also store the host-order sequence numbers in here too. @@ -935,13 +945,10 @@ struct tcp_skb_cb { __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ union { - /* Note : tcp_tw_isn is used in input path only - * (isn chosen by tcp_timewait_state_process()) - * + /* Note : * tcp_gso_segs/size are used in write queue only, * cf tcp_skb_pcount()/tcp_skb_mss() */ - __u32 tcp_tw_isn; struct { u16 tcp_gso_segs; u16 tcp_gso_size; @@ -950,15 +957,6 @@ struct tcp_skb_cb { __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ __u8 sacked; /* State flags for SACK. */ -#define TCPCB_SACKED_ACKED 0x01 /* SKB ACK'd by a SACK block */ -#define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */ -#define TCPCB_LOST 0x04 /* SKB is lost */ -#define TCPCB_TAGBITS 0x07 /* All tag bits */ -#define TCPCB_REPAIRED 0x10 /* SKB repaired (no skb_mstamp_ns) */ -#define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */ -#define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \ - TCPCB_REPAIRED) - __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ __u8 txstamp_ack:1, /* Record TX timestamp for ack? */ eor:1, /* Is skb MSG_EOR marked? */ @@ -1167,7 +1165,7 @@ struct tcp_congestion_ops { /* call when packets are delivered to update cwnd and pacing rate, * after all the ca_state processing. (optional) */ - void (*cong_control)(struct sock *sk, const struct rate_sample *rs); + void (*cong_control)(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs); /* new value of cwnd after loss (required) */ @@ -2194,7 +2192,10 @@ void tcp_v4_destroy_sock(struct sock *sk); struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features); -struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb); +struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb); +struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th); +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, + struct tcphdr *th); INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff)); @@ -2283,7 +2284,8 @@ struct tcp_request_sock_ops { struct dst_entry *(*route_req)(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req); + struct request_sock *req, + u32 tw_isn); u32 (*init_seq)(const struct sk_buff *skb); u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb); int (*send_synack)(const struct sock *sk, struct dst_entry *dst, @@ -2705,10 +2707,10 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1); } -static inline void tcp_bpf_rtt(struct sock *sk) +static inline void tcp_bpf_rtt(struct sock *sk, long mrtt, u32 srtt) { if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RTT_CB_FLAG)) - tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL); + tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_RTT_CB, mrtt, srtt); } #if IS_ENABLED(CONFIG_SMC) diff --git a/include/net/tcx.h b/include/net/tcx.h index 0a5f40a91c..5ce0ce9e0c 100644 --- a/include/net/tcx.h +++ b/include/net/tcx.h @@ -75,9 +75,9 @@ tcx_entry_fetch(struct net_device *dev, bool ingress) return rcu_dereference_rtnl(dev->tcx_egress); } -static inline struct bpf_mprog_entry *tcx_entry_create(void) +static inline struct bpf_mprog_entry *tcx_entry_create_noprof(void) { - struct tcx_entry *tcx = kzalloc(sizeof(*tcx), GFP_KERNEL); + struct tcx_entry *tcx = kzalloc_noprof(sizeof(*tcx), GFP_KERNEL); if (tcx) { bpf_mprog_bundle_init(&tcx->bundle); @@ -85,6 +85,7 @@ static inline struct bpf_mprog_entry *tcx_entry_create(void) } return NULL; } +#define tcx_entry_create(...) alloc_hooks(tcx_entry_create_noprof(__VA_ARGS__)) static inline void tcx_entry_free(struct bpf_mprog_entry *entry) { diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h index 74d2b463cc..62b3e9f2ae 100644 --- a/include/net/timewait_sock.h +++ b/include/net/timewait_sock.h @@ -15,18 +15,9 @@ struct timewait_sock_ops { struct kmem_cache *twsk_slab; char *twsk_slab_name; unsigned int twsk_obj_size; - int (*twsk_unique)(struct sock *sk, - struct sock *sktw, void *twp); void (*twsk_destructor)(struct sock *sk); }; -static inline int twsk_unique(struct sock *sk, struct sock *sktw, void *twp) -{ - if (sk->sk_prot->twsk_prot->twsk_unique != NULL) - return sk->sk_prot->twsk_prot->twsk_unique(sk, sktw, twp); - return 0; -} - static inline void twsk_destructor(struct sock *sk) { if (sk->sk_prot->twsk_prot->twsk_destructor != NULL) diff --git a/include/net/tls.h b/include/net/tls.h index 33f657d3c0..3a33924db2 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -362,7 +362,7 @@ static inline bool tls_is_skb_tx_device_offloaded(const struct sk_buff *skb) static inline struct tls_context *tls_get_ctx(const struct sock *sk) { - struct inet_connection_sock *icsk = inet_csk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); /* Use RCU on icsk_ulp_data only for sock diag code, * TLS data path doesn't need rcu_dereference(). diff --git a/include/net/udp.h b/include/net/udp.h index 488a6d2bab..c4e05b14b6 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -379,14 +379,7 @@ static inline bool udp_skb_is_linear(struct sk_buff *skb) static inline int copy_linear_skb(struct sk_buff *skb, int len, int off, struct iov_iter *to) { - int n; - - n = copy_to_iter(skb->data + off, len, to); - if (n == len) - return 0; - - iov_iter_revert(to, n); - return -EFAULT; + return copy_to_iter_full(skb->data + off, len, to) ? 0 : -EFAULT; } /* diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index d716214fe0..a93dc51f63 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -179,8 +179,8 @@ struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb, struct dst_cache *dst_cache); struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, - __be16 flags, __be64 tunnel_id, - int md_size); + const unsigned long *flags, + __be64 tunnel_id, int md_size); #ifdef CONFIG_INET static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum) diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index c9aec9ab61..0a5dca2b2b 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -219,13 +219,10 @@ static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool return meta; } -static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool) +static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); - if (!pool->dma_need_sync) - return; - xp_dma_sync_for_cpu(xskb); } @@ -402,7 +399,7 @@ static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool return NULL; } -static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool) +static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) { } diff --git a/include/net/xfrm.h b/include/net/xfrm.h index cb4841a9ff..7d4c223525 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -178,7 +178,10 @@ struct xfrm_state { struct hlist_node gclist; struct hlist_node bydst; }; - struct hlist_node bysrc; + union { + struct hlist_node dev_gclist; + struct hlist_node bysrc; + }; struct hlist_node byspi; struct hlist_node byseq; @@ -291,6 +294,7 @@ struct xfrm_state { /* Private data of this transformer, format is opaque, * interpreted by xfrm_type methods. */ void *data; + u8 dir; }; static inline struct net *xs_net(struct xfrm_state *x) @@ -1587,7 +1591,7 @@ void xfrm_state_update_stats(struct net *net); static inline void xfrm_dev_state_update_stats(struct xfrm_state *x) { struct xfrm_dev_offload *xdo = &x->xso; - struct net_device *dev = xdo->dev; + struct net_device *dev = READ_ONCE(xdo->dev); if (dev && dev->xfrmdev_ops && dev->xfrmdev_ops->xdo_dev_state_update_stats) @@ -1945,13 +1949,16 @@ int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp, struct xfrm_user_offload *xuo, u8 dir, struct netlink_ext_ack *extack); bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x); +void xfrm_dev_state_delete(struct xfrm_state *x); +void xfrm_dev_state_free(struct xfrm_state *x); static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x) { struct xfrm_dev_offload *xso = &x->xso; + struct net_device *dev = READ_ONCE(xso->dev); - if (xso->dev && xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn) - xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn(x); + if (dev && dev->xfrmdev_ops->xdo_dev_state_advance_esn) + dev->xfrmdev_ops->xdo_dev_state_advance_esn(x); } static inline bool xfrm_dst_offload_ok(struct dst_entry *dst) @@ -1972,28 +1979,6 @@ static inline bool xfrm_dst_offload_ok(struct dst_entry *dst) return false; } -static inline void xfrm_dev_state_delete(struct xfrm_state *x) -{ - struct xfrm_dev_offload *xso = &x->xso; - - if (xso->dev) - xso->dev->xfrmdev_ops->xdo_dev_state_delete(x); -} - -static inline void xfrm_dev_state_free(struct xfrm_state *x) -{ - struct xfrm_dev_offload *xso = &x->xso; - struct net_device *dev = xso->dev; - - if (dev && dev->xfrmdev_ops) { - if (dev->xfrmdev_ops->xdo_dev_state_free) - dev->xfrmdev_ops->xdo_dev_state_free(x); - xso->dev = NULL; - xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; - netdev_put(dev, &xso->dev_tracker); - } -} - static inline void xfrm_dev_policy_delete(struct xfrm_policy *x) { struct xfrm_dev_offload *xdo = &x->xdo; diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 99dd7376df..bacb33f1e3 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -43,7 +43,6 @@ struct xsk_dma_map { refcount_t users; struct list_head list; /* Protected by the RTNL_LOCK */ u32 dma_pages_cnt; - bool dma_need_sync; }; struct xsk_buff_pool { @@ -82,7 +81,6 @@ struct xsk_buff_pool { u8 tx_metadata_len; /* inherited from umem */ u8 cached_need_wakeup; bool uses_need_wakeup; - bool dma_need_sync; bool unaligned; bool tx_sw_csum; void *addrs; @@ -155,21 +153,17 @@ static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb) return xskb->frame_dma; } -void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb); static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb) { - xp_dma_sync_for_cpu_slow(xskb); + dma_sync_single_for_cpu(xskb->pool->dev, xskb->dma, + xskb->pool->frame_len, + DMA_BIDIRECTIONAL); } -void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma, - size_t size); static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, size_t size) { - if (!pool->dma_need_sync) - return; - - xp_dma_sync_for_device_slow(pool, dma, size); + dma_sync_single_for_device(pool->dev, dma, size, DMA_BIDIRECTIONAL); } /* Masks for xdp_umem_page flags. |