diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-18 17:40:19 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-18 17:40:19 +0000 |
commit | 9f0fc191371843c4fc000a226b0a26b6c059aacd (patch) | |
tree | 35f8be3ef04506ac891ad001e8c41e535ae8d01d /net/netfilter | |
parent | Releasing progress-linux version 6.6.15-2~progress7.99u1. (diff) | |
download | linux-9f0fc191371843c4fc000a226b0a26b6c059aacd.tar.xz linux-9f0fc191371843c4fc000a226b0a26b6c059aacd.zip |
Merging upstream version 6.7.7.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
55 files changed, 1007 insertions, 762 deletions
diff --git a/net/netfilter/core.c b/net/netfilter/core.c index ef4e76e5ae..3126911f50 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -639,10 +639,10 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, if (ret == 1) continue; return ret; + case NF_STOLEN: + return NF_DROP_GETERR(verdict); default: - /* Implicit handling for NF_STOLEN, as well as any other - * non conventional verdicts. - */ + WARN_ON_ONCE(1); return 0; } } diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 26ab0e9612..9523104a90 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -28,6 +28,7 @@ #define mtype_del IPSET_TOKEN(MTYPE, _del) #define mtype_list IPSET_TOKEN(MTYPE, _list) #define mtype_gc IPSET_TOKEN(MTYPE, _gc) +#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc) #define mtype MTYPE #define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id))) @@ -57,9 +58,6 @@ mtype_destroy(struct ip_set *set) { struct mtype *map = set->data; - if (SET_WITH_TIMEOUT(set)) - del_timer_sync(&map->gc); - if (set->dsize && set->extensions & IPSET_EXT_DESTROY) mtype_ext_cleanup(set); ip_set_free(map->members); @@ -288,6 +286,15 @@ mtype_gc(struct timer_list *t) add_timer(&map->gc); } +static void +mtype_cancel_gc(struct ip_set *set) +{ + struct mtype *map = set->data; + + if (SET_WITH_TIMEOUT(set)) + del_timer_sync(&map->gc); +} + static const struct ip_set_type_variant mtype = { .kadt = mtype_kadt, .uadt = mtype_uadt, @@ -301,6 +308,7 @@ static const struct ip_set_type_variant mtype = { .head = mtype_head, .list = mtype_list, .same_set = mtype_same_set, + .cancel_gc = mtype_cancel_gc, }; #endif /* __IP_SET_BITMAP_IP_GEN_H */ diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 4c133e06be..3184cc6be4 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1154,6 +1154,7 @@ static int ip_set_create(struct sk_buff *skb, const struct nfnl_info *info, return ret; cleanup: + set->variant->cancel_gc(set); set->variant->destroy(set); put_out: module_put(set->type->me); @@ -1182,6 +1183,14 @@ ip_set_destroy_set(struct ip_set *set) kfree(set); } +static void +ip_set_destroy_set_rcu(struct rcu_head *head) +{ + struct ip_set *set = container_of(head, struct ip_set, rcu); + + ip_set_destroy_set(set); +} + static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const attr[]) { @@ -1193,8 +1202,6 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, if (unlikely(protocol_min_failed(attr))) return -IPSET_ERR_PROTOCOL; - /* Must wait for flush to be really finished in list:set */ - rcu_barrier(); /* Commands are serialized and references are * protected by the ip_set_ref_lock. @@ -1206,8 +1213,10 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, * counter, so if it's already zero, we can proceed * without holding the lock. */ - read_lock_bh(&ip_set_ref_lock); if (!attr[IPSET_ATTR_SETNAME]) { + /* Must wait for flush to be really finished in list:set */ + rcu_barrier(); + read_lock_bh(&ip_set_ref_lock); for (i = 0; i < inst->ip_set_max; i++) { s = ip_set(inst, i); if (s && (s->ref || s->ref_netlink)) { @@ -1221,6 +1230,8 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, s = ip_set(inst, i); if (s) { ip_set(inst, i) = NULL; + /* Must cancel garbage collectors */ + s->variant->cancel_gc(s); ip_set_destroy_set(s); } } @@ -1228,6 +1239,9 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, inst->is_destroyed = false; } else { u32 flags = flag_exist(info->nlh); + u16 features = 0; + + read_lock_bh(&ip_set_ref_lock); s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &i); if (!s) { @@ -1238,10 +1252,16 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, ret = -IPSET_ERR_BUSY; goto out; } + features = s->type->features; ip_set(inst, i) = NULL; read_unlock_bh(&ip_set_ref_lock); - - ip_set_destroy_set(s); + if (features & IPSET_TYPE_NAME) { + /* Must wait for flush to be really finished */ + rcu_barrier(); + } + /* Must cancel garbage collectors */ + s->variant->cancel_gc(s); + call_rcu(&s->rcu, ip_set_destroy_set_rcu); } return 0; out: @@ -1394,9 +1414,6 @@ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info, ip_set(inst, to_id) = from; write_unlock_bh(&ip_set_ref_lock); - /* Make sure all readers of the old set pointers are completed. */ - synchronize_rcu(); - return 0; } @@ -2362,6 +2379,7 @@ ip_set_net_exit(struct net *net) set = ip_set(inst, i); if (set) { ip_set(inst, i) = NULL; + set->variant->cancel_gc(set); ip_set_destroy_set(set); } } @@ -2409,8 +2427,11 @@ ip_set_fini(void) { nf_unregister_sockopt(&so_set); nfnetlink_subsys_unregister(&ip_set_netlink_subsys); - unregister_pernet_subsys(&ip_set_net_ops); + + /* Wait for call_rcu() in destroy */ + rcu_barrier(); + pr_debug("these are the famous last words\n"); } diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 7c23995417..20aad81fca 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -221,6 +221,7 @@ static const union nf_inet_addr zeromask = {}; #undef mtype_gc_do #undef mtype_gc #undef mtype_gc_init +#undef mtype_cancel_gc #undef mtype_variant #undef mtype_data_match @@ -265,6 +266,7 @@ static const union nf_inet_addr zeromask = {}; #define mtype_gc_do IPSET_TOKEN(MTYPE, _gc_do) #define mtype_gc IPSET_TOKEN(MTYPE, _gc) #define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) +#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc) #define mtype_variant IPSET_TOKEN(MTYPE, _variant) #define mtype_data_match IPSET_TOKEN(MTYPE, _data_match) @@ -429,7 +431,7 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy) u32 i; for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = __ipset_dereference(hbucket(t, i)); + n = (__force struct hbucket *)hbucket(t, i); if (!n) continue; if (set->extensions & IPSET_EXT_DESTROY && ext_destroy) @@ -449,10 +451,7 @@ mtype_destroy(struct ip_set *set) struct htype *h = set->data; struct list_head *l, *lt; - if (SET_WITH_TIMEOUT(set)) - cancel_delayed_work_sync(&h->gc.dwork); - - mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true); + mtype_ahash_destroy(set, (__force struct htable *)h->table, true); list_for_each_safe(l, lt, &h->ad) { list_del(l); kfree(l); @@ -598,6 +597,15 @@ mtype_gc_init(struct htable_gc *gc) queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ); } +static void +mtype_cancel_gc(struct ip_set *set) +{ + struct htype *h = set->data; + + if (SET_WITH_TIMEOUT(set)) + cancel_delayed_work_sync(&h->gc.dwork); +} + static int mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags); @@ -1440,6 +1448,7 @@ static const struct ip_set_type_variant mtype_variant = { .uref = mtype_uref, .resize = mtype_resize, .same_set = mtype_same_set, + .cancel_gc = mtype_cancel_gc, .region_lock = true, }; diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index e162636525..6c3f28bc59 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -426,9 +426,6 @@ list_set_destroy(struct ip_set *set) struct list_set *map = set->data; struct set_elem *e, *n; - if (SET_WITH_TIMEOUT(set)) - timer_shutdown_sync(&map->gc); - list_for_each_entry_safe(e, n, &map->members, list) { list_del(&e->list); ip_set_put_byindex(map->net, e->id); @@ -545,6 +542,15 @@ list_set_same_set(const struct ip_set *a, const struct ip_set *b) a->extensions == b->extensions; } +static void +list_set_cancel_gc(struct ip_set *set) +{ + struct list_set *map = set->data; + + if (SET_WITH_TIMEOUT(set)) + timer_shutdown_sync(&map->gc); +} + static const struct ip_set_type_variant set_variant = { .kadt = list_set_kadt, .uadt = list_set_uadt, @@ -558,6 +564,7 @@ static const struct ip_set_type_variant set_variant = { .head = list_set_head, .list = list_set_list, .same_set = list_set_same_set, + .cancel_gc = list_set_cancel_gc, }; static void diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 3230506ae3..a2c16b5010 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -2450,3 +2450,4 @@ static void __exit ip_vs_cleanup(void) module_init(ip_vs_init); module_exit(ip_vs_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IP Virtual Server"); diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c index 5e6ec32aff..75f4c231f4 100644 --- a/net/netfilter/ipvs/ip_vs_dh.c +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -270,3 +270,4 @@ static void __exit ip_vs_dh_cleanup(void) module_init(ip_vs_dh_init); module_exit(ip_vs_dh_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs destination hashing scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_fo.c b/net/netfilter/ipvs/ip_vs_fo.c index b846cc3852..ab117e5bc3 100644 --- a/net/netfilter/ipvs/ip_vs_fo.c +++ b/net/netfilter/ipvs/ip_vs_fo.c @@ -72,3 +72,4 @@ static void __exit ip_vs_fo_cleanup(void) module_init(ip_vs_fo_init); module_exit(ip_vs_fo_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs weighted failover scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index ef1f45e43b..f53899d124 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -635,3 +635,4 @@ static void __exit ip_vs_ftp_exit(void) module_init(ip_vs_ftp_init); module_exit(ip_vs_ftp_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs ftp helper"); diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index cf78ba4ce5..8ceec7a2fa 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -632,3 +632,4 @@ static void __exit ip_vs_lblc_cleanup(void) module_init(ip_vs_lblc_init); module_exit(ip_vs_lblc_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs locality-based least-connection scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 9eddf118b4..0fb6470721 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -817,3 +817,4 @@ static void __exit ip_vs_lblcr_cleanup(void) module_init(ip_vs_lblcr_init); module_exit(ip_vs_lblcr_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs locality-based least-connection with replication scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c index 9d34d81fc6..c2764505e3 100644 --- a/net/netfilter/ipvs/ip_vs_lc.c +++ b/net/netfilter/ipvs/ip_vs_lc.c @@ -86,3 +86,4 @@ static void __exit ip_vs_lc_cleanup(void) module_init(ip_vs_lc_init); module_exit(ip_vs_lc_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs least connection scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c index f56862a875..ed7f5c889b 100644 --- a/net/netfilter/ipvs/ip_vs_nq.c +++ b/net/netfilter/ipvs/ip_vs_nq.c @@ -136,3 +136,4 @@ static void __exit ip_vs_nq_cleanup(void) module_init(ip_vs_nq_init); module_exit(ip_vs_nq_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs never queue scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_ovf.c b/net/netfilter/ipvs/ip_vs_ovf.c index c03066fdd5..c7708b8097 100644 --- a/net/netfilter/ipvs/ip_vs_ovf.c +++ b/net/netfilter/ipvs/ip_vs_ovf.c @@ -79,3 +79,4 @@ static void __exit ip_vs_ovf_cleanup(void) module_init(ip_vs_ovf_init); module_exit(ip_vs_ovf_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs overflow connection scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index 0ac6705a61..e4ce1d9a63 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -185,3 +185,4 @@ static void __exit ip_vs_sip_cleanup(void) module_init(ip_vs_sip_init); module_exit(ip_vs_sip_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs sip helper"); diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c index 38495c6f6c..6baa34dff9 100644 --- a/net/netfilter/ipvs/ip_vs_rr.c +++ b/net/netfilter/ipvs/ip_vs_rr.c @@ -122,4 +122,5 @@ static void __exit ip_vs_rr_cleanup(void) module_init(ip_vs_rr_init); module_exit(ip_vs_rr_cleanup); +MODULE_DESCRIPTION("ipvs round-robin scheduler"); MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c index 7663288e53..a46f99a566 100644 --- a/net/netfilter/ipvs/ip_vs_sed.c +++ b/net/netfilter/ipvs/ip_vs_sed.c @@ -137,3 +137,4 @@ static void __exit ip_vs_sed_cleanup(void) module_init(ip_vs_sed_init); module_exit(ip_vs_sed_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs shortest expected delay scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index c2028e4120..92e77d7a6b 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -376,3 +376,4 @@ static void __exit ip_vs_sh_cleanup(void) module_init(ip_vs_sh_init); module_exit(ip_vs_sh_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs source hashing scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 4174076c66..eaf9f2ed00 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1298,17 +1298,13 @@ static void set_sock_size(struct sock *sk, int mode, int val) static void set_mcast_loop(struct sock *sk, u_char loop) { /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ - lock_sock(sk); inet_assign_bit(MC_LOOP, sk, loop); #ifdef CONFIG_IP_VS_IPV6 - if (sk->sk_family == AF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); - + if (READ_ONCE(sk->sk_family) == AF_INET6) { /* IPV6_MULTICAST_LOOP */ - np->mc_loop = loop ? 1 : 0; + inet6_assign_bit(MC6_LOOP, sk, loop); } #endif - release_sock(sk); } /* @@ -1320,13 +1316,13 @@ static void set_mcast_ttl(struct sock *sk, u_char ttl) /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ lock_sock(sk); - inet->mc_ttl = ttl; + WRITE_ONCE(inet->mc_ttl, ttl); #ifdef CONFIG_IP_VS_IPV6 if (sk->sk_family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); /* IPV6_MULTICAST_HOPS */ - np->mcast_hops = ttl; + WRITE_ONCE(np->mcast_hops, ttl); } #endif release_sock(sk); @@ -1339,13 +1335,13 @@ static void set_mcast_pmtudisc(struct sock *sk, int val) /* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */ lock_sock(sk); - inet->pmtudisc = val; + WRITE_ONCE(inet->pmtudisc, val); #ifdef CONFIG_IP_VS_IPV6 if (sk->sk_family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); /* IPV6_MTU_DISCOVER */ - np->pmtudisc = val; + WRITE_ONCE(np->pmtudisc, val); } #endif release_sock(sk); diff --git a/net/netfilter/ipvs/ip_vs_twos.c b/net/netfilter/ipvs/ip_vs_twos.c index 3308e4cc74..8d5419edde 100644 --- a/net/netfilter/ipvs/ip_vs_twos.c +++ b/net/netfilter/ipvs/ip_vs_twos.c @@ -137,3 +137,4 @@ static void __exit ip_vs_twos_cleanup(void) module_init(ip_vs_twos_init); module_exit(ip_vs_twos_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs power of twos choice scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c index 09f584b564..9fa500927c 100644 --- a/net/netfilter/ipvs/ip_vs_wlc.c +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -109,3 +109,4 @@ static void __exit ip_vs_wlc_cleanup(void) module_init(ip_vs_wlc_init); module_exit(ip_vs_wlc_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs weighted least connection scheduler"); diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c index 1bc7a0789d..85ce0d04af 100644 --- a/net/netfilter/ipvs/ip_vs_wrr.c +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -263,3 +263,4 @@ static void __exit ip_vs_wrr_cleanup(void) module_init(ip_vs_wrr_init); module_exit(ip_vs_wrr_cleanup); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ipvs weighted round-robin scheduler"); diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index b21799d468..475358ec82 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -230,9 +230,7 @@ static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, return 0; } -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in nf_conntrack BTF"); +__bpf_kfunc_start_defs(); /* bpf_xdp_ct_alloc - Allocate a new CT entry * @@ -467,7 +465,7 @@ __bpf_kfunc int bpf_ct_change_status(struct nf_conn *nfct, u32 status) return nf_ct_change_status_common(nfct, status); } -__diag_pop() +__bpf_kfunc_end_defs(); BTF_SET8_START(nf_ct_kfunc_set) BTF_ID_FLAGS(func, bpf_xdp_ct_alloc, KF_ACQUIRE | KF_RET_NULL) diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index 9fb9b80312..cfa0fe0356 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -82,3 +82,4 @@ out: EXPORT_SYMBOL_GPL(nf_conntrack_broadcast_help); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Broadcast connection tracking helper"); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 9f6f2e6435..2e5f3864d3 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2042,24 +2042,6 @@ out: } EXPORT_SYMBOL_GPL(nf_conntrack_in); -/* Alter reply tuple (maybe alter helper). This is for NAT, and is - implicitly racy: see __nf_conntrack_confirm */ -void nf_conntrack_alter_reply(struct nf_conn *ct, - const struct nf_conntrack_tuple *newreply) -{ - struct nf_conn_help *help = nfct_help(ct); - - /* Should be unconfirmed, so not in hash table yet */ - WARN_ON(nf_ct_is_confirmed(ct)); - - nf_ct_dump_tuple(newreply); - - ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; - if (ct->master || (help && !hlist_empty(&help->expectations))) - return; -} -EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); - /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ void __nf_ct_refresh_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, @@ -2187,11 +2169,11 @@ static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num); if (dataoff <= 0) - return -1; + return NF_DROP; if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, l4num, net, &tuple)) - return -1; + return NF_DROP; if (ct->status & IPS_SRC_NAT) { memcpy(tuple.src.u3.all, @@ -2211,7 +2193,7 @@ static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple); if (!h) - return 0; + return NF_ACCEPT; /* Store status bits of the conntrack that is clashing to re-do NAT * mangling according to what it has been done already to this packet. @@ -2224,19 +2206,25 @@ static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, nat_hook = rcu_dereference(nf_nat_hook); if (!nat_hook) - return 0; + return NF_ACCEPT; - if (status & IPS_SRC_NAT && - nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC, - IP_CT_DIR_ORIGINAL) == NF_DROP) - return -1; + if (status & IPS_SRC_NAT) { + unsigned int verdict = nat_hook->manip_pkt(skb, ct, + NF_NAT_MANIP_SRC, + IP_CT_DIR_ORIGINAL); + if (verdict != NF_ACCEPT) + return verdict; + } - if (status & IPS_DST_NAT && - nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST, - IP_CT_DIR_ORIGINAL) == NF_DROP) - return -1; + if (status & IPS_DST_NAT) { + unsigned int verdict = nat_hook->manip_pkt(skb, ct, + NF_NAT_MANIP_DST, + IP_CT_DIR_ORIGINAL); + if (verdict != NF_ACCEPT) + return verdict; + } - return 0; + return NF_ACCEPT; } /* This packet is coming from userspace via nf_queue, complete the packet @@ -2251,14 +2239,14 @@ static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, help = nfct_help(ct); if (!help) - return 0; + return NF_ACCEPT; helper = rcu_dereference(help->helper); if (!helper) - return 0; + return NF_ACCEPT; if (!(helper->flags & NF_CT_HELPER_F_USERSPACE)) - return 0; + return NF_ACCEPT; switch (nf_ct_l3num(ct)) { case NFPROTO_IPV4: @@ -2273,42 +2261,44 @@ static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, &frag_off); if (protoff < 0 || (frag_off & htons(~0x7)) != 0) - return 0; + return NF_ACCEPT; break; } #endif default: - return 0; + return NF_ACCEPT; } if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb)) { if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); - return -1; + return NF_DROP; } } /* We've seen it coming out the other side: confirm it */ - return nf_conntrack_confirm(skb) == NF_DROP ? - 1 : 0; + return nf_conntrack_confirm(skb); } static int nf_conntrack_update(struct net *net, struct sk_buff *skb) { enum ip_conntrack_info ctinfo; struct nf_conn *ct; - int err; ct = nf_ct_get(skb, &ctinfo); if (!ct) - return 0; + return NF_ACCEPT; if (!nf_ct_is_confirmed(ct)) { - err = __nf_conntrack_update(net, skb, ct, ctinfo); - if (err < 0) - return err; + int ret = __nf_conntrack_update(net, skb, ct, ctinfo); + + if (ret != NF_ACCEPT) + return ret; ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return NF_ACCEPT; } return nf_confirm_cthelper(skb, ct, ctinfo); diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index f22691f838..4ed5878cb2 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -194,12 +194,7 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, struct nf_conntrack_helper *helper = NULL; struct nf_conn_help *help; - /* We already got a helper explicitly attached. The function - * nf_conntrack_alter_reply - in case NAT is in use - asks for looking - * the helper up again. Since now the user is in full control of - * making consistent helper configurations, skip this automatic - * re-lookup, otherwise we'll lose the helper. - */ + /* We already got a helper explicitly attached (e.g. nft_ct) */ if (test_bit(IPS_HELPER_BIT, &ct->status)) return 0; diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c index 6e70e137a0..6c46aad233 100644 --- a/net/netfilter/nf_conntrack_labels.c +++ b/net/netfilter/nf_conntrack_labels.c @@ -11,8 +11,6 @@ #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_labels.h> -static DEFINE_SPINLOCK(nf_connlabels_lock); - static int replace_u32(u32 *address, u32 mask, u32 new) { u32 old, tmp; @@ -60,23 +58,24 @@ EXPORT_SYMBOL_GPL(nf_connlabels_replace); int nf_connlabels_get(struct net *net, unsigned int bits) { + int v; + if (BIT_WORD(bits) >= NF_CT_LABELS_MAX_SIZE / sizeof(long)) return -ERANGE; - spin_lock(&nf_connlabels_lock); - net->ct.labels_used++; - spin_unlock(&nf_connlabels_lock); - BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX); + v = atomic_inc_return_relaxed(&net->ct.labels_used); + WARN_ON_ONCE(v <= 0); + return 0; } EXPORT_SYMBOL_GPL(nf_connlabels_get); void nf_connlabels_put(struct net *net) { - spin_lock(&nf_connlabels_lock); - net->ct.labels_used--; - spin_unlock(&nf_connlabels_lock); + int v = atomic_dec_return_relaxed(&net->ct.labels_used); + + WARN_ON_ONCE(v < 0); } EXPORT_SYMBOL_GPL(nf_connlabels_put); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 334db22199..fb0ae15e96 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -57,6 +57,7 @@ #include "nf_internals.h" MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("List and change connection tracking table"); struct ctnetlink_list_dump_ctx { struct nf_conn *last; diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index c928ff63b1..f36727ed91 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -699,3 +699,4 @@ MODULE_ALIAS("ip_conntrack"); MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6)); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IPv4 and IPv6 connection tracking"); diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index c6bd533983..4cc97f9712 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -283,7 +283,7 @@ sctp_new(struct nf_conn *ct, const struct sk_buff *skb, pr_debug("Setting vtag %x for secondary conntrack\n", sh->vtag); ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag; - } else { + } else if (sch->type == SCTP_CID_SHUTDOWN_ACK) { /* If it is a shutdown ack OOTB packet, we expect a return shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ pr_debug("Setting vtag %x for new conn OOTB\n", diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 4018acb1d6..ae493599a3 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -457,7 +457,8 @@ static void tcp_init_sender(struct ip_ct_tcp_state *sender, const struct sk_buff *skb, unsigned int dataoff, const struct tcphdr *tcph, - u32 end, u32 win) + u32 end, u32 win, + enum ip_conntrack_dir dir) { /* SYN-ACK in reply to a SYN * or SYN from reply direction in simultaneous open. @@ -471,7 +472,8 @@ static void tcp_init_sender(struct ip_ct_tcp_state *sender, * Both sides must send the Window Scale option * to enable window scaling in either direction. */ - if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE && + if (dir == IP_CT_DIR_REPLY && + !(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) { sender->td_scale = 0; receiver->td_scale = 0; @@ -542,7 +544,7 @@ tcp_in_window(struct nf_conn *ct, enum ip_conntrack_dir dir, if (tcph->syn) { tcp_init_sender(sender, receiver, skb, dataoff, tcph, - end, win); + end, win, dir); if (!tcph->ack) /* Simultaneous open */ return NFCT_TCP_ACCEPT; @@ -585,7 +587,7 @@ tcp_in_window(struct nf_conn *ct, enum ip_conntrack_dir dir, */ tcp_init_sender(sender, receiver, skb, dataoff, tcph, - end, win); + end, win, dir); if (dir == IP_CT_DIR_REPLY && !tcph->ack) return NFCT_TCP_ACCEPT; @@ -835,7 +837,8 @@ static bool tcp_error(const struct tcphdr *th, static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - const struct tcphdr *th) + const struct tcphdr *th, + const struct nf_hook_state *state) { enum tcp_conntrack new_state; struct net *net = nf_ct_net(ct); @@ -846,7 +849,7 @@ static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, /* Invalid: delete conntrack */ if (new_state >= TCP_CONNTRACK_MAX) { - pr_debug("nf_ct_tcp: invalid new deleting.\n"); + tcp_error_log(skb, state, "invalid new"); return false; } @@ -980,7 +983,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, if (tcp_error(th, skb, dataoff, state)) return -NF_ACCEPT; - if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th)) + if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th, state)) return -NF_ACCEPT; spin_lock_bh(&ct->lock); diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 920a5a29ae..a057133923 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -87,12 +87,22 @@ static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple) return 0; } +static struct dst_entry *nft_route_dst_fetch(struct nf_flow_route *route, + enum flow_offload_tuple_dir dir) +{ + struct dst_entry *dst = route->tuple[dir].dst; + + route->tuple[dir].dst = NULL; + + return dst; +} + static int flow_offload_fill_route(struct flow_offload *flow, - const struct nf_flow_route *route, + struct nf_flow_route *route, enum flow_offload_tuple_dir dir) { struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; - struct dst_entry *dst = route->tuple[dir].dst; + struct dst_entry *dst = nft_route_dst_fetch(route, dir); int i, j = 0; switch (flow_tuple->l3proto) { @@ -122,6 +132,7 @@ static int flow_offload_fill_route(struct flow_offload *flow, ETH_ALEN); flow_tuple->out.ifidx = route->tuple[dir].out.ifindex; flow_tuple->out.hw_ifidx = route->tuple[dir].out.hw_ifindex; + dst_release(dst); break; case FLOW_OFFLOAD_XMIT_XFRM: case FLOW_OFFLOAD_XMIT_NEIGH: @@ -146,7 +157,7 @@ static void nft_flow_dst_release(struct flow_offload *flow, } void flow_offload_route_init(struct flow_offload *flow, - const struct nf_flow_route *route) + struct nf_flow_route *route) { flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL); flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY); diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 8cc52d2bd3..e16f158388 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -193,11 +193,12 @@ void nf_logger_put(int pf, enum nf_log_type type) return; } - BUG_ON(loggers[pf][type] == NULL); - rcu_read_lock(); logger = rcu_dereference(loggers[pf][type]); - module_put(logger->me); + if (!logger) + WARN_ON_ONCE(1); + else + module_put(logger->me); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(nf_logger_put); diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c index 141ee77832..6e3b2f5885 100644 --- a/net/netfilter/nf_nat_bpf.c +++ b/net/netfilter/nf_nat_bpf.c @@ -12,9 +12,7 @@ #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_nat.h> -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in nf_nat BTF"); +__bpf_kfunc_start_defs(); /* bpf_ct_set_nat_info - Set source or destination nat address * @@ -54,7 +52,7 @@ __bpf_kfunc int bpf_ct_set_nat_info(struct nf_conn___init *nfct, return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; } -__diag_pop() +__bpf_kfunc_end_defs(); BTF_SET8_START(nf_nat_kfunc_set) BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS) diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index c4e0516a8d..c3d7ecbc77 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -1263,6 +1263,7 @@ static void __exit nf_nat_cleanup(void) } MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Network address translation core"); module_init(nf_nat_init); module_exit(nf_nat_cleanup); diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index 48cc60084d..dc450cc812 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -668,7 +668,7 @@ static int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int struct flowi fl; int err; - err = xfrm_decode_session(skb, &fl, family); + err = xfrm_decode_session(net, skb, &fl, family); if (err < 0) return err; @@ -697,6 +697,31 @@ static int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int } #endif +static bool nf_nat_inet_port_was_mangled(const struct sk_buff *skb, __be16 sport) +{ + enum ip_conntrack_info ctinfo; + enum ip_conntrack_dir dir; + const struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return false; + + switch (nf_ct_protonum(ct)) { + case IPPROTO_TCP: + case IPPROTO_UDP: + break; + default: + return false; + } + + dir = CTINFO2DIR(ctinfo); + if (dir != IP_CT_DIR_ORIGINAL) + return false; + + return ct->tuplehash[!dir].tuple.dst.u.all != sport; +} + static unsigned int nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -707,8 +732,20 @@ nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb, ret = nf_nat_ipv4_fn(priv, skb, state); - if (ret == NF_ACCEPT && sk && saddr != ip_hdr(skb)->saddr && - !inet_sk_transparent(sk)) + if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk)) + return ret; + + /* skb has a socket assigned via tcp edemux. We need to check + * if nf_nat_ipv4_fn() has mangled the packet in a way that + * edemux would not have found this socket. + * + * This includes both changes to the source address and changes + * to the source port, which are both handled by the + * nf_nat_ipv4_fn() call above -- long after tcp/udp early demux + * might have found a socket for the old (pre-snat) address. + */ + if (saddr != ip_hdr(skb)->saddr || + nf_nat_inet_port_was_mangled(skb, sk->sk_dport)) skb_orphan(skb); /* TCP edemux obtained wrong socket */ return ret; @@ -938,14 +975,36 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, } static unsigned int +nf_nat_ipv6_local_in(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct in6_addr saddr = ipv6_hdr(skb)->saddr; + struct sock *sk = skb->sk; + unsigned int ret; + + ret = nf_nat_ipv6_fn(priv, skb, state); + + if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk)) + return ret; + + /* see nf_nat_ipv4_local_in */ + if (ipv6_addr_cmp(&saddr, &ipv6_hdr(skb)->saddr) || + nf_nat_inet_port_was_mangled(skb, sk->sk_dport)) + skb_orphan(skb); + + return ret; +} + +static unsigned int nf_nat_ipv6_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - unsigned int ret; + unsigned int ret, verdict; struct in6_addr daddr = ipv6_hdr(skb)->daddr; ret = nf_nat_ipv6_fn(priv, skb, state); - if (ret != NF_DROP && ret != NF_STOLEN && + verdict = ret & NF_VERDICT_MASK; + if (verdict != NF_DROP && verdict != NF_STOLEN && ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) skb_dst_drop(skb); @@ -1051,7 +1110,7 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = { }, /* After packet filtering, change source */ { - .hook = nf_nat_ipv6_fn, + .hook = nf_nat_ipv6_local_in, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC, diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 16915f8eef..467671f2d4 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -153,7 +153,7 @@ void synproxy_init_timestamp_cookie(const struct nf_synproxy_info *info, struct synproxy_options *opts) { opts->tsecr = opts->tsval; - opts->tsval = tcp_time_stamp_raw() & ~0x3f; + opts->tsval = tcp_clock_ms() & ~0x3f; if (opts->options & NF_SYNPROXY_OPT_WSCALE) { opts->tsval |= opts->wscale; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4fc8348dd7..79e088e6f1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -592,9 +592,9 @@ static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, static int nft_mapelem_deactivate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, - struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - nft_setelem_data_deactivate(ctx->net, set, elem); + nft_setelem_data_deactivate(ctx->net, set, elem_priv); return 0; } @@ -602,7 +602,7 @@ static int nft_mapelem_deactivate(const struct nft_ctx *ctx, struct nft_set_elem_catchall { struct list_head list; struct rcu_head rcu; - void *elem; + struct nft_elem_priv *elem; }; static void nft_map_catchall_deactivate(const struct nft_ctx *ctx, @@ -610,7 +610,6 @@ static void nft_map_catchall_deactivate(const struct nft_ctx *ctx, { u8 genmask = nft_genmask_next(ctx->net); struct nft_set_elem_catchall *catchall; - struct nft_set_elem elem; struct nft_set_ext *ext; list_for_each_entry(catchall, &set->catchall_list, list) { @@ -618,8 +617,7 @@ static void nft_map_catchall_deactivate(const struct nft_ctx *ctx, if (!nft_set_elem_active(ext, genmask)) continue; - elem.priv = catchall->elem; - nft_setelem_data_deactivate(ctx->net, set, &elem); + nft_setelem_data_deactivate(ctx->net, set, catchall->elem); break; } } @@ -686,15 +684,16 @@ static int nft_delobj(struct nft_ctx *ctx, struct nft_object *obj) return err; } -static int nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type, - struct nft_flowtable *flowtable) +static struct nft_trans * +nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type, + struct nft_flowtable *flowtable) { struct nft_trans *trans; trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_flowtable)); if (trans == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); if (msg_type == NFT_MSG_NEWFLOWTABLE) nft_activate_next(ctx->net, flowtable); @@ -703,22 +702,22 @@ static int nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type, nft_trans_flowtable(trans) = flowtable; nft_trans_commit_list_add_tail(ctx->net, trans); - return 0; + return trans; } static int nft_delflowtable(struct nft_ctx *ctx, struct nft_flowtable *flowtable) { - int err; + struct nft_trans *trans; - err = nft_trans_flowtable_add(ctx, NFT_MSG_DELFLOWTABLE, flowtable); - if (err < 0) - return err; + trans = nft_trans_flowtable_add(ctx, NFT_MSG_DELFLOWTABLE, flowtable); + if (IS_ERR(trans)) + return PTR_ERR(trans); nft_deactivate_next(ctx->net, flowtable); nft_use_dec(&ctx->table->use); - return err; + return 0; } static void __nft_reg_track_clobber(struct nft_regs_track *track, u8 dreg) @@ -1253,6 +1252,7 @@ static int nf_tables_updtable(struct nft_ctx *ctx) return 0; err_register_hooks: + ctx->table->flags |= NFT_TABLE_F_DORMANT; nft_trans_destroy(trans); return ret; } @@ -2082,7 +2082,7 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net, struct nft_hook *hook; int err; - hook = kmalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT); + hook = kzalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT); if (!hook) { err = -ENOMEM; goto err_hook_alloc; @@ -2505,19 +2505,15 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, RCU_INIT_POINTER(chain->blob_gen_0, blob); RCU_INIT_POINTER(chain->blob_gen_1, blob); - err = nf_tables_register_hook(net, table, chain); - if (err < 0) - goto err_destroy_chain; - if (!nft_use_inc(&table->use)) { err = -EMFILE; - goto err_use; + goto err_destroy_chain; } trans = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN); if (IS_ERR(trans)) { err = PTR_ERR(trans); - goto err_unregister_hook; + goto err_trans; } nft_trans_chain_policy(trans) = NFT_CHAIN_POLICY_UNSET; @@ -2525,17 +2521,22 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, nft_trans_chain_policy(trans) = policy; err = nft_chain_add(table, chain); - if (err < 0) { - nft_trans_destroy(trans); - goto err_unregister_hook; - } + if (err < 0) + goto err_chain_add; + + /* This must be LAST to ensure no packets are walking over this chain. */ + err = nf_tables_register_hook(net, table, chain); + if (err < 0) + goto err_register_hook; return 0; -err_unregister_hook: +err_register_hook: + nft_chain_del(chain); +err_chain_add: + nft_trans_destroy(trans); +err_trans: nft_use_dec_restore(&table->use); -err_use: - nf_tables_unregister_hook(net, table, chain); err_destroy_chain: nf_tables_chain_destroy(ctx); @@ -3327,7 +3328,7 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { [NFTA_RULE_CHAIN] = { .type = NLA_STRING, .len = NFT_CHAIN_MAXNAMELEN - 1 }, [NFTA_RULE_HANDLE] = { .type = NLA_U64 }, - [NFTA_RULE_EXPRESSIONS] = { .type = NLA_NESTED }, + [NFTA_RULE_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy), [NFTA_RULE_COMPAT] = { .type = NLA_NESTED }, [NFTA_RULE_POSITION] = { .type = NLA_U64 }, [NFTA_RULE_USERDATA] = { .type = NLA_BINARY, @@ -3452,20 +3453,21 @@ static void audit_log_rule_reset(const struct nft_table *table, } struct nft_rule_dump_ctx { + unsigned int s_idx; char *table; char *chain; + bool reset; }; static int __nf_tables_dump_rules(struct sk_buff *skb, unsigned int *idx, struct netlink_callback *cb, const struct nft_table *table, - const struct nft_chain *chain, - bool reset) + const struct nft_chain *chain) { + struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); const struct nft_rule *rule, *prule; - unsigned int s_idx = cb->args[0]; unsigned int entries = 0; int ret = 0; u64 handle; @@ -3474,7 +3476,7 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, list_for_each_entry_rcu(rule, &chain->rules, list) { if (!nft_is_active(net, rule)) goto cont_skip; - if (*idx < s_idx) + if (*idx < ctx->s_idx) goto cont; if (prule) handle = prule->handle; @@ -3486,7 +3488,7 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, NFT_MSG_NEWRULE, NLM_F_MULTI | NLM_F_APPEND, table->family, - table, chain, rule, handle, reset) < 0) { + table, chain, rule, handle, ctx->reset) < 0) { ret = 1; break; } @@ -3498,7 +3500,7 @@ cont_skip: (*idx)++; } - if (reset && entries) + if (ctx->reset && entries) audit_log_rule_reset(table, cb->seq, entries); return ret; @@ -3508,17 +3510,13 @@ static int nf_tables_dump_rules(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); - const struct nft_rule_dump_ctx *ctx = cb->data; + struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; struct nft_table *table; const struct nft_chain *chain; unsigned int idx = 0; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nftables_pernet *nft_net; - bool reset = false; - - if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET) - reset = true; rcu_read_lock(); nft_net = nft_pernet(net); @@ -3528,10 +3526,10 @@ static int nf_tables_dump_rules(struct sk_buff *skb, if (family != NFPROTO_UNSPEC && family != table->family) continue; - if (ctx && ctx->table && strcmp(ctx->table, table->name) != 0) + if (ctx->table && strcmp(ctx->table, table->name) != 0) continue; - if (ctx && ctx->table && ctx->chain) { + if (ctx->table && ctx->chain) { struct rhlist_head *list, *tmp; list = rhltable_lookup(&table->chains_ht, ctx->chain, @@ -3543,7 +3541,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb, if (!nft_is_active(net, chain)) continue; __nf_tables_dump_rules(skb, &idx, - cb, table, chain, reset); + cb, table, chain); break; } goto done; @@ -3551,68 +3549,81 @@ static int nf_tables_dump_rules(struct sk_buff *skb, list_for_each_entry_rcu(chain, &table->chains, list) { if (__nf_tables_dump_rules(skb, &idx, - cb, table, chain, reset)) + cb, table, chain)) goto done; } - if (ctx && ctx->table) + if (ctx->table) break; } done: rcu_read_unlock(); - cb->args[0] = idx; + ctx->s_idx = idx; return skb->len; } +static int nf_tables_dumpreset_rules(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk)); + int ret; + + /* Mutex is held is to prevent that two concurrent dump-and-reset calls + * do not underrun counters and quotas. The commit_mutex is used for + * the lack a better lock, this is not transaction path. + */ + mutex_lock(&nft_net->commit_mutex); + ret = nf_tables_dump_rules(skb, cb); + mutex_unlock(&nft_net->commit_mutex); + + return ret; +} + static int nf_tables_dump_rules_start(struct netlink_callback *cb) { + struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; const struct nlattr * const *nla = cb->data; - struct nft_rule_dump_ctx *ctx = NULL; - if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) { - ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC); - if (!ctx) - return -ENOMEM; + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); - if (nla[NFTA_RULE_TABLE]) { - ctx->table = nla_strdup(nla[NFTA_RULE_TABLE], - GFP_ATOMIC); - if (!ctx->table) { - kfree(ctx); - return -ENOMEM; - } - } - if (nla[NFTA_RULE_CHAIN]) { - ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN], - GFP_ATOMIC); - if (!ctx->chain) { - kfree(ctx->table); - kfree(ctx); - return -ENOMEM; - } + if (nla[NFTA_RULE_TABLE]) { + ctx->table = nla_strdup(nla[NFTA_RULE_TABLE], GFP_ATOMIC); + if (!ctx->table) + return -ENOMEM; + } + if (nla[NFTA_RULE_CHAIN]) { + ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN], GFP_ATOMIC); + if (!ctx->chain) { + kfree(ctx->table); + return -ENOMEM; } } - - cb->data = ctx; return 0; } +static int nf_tables_dumpreset_rules_start(struct netlink_callback *cb) +{ + struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; + + ctx->reset = true; + + return nf_tables_dump_rules_start(cb); +} + static int nf_tables_dump_rules_done(struct netlink_callback *cb) { - struct nft_rule_dump_ctx *ctx = cb->data; + struct nft_rule_dump_ctx *ctx = (void *)cb->ctx; - if (ctx) { - kfree(ctx->table); - kfree(ctx->chain); - kfree(ctx); - } + kfree(ctx->table); + kfree(ctx->chain); return 0; } /* called with rcu_read_lock held */ -static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, - const struct nlattr * const nla[]) +static struct sk_buff * +nf_tables_getrule_single(u32 portid, const struct nfnl_info *info, + const struct nlattr * const nla[], bool reset) { struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); @@ -3622,60 +3633,110 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, struct net *net = info->net; struct nft_table *table; struct sk_buff *skb2; - bool reset = false; int err; - if (info->nlh->nlmsg_flags & NLM_F_DUMP) { - struct netlink_dump_control c = { - .start= nf_tables_dump_rules_start, - .dump = nf_tables_dump_rules, - .done = nf_tables_dump_rules_done, - .module = THIS_MODULE, - .data = (void *)nla, - }; - - return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); - } - table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, 0); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]); - return PTR_ERR(table); + return ERR_CAST(table); } chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); - return PTR_ERR(chain); + return ERR_CAST(chain); } rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); - return PTR_ERR(rule); + return ERR_CAST(rule); } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) - return -ENOMEM; - - if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET) - reset = true; + return ERR_PTR(-ENOMEM); - err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid, + err = nf_tables_fill_rule_info(skb2, net, portid, info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, family, table, chain, rule, 0, reset); - if (err < 0) - goto err_fill_rule_info; + if (err < 0) { + kfree_skb(skb2); + return ERR_PTR(err); + } - if (reset) - audit_log_rule_reset(table, nft_pernet(net)->base_seq, 1); + return skb2; +} - return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); +static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + u32 portid = NETLINK_CB(skb).portid; + struct net *net = info->net; + struct sk_buff *skb2; -err_fill_rule_info: - kfree_skb(skb2); - return err; + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .start= nf_tables_dump_rules_start, + .dump = nf_tables_dump_rules, + .done = nf_tables_dump_rules_done, + .module = THIS_MODULE, + .data = (void *)nla, + }; + + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); + } + + skb2 = nf_tables_getrule_single(portid, info, nla, false); + if (IS_ERR(skb2)) + return PTR_ERR(skb2); + + return nfnetlink_unicast(skb2, net, portid); +} + +static int nf_tables_getrule_reset(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) +{ + struct nftables_pernet *nft_net = nft_pernet(info->net); + u32 portid = NETLINK_CB(skb).portid; + struct net *net = info->net; + struct sk_buff *skb2; + char *buf; + + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .start= nf_tables_dumpreset_rules_start, + .dump = nf_tables_dumpreset_rules, + .done = nf_tables_dump_rules_done, + .module = THIS_MODULE, + .data = (void *)nla, + }; + + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); + } + + if (!try_module_get(THIS_MODULE)) + return -EINVAL; + rcu_read_unlock(); + mutex_lock(&nft_net->commit_mutex); + skb2 = nf_tables_getrule_single(portid, info, nla, true); + mutex_unlock(&nft_net->commit_mutex); + rcu_read_lock(); + module_put(THIS_MODULE); + + if (IS_ERR(skb2)) + return PTR_ERR(skb2); + + buf = kasprintf(GFP_ATOMIC, "%.*s:%u", + nla_len(nla[NFTA_RULE_TABLE]), + (char *)nla_data(nla[NFTA_RULE_TABLE]), + nft_net->base_seq); + audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, + AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC); + kfree(buf); + + return nfnetlink_unicast(skb2, net, portid); } void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule) @@ -3758,9 +3819,9 @@ static int nft_table_validate(struct net *net, const struct nft_table *table) int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, - struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_ctx *pctx = (struct nft_ctx *)ctx; const struct nft_data *data; int err; @@ -3790,7 +3851,6 @@ int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set) { u8 genmask = nft_genmask_next(ctx->net); struct nft_set_elem_catchall *catchall; - struct nft_set_elem elem; struct nft_set_ext *ext; int ret = 0; @@ -3799,8 +3859,7 @@ int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set) if (!nft_set_elem_active(ext, genmask)) continue; - elem.priv = catchall->elem; - ret = nft_setelem_validate(ctx, set, NULL, &elem); + ret = nft_setelem_validate(ctx, set, NULL, catchall->elem); if (ret < 0) return ret; } @@ -4261,12 +4320,16 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 }, [NFTA_SET_HANDLE] = { .type = NLA_U64 }, [NFTA_SET_EXPR] = { .type = NLA_NESTED }, - [NFTA_SET_EXPRESSIONS] = { .type = NLA_NESTED }, + [NFTA_SET_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy), +}; + +static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = { + [NFTA_SET_FIELD_LEN] = { .type = NLA_U32 }, }; static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { [NFTA_SET_DESC_SIZE] = { .type = NLA_U32 }, - [NFTA_SET_DESC_CONCAT] = { .type = NLA_NESTED }, + [NFTA_SET_DESC_CONCAT] = NLA_POLICY_NESTED_ARRAY(nft_concat_policy), }; static struct nft_set *nft_set_lookup(const struct nft_table *table, @@ -4705,8 +4768,10 @@ static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info, return -EINVAL; set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask); - if (IS_ERR(set)) + if (IS_ERR(set)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); return PTR_ERR(set); + } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (skb2 == NULL) @@ -4723,10 +4788,6 @@ err_fill_set_info: return err; } -static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = { - [NFTA_SET_FIELD_LEN] = { .type = NLA_U32 }, -}; - static int nft_set_desc_concat_parse(const struct nlattr *attr, struct nft_set_desc *desc) { @@ -5269,9 +5330,9 @@ static int nft_validate_register_store(const struct nft_ctx *ctx, static int nft_setelem_data_validate(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); enum nft_registers dreg; dreg = nft_type_to_reg(set->dtype); @@ -5284,9 +5345,9 @@ static int nft_setelem_data_validate(const struct nft_ctx *ctx, static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, - struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - return nft_setelem_data_validate(ctx, set, elem); + return nft_setelem_data_validate(ctx, set, elem_priv); } static int nft_set_catchall_bind_check(const struct nft_ctx *ctx, @@ -5294,7 +5355,6 @@ static int nft_set_catchall_bind_check(const struct nft_ctx *ctx, { u8 genmask = nft_genmask_next(ctx->net); struct nft_set_elem_catchall *catchall; - struct nft_set_elem elem; struct nft_set_ext *ext; int ret = 0; @@ -5303,8 +5363,7 @@ static int nft_set_catchall_bind_check(const struct nft_ctx *ctx, if (!nft_set_elem_active(ext, genmask)) continue; - elem.priv = catchall->elem; - ret = nft_setelem_data_validate(ctx, set, &elem); + ret = nft_setelem_data_validate(ctx, set, catchall->elem); if (ret < 0) break; } @@ -5371,14 +5430,14 @@ static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, static void nft_setelem_data_activate(const struct net *net, const struct nft_set *set, - struct nft_set_elem *elem); + struct nft_elem_priv *elem_priv); static int nft_mapelem_activate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, - struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - nft_setelem_data_activate(ctx->net, set, elem); + nft_setelem_data_activate(ctx->net, set, elem_priv); return 0; } @@ -5388,7 +5447,6 @@ static void nft_map_catchall_activate(const struct nft_ctx *ctx, { u8 genmask = nft_genmask_next(ctx->net); struct nft_set_elem_catchall *catchall; - struct nft_set_elem elem; struct nft_set_ext *ext; list_for_each_entry(catchall, &set->catchall_list, list) { @@ -5396,8 +5454,7 @@ static void nft_map_catchall_activate(const struct nft_ctx *ctx, if (!nft_set_elem_active(ext, genmask)) continue; - elem.priv = catchall->elem; - nft_setelem_data_activate(ctx->net, set, &elem); + nft_setelem_data_activate(ctx->net, set, catchall->elem); break; } } @@ -5524,7 +5581,7 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { [NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING, .len = NFT_OBJ_MAXNAMELEN - 1 }, [NFTA_SET_ELEM_KEY_END] = { .type = NLA_NESTED }, - [NFTA_SET_ELEM_EXPRESSIONS] = { .type = NLA_NESTED }, + [NFTA_SET_ELEM_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy), }; static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { @@ -5532,7 +5589,7 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING, .len = NFT_SET_MAXNAMELEN - 1 }, - [NFTA_SET_ELEM_LIST_ELEMENTS] = { .type = NLA_NESTED }, + [NFTA_SET_ELEM_LIST_ELEMENTS] = NLA_POLICY_NESTED_ARRAY(nft_set_elem_policy), [NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 }, }; @@ -5576,10 +5633,10 @@ nla_put_failure: static int nf_tables_fill_setelem(struct sk_buff *skb, const struct nft_set *set, - const struct nft_set_elem *elem, + const struct nft_elem_priv *elem_priv, bool reset) { - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; @@ -5665,16 +5722,16 @@ struct nft_set_dump_args { static int nf_tables_dump_setelem(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, - struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_set_dump_args *args; if (nft_set_elem_expired(ext) || nft_set_elem_is_dead(ext)) return 0; args = container_of(iter, struct nft_set_dump_args, iter); - return nf_tables_fill_setelem(args->skb, set, elem, args->reset); + return nf_tables_fill_setelem(args->skb, set, elem_priv, args->reset); } static void audit_log_nft_set_reset(const struct nft_table *table, @@ -5691,6 +5748,7 @@ static void audit_log_nft_set_reset(const struct nft_table *table, struct nft_set_dump_ctx { const struct nft_set *set; struct nft_ctx ctx; + bool reset; }; static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb, @@ -5699,7 +5757,6 @@ static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb, { struct nft_set_elem_catchall *catchall; u8 genmask = nft_genmask_cur(net); - struct nft_set_elem elem; struct nft_set_ext *ext; int ret = 0; @@ -5709,8 +5766,7 @@ static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb, nft_set_elem_expired(ext)) continue; - elem.priv = catchall->elem; - ret = nf_tables_fill_setelem(skb, set, &elem, reset); + ret = nf_tables_fill_setelem(skb, set, catchall->elem, reset); if (reset && !ret) audit_log_nft_set_reset(set->table, base_seq, 1); break; @@ -5730,7 +5786,6 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) bool set_found = false; struct nlmsghdr *nlh; struct nlattr *nest; - bool reset = false; u32 portid, seq; int event; @@ -5778,12 +5833,9 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) if (nest == NULL) goto nla_put_failure; - if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETSETELEM_RESET) - reset = true; - args.cb = cb; args.skb = skb; - args.reset = reset; + args.reset = dump_ctx->reset; args.iter.genmask = nft_genmask_cur(net); args.iter.skip = cb->args[0]; args.iter.count = 0; @@ -5793,11 +5845,11 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) if (!args.iter.err && args.iter.count == cb->args[0]) args.iter.err = nft_set_catchall_dump(net, skb, set, - reset, cb->seq); + dump_ctx->reset, cb->seq); nla_nest_end(skb, nest); nlmsg_end(skb, nlh); - if (reset && args.iter.count > args.iter.skip) + if (dump_ctx->reset && args.iter.count > args.iter.skip) audit_log_nft_set_reset(table, cb->seq, args.iter.count - args.iter.skip); @@ -5835,7 +5887,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, const struct nft_ctx *ctx, u32 seq, u32 portid, int event, u16 flags, const struct nft_set *set, - const struct nft_set_elem *elem, + const struct nft_elem_priv *elem_priv, bool reset) { struct nlmsghdr *nlh; @@ -5857,7 +5909,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, if (nest == NULL) goto nla_put_failure; - err = nf_tables_fill_setelem(skb, set, elem, reset); + err = nf_tables_fill_setelem(skb, set, elem_priv, reset); if (err < 0) goto nla_put_failure; @@ -6007,7 +6059,7 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; err = nf_tables_fill_setelem_info(skb, ctx, ctx->seq, ctx->portid, - NFT_MSG_NEWSETELEM, 0, set, &elem, + NFT_MSG_NEWSETELEM, 0, set, elem.priv, reset); if (err < 0) goto err_fill_setelem; @@ -6043,11 +6095,16 @@ static int nf_tables_getsetelem(struct sk_buff *skb, } set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask); - if (IS_ERR(set)) + if (IS_ERR(set)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]); return PTR_ERR(set); + } nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETSETELEM_RESET) + reset = true; + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_set_start, @@ -6058,6 +6115,7 @@ static int nf_tables_getsetelem(struct sk_buff *skb, struct nft_set_dump_ctx dump_ctx = { .set = set, .ctx = ctx, + .reset = reset, }; c.data = &dump_ctx; @@ -6067,9 +6125,6 @@ static int nf_tables_getsetelem(struct sk_buff *skb, if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) return -EINVAL; - if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETSETELEM_RESET) - reset = true; - nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_get_set_elem(&ctx, set, attr, reset); if (err < 0) { @@ -6088,7 +6143,7 @@ static int nf_tables_getsetelem(struct sk_buff *skb, static void nf_tables_setelem_notify(const struct nft_ctx *ctx, const struct nft_set *set, - const struct nft_set_elem *elem, + const struct nft_elem_priv *elem_priv, int event) { struct nftables_pernet *nft_net; @@ -6109,7 +6164,7 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx, flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags, - set, elem, false); + set, elem_priv, false); if (err < 0) { kfree_skb(skb); goto err; @@ -6184,10 +6239,11 @@ static int nft_set_ext_memcpy(const struct nft_set_ext_tmpl *tmpl, u8 id, return 0; } -void *nft_set_elem_init(const struct nft_set *set, - const struct nft_set_ext_tmpl *tmpl, - const u32 *key, const u32 *key_end, - const u32 *data, u64 timeout, u64 expiration, gfp_t gfp) +struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set, + const struct nft_set_ext_tmpl *tmpl, + const u32 *key, const u32 *key_end, + const u32 *data, + u64 timeout, u64 expiration, gfp_t gfp) { struct nft_set_ext *ext; void *elem; @@ -6252,10 +6308,11 @@ static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, } /* Drop references and destroy. Called from gc, dynset and abort path. */ -void nft_set_elem_destroy(const struct nft_set *set, void *elem, +void nft_set_elem_destroy(const struct nft_set *set, + const struct nft_elem_priv *elem_priv, bool destroy_expr) { - struct nft_set_ext *ext = nft_set_elem_ext(set, elem); + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); struct nft_ctx ctx = { .net = read_pnet(&set->net), .family = set->table->family, @@ -6266,10 +6323,10 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, nft_data_release(nft_set_ext_data(ext), set->dtype); if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) nft_set_elem_expr_destroy(&ctx, nft_set_ext_expr(ext)); - if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) nft_use_dec(&(*nft_set_ext_obj(ext))->use); - kfree(elem); + + kfree(elem_priv); } EXPORT_SYMBOL_GPL(nft_set_elem_destroy); @@ -6277,14 +6334,15 @@ EXPORT_SYMBOL_GPL(nft_set_elem_destroy); * path via nft_setelem_data_deactivate(). */ void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, - const struct nft_set *set, void *elem) + const struct nft_set *set, + const struct nft_elem_priv *elem_priv) { - struct nft_set_ext *ext = nft_set_elem_ext(set, elem); + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext)); - kfree(elem); + kfree(elem_priv); } int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, @@ -6379,7 +6437,7 @@ EXPORT_SYMBOL_GPL(nft_set_catchall_lookup); static int nft_setelem_catchall_insert(const struct net *net, struct nft_set *set, const struct nft_set_elem *elem, - struct nft_set_ext **pext) + struct nft_elem_priv **priv) { struct nft_set_elem_catchall *catchall; u8 genmask = nft_genmask_next(net); @@ -6388,7 +6446,7 @@ static int nft_setelem_catchall_insert(const struct net *net, list_for_each_entry(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (nft_set_elem_active(ext, genmask)) { - *pext = ext; + *priv = catchall->elem; return -EEXIST; } } @@ -6406,22 +6464,23 @@ static int nft_setelem_catchall_insert(const struct net *net, static int nft_setelem_insert(const struct net *net, struct nft_set *set, const struct nft_set_elem *elem, - struct nft_set_ext **ext, unsigned int flags) + struct nft_elem_priv **elem_priv, + unsigned int flags) { int ret; if (flags & NFT_SET_ELEM_CATCHALL) - ret = nft_setelem_catchall_insert(net, set, elem, ext); + ret = nft_setelem_catchall_insert(net, set, elem, elem_priv); else - ret = set->ops->insert(net, set, elem, ext); + ret = set->ops->insert(net, set, elem, elem_priv); return ret; } static bool nft_setelem_is_catchall(const struct nft_set *set, - const struct nft_set_elem *elem) + const struct nft_elem_priv *elem_priv) { - struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && *nft_set_ext_flags(ext) & NFT_SET_ELEM_CATCHALL) @@ -6431,14 +6490,14 @@ static bool nft_setelem_is_catchall(const struct nft_set *set, } static void nft_setelem_activate(struct net *net, struct nft_set *set, - struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); - if (nft_setelem_is_catchall(set, elem)) { + if (nft_setelem_is_catchall(set, elem_priv)) { nft_set_elem_change_active(net, set, ext); } else { - set->ops->activate(net, set, elem); + set->ops->activate(net, set, elem_priv); } } @@ -6502,12 +6561,12 @@ static void nft_setelem_catchall_destroy(struct nft_set_elem_catchall *catchall) static void nft_setelem_catchall_remove(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { struct nft_set_elem_catchall *catchall, *next; list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { - if (catchall->elem == elem->priv) { + if (catchall->elem == elem_priv) { nft_setelem_catchall_destroy(catchall); break; } @@ -6516,12 +6575,12 @@ static void nft_setelem_catchall_remove(const struct net *net, static void nft_setelem_remove(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - if (nft_setelem_is_catchall(set, elem)) - nft_setelem_catchall_remove(net, set, elem); + if (nft_setelem_is_catchall(set, elem_priv)) + nft_setelem_catchall_remove(net, set, elem_priv); else - set->ops->remove(net, set, elem); + set->ops->remove(net, set, elem_priv); } static bool nft_setelem_valid_key_end(const struct nft_set *set, @@ -6554,13 +6613,14 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_set_ext *ext, *ext2; struct nft_set_elem elem; struct nft_set_binding *binding; + struct nft_elem_priv *elem_priv; struct nft_object *obj = NULL; struct nft_userdata *udata; struct nft_data_desc desc; enum nft_registers dreg; struct nft_trans *trans; - u64 timeout; u64 expiration; + u64 timeout; int err, i; u8 ulen; @@ -6853,9 +6913,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, ext->genmask = nft_genmask_cur(ctx->net); - err = nft_setelem_insert(ctx->net, set, &elem, &ext2, flags); + err = nft_setelem_insert(ctx->net, set, &elem, &elem_priv, flags); if (err) { if (err == -EEXIST) { + ext2 = nft_set_elem_ext(set, elem_priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^ nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) || nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^ @@ -6889,12 +6950,12 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } } - nft_trans_elem(trans) = elem; + nft_trans_elem_priv(trans) = elem.priv; nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err_set_full: - nft_setelem_remove(ctx->net, set, &elem); + nft_setelem_remove(ctx->net, set, elem.priv); err_element_clash: kfree(trans); err_elem_free: @@ -6942,8 +7003,10 @@ static int nf_tables_newsetelem(struct sk_buff *skb, set = nft_set_lookup_global(net, table, nla[NFTA_SET_ELEM_LIST_SET], nla[NFTA_SET_ELEM_LIST_SET_ID], genmask); - if (IS_ERR(set)) + if (IS_ERR(set)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]); return PTR_ERR(set); + } if (!list_empty(&set->bindings) && (set->flags & (NFT_SET_CONSTANT | NFT_SET_ANONYMOUS))) @@ -6993,9 +7056,9 @@ void nft_data_hold(const struct nft_data *data, enum nft_data_types type) static void nft_setelem_data_activate(const struct net *net, const struct nft_set *set, - struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_hold(nft_set_ext_data(ext), set->dtype); @@ -7005,9 +7068,9 @@ static void nft_setelem_data_activate(const struct net *net, void nft_setelem_data_deactivate(const struct net *net, const struct nft_set *set, - struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_release(nft_set_ext_data(ext), set->dtype); @@ -7092,9 +7155,9 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) goto fail_ops; - nft_setelem_data_deactivate(ctx->net, set, &elem); + nft_setelem_data_deactivate(ctx->net, set, elem.priv); - nft_trans_elem(trans) = elem; + nft_trans_elem_priv(trans) = elem.priv; nft_trans_commit_list_add_tail(ctx->net, trans); return 0; @@ -7112,36 +7175,29 @@ fail_elem: static int nft_setelem_flush(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, - struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { struct nft_trans *trans; - int err; trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM, sizeof(struct nft_trans_elem), GFP_ATOMIC); if (!trans) return -ENOMEM; - if (!set->ops->flush(ctx->net, set, elem->priv)) { - err = -ENOENT; - goto err1; - } + set->ops->flush(ctx->net, set, elem_priv); set->ndeact++; - nft_setelem_data_deactivate(ctx->net, set, elem); + nft_setelem_data_deactivate(ctx->net, set, elem_priv); nft_trans_elem_set(trans) = set; - nft_trans_elem(trans) = *elem; + nft_trans_elem_priv(trans) = elem_priv; nft_trans_commit_list_add_tail(ctx->net, trans); return 0; -err1: - kfree(trans); - return err; } static int __nft_set_catchall_flush(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { struct nft_trans *trans; @@ -7150,9 +7206,9 @@ static int __nft_set_catchall_flush(const struct nft_ctx *ctx, if (!trans) return -ENOMEM; - nft_setelem_data_deactivate(ctx->net, set, elem); + nft_setelem_data_deactivate(ctx->net, set, elem_priv); nft_trans_elem_set(trans) = set; - nft_trans_elem(trans) = *elem; + nft_trans_elem_priv(trans) = elem_priv; nft_trans_commit_list_add_tail(ctx->net, trans); return 0; @@ -7163,7 +7219,6 @@ static int nft_set_catchall_flush(const struct nft_ctx *ctx, { u8 genmask = nft_genmask_next(ctx->net); struct nft_set_elem_catchall *catchall; - struct nft_set_elem elem; struct nft_set_ext *ext; int ret = 0; @@ -7172,8 +7227,7 @@ static int nft_set_catchall_flush(const struct nft_ctx *ctx, if (!nft_set_elem_active(ext, genmask)) continue; - elem.priv = catchall->elem; - ret = __nft_set_catchall_flush(ctx, set, &elem); + ret = __nft_set_catchall_flush(ctx, set, catchall->elem); if (ret < 0) break; nft_set_elem_change_active(ctx->net, set, ext); @@ -7218,8 +7272,10 @@ static int nf_tables_delsetelem(struct sk_buff *skb, } set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask); - if (IS_ERR(set)) + if (IS_ERR(set)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]); return PTR_ERR(set); + } if (nft_set_is_anonymous(set)) return -EOPNOTSUPP; @@ -7415,11 +7471,15 @@ nla_put_failure: return -1; } -static const struct nft_object_type *__nft_obj_type_get(u32 objtype) +static const struct nft_object_type *__nft_obj_type_get(u32 objtype, u8 family) { const struct nft_object_type *type; list_for_each_entry(type, &nf_tables_objects, list) { + if (type->family != NFPROTO_UNSPEC && + type->family != family) + continue; + if (objtype == type->type) return type; } @@ -7427,11 +7487,11 @@ static const struct nft_object_type *__nft_obj_type_get(u32 objtype) } static const struct nft_object_type * -nft_obj_type_get(struct net *net, u32 objtype) +nft_obj_type_get(struct net *net, u32 objtype, u8 family) { const struct nft_object_type *type; - type = __nft_obj_type_get(objtype); + type = __nft_obj_type_get(objtype, family); if (type != NULL && try_module_get(type->owner)) return type; @@ -7524,7 +7584,7 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - type = __nft_obj_type_get(objtype); + type = __nft_obj_type_get(objtype, family); if (WARN_ON_ONCE(!type)) return -ENOENT; @@ -7538,7 +7598,7 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, if (!nft_use_inc(&table->use)) return -EMFILE; - type = nft_obj_type_get(net, objtype); + type = nft_obj_type_get(net, objtype, family); if (IS_ERR(type)) { err = PTR_ERR(type); goto err_type; @@ -7649,28 +7709,26 @@ static void audit_log_obj_reset(const struct nft_table *table, kfree(buf); } -struct nft_obj_filter { +struct nft_obj_dump_ctx { + unsigned int s_idx; char *table; u32 type; + bool reset; }; static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); - const struct nft_table *table; - unsigned int idx = 0, s_idx = cb->args[0]; - struct nft_obj_filter *filter = cb->data; + struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nftables_pernet *nft_net; + const struct nft_table *table; unsigned int entries = 0; struct nft_object *obj; - bool reset = false; + unsigned int idx = 0; int rc = 0; - if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) - reset = true; - rcu_read_lock(); nft_net = nft_pernet(net); cb->seq = READ_ONCE(nft_net->base_seq); @@ -7683,17 +7741,12 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) list_for_each_entry_rcu(obj, &table->objects, list) { if (!nft_is_active(net, obj)) goto cont; - if (idx < s_idx) + if (idx < ctx->s_idx) goto cont; - if (idx > s_idx) - memset(&cb->args[1], 0, - sizeof(cb->args) - sizeof(cb->args[0])); - if (filter && filter->table && - strcmp(filter->table, table->name)) + if (ctx->table && strcmp(ctx->table, table->name)) goto cont; - if (filter && - filter->type != NFT_OBJECT_UNSPEC && - obj->ops->type->type != filter->type) + if (ctx->type != NFT_OBJECT_UNSPEC && + obj->ops->type->type != ctx->type) goto cont; rc = nf_tables_fill_obj_info(skb, net, @@ -7702,7 +7755,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) NFT_MSG_NEWOBJ, NLM_F_MULTI | NLM_F_APPEND, table->family, table, - obj, reset); + obj, ctx->reset); if (rc < 0) break; @@ -7711,51 +7764,44 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) cont: idx++; } - if (reset && entries) + if (ctx->reset && entries) audit_log_obj_reset(table, nft_net->base_seq, entries); if (rc < 0) break; } rcu_read_unlock(); - cb->args[0] = idx; + ctx->s_idx = idx; return skb->len; } static int nf_tables_dump_obj_start(struct netlink_callback *cb) { + struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; const struct nlattr * const *nla = cb->data; - struct nft_obj_filter *filter = NULL; - if (nla[NFTA_OBJ_TABLE] || nla[NFTA_OBJ_TYPE]) { - filter = kzalloc(sizeof(*filter), GFP_ATOMIC); - if (!filter) + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); + + if (nla[NFTA_OBJ_TABLE]) { + ctx->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC); + if (!ctx->table) return -ENOMEM; + } - if (nla[NFTA_OBJ_TABLE]) { - filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC); - if (!filter->table) { - kfree(filter); - return -ENOMEM; - } - } + if (nla[NFTA_OBJ_TYPE]) + ctx->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); - if (nla[NFTA_OBJ_TYPE]) - filter->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); - } + if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) + ctx->reset = true; - cb->data = filter; return 0; } static int nf_tables_dump_obj_done(struct netlink_callback *cb) { - struct nft_obj_filter *filter = cb->data; + struct nft_obj_dump_ctx *ctx = (void *)cb->ctx; - if (filter) { - kfree(filter->table); - kfree(filter); - } + kfree(ctx->table); return 0; } @@ -8329,9 +8375,9 @@ static int nf_tables_newflowtable(struct sk_buff *skb, u8 family = info->nfmsg->nfgen_family; const struct nf_flowtable_type *type; struct nft_flowtable *flowtable; - struct nft_hook *hook, *next; struct net *net = info->net; struct nft_table *table; + struct nft_trans *trans; struct nft_ctx ctx; int err; @@ -8411,34 +8457,34 @@ static int nf_tables_newflowtable(struct sk_buff *skb, err = nft_flowtable_parse_hook(&ctx, nla, &flowtable_hook, flowtable, extack, true); if (err < 0) - goto err4; + goto err_flowtable_parse_hooks; list_splice(&flowtable_hook.list, &flowtable->hook_list); flowtable->data.priority = flowtable_hook.priority; flowtable->hooknum = flowtable_hook.num; + trans = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto err_flowtable_trans; + } + + /* This must be LAST to ensure no packets are walking over this flowtable. */ err = nft_register_flowtable_net_hooks(ctx.net, table, &flowtable->hook_list, flowtable); - if (err < 0) { - nft_hooks_destroy(&flowtable->hook_list); - goto err4; - } - - err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable); if (err < 0) - goto err5; + goto err_flowtable_hooks; list_add_tail_rcu(&flowtable->list, &table->flowtables); return 0; -err5: - list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { - nft_unregister_flowtable_hook(net, flowtable, hook); - list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); - } -err4: + +err_flowtable_hooks: + nft_trans_destroy(trans); +err_flowtable_trans: + nft_hooks_destroy(&flowtable->hook_list); +err_flowtable_parse_hooks: flowtable->data.type->free(&flowtable->data); err3: module_put(type->owner); @@ -8722,6 +8768,7 @@ static int nf_tables_getflowtable(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { + struct netlink_ext_ack *extack = info->extack; u8 genmask = nft_genmask_cur(info->net); u8 family = info->nfmsg->nfgen_family; struct nft_flowtable *flowtable; @@ -8747,13 +8794,17 @@ static int nf_tables_getflowtable(struct sk_buff *skb, table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, genmask, 0); - if (IS_ERR(table)) + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]); return PTR_ERR(table); + } flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], genmask); - if (IS_ERR(flowtable)) + if (IS_ERR(flowtable)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]); return PTR_ERR(flowtable); + } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) @@ -9009,7 +9060,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_rule_policy, }, [NFT_MSG_GETRULE_RESET] = { - .call = nf_tables_getrule, + .call = nf_tables_getrule_reset, .type = NFNL_CB_RCU, .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, @@ -9259,7 +9310,7 @@ static void nft_commit_release(struct nft_trans *trans) case NFT_MSG_DESTROYSETELEM: nf_tables_set_elem_destroy(&trans->ctx, nft_trans_elem_set(trans), - nft_trans_elem(trans).priv); + nft_trans_elem_priv(trans)); break; case NFT_MSG_DELOBJ: case NFT_MSG_DESTROYOBJ: @@ -9488,16 +9539,12 @@ void nft_chain_del(struct nft_chain *chain) static void nft_trans_gc_setelem_remove(struct nft_ctx *ctx, struct nft_trans_gc *trans) { - void **priv = trans->priv; + struct nft_elem_priv **priv = trans->priv; unsigned int i; for (i = 0; i < trans->count; i++) { - struct nft_set_elem elem = { - .priv = priv[i], - }; - - nft_setelem_data_deactivate(ctx->net, trans->set, &elem); - nft_setelem_remove(ctx->net, trans->set, &elem); + nft_setelem_data_deactivate(ctx->net, trans->set, priv[i]); + nft_setelem_remove(ctx->net, trans->set, priv[i]); } } @@ -9510,7 +9557,7 @@ void nft_trans_gc_destroy(struct nft_trans_gc *trans) static void nft_trans_gc_trans_free(struct rcu_head *rcu) { - struct nft_set_elem elem = {}; + struct nft_elem_priv *elem_priv; struct nft_trans_gc *trans; struct nft_ctx ctx = {}; unsigned int i; @@ -9519,11 +9566,11 @@ static void nft_trans_gc_trans_free(struct rcu_head *rcu) ctx.net = read_pnet(&trans->set->net); for (i = 0; i < trans->count; i++) { - elem.priv = trans->priv[i]; - if (!nft_setelem_is_catchall(trans->set, &elem)) + elem_priv = trans->priv[i]; + if (!nft_setelem_is_catchall(trans->set, elem_priv)) atomic_dec(&trans->set->nelems); - nf_tables_set_elem_destroy(&ctx, trans->set, elem.priv); + nf_tables_set_elem_destroy(&ctx, trans->set, elem_priv); } nft_trans_gc_destroy(trans); @@ -9700,8 +9747,9 @@ dead_elem: struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc) { struct nft_set_elem_catchall *catchall, *next; + u64 tstamp = nft_net_tstamp(gc->net); const struct nft_set *set = gc->set; - struct nft_set_elem elem; + struct nft_elem_priv *elem_priv; struct nft_set_ext *ext; WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net)); @@ -9709,19 +9757,17 @@ struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc) list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); - if (!nft_set_elem_expired(ext)) + if (!__nft_set_elem_expired(ext, tstamp)) continue; gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); if (!gc) return NULL; - memset(&elem, 0, sizeof(elem)); - elem.priv = catchall->elem; - - nft_setelem_data_deactivate(gc->net, gc->set, &elem); + elem_priv = catchall->elem; + nft_setelem_data_deactivate(gc->net, gc->set, elem_priv); nft_setelem_catchall_destroy(catchall); - nft_trans_gc_elem_add(gc, elem.priv); + nft_trans_gc_elem_add(gc, elem_priv); } return gc; @@ -10105,9 +10151,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) case NFT_MSG_NEWSETELEM: te = (struct nft_trans_elem *)trans->data; - nft_setelem_activate(net, te->set, &te->elem); + nft_setelem_activate(net, te->set, te->elem_priv); nf_tables_setelem_notify(&trans->ctx, te->set, - &te->elem, + te->elem_priv, NFT_MSG_NEWSETELEM); if (te->set->ops->commit && list_empty(&te->set->pending_update)) { @@ -10121,10 +10167,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) te = (struct nft_trans_elem *)trans->data; nf_tables_setelem_notify(&trans->ctx, te->set, - &te->elem, + te->elem_priv, trans->msg_type); - nft_setelem_remove(net, te->set, &te->elem); - if (!nft_setelem_is_catchall(te->set, &te->elem)) { + nft_setelem_remove(net, te->set, te->elem_priv); + if (!nft_setelem_is_catchall(te->set, te->elem_priv)) { atomic_dec(&te->set->nelems); te->set->ndeact--; } @@ -10244,7 +10290,7 @@ static void nf_tables_abort_release(struct nft_trans *trans) break; case NFT_MSG_NEWSETELEM: nft_set_elem_destroy(nft_trans_elem_set(trans), - nft_trans_elem(trans).priv, true); + nft_trans_elem_priv(trans), true); break; case NFT_MSG_NEWOBJ: nft_obj_destroy(&trans->ctx, nft_trans_obj(trans)); @@ -10392,8 +10438,8 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) break; } te = (struct nft_trans_elem *)trans->data; - nft_setelem_remove(net, te->set, &te->elem); - if (!nft_setelem_is_catchall(te->set, &te->elem)) + nft_setelem_remove(net, te->set, te->elem_priv); + if (!nft_setelem_is_catchall(te->set, te->elem_priv)) atomic_dec(&te->set->nelems); if (te->set->ops->abort && @@ -10406,9 +10452,9 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) case NFT_MSG_DESTROYSETELEM: te = (struct nft_trans_elem *)trans->data; - nft_setelem_data_activate(net, te->set, &te->elem); - nft_setelem_activate(net, te->set, &te->elem); - if (!nft_setelem_is_catchall(te->set, &te->elem)) + nft_setelem_data_activate(net, te->set, te->elem_priv); + nft_setelem_activate(net, te->set, te->elem_priv); + if (!nft_setelem_is_catchall(te->set, te->elem_priv)) te->set->ndeact--; if (te->set->ops->abort && @@ -10497,6 +10543,7 @@ static bool nf_tables_valid_genid(struct net *net, u32 genid) bool genid_ok; mutex_lock(&nft_net->commit_mutex); + nft_net->tstamp = get_jiffies_64(); genid_ok = genid == 0 || nft_net->base_seq == genid; if (!genid_ok) @@ -10584,9 +10631,9 @@ static int nft_check_loops(const struct nft_ctx *ctx, static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, - struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) @@ -11393,4 +11440,5 @@ module_exit(nf_tables_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("Framework for packet filtering and classification"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFTABLES); diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 711c22ab70..c3e6353647 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -115,7 +115,7 @@ static noinline void __nft_trace_verdict(const struct nft_pktinfo *pkt, { enum nft_trace_types type; - switch (regs->verdict.code) { + switch (regs->verdict.code & NF_VERDICT_MASK) { case NFT_CONTINUE: case NFT_RETURN: type = NFT_TRACETYPE_RETURN; @@ -308,10 +308,11 @@ next_rule: switch (regs.verdict.code & NF_VERDICT_MASK) { case NF_ACCEPT: - case NF_DROP: case NF_QUEUE: case NF_STOLEN: return regs.verdict.code; + case NF_DROP: + return NF_DROP_REASON(pkt->skb, SKB_DROP_REASON_NETFILTER_DROP, EPERM); } switch (regs.verdict.code) { @@ -342,6 +343,9 @@ next_rule: if (static_branch_unlikely(&nft_counters_enabled)) nft_update_chain_stats(basechain, pkt); + if (nft_base_chain(basechain)->policy == NF_DROP) + return NF_DROP_REASON(pkt->skb, SKB_DROP_REASON_NETFILTER_DROP, EPERM); + return nft_base_chain(basechain)->policy; } EXPORT_SYMBOL_GPL(nft_do_chain); diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 6d41c0bd3d..a83637e3f4 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -258,17 +258,21 @@ void nft_trace_notify(const struct nft_pktinfo *pkt, case __NFT_TRACETYPE_MAX: break; case NFT_TRACETYPE_RETURN: - case NFT_TRACETYPE_RULE: + case NFT_TRACETYPE_RULE: { + unsigned int v; + if (nft_verdict_dump(skb, NFTA_TRACE_VERDICT, verdict)) goto nla_put_failure; /* pkt->skb undefined iff NF_STOLEN, disable dump */ - if (verdict->code == NF_STOLEN) + v = verdict->code & NF_VERDICT_MASK; + if (v == NF_STOLEN) info->packet_dumped = true; else mark = pkt->skb->mark; break; + } case NFT_TRACETYPE_POLICY: mark = pkt->skb->mark; diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index 50723ba082..c0fc431991 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -447,4 +447,5 @@ module_init(nfnl_osf_init); module_exit(nfnl_osf_fini); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Passive OS fingerprint matching"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 556bc902af..5cf38fc0a3 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -228,19 +228,29 @@ find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict) { const struct nf_ct_hook *ct_hook; - int err; if (verdict == NF_ACCEPT || verdict == NF_REPEAT || verdict == NF_STOP) { + unsigned int ct_verdict = verdict; + rcu_read_lock(); ct_hook = rcu_dereference(nf_ct_hook); - if (ct_hook) { - err = ct_hook->update(entry->state.net, entry->skb); - if (err < 0) - verdict = NF_DROP; - } + if (ct_hook) + ct_verdict = ct_hook->update(entry->state.net, entry->skb); rcu_read_unlock(); + + switch (ct_verdict & NF_VERDICT_MASK) { + case NF_ACCEPT: + /* follow userspace verdict, could be REPEAT */ + break; + case NF_STOLEN: + nf_queue_entry_free(entry); + return; + default: + verdict = ct_verdict & NF_VERDICT_MASK; + break; + } } nf_reinject(entry, verdict); } diff --git a/net/netfilter/nft_chain_nat.c b/net/netfilter/nft_chain_nat.c index 98e4946100..40e230d8b7 100644 --- a/net/netfilter/nft_chain_nat.c +++ b/net/netfilter/nft_chain_nat.c @@ -137,6 +137,7 @@ module_init(nft_chain_nat_init); module_exit(nft_chain_nat_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("nftables network address translation support"); #ifdef CONFIG_NF_TABLES_IPV4 MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat"); #endif diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index f0eeda97bf..1f9474fefe 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -135,7 +135,7 @@ static void nft_target_eval_bridge(const struct nft_expr *expr, static const struct nla_policy nft_target_policy[NFTA_TARGET_MAX + 1] = { [NFTA_TARGET_NAME] = { .type = NLA_NUL_STRING }, - [NFTA_TARGET_REV] = { .type = NLA_U32 }, + [NFTA_TARGET_REV] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_TARGET_INFO] = { .type = NLA_BINARY }, }; @@ -200,6 +200,7 @@ static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1] static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv) { struct nlattr *tb[NFTA_RULE_COMPAT_MAX+1]; + u32 l4proto; u32 flags; int err; @@ -212,12 +213,18 @@ static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv) return -EINVAL; flags = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_FLAGS])); - if (flags & ~NFT_RULE_COMPAT_F_MASK) + if (flags & NFT_RULE_COMPAT_F_UNUSED || + flags & ~NFT_RULE_COMPAT_F_MASK) return -EINVAL; if (flags & NFT_RULE_COMPAT_F_INV) *inv = true; - *proto = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO])); + l4proto = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO])); + if (l4proto > U16_MAX) + return -EINVAL; + + *proto = l4proto; + return 0; } @@ -419,7 +426,7 @@ static void nft_match_eval(const struct nft_expr *expr, static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = { [NFTA_MATCH_NAME] = { .type = NLA_NUL_STRING }, - [NFTA_MATCH_REV] = { .type = NLA_U32 }, + [NFTA_MATCH_REV] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_MATCH_INFO] = { .type = NLA_BINARY }, }; @@ -724,7 +731,7 @@ out_put: static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = { [NFTA_COMPAT_NAME] = { .type = NLA_NUL_STRING, .len = NFT_COMPAT_NAME_MAX-1 }, - [NFTA_COMPAT_REV] = { .type = NLA_U32 }, + [NFTA_COMPAT_REV] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_COMPAT_TYPE] = { .type = NLA_U32 }, }; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 86bb9d7797..bfd3e5a14d 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -476,6 +476,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, break; #endif case NFT_CT_ID: + if (tb[NFTA_CT_DIRECTION]) + return -EINVAL; + len = sizeof(u32); break; default: @@ -1250,7 +1253,31 @@ static int nft_ct_expect_obj_init(const struct nft_ctx *ctx, if (tb[NFTA_CT_EXPECT_L3PROTO]) priv->l3num = ntohs(nla_get_be16(tb[NFTA_CT_EXPECT_L3PROTO])); + switch (priv->l3num) { + case NFPROTO_IPV4: + case NFPROTO_IPV6: + if (priv->l3num != ctx->family) + return -EINVAL; + + fallthrough; + case NFPROTO_INET: + break; + default: + return -EOPNOTSUPP; + } + priv->l4proto = nla_get_u8(tb[NFTA_CT_EXPECT_L4PROTO]); + switch (priv->l4proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + case IPPROTO_DCCP: + case IPPROTO_SCTP: + break; + default: + return -EOPNOTSUPP; + } + priv->dport = nla_get_be16(tb[NFTA_CT_EXPECT_DPORT]); priv->timeout = nla_get_u32(tb[NFTA_CT_EXPECT_TIMEOUT]); priv->size = nla_get_u8(tb[NFTA_CT_EXPECT_SIZE]); diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 629a91a8c6..c09dba5735 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -44,33 +44,34 @@ static int nft_dynset_expr_setup(const struct nft_dynset *priv, return 0; } -static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, - struct nft_regs *regs) +static struct nft_elem_priv *nft_dynset_new(struct nft_set *set, + const struct nft_expr *expr, + struct nft_regs *regs) { const struct nft_dynset *priv = nft_expr_priv(expr); struct nft_set_ext *ext; + void *elem_priv; u64 timeout; - void *elem; if (!atomic_add_unless(&set->nelems, 1, set->size)) return NULL; timeout = priv->timeout ? : set->timeout; - elem = nft_set_elem_init(set, &priv->tmpl, - ®s->data[priv->sreg_key], NULL, - ®s->data[priv->sreg_data], - timeout, 0, GFP_ATOMIC); - if (IS_ERR(elem)) + elem_priv = nft_set_elem_init(set, &priv->tmpl, + ®s->data[priv->sreg_key], NULL, + ®s->data[priv->sreg_data], + timeout, 0, GFP_ATOMIC); + if (IS_ERR(elem_priv)) goto err1; - ext = nft_set_elem_ext(set, elem); + ext = nft_set_elem_ext(set, elem_priv); if (priv->num_exprs && nft_dynset_expr_setup(priv, ext) < 0) goto err2; - return elem; + return elem_priv; err2: - nft_set_elem_destroy(set, elem, false); + nft_set_elem_destroy(set, elem_priv, false); err1: if (set->size) atomic_dec(&set->nelems); diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index ca905aa822..37cfe6dd71 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -208,4 +208,5 @@ bool nft_fib_reduce(struct nft_regs_track *track, EXPORT_SYMBOL_GPL(nft_fib_reduce); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Query routing table from nftables"); MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index a5268e6dd3..358e742afa 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -270,4 +270,5 @@ module_exit(nft_fwd_netdev_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_DESCRIPTION("nftables netdev packet forwarding support"); MODULE_ALIAS_NFT_AF_EXPR(5, "fwd"); diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 1e5e7a181e..32df7a1683 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -13,6 +13,7 @@ #include <net/netfilter/nf_tables_core.h> struct nft_bitmap_elem { + struct nft_elem_priv priv; struct list_head head; struct nft_set_ext ext; }; @@ -104,8 +105,9 @@ nft_bitmap_elem_find(const struct nft_set *set, struct nft_bitmap_elem *this, return NULL; } -static void *nft_bitmap_get(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem, unsigned int flags) +static struct nft_elem_priv * +nft_bitmap_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) { const struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); @@ -116,23 +118,23 @@ static void *nft_bitmap_get(const struct net *net, const struct nft_set *set, !nft_set_elem_active(&be->ext, genmask)) continue; - return be; + return &be->priv; } return ERR_PTR(-ENOENT); } static int nft_bitmap_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, - struct nft_set_ext **ext) + struct nft_elem_priv **elem_priv) { + struct nft_bitmap_elem *new = nft_elem_priv_cast(elem->priv), *be; struct nft_bitmap *priv = nft_set_priv(set); - struct nft_bitmap_elem *new = elem->priv, *be; u8 genmask = nft_genmask_next(net); u32 idx, off; be = nft_bitmap_elem_find(set, new, genmask); if (be) { - *ext = &be->ext; + *elem_priv = &be->priv; return -EEXIST; } @@ -144,12 +146,11 @@ static int nft_bitmap_insert(const struct net *net, const struct nft_set *set, return 0; } -static void nft_bitmap_remove(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) +static void nft_bitmap_remove(const struct net *net, const struct nft_set *set, + struct nft_elem_priv *elem_priv) { + struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv); struct nft_bitmap *priv = nft_set_priv(set); - struct nft_bitmap_elem *be = elem->priv; u8 genmask = nft_genmask_next(net); u32 idx, off; @@ -161,10 +162,10 @@ static void nft_bitmap_remove(const struct net *net, static void nft_bitmap_activate(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { + struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv); struct nft_bitmap *priv = nft_set_priv(set); - struct nft_bitmap_elem *be = elem->priv; u8 genmask = nft_genmask_next(net); u32 idx, off; @@ -174,28 +175,27 @@ static void nft_bitmap_activate(const struct net *net, nft_set_elem_change_active(net, set, &be->ext); } -static bool nft_bitmap_flush(const struct net *net, - const struct nft_set *set, void *_be) +static void nft_bitmap_flush(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) { + struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv); struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); - struct nft_bitmap_elem *be = _be; u32 idx, off; nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off); /* Enter 10 state, similar to deactivation. */ priv->bitmap[idx] &= ~(genmask << off); nft_set_elem_change_active(net, set, &be->ext); - - return true; } -static void *nft_bitmap_deactivate(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) +static struct nft_elem_priv * +nft_bitmap_deactivate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) { + struct nft_bitmap_elem *this = nft_elem_priv_cast(elem->priv), *be; struct nft_bitmap *priv = nft_set_priv(set); - struct nft_bitmap_elem *this = elem->priv, *be; u8 genmask = nft_genmask_next(net); u32 idx, off; @@ -209,7 +209,7 @@ static void *nft_bitmap_deactivate(const struct net *net, priv->bitmap[idx] &= ~(genmask << off); nft_set_elem_change_active(net, set, &be->ext); - return be; + return &be->priv; } static void nft_bitmap_walk(const struct nft_ctx *ctx, @@ -218,7 +218,6 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx, { const struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be; - struct nft_set_elem elem; list_for_each_entry_rcu(be, &priv->list, head) { if (iter->count < iter->skip) @@ -226,9 +225,7 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx, if (!nft_set_elem_active(&be->ext, iter->genmask)) goto cont; - elem.priv = be; - - iter->err = iter->fn(ctx, set, iter, &elem); + iter->err = iter->fn(ctx, set, iter, &be->priv); if (iter->err < 0) return; @@ -265,6 +262,8 @@ static int nft_bitmap_init(const struct nft_set *set, { struct nft_bitmap *priv = nft_set_priv(set); + BUILD_BUG_ON(offsetof(struct nft_bitmap_elem, priv) != 0); + INIT_LIST_HEAD(&priv->list); priv->bitmap_size = nft_bitmap_size(set->klen); @@ -278,7 +277,7 @@ static void nft_bitmap_destroy(const struct nft_ctx *ctx, struct nft_bitmap_elem *be, *n; list_for_each_entry_safe(be, n, &priv->list, head) - nf_tables_set_elem_destroy(ctx, set, be); + nf_tables_set_elem_destroy(ctx, set, &be->priv); } static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 2013de934c..6968a3b342 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -27,6 +27,7 @@ struct nft_rhash { }; struct nft_rhash_elem { + struct nft_elem_priv priv; struct rhash_head node; struct nft_set_ext ext; }; @@ -35,6 +36,7 @@ struct nft_rhash_cmp_arg { const struct nft_set *set; const u32 *key; u8 genmask; + u64 tstamp; }; static inline u32 nft_rhash_key(const void *data, u32 len, u32 seed) @@ -61,7 +63,7 @@ static inline int nft_rhash_cmp(struct rhashtable_compare_arg *arg, return 1; if (nft_set_elem_is_dead(&he->ext)) return 1; - if (nft_set_elem_expired(&he->ext)) + if (__nft_set_elem_expired(&he->ext, x->tstamp)) return 1; if (!nft_set_elem_active(&he->ext, x->genmask)) return 1; @@ -86,6 +88,7 @@ bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, .genmask = nft_genmask_cur(net), .set = set, .key = key, + .tstamp = get_jiffies_64(), }; he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); @@ -95,8 +98,9 @@ bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, return !!he; } -static void *nft_rhash_get(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem, unsigned int flags) +static struct nft_elem_priv * +nft_rhash_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) { struct nft_rhash *priv = nft_set_priv(set); struct nft_rhash_elem *he; @@ -104,17 +108,19 @@ static void *nft_rhash_get(const struct net *net, const struct nft_set *set, .genmask = nft_genmask_cur(net), .set = set, .key = elem->key.val.data, + .tstamp = get_jiffies_64(), }; he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); if (he != NULL) - return he; + return &he->priv; return ERR_PTR(-ENOENT); } static bool nft_rhash_update(struct nft_set *set, const u32 *key, - void *(*new)(struct nft_set *, + struct nft_elem_priv * + (*new)(struct nft_set *, const struct nft_expr *, struct nft_regs *regs), const struct nft_expr *expr, @@ -123,20 +129,23 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key, { struct nft_rhash *priv = nft_set_priv(set); struct nft_rhash_elem *he, *prev; + struct nft_elem_priv *elem_priv; struct nft_rhash_cmp_arg arg = { .genmask = NFT_GENMASK_ANY, .set = set, .key = key, + .tstamp = get_jiffies_64(), }; he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); if (he != NULL) goto out; - he = new(set, expr, regs); - if (he == NULL) + elem_priv = new(set, expr, regs); + if (!elem_priv) goto err1; + he = nft_elem_priv_cast(elem_priv); prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node, nft_rhash_params); if (IS_ERR(prev)) @@ -144,7 +153,7 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key, /* Another cpu may race to insert the element with the same key */ if (prev) { - nft_set_elem_destroy(set, he, true); + nft_set_elem_destroy(set, &he->priv, true); atomic_dec(&set->nelems); he = prev; } @@ -154,7 +163,7 @@ out: return true; err2: - nft_set_elem_destroy(set, he, true); + nft_set_elem_destroy(set, &he->priv, true); atomic_dec(&set->nelems); err1: return false; @@ -162,14 +171,15 @@ err1: static int nft_rhash_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, - struct nft_set_ext **ext) + struct nft_elem_priv **elem_priv) { + struct nft_rhash_elem *he = nft_elem_priv_cast(elem->priv); struct nft_rhash *priv = nft_set_priv(set); - struct nft_rhash_elem *he = elem->priv; struct nft_rhash_cmp_arg arg = { .genmask = nft_genmask_next(net), .set = set, .key = elem->key.val.data, + .tstamp = nft_net_tstamp(net), }; struct nft_rhash_elem *prev; @@ -178,33 +188,32 @@ static int nft_rhash_insert(const struct net *net, const struct nft_set *set, if (IS_ERR(prev)) return PTR_ERR(prev); if (prev) { - *ext = &prev->ext; + *elem_priv = &prev->priv; return -EEXIST; } return 0; } static void nft_rhash_activate(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - struct nft_rhash_elem *he = elem->priv; + struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv); nft_set_elem_change_active(net, set, &he->ext); } -static bool nft_rhash_flush(const struct net *net, - const struct nft_set *set, void *priv) +static void nft_rhash_flush(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) { - struct nft_rhash_elem *he = priv; + struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv); nft_set_elem_change_active(net, set, &he->ext); - - return true; } -static void *nft_rhash_deactivate(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) +static struct nft_elem_priv * +nft_rhash_deactivate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) { struct nft_rhash *priv = nft_set_priv(set); struct nft_rhash_elem *he; @@ -212,6 +221,7 @@ static void *nft_rhash_deactivate(const struct net *net, .genmask = nft_genmask_next(net), .set = set, .key = elem->key.val.data, + .tstamp = nft_net_tstamp(net), }; rcu_read_lock(); @@ -221,15 +231,15 @@ static void *nft_rhash_deactivate(const struct net *net, rcu_read_unlock(); - return he; + return &he->priv; } static void nft_rhash_remove(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { + struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv); struct nft_rhash *priv = nft_set_priv(set); - struct nft_rhash_elem *he = elem->priv; rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params); } @@ -260,7 +270,6 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_rhash *priv = nft_set_priv(set); struct nft_rhash_elem *he; struct rhashtable_iter hti; - struct nft_set_elem elem; rhashtable_walk_enter(&priv->ht, &hti); rhashtable_walk_start(&hti); @@ -280,9 +289,7 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, if (!nft_set_elem_active(&he->ext, iter->genmask)) goto cont; - elem.priv = he; - - iter->err = iter->fn(ctx, set, iter, &elem); + iter->err = iter->fn(ctx, set, iter, &he->priv); if (iter->err < 0) break; @@ -406,6 +413,8 @@ static int nft_rhash_init(const struct nft_set *set, struct rhashtable_params params = nft_rhash_params; int err; + BUILD_BUG_ON(offsetof(struct nft_rhash_elem, priv) != 0); + params.nelem_hint = desc->size ?: NFT_RHASH_ELEMENT_HINT; params.key_len = set->klen; @@ -428,8 +437,9 @@ struct nft_rhash_ctx { static void nft_rhash_elem_destroy(void *ptr, void *arg) { struct nft_rhash_ctx *rhash_ctx = arg; + struct nft_rhash_elem *he = ptr; - nf_tables_set_elem_destroy(&rhash_ctx->ctx, rhash_ctx->set, ptr); + nf_tables_set_elem_destroy(&rhash_ctx->ctx, rhash_ctx->set, &he->priv); } static void nft_rhash_destroy(const struct nft_ctx *ctx, @@ -476,6 +486,7 @@ struct nft_hash { }; struct nft_hash_elem { + struct nft_elem_priv priv; struct hlist_node node; struct nft_set_ext ext; }; @@ -501,8 +512,9 @@ bool nft_hash_lookup(const struct net *net, const struct nft_set *set, return false; } -static void *nft_hash_get(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem, unsigned int flags) +static struct nft_elem_priv * +nft_hash_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) { struct nft_hash *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); @@ -514,7 +526,7 @@ static void *nft_hash_get(const struct net *net, const struct nft_set *set, hlist_for_each_entry_rcu(he, &priv->table[hash], node) { if (!memcmp(nft_set_ext_key(&he->ext), elem->key.val.data, set->klen) && nft_set_elem_active(&he->ext, genmask)) - return he; + return &he->priv; } return ERR_PTR(-ENOENT); } @@ -562,9 +574,9 @@ static u32 nft_jhash(const struct nft_set *set, const struct nft_hash *priv, static int nft_hash_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, - struct nft_set_ext **ext) + struct nft_elem_priv **elem_priv) { - struct nft_hash_elem *this = elem->priv, *he; + struct nft_hash_elem *this = nft_elem_priv_cast(elem->priv), *he; struct nft_hash *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); u32 hash; @@ -574,7 +586,7 @@ static int nft_hash_insert(const struct net *net, const struct nft_set *set, if (!memcmp(nft_set_ext_key(&this->ext), nft_set_ext_key(&he->ext), set->klen) && nft_set_elem_active(&he->ext, genmask)) { - *ext = &he->ext; + *elem_priv = &he->priv; return -EEXIST; } } @@ -583,28 +595,28 @@ static int nft_hash_insert(const struct net *net, const struct nft_set *set, } static void nft_hash_activate(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - struct nft_hash_elem *he = elem->priv; + struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv); nft_set_elem_change_active(net, set, &he->ext); } -static bool nft_hash_flush(const struct net *net, - const struct nft_set *set, void *priv) +static void nft_hash_flush(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) { - struct nft_hash_elem *he = priv; + struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv); nft_set_elem_change_active(net, set, &he->ext); - return true; } -static void *nft_hash_deactivate(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) +static struct nft_elem_priv * +nft_hash_deactivate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) { + struct nft_hash_elem *this = nft_elem_priv_cast(elem->priv), *he; struct nft_hash *priv = nft_set_priv(set); - struct nft_hash_elem *this = elem->priv, *he; u8 genmask = nft_genmask_next(net); u32 hash; @@ -614,7 +626,7 @@ static void *nft_hash_deactivate(const struct net *net, set->klen) && nft_set_elem_active(&he->ext, genmask)) { nft_set_elem_change_active(net, set, &he->ext); - return he; + return &he->priv; } } return NULL; @@ -622,9 +634,9 @@ static void *nft_hash_deactivate(const struct net *net, static void nft_hash_remove(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - struct nft_hash_elem *he = elem->priv; + struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv); hlist_del_rcu(&he->node); } @@ -634,7 +646,6 @@ static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set, { struct nft_hash *priv = nft_set_priv(set); struct nft_hash_elem *he; - struct nft_set_elem elem; int i; for (i = 0; i < priv->buckets; i++) { @@ -644,9 +655,7 @@ static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set, if (!nft_set_elem_active(&he->ext, iter->genmask)) goto cont; - elem.priv = he; - - iter->err = iter->fn(ctx, set, iter, &elem); + iter->err = iter->fn(ctx, set, iter, &he->priv); if (iter->err < 0) return; cont: @@ -685,7 +694,7 @@ static void nft_hash_destroy(const struct nft_ctx *ctx, for (i = 0; i < priv->buckets; i++) { hlist_for_each_entry_safe(he, next, &priv->table[i], node) { hlist_del_rcu(&he->node); - nf_tables_set_elem_destroy(ctx, set, he); + nf_tables_set_elem_destroy(ctx, set, &he->priv); } } } diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 3ff31043f7..3089c4ca8f 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -342,9 +342,6 @@ #include "nft_set_pipapo_avx2.h" #include "nft_set_pipapo.h" -/* Current working bitmap index, toggled between field matches */ -static DEFINE_PER_CPU(bool, nft_pipapo_scratch_index); - /** * pipapo_refill() - For each set bit, set bits from selected mapping table item * @map: Bitmap to be scanned for set bits @@ -412,6 +409,7 @@ bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext) { struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_scratch *scratch; unsigned long *res_map, *fill_map; u8 genmask = nft_genmask_cur(net); const u8 *rp = (const u8 *)key; @@ -422,15 +420,17 @@ bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, local_bh_disable(); - map_index = raw_cpu_read(nft_pipapo_scratch_index); - m = rcu_dereference(priv->match); if (unlikely(!m || !*raw_cpu_ptr(m->scratch))) goto out; - res_map = *raw_cpu_ptr(m->scratch) + (map_index ? m->bsize_max : 0); - fill_map = *raw_cpu_ptr(m->scratch) + (map_index ? 0 : m->bsize_max); + scratch = *raw_cpu_ptr(m->scratch); + + map_index = scratch->map_index; + + res_map = scratch->map + (map_index ? m->bsize_max : 0); + fill_map = scratch->map + (map_index ? 0 : m->bsize_max); memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); @@ -460,7 +460,7 @@ next_match: b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, last); if (b < 0) { - raw_cpu_write(nft_pipapo_scratch_index, map_index); + scratch->map_index = map_index; local_bh_enable(); return false; @@ -477,7 +477,7 @@ next_match: * current inactive bitmap is clean and can be reused as * *next* bitmap (not initial) for the next packet. */ - raw_cpu_write(nft_pipapo_scratch_index, map_index); + scratch->map_index = map_index; local_bh_enable(); return true; @@ -504,6 +504,7 @@ out: * @set: nftables API set representation * @data: Key data to be matched against existing elements * @genmask: If set, check that element is active in given genmask + * @tstamp: timestamp to check for expired elements * * This is essentially the same as the lookup function, except that it matches * key data against the uncommitted copy and doesn't use preallocated maps for @@ -513,7 +514,8 @@ out: */ static struct nft_pipapo_elem *pipapo_get(const struct net *net, const struct nft_set *set, - const u8 *data, u8 genmask) + const u8 *data, u8 genmask, + u64 tstamp) { struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT); struct nft_pipapo *priv = nft_set_priv(set); @@ -566,7 +568,7 @@ next_match: goto out; if (last) { - if (nft_set_elem_expired(&f->mt[b].e->ext)) + if (__nft_set_elem_expired(&f->mt[b].e->ext, tstamp)) goto next_match; if ((genmask && !nft_set_elem_active(&f->mt[b].e->ext, genmask))) @@ -599,11 +601,18 @@ out: * @elem: nftables API element representation containing key data * @flags: Unused */ -static void *nft_pipapo_get(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem, unsigned int flags) +static struct nft_elem_priv * +nft_pipapo_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) { - return pipapo_get(net, set, (const u8 *)elem->key.val.data, - nft_genmask_cur(net)); + struct nft_pipapo_elem *e; + + e = pipapo_get(net, set, (const u8 *)elem->key.val.data, + nft_genmask_cur(net), get_jiffies_64()); + if (IS_ERR(e)) + return ERR_CAST(e); + + return &e->priv; } /** @@ -1102,6 +1111,25 @@ static void pipapo_map(struct nft_pipapo_match *m, } /** + * pipapo_free_scratch() - Free per-CPU map at original (not aligned) address + * @m: Matching data + * @cpu: CPU number + */ +static void pipapo_free_scratch(const struct nft_pipapo_match *m, unsigned int cpu) +{ + struct nft_pipapo_scratch *s; + void *mem; + + s = *per_cpu_ptr(m->scratch, cpu); + if (!s) + return; + + mem = s; + mem -= s->align_off; + kfree(mem); +} + +/** * pipapo_realloc_scratch() - Reallocate scratch maps for partial match results * @clone: Copy of matching data with pending insertions and deletions * @bsize_max: Maximum bucket size, scratch maps cover two buckets @@ -1114,12 +1142,13 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, int i; for_each_possible_cpu(i) { - unsigned long *scratch; + struct nft_pipapo_scratch *scratch; #ifdef NFT_PIPAPO_ALIGN - unsigned long *scratch_aligned; + void *scratch_aligned; + u32 align_off; #endif - - scratch = kzalloc_node(bsize_max * sizeof(*scratch) * 2 + + scratch = kzalloc_node(struct_size(scratch, map, + bsize_max * 2) + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL, cpu_to_node(i)); if (!scratch) { @@ -1133,14 +1162,25 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, return -ENOMEM; } - kfree(*per_cpu_ptr(clone->scratch, i)); - - *per_cpu_ptr(clone->scratch, i) = scratch; + pipapo_free_scratch(clone, i); #ifdef NFT_PIPAPO_ALIGN - scratch_aligned = NFT_PIPAPO_LT_ALIGN(scratch); - *per_cpu_ptr(clone->scratch_aligned, i) = scratch_aligned; + /* Align &scratch->map (not the struct itself): the extra + * %NFT_PIPAPO_ALIGN_HEADROOM bytes passed to kzalloc_node() + * above guarantee we can waste up to those bytes in order + * to align the map field regardless of its offset within + * the struct. + */ + BUILD_BUG_ON(offsetof(struct nft_pipapo_scratch, map) > NFT_PIPAPO_ALIGN_HEADROOM); + + scratch_aligned = NFT_PIPAPO_LT_ALIGN(&scratch->map); + scratch_aligned -= offsetof(struct nft_pipapo_scratch, map); + align_off = scratch_aligned - (void *)scratch; + + scratch = scratch_aligned; + scratch->align_off = align_off; #endif + *per_cpu_ptr(clone->scratch, i) = scratch; } return 0; @@ -1151,21 +1191,22 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, * @net: Network namespace * @set: nftables API set representation * @elem: nftables API element representation containing key data - * @ext2: Filled with pointer to &struct nft_set_ext in inserted element + * @elem_priv: Filled with pointer to &struct nft_set_ext in inserted element * * Return: 0 on success, error pointer on failure. */ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, - struct nft_set_ext **ext2) + struct nft_elem_priv **elem_priv) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; const u8 *start = (const u8 *)elem->key.val.data, *end; - struct nft_pipapo_elem *e = elem->priv, *dup; struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m = priv->clone; u8 genmask = nft_genmask_next(net); + struct nft_pipapo_elem *e, *dup; + u64 tstamp = nft_net_tstamp(net); struct nft_pipapo_field *f; const u8 *start_p, *end_p; int i, bsize_max, err = 0; @@ -1175,7 +1216,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, else end = start; - dup = pipapo_get(net, set, start, genmask); + dup = pipapo_get(net, set, start, genmask, tstamp); if (!IS_ERR(dup)) { /* Check if we already have the same exact entry */ const struct nft_data *dup_key, *dup_end; @@ -1188,7 +1229,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, if (!memcmp(start, dup_key->data, sizeof(*dup_key->data)) && !memcmp(end, dup_end->data, sizeof(*dup_end->data))) { - *ext2 = &dup->ext; + *elem_priv = &dup->priv; return -EEXIST; } @@ -1197,13 +1238,13 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, if (PTR_ERR(dup) == -ENOENT) { /* Look for partially overlapping entries */ - dup = pipapo_get(net, set, end, nft_genmask_next(net)); + dup = pipapo_get(net, set, end, nft_genmask_next(net), tstamp); } if (PTR_ERR(dup) != -ENOENT) { if (IS_ERR(dup)) return PTR_ERR(dup); - *ext2 = &dup->ext; + *elem_priv = &dup->priv; return -ENOTEMPTY; } @@ -1263,7 +1304,8 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, put_cpu_ptr(m->scratch); } - *ext2 = &e->ext; + e = nft_elem_priv_cast(elem->priv); + *elem_priv = &e->priv; pipapo_map(m, rulemap, e); @@ -1293,11 +1335,6 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) if (!new->scratch) goto out_scratch; -#ifdef NFT_PIPAPO_ALIGN - new->scratch_aligned = alloc_percpu(*new->scratch_aligned); - if (!new->scratch_aligned) - goto out_scratch; -#endif for_each_possible_cpu(i) *per_cpu_ptr(new->scratch, i) = NULL; @@ -1349,10 +1386,7 @@ out_lt: } out_scratch_realloc: for_each_possible_cpu(i) - kfree(*per_cpu_ptr(new->scratch, i)); -#ifdef NFT_PIPAPO_ALIGN - free_percpu(new->scratch_aligned); -#endif + pipapo_free_scratch(new, i); out_scratch: free_percpu(new->scratch); kfree(new); @@ -1540,23 +1574,19 @@ static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set, struct nft_pipapo_elem *e) { - struct nft_set_elem elem = { - .priv = e, - }; - - nft_setelem_data_deactivate(net, set, &elem); + nft_setelem_data_deactivate(net, set, &e->priv); } /** * pipapo_gc() - Drop expired entries from set, destroy start and end elements - * @_set: nftables API set representation + * @set: nftables API set representation * @m: Matching data */ -static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m) +static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) { - struct nft_set *set = (struct nft_set *) _set; struct nft_pipapo *priv = nft_set_priv(set); struct net *net = read_pnet(&set->net); + u64 tstamp = nft_net_tstamp(net); int rules_f0, first_rule = 0; struct nft_pipapo_elem *e; struct nft_trans_gc *gc; @@ -1591,7 +1621,7 @@ static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m) /* synchronous gc never fails, there is no need to set on * NFT_SET_ELEM_DEAD_BIT. */ - if (nft_set_elem_expired(&e->ext)) { + if (__nft_set_elem_expired(&e->ext, tstamp)) { priv->dirty = true; gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); @@ -1637,13 +1667,9 @@ static void pipapo_free_match(struct nft_pipapo_match *m) int i; for_each_possible_cpu(i) - kfree(*per_cpu_ptr(m->scratch, i)); + pipapo_free_scratch(m, i); -#ifdef NFT_PIPAPO_ALIGN - free_percpu(m->scratch_aligned); -#endif free_percpu(m->scratch); - pipapo_free_fields(m); kfree(m); @@ -1672,7 +1698,7 @@ static void pipapo_reclaim_match(struct rcu_head *rcu) * We also need to create a new working copy for subsequent insertions and * deletions. */ -static void nft_pipapo_commit(const struct nft_set *set) +static void nft_pipapo_commit(struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *new_clone, *old; @@ -1732,7 +1758,7 @@ static void nft_pipapo_abort(const struct nft_set *set) * nft_pipapo_activate() - Mark element reference as active given key, commit * @net: Network namespace * @set: nftables API set representation - * @elem: nftables API element representation containing key data + * @elem_priv: nftables API element representation containing key data * * On insertion, elements are added to a copy of the matching data currently * in use for lookups, and not directly inserted into current lookup data. Both @@ -1741,9 +1767,9 @@ static void nft_pipapo_abort(const struct nft_set *set) */ static void nft_pipapo_activate(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - struct nft_pipapo_elem *e = elem->priv; + struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv); nft_set_elem_change_active(net, set, &e->ext); } @@ -1766,7 +1792,7 @@ static void *pipapo_deactivate(const struct net *net, const struct nft_set *set, { struct nft_pipapo_elem *e; - e = pipapo_get(net, set, data, nft_genmask_next(net)); + e = pipapo_get(net, set, data, nft_genmask_next(net), nft_net_tstamp(net)); if (IS_ERR(e)) return NULL; @@ -1783,9 +1809,9 @@ static void *pipapo_deactivate(const struct net *net, const struct nft_set *set, * * Return: deactivated element if found, NULL otherwise. */ -static void *nft_pipapo_deactivate(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) +static struct nft_elem_priv * +nft_pipapo_deactivate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); @@ -1796,7 +1822,7 @@ static void *nft_pipapo_deactivate(const struct net *net, * nft_pipapo_flush() - Call pipapo_deactivate() to make element inactive * @net: Network namespace * @set: nftables API set representation - * @elem: nftables API element representation containing key data + * @elem_priv: nftables API element representation containing key data * * This is functionally the same as nft_pipapo_deactivate(), with a slightly * different interface, and it's also called once for each element in a set @@ -1810,13 +1836,12 @@ static void *nft_pipapo_deactivate(const struct net *net, * * Return: true if element was found and deactivated. */ -static bool nft_pipapo_flush(const struct net *net, const struct nft_set *set, - void *elem) +static void nft_pipapo_flush(const struct net *net, const struct nft_set *set, + struct nft_elem_priv *elem_priv) { - struct nft_pipapo_elem *e = elem; + struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv); - return pipapo_deactivate(net, set, (const u8 *)nft_set_ext_key(&e->ext), - &e->ext); + nft_set_elem_change_active(net, set, &e->ext); } /** @@ -1939,7 +1964,7 @@ static bool pipapo_match_field(struct nft_pipapo_field *f, * nft_pipapo_remove() - Remove element given key, commit * @net: Network namespace * @set: nftables API set representation - * @elem: nftables API element representation containing key data + * @elem_priv: nftables API element representation containing key data * * Similarly to nft_pipapo_activate(), this is used as commit operation by the * API, but it's called once per element in the pending transaction, so we can't @@ -1947,14 +1972,15 @@ static bool pipapo_match_field(struct nft_pipapo_field *f, * the matched element here, if any, and commit the updated matching data. */ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m = priv->clone; - struct nft_pipapo_elem *e = elem->priv; int rules_f0, first_rule = 0; + struct nft_pipapo_elem *e; const u8 *data; + e = nft_elem_priv_cast(elem_priv); data = (const u8 *)nft_set_ext_key(&e->ext); while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { @@ -2031,7 +2057,6 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, for (r = 0; r < f->rules; r++) { struct nft_pipapo_elem *e; - struct nft_set_elem elem; if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) continue; @@ -2044,9 +2069,7 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, if (!nft_set_elem_active(&e->ext, iter->genmask)) goto cont; - elem.priv = e; - - iter->err = iter->fn(ctx, set, iter, &elem); + iter->err = iter->fn(ctx, set, iter, &e->priv); if (iter->err < 0) goto out; @@ -2118,6 +2141,8 @@ static int nft_pipapo_init(const struct nft_set *set, struct nft_pipapo_field *f; int err, i, field_count; + BUILD_BUG_ON(offsetof(struct nft_pipapo_elem, priv) != 0); + field_count = desc->field_count ? : 1; if (field_count > NFT_PIPAPO_MAX_FIELDS) @@ -2130,7 +2155,7 @@ static int nft_pipapo_init(const struct nft_set *set, m->field_count = field_count; m->bsize_max = 0; - m->scratch = alloc_percpu(unsigned long *); + m->scratch = alloc_percpu(struct nft_pipapo_scratch *); if (!m->scratch) { err = -ENOMEM; goto out_scratch; @@ -2138,16 +2163,6 @@ static int nft_pipapo_init(const struct nft_set *set, for_each_possible_cpu(i) *per_cpu_ptr(m->scratch, i) = NULL; -#ifdef NFT_PIPAPO_ALIGN - m->scratch_aligned = alloc_percpu(unsigned long *); - if (!m->scratch_aligned) { - err = -ENOMEM; - goto out_free; - } - for_each_possible_cpu(i) - *per_cpu_ptr(m->scratch_aligned, i) = NULL; -#endif - rcu_head_init(&m->rcu); nft_pipapo_for_each_field(f, i, m) { @@ -2178,9 +2193,6 @@ static int nft_pipapo_init(const struct nft_set *set, return 0; out_free: -#ifdef NFT_PIPAPO_ALIGN - free_percpu(m->scratch_aligned); -#endif free_percpu(m->scratch); out_scratch: kfree(m); @@ -2212,7 +2224,7 @@ static void nft_set_pipapo_match_destroy(const struct nft_ctx *ctx, e = f->mt[r].e; - nf_tables_set_elem_destroy(ctx, set, e); + nf_tables_set_elem_destroy(ctx, set, &e->priv); } } @@ -2234,11 +2246,8 @@ static void nft_pipapo_destroy(const struct nft_ctx *ctx, nft_set_pipapo_match_destroy(ctx, set, m); -#ifdef NFT_PIPAPO_ALIGN - free_percpu(m->scratch_aligned); -#endif for_each_possible_cpu(cpu) - kfree(*per_cpu_ptr(m->scratch, cpu)); + pipapo_free_scratch(m, cpu); free_percpu(m->scratch); pipapo_free_fields(m); kfree(m); @@ -2251,11 +2260,8 @@ static void nft_pipapo_destroy(const struct nft_ctx *ctx, if (priv->dirty) nft_set_pipapo_match_destroy(ctx, set, m); -#ifdef NFT_PIPAPO_ALIGN - free_percpu(priv->clone->scratch_aligned); -#endif for_each_possible_cpu(cpu) - kfree(*per_cpu_ptr(priv->clone->scratch, cpu)); + pipapo_free_scratch(priv->clone, cpu); free_percpu(priv->clone->scratch); pipapo_free_fields(priv->clone); diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h index 2e164a3199..f59a0cd811 100644 --- a/net/netfilter/nft_set_pipapo.h +++ b/net/netfilter/nft_set_pipapo.h @@ -131,20 +131,28 @@ struct nft_pipapo_field { }; /** + * struct nft_pipapo_scratch - percpu data used for lookup and matching + * @map_index: Current working bitmap index, toggled between field matches + * @align_off: Offset to get the originally allocated address + * @map: store partial matching results during lookup + */ +struct nft_pipapo_scratch { + u8 map_index; + u32 align_off; + unsigned long map[]; +}; + +/** * struct nft_pipapo_match - Data used for lookup and matching * @field_count Amount of fields in set * @scratch: Preallocated per-CPU maps for partial matching results - * @scratch_aligned: Version of @scratch aligned to NFT_PIPAPO_ALIGN bytes * @bsize_max: Maximum lookup table bucket size of all fields, in longs * @rcu Matching data is swapped on commits * @f: Fields, with lookup and mapping tables */ struct nft_pipapo_match { int field_count; -#ifdef NFT_PIPAPO_ALIGN - unsigned long * __percpu *scratch_aligned; -#endif - unsigned long * __percpu *scratch; + struct nft_pipapo_scratch * __percpu *scratch; size_t bsize_max; struct rcu_head rcu; struct nft_pipapo_field f[] __counted_by(field_count); @@ -170,10 +178,12 @@ struct nft_pipapo_elem; /** * struct nft_pipapo_elem - API-facing representation of single set element + * @priv: element placeholder * @ext: nftables API extensions */ struct nft_pipapo_elem { - struct nft_set_ext ext; + struct nft_elem_priv priv; + struct nft_set_ext ext; }; int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst, diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index 52e0d026d3..a3a8ddca99 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -57,7 +57,7 @@ /* Jump to label if @reg is zero */ #define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label) \ - asm_volatile_goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \ + asm goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \ "je %l[" #label "]" : : : : label) /* Store 256 bits from YMM register into memory. Contrary to bucket load @@ -71,9 +71,6 @@ #define NFT_PIPAPO_AVX2_ZERO(reg) \ asm volatile("vpxor %ymm" #reg ", %ymm" #reg ", %ymm" #reg) -/* Current working bitmap index, toggled between field matches */ -static DEFINE_PER_CPU(bool, nft_pipapo_avx2_scratch_index); - /** * nft_pipapo_avx2_prepare() - Prepare before main algorithm body * @@ -1120,11 +1117,12 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext) { struct nft_pipapo *priv = nft_set_priv(set); - unsigned long *res, *fill, *scratch; + struct nft_pipapo_scratch *scratch; u8 genmask = nft_genmask_cur(net); const u8 *rp = (const u8 *)key; struct nft_pipapo_match *m; struct nft_pipapo_field *f; + unsigned long *res, *fill; bool map_index; int i, ret = 0; @@ -1141,15 +1139,16 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, */ kernel_fpu_begin_mask(0); - scratch = *raw_cpu_ptr(m->scratch_aligned); + scratch = *raw_cpu_ptr(m->scratch); if (unlikely(!scratch)) { kernel_fpu_end(); return false; } - map_index = raw_cpu_read(nft_pipapo_avx2_scratch_index); - res = scratch + (map_index ? m->bsize_max : 0); - fill = scratch + (map_index ? 0 : m->bsize_max); + map_index = scratch->map_index; + + res = scratch->map + (map_index ? m->bsize_max : 0); + fill = scratch->map + (map_index ? 0 : m->bsize_max); /* Starting map doesn't need to be set for this implementation */ @@ -1221,7 +1220,7 @@ next_match: out: if (i % 2) - raw_cpu_write(nft_pipapo_avx2_scratch_index, !map_index); + scratch->map_index = !map_index; kernel_fpu_end(); return ret >= 0; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index e34662f4a7..9944fe479e 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -19,10 +19,11 @@ struct nft_rbtree { struct rb_root root; rwlock_t lock; seqcount_rwlock_t count; - struct delayed_work gc_work; + unsigned long last_gc; }; struct nft_rbtree_elem { + struct nft_elem_priv priv; struct rb_node node; struct nft_set_ext ext; }; @@ -48,8 +49,7 @@ static int nft_rbtree_cmp(const struct nft_set *set, static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe) { - return nft_set_elem_expired(&rbe->ext) || - nft_set_elem_is_dead(&rbe->ext); + return nft_set_elem_expired(&rbe->ext); } static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, @@ -197,8 +197,9 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, return false; } -static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem, unsigned int flags) +static struct nft_elem_priv * +nft_rbtree_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) { struct nft_rbtree *priv = nft_set_priv(set); unsigned int seq = read_seqcount_begin(&priv->count); @@ -209,33 +210,31 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); if (ret || !read_seqcount_retry(&priv->count, seq)) - return rbe; + return &rbe->priv; read_lock_bh(&priv->lock); seq = read_seqcount_begin(&priv->count); ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); - if (!ret) - rbe = ERR_PTR(-ENOENT); read_unlock_bh(&priv->lock); - return rbe; + if (!ret) + return ERR_PTR(-ENOENT); + + return &rbe->priv; } -static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, - struct nft_rbtree *priv, - struct nft_rbtree_elem *rbe) +static void nft_rbtree_gc_elem_remove(struct net *net, struct nft_set *set, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) { - struct nft_set_elem elem = { - .priv = rbe, - }; - - nft_setelem_data_deactivate(net, set, &elem); + lockdep_assert_held_write(&priv->lock); + nft_setelem_data_deactivate(net, set, &rbe->priv); rb_erase(&rbe->node, &priv->root); } static const struct nft_rbtree_elem * nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, - struct nft_rbtree_elem *rbe, u8 genmask) + struct nft_rbtree_elem *rbe) { struct nft_set *set = (struct nft_set *)__set; struct rb_node *prev = rb_prev(&rbe->node); @@ -254,7 +253,7 @@ nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, while (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); if (nft_rbtree_interval_end(rbe_prev) && - nft_set_elem_active(&rbe_prev->ext, genmask)) + nft_set_elem_active(&rbe_prev->ext, NFT_GENMASK_ANY)) break; prev = rb_prev(prev); @@ -263,7 +262,7 @@ nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, rbe_prev = NULL; if (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); - nft_rbtree_gc_remove(net, set, priv, rbe_prev); + nft_rbtree_gc_elem_remove(net, set, priv, rbe_prev); /* There is always room in this trans gc for this element, * memory allocation never actually happens, hence, the warning @@ -277,7 +276,7 @@ nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, nft_trans_gc_elem_add(gc, rbe_prev); } - nft_rbtree_gc_remove(net, set, priv, rbe); + nft_rbtree_gc_elem_remove(net, set, priv, rbe); gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); if (WARN_ON_ONCE(!gc)) return ERR_PTR(-ENOMEM); @@ -307,13 +306,14 @@ static bool nft_rbtree_update_first(const struct nft_set *set, static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *new, - struct nft_set_ext **ext) + struct nft_elem_priv **elem_priv) { struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL; struct rb_node *node, *next, *parent, **p, *first = NULL; struct nft_rbtree *priv = nft_set_priv(set); u8 cur_genmask = nft_genmask_cur(net); u8 genmask = nft_genmask_next(net); + u64 tstamp = nft_net_tstamp(net); int d; /* Descend the tree to search for an existing element greater than the @@ -361,11 +361,11 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, /* perform garbage collection to avoid bogus overlap reports * but skip new elements in this transaction. */ - if (nft_set_elem_expired(&rbe->ext) && + if (__nft_set_elem_expired(&rbe->ext, tstamp) && nft_set_elem_active(&rbe->ext, cur_genmask)) { const struct nft_rbtree_elem *removed_end; - removed_end = nft_rbtree_gc_elem(set, priv, rbe, genmask); + removed_end = nft_rbtree_gc_elem(set, priv, rbe); if (IS_ERR(removed_end)) return PTR_ERR(removed_end); @@ -424,7 +424,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, */ if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) && nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) { - *ext = &rbe_ge->ext; + *elem_priv = &rbe_ge->priv; return -EEXIST; } @@ -433,7 +433,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, */ if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) && nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) { - *ext = &rbe_le->ext; + *elem_priv = &rbe_le->priv; return -EEXIST; } @@ -485,10 +485,10 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, - struct nft_set_ext **ext) + struct nft_elem_priv **elem_priv) { + struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv); struct nft_rbtree *priv = nft_set_priv(set); - struct nft_rbtree_elem *rbe = elem->priv; int err; do { @@ -499,7 +499,7 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, write_lock_bh(&priv->lock); write_seqcount_begin(&priv->count); - err = __nft_rbtree_insert(net, set, rbe, ext); + err = __nft_rbtree_insert(net, set, rbe, elem_priv); write_seqcount_end(&priv->count); write_unlock_bh(&priv->lock); } while (err == -EAGAIN); @@ -507,13 +507,8 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, return err; } -static void nft_rbtree_remove(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) +static void nft_rbtree_erase(struct nft_rbtree *priv, struct nft_rbtree_elem *rbe) { - struct nft_rbtree *priv = nft_set_priv(set); - struct nft_rbtree_elem *rbe = elem->priv; - write_lock_bh(&priv->lock); write_seqcount_begin(&priv->count); rb_erase(&rbe->node, &priv->root); @@ -521,33 +516,43 @@ static void nft_rbtree_remove(const struct net *net, write_unlock_bh(&priv->lock); } +static void nft_rbtree_remove(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) +{ + struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv); + struct nft_rbtree *priv = nft_set_priv(set); + + nft_rbtree_erase(priv, rbe); +} + static void nft_rbtree_activate(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + struct nft_elem_priv *elem_priv) { - struct nft_rbtree_elem *rbe = elem->priv; + struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv); nft_set_elem_change_active(net, set, &rbe->ext); } -static bool nft_rbtree_flush(const struct net *net, - const struct nft_set *set, void *priv) +static void nft_rbtree_flush(const struct net *net, + const struct nft_set *set, + struct nft_elem_priv *elem_priv) { - struct nft_rbtree_elem *rbe = priv; + struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv); nft_set_elem_change_active(net, set, &rbe->ext); - - return true; } -static void *nft_rbtree_deactivate(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) +static struct nft_elem_priv * +nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) { + struct nft_rbtree_elem *rbe, *this = nft_elem_priv_cast(elem->priv); const struct nft_rbtree *priv = nft_set_priv(set); const struct rb_node *parent = priv->root.rb_node; - struct nft_rbtree_elem *rbe, *this = elem->priv; u8 genmask = nft_genmask_next(net); + u64 tstamp = nft_net_tstamp(net); int d; while (parent != NULL) { @@ -568,14 +573,14 @@ static void *nft_rbtree_deactivate(const struct net *net, nft_rbtree_interval_end(this)) { parent = parent->rb_right; continue; - } else if (nft_set_elem_expired(&rbe->ext)) { + } else if (__nft_set_elem_expired(&rbe->ext, tstamp)) { break; } else if (!nft_set_elem_active(&rbe->ext, genmask)) { parent = parent->rb_left; continue; } - nft_rbtree_flush(net, set, rbe); - return rbe; + nft_rbtree_flush(net, set, &rbe->priv); + return &rbe->priv; } } return NULL; @@ -587,7 +592,6 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe; - struct nft_set_elem elem; struct rb_node *node; read_lock_bh(&priv->lock); @@ -599,9 +603,7 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, if (!nft_set_elem_active(&rbe->ext, iter->genmask)) goto cont; - elem.priv = rbe; - - iter->err = iter->fn(ctx, set, iter, &elem); + iter->err = iter->fn(ctx, set, iter, &rbe->priv); if (iter->err < 0) { read_unlock_bh(&priv->lock); return; @@ -612,45 +614,35 @@ cont: read_unlock_bh(&priv->lock); } -static void nft_rbtree_gc(struct work_struct *work) +static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) +{ + nft_setelem_data_deactivate(net, set, &rbe->priv); + nft_rbtree_erase(priv, rbe); +} + +static void nft_rbtree_gc(struct nft_set *set) { + struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe, *rbe_end = NULL; - struct nftables_pernet *nft_net; - struct nft_rbtree *priv; + struct net *net = read_pnet(&set->net); + u64 tstamp = nft_net_tstamp(net); + struct rb_node *node, *next; struct nft_trans_gc *gc; - struct rb_node *node; - struct nft_set *set; - unsigned int gc_seq; - struct net *net; - priv = container_of(work, struct nft_rbtree, gc_work.work); set = nft_set_container_of(priv); net = read_pnet(&set->net); - nft_net = nft_pernet(net); - gc_seq = READ_ONCE(nft_net->gc_seq); - if (nft_set_gc_is_pending(set)) - goto done; - - gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); + gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); if (!gc) - goto done; - - read_lock_bh(&priv->lock); - for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { + return; - /* Ruleset has been updated, try later. */ - if (READ_ONCE(nft_net->gc_seq) != gc_seq) { - nft_trans_gc_destroy(gc); - gc = NULL; - goto try_later; - } + for (node = rb_first(&priv->root); node ; node = next) { + next = rb_next(node); rbe = rb_entry(node, struct nft_rbtree_elem, node); - if (nft_set_elem_is_dead(&rbe->ext)) - goto dead_elem; - /* elements are reversed in the rbtree for historical reasons, * from highest to lowest value, that is why end element is * always visited before the start element. @@ -659,40 +651,37 @@ static void nft_rbtree_gc(struct work_struct *work) rbe_end = rbe; continue; } - if (!nft_set_elem_expired(&rbe->ext)) + if (!__nft_set_elem_expired(&rbe->ext, tstamp)) continue; - nft_set_elem_dead(&rbe->ext); - - if (!rbe_end) - continue; - - nft_set_elem_dead(&rbe_end->ext); - - gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); if (!gc) goto try_later; - nft_trans_gc_elem_add(gc, rbe_end); - rbe_end = NULL; -dead_elem: - gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + /* end element needs to be removed first, it has + * no timeout extension. + */ + if (rbe_end) { + nft_rbtree_gc_remove(net, set, priv, rbe_end); + nft_trans_gc_elem_add(gc, rbe_end); + rbe_end = NULL; + } + + gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); if (!gc) goto try_later; + nft_rbtree_gc_remove(net, set, priv, rbe); nft_trans_gc_elem_add(gc, rbe); } - gc = nft_trans_gc_catchall_async(gc, gc_seq); - try_later: - read_unlock_bh(&priv->lock); - if (gc) - nft_trans_gc_queue_async_done(gc); -done: - queue_delayed_work(system_power_efficient_wq, &priv->gc_work, - nft_set_gc_interval(set)); + if (gc) { + gc = nft_trans_gc_catchall_sync(gc); + nft_trans_gc_queue_sync_done(gc); + priv->last_gc = jiffies; + } } static u64 nft_rbtree_privsize(const struct nlattr * const nla[], @@ -707,15 +696,12 @@ static int nft_rbtree_init(const struct nft_set *set, { struct nft_rbtree *priv = nft_set_priv(set); + BUILD_BUG_ON(offsetof(struct nft_rbtree_elem, priv) != 0); + rwlock_init(&priv->lock); seqcount_rwlock_init(&priv->count, &priv->lock); priv->root = RB_ROOT; - INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc); - if (set->flags & NFT_SET_TIMEOUT) - queue_delayed_work(system_power_efficient_wq, &priv->gc_work, - nft_set_gc_interval(set)); - return 0; } @@ -726,12 +712,10 @@ static void nft_rbtree_destroy(const struct nft_ctx *ctx, struct nft_rbtree_elem *rbe; struct rb_node *node; - cancel_delayed_work_sync(&priv->gc_work); - rcu_barrier(); while ((node = priv->root.rb_node) != NULL) { rb_erase(node, &priv->root); rbe = rb_entry(node, struct nft_rbtree_elem, node); - nf_tables_set_elem_destroy(ctx, set, rbe); + nf_tables_set_elem_destroy(ctx, set, &rbe->priv); } } @@ -753,6 +737,21 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, return true; } +static void nft_rbtree_commit(struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); + + if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) + nft_rbtree_gc(set); +} + +static void nft_rbtree_gc_init(const struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); + + priv->last_gc = jiffies; +} + const struct nft_set_type nft_set_rbtree_type = { .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, .ops = { @@ -766,6 +765,8 @@ const struct nft_set_type nft_set_rbtree_type = { .deactivate = nft_rbtree_deactivate, .flush = nft_rbtree_flush, .activate = nft_rbtree_activate, + .commit = nft_rbtree_commit, + .gc_init = nft_rbtree_gc_init, .lookup = nft_rbtree_lookup, .walk = nft_rbtree_walk, .get = nft_rbtree_get, diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 9f21953c74..f735d79d8b 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -713,6 +713,7 @@ static const struct nft_object_ops nft_tunnel_obj_ops = { static struct nft_object_type nft_tunnel_obj_type __read_mostly = { .type = NFT_OBJECT_TUNNEL, + .family = NFPROTO_NETDEV, .ops = &nft_tunnel_obj_ops, .maxattr = NFTA_TUNNEL_KEY_MAX, .policy = nft_tunnel_key_policy, |