From dc50eab76b709d68175a358d6e23a5a3890764d3 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 18 May 2024 19:39:57 +0200 Subject: Merging upstream version 6.7.7. Signed-off-by: Daniel Baumann --- net/core/sock.c | 257 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 134 insertions(+), 123 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 383e30fe79..20160865ed 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -760,7 +760,7 @@ out: return ret; } -bool sk_mc_loop(struct sock *sk) +bool sk_mc_loop(const struct sock *sk) { if (dev_recursion_level()) return false; @@ -772,7 +772,7 @@ bool sk_mc_loop(struct sock *sk) return inet_test_bit(MC_LOOP, sk); #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - return inet6_sk(sk)->mc_loop; + return inet6_test_bit(MC6_LOOP, sk); #endif } WARN_ON_ONCE(1); @@ -807,9 +807,7 @@ EXPORT_SYMBOL(sock_no_linger); void sock_set_priority(struct sock *sk, u32 priority) { - lock_sock(sk); WRITE_ONCE(sk->sk_priority, priority); - release_sock(sk); } EXPORT_SYMBOL(sock_set_priority); @@ -1119,6 +1117,94 @@ int sk_setsockopt(struct sock *sk, int level, int optname, valbool = val ? 1 : 0; + /* handle options which do not require locking the socket. */ + switch (optname) { + case SO_PRIORITY: + if ((val >= 0 && val <= 6) || + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + sock_set_priority(sk, val); + return 0; + } + return -EPERM; + case SO_PASSSEC: + assign_bit(SOCK_PASSSEC, &sock->flags, valbool); + return 0; + case SO_PASSCRED: + assign_bit(SOCK_PASSCRED, &sock->flags, valbool); + return 0; + case SO_PASSPIDFD: + assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool); + return 0; + case SO_TYPE: + case SO_PROTOCOL: + case SO_DOMAIN: + case SO_ERROR: + return -ENOPROTOOPT; +#ifdef CONFIG_NET_RX_BUSY_POLL + case SO_BUSY_POLL: + if (val < 0) + return -EINVAL; + WRITE_ONCE(sk->sk_ll_usec, val); + return 0; + case SO_PREFER_BUSY_POLL: + if (valbool && !sockopt_capable(CAP_NET_ADMIN)) + return -EPERM; + WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); + return 0; + case SO_BUSY_POLL_BUDGET: + if (val > READ_ONCE(sk->sk_busy_poll_budget) && + !sockopt_capable(CAP_NET_ADMIN)) + return -EPERM; + if (val < 0 || val > U16_MAX) + return -EINVAL; + WRITE_ONCE(sk->sk_busy_poll_budget, val); + return 0; +#endif + case SO_MAX_PACING_RATE: + { + unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; + unsigned long pacing_rate; + + if (sizeof(ulval) != sizeof(val) && + optlen >= sizeof(ulval) && + copy_from_sockptr(&ulval, optval, sizeof(ulval))) { + return -EFAULT; + } + if (ulval != ~0UL) + cmpxchg(&sk->sk_pacing_status, + SK_PACING_NONE, + SK_PACING_NEEDED); + /* Pairs with READ_ONCE() from sk_getsockopt() */ + WRITE_ONCE(sk->sk_max_pacing_rate, ulval); + pacing_rate = READ_ONCE(sk->sk_pacing_rate); + if (ulval < pacing_rate) + WRITE_ONCE(sk->sk_pacing_rate, ulval); + return 0; + } + case SO_TXREHASH: + if (val < -1 || val > 1) + return -EINVAL; + if ((u8)val == SOCK_TXREHASH_DEFAULT) + val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); + /* Paired with READ_ONCE() in tcp_rtx_synack() + * and sk_getsockopt(). + */ + WRITE_ONCE(sk->sk_txrehash, (u8)val); + return 0; + case SO_PEEK_OFF: + { + int (*set_peek_off)(struct sock *sk, int val); + + set_peek_off = READ_ONCE(sock->ops)->set_peek_off; + if (set_peek_off) + ret = set_peek_off(sk, val); + else + ret = -EOPNOTSUPP; + return ret; + } + } + sockopt_lock_sock(sk); switch (optname) { @@ -1134,12 +1220,6 @@ int sk_setsockopt(struct sock *sk, int level, int optname, case SO_REUSEPORT: sk->sk_reuseport = valbool; break; - case SO_TYPE: - case SO_PROTOCOL: - case SO_DOMAIN: - case SO_ERROR: - ret = -ENOPROTOOPT; - break; case SO_DONTROUTE: sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); sk_dst_reset(sk); @@ -1214,15 +1294,6 @@ set_sndbuf: sk->sk_no_check_tx = valbool; break; - case SO_PRIORITY: - if ((val >= 0 && val <= 6) || - sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || - sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - WRITE_ONCE(sk->sk_priority, val); - else - ret = -EPERM; - break; - case SO_LINGER: if (optlen < sizeof(ling)) { ret = -EINVAL; /* 1003.1g */ @@ -1248,14 +1319,6 @@ set_sndbuf: case SO_BSDCOMPAT: break; - case SO_PASSCRED: - assign_bit(SOCK_PASSCRED, &sock->flags, valbool); - break; - - case SO_PASSPIDFD: - assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool); - break; - case SO_TIMESTAMP_OLD: case SO_TIMESTAMP_NEW: case SO_TIMESTAMPNS_OLD: @@ -1361,9 +1424,6 @@ set_sndbuf: sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); break; - case SO_PASSSEC: - assign_bit(SOCK_PASSSEC, &sock->flags, valbool); - break; case SO_MARK: if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { @@ -1385,18 +1445,6 @@ set_sndbuf: sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); break; - case SO_PEEK_OFF: - { - int (*set_peek_off)(struct sock *sk, int val); - - set_peek_off = READ_ONCE(sock->ops)->set_peek_off; - if (set_peek_off) - ret = set_peek_off(sk, val); - else - ret = -EOPNOTSUPP; - break; - } - case SO_NOFCS: sock_valbool_flag(sk, SOCK_NOFCS, valbool); break; @@ -1405,50 +1453,7 @@ set_sndbuf: sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); break; -#ifdef CONFIG_NET_RX_BUSY_POLL - case SO_BUSY_POLL: - if (val < 0) - ret = -EINVAL; - else - WRITE_ONCE(sk->sk_ll_usec, val); - break; - case SO_PREFER_BUSY_POLL: - if (valbool && !sockopt_capable(CAP_NET_ADMIN)) - ret = -EPERM; - else - WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); - break; - case SO_BUSY_POLL_BUDGET: - if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) { - ret = -EPERM; - } else { - if (val < 0 || val > U16_MAX) - ret = -EINVAL; - else - WRITE_ONCE(sk->sk_busy_poll_budget, val); - } - break; -#endif - case SO_MAX_PACING_RATE: - { - unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; - - if (sizeof(ulval) != sizeof(val) && - optlen >= sizeof(ulval) && - copy_from_sockptr(&ulval, optval, sizeof(ulval))) { - ret = -EFAULT; - break; - } - if (ulval != ~0UL) - cmpxchg(&sk->sk_pacing_status, - SK_PACING_NONE, - SK_PACING_NEEDED); - /* Pairs with READ_ONCE() from sk_getsockopt() */ - WRITE_ONCE(sk->sk_max_pacing_rate, ulval); - sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval); - break; - } case SO_INCOMING_CPU: reuseport_update_incoming_cpu(sk, val); break; @@ -1533,19 +1538,6 @@ set_sndbuf: break; } - case SO_TXREHASH: - if (val < -1 || val > 1) { - ret = -EINVAL; - break; - } - if ((u8)val == SOCK_TXREHASH_DEFAULT) - val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); - /* Paired with READ_ONCE() in tcp_rtx_synack() - * and sk_getsockopt(). - */ - WRITE_ONCE(sk->sk_txrehash, (u8)val); - break; - default: ret = -ENOPROTOOPT; break; @@ -2018,14 +2010,6 @@ lenout: return 0; } -int sock_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) -{ - return sk_getsockopt(sock->sk, level, optname, - USER_SOCKPTR(optval), - USER_SOCKPTR(optlen)); -} - /* * Initialize an sk_lock. * @@ -3010,6 +2994,11 @@ void __sk_flush_backlog(struct sock *sk) { spin_lock_bh(&sk->sk_lock.slock); __release_sock(sk); + + if (sk->sk_prot->release_cb) + INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, + tcp_release_cb, sk); + spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL_GPL(__sk_flush_backlog); @@ -3046,21 +3035,29 @@ EXPORT_SYMBOL(sk_wait_data); * @amt: pages to allocate * @kind: allocation type * - * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc + * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc. + * + * Unlike the globally shared limits among the sockets under same protocol, + * consuming the budget of a memcg won't have direct effect on other ones. + * So be optimistic about memcg's tolerance, and leave the callers to decide + * whether or not to raise allocated through sk_under_memory_pressure() or + * its variants. */ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { - bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg; + struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL; struct proto *prot = sk->sk_prot; - bool charged = true; + bool charged = false; long allocated; sk_memory_allocated_add(sk, amt); allocated = sk_memory_allocated(sk); - if (memcg_charge && - !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt, - gfp_memcg_charge()))) - goto suppress_allocation; + + if (memcg) { + if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge())) + goto suppress_allocation; + charged = true; + } /* Under limit. */ if (allocated <= sk_prot_mem_limits(sk, 0)) { @@ -3076,7 +3073,14 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) if (allocated > sk_prot_mem_limits(sk, 2)) goto suppress_allocation; - /* guarantee minimum buffer size under pressure */ + /* Guarantee minimum buffer size under pressure (either global + * or memcg) to make sure features described in RFC 7323 (TCP + * Extensions for High Performance) work properly. + * + * This rule does NOT stand when exceeds global or memcg's hard + * limit, or else a DoS attack can be taken place by spawning + * lots of sockets whose usage are under minimum buffer size. + */ if (kind == SK_MEM_RECV) { if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) return 1; @@ -3095,8 +3099,17 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) if (sk_has_memory_pressure(sk)) { u64 alloc; - if (!sk_under_memory_pressure(sk)) + /* The following 'average' heuristic is within the + * scope of global accounting, so it only makes + * sense for global memory pressure. + */ + if (!sk_under_global_memory_pressure(sk)) return 1; + + /* Try to be fair among all the sockets under global + * pressure by allowing the ones that below average + * usage to raise. + */ alloc = sk_sockets_allocated_read_positive(sk); if (sk_prot_mem_limits(sk, 2) > alloc * sk_mem_pages(sk->sk_wmem_queued + @@ -3115,8 +3128,8 @@ suppress_allocation: */ if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { /* Force charge with __GFP_NOFAIL */ - if (memcg_charge && !charged) { - mem_cgroup_charge_skmem(sk->sk_memcg, amt, + if (memcg && !charged) { + mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge() | __GFP_NOFAIL); } return 1; @@ -3128,8 +3141,8 @@ suppress_allocation: sk_memory_allocated_sub(sk, amt); - if (memcg_charge && charged) - mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); + if (charged) + mem_cgroup_uncharge_skmem(memcg, amt); return 0; } @@ -3528,11 +3541,9 @@ void release_sock(struct sock *sk) if (sk->sk_backlog.tail) __release_sock(sk); - /* Warning : release_cb() might need to release sk ownership, - * ie call sock_release_ownership(sk) before us. - */ if (sk->sk_prot->release_cb) - sk->sk_prot->release_cb(sk); + INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, + tcp_release_cb, sk); sock_release_ownership(sk); if (waitqueue_active(&sk->sk_lock.wq)) -- cgit v1.2.3