diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-06-03 05:08:49 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-06-03 05:08:49 +0000 |
commit | 76047c0312414915035f4ebbaa533fe3817a21e0 (patch) | |
tree | 86f79ba3c1f753b1d39be793455035794fade031 /net/mptcp | |
parent | Adding upstream version 6.8.11. (diff) | |
download | linux-76047c0312414915035f4ebbaa533fe3817a21e0.tar.xz linux-76047c0312414915035f4ebbaa533fe3817a21e0.zip |
Adding upstream version 6.8.12.upstream/6.8.12
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'net/mptcp')
-rw-r--r-- | net/mptcp/protocol.c | 54 | ||||
-rw-r--r-- | net/mptcp/protocol.h | 45 | ||||
-rw-r--r-- | net/mptcp/sockopt.c | 129 |
3 files changed, 161 insertions, 67 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 74c1faec27..54e29ab911 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1687,15 +1687,6 @@ out: } } -static void mptcp_set_nospace(struct sock *sk) -{ - /* enable autotune */ - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - - /* will be cleared on avail space */ - set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags); -} - static int mptcp_disconnect(struct sock *sk, int flags); static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, @@ -1766,6 +1757,30 @@ static int do_copy_data_nocache(struct sock *sk, int copy, return 0; } +/* open-code sk_stream_memory_free() plus sent limit computation to + * avoid indirect calls in fast-path. + * Called under the msk socket lock, so we can avoid a bunch of ONCE + * annotations. + */ +static u32 mptcp_send_limit(const struct sock *sk) +{ + const struct mptcp_sock *msk = mptcp_sk(sk); + u32 limit, not_sent; + + if (sk->sk_wmem_queued >= READ_ONCE(sk->sk_sndbuf)) + return 0; + + limit = mptcp_notsent_lowat(sk); + if (limit == UINT_MAX) + return UINT_MAX; + + not_sent = msk->write_seq - msk->snd_nxt; + if (not_sent >= limit) + return 0; + + return limit - not_sent; +} + static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -1810,6 +1825,12 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct mptcp_data_frag *dfrag; bool dfrag_collapsed; size_t psize, offset; + u32 copy_limit; + + /* ensure fitting the notsent_lowat() constraint */ + copy_limit = mptcp_send_limit(sk); + if (!copy_limit) + goto wait_for_memory; /* reuse tail pfrag, if possible, or carve a new one from the * page allocator @@ -1817,9 +1838,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) dfrag = mptcp_pending_tail(sk); dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); if (!dfrag_collapsed) { - if (!sk_stream_memory_free(sk)) - goto wait_for_memory; - if (!mptcp_page_frag_refill(sk, pfrag)) goto wait_for_memory; @@ -1834,6 +1852,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) offset = dfrag->offset + dfrag->data_len; psize = pfrag->size - offset; psize = min_t(size_t, psize, msg_data_left(msg)); + psize = min_t(size_t, psize, copy_limit); total_ts = psize + frag_truesize; if (!sk_wmem_schedule(sk, total_ts)) @@ -1869,7 +1888,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) continue; wait_for_memory: - mptcp_set_nospace(sk); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); __mptcp_push_pending(sk, msg->msg_flags); ret = sk_stream_wait_memory(sk, &timeo); if (ret) @@ -3770,6 +3789,7 @@ static struct proto mptcp_prot = { .unhash = mptcp_unhash, .get_port = mptcp_get_port, .forward_alloc_get = mptcp_forward_alloc_get, + .stream_memory_free = mptcp_stream_memory_free, .sockets_allocated = &mptcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, @@ -3941,12 +3961,12 @@ static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; - if (sk_stream_is_writeable(sk)) + if (__mptcp_stream_is_writeable(sk, 1)) return EPOLLOUT | EPOLLWRNORM; - mptcp_set_nospace(sk); - smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ - if (sk_stream_is_writeable(sk)) + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + smp_mb__after_atomic(); /* NOSPACE is changed by mptcp_write_space() */ + if (__mptcp_stream_is_writeable(sk, 1)) return EPOLLOUT | EPOLLWRNORM; return 0; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 07f6242afc..5f4c10c41c 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -113,10 +113,9 @@ #define MPTCP_RST_TRANSIENT BIT(0) /* MPTCP socket atomic flags */ -#define MPTCP_NOSPACE 1 -#define MPTCP_WORK_RTX 2 -#define MPTCP_FALLBACK_DONE 4 -#define MPTCP_WORK_CLOSE_SUBFLOW 5 +#define MPTCP_WORK_RTX 1 +#define MPTCP_FALLBACK_DONE 2 +#define MPTCP_WORK_CLOSE_SUBFLOW 3 /* MPTCP socket release cb flags */ #define MPTCP_PUSH_PENDING 1 @@ -306,6 +305,10 @@ struct mptcp_sock { in_accept_queue:1, free_first:1, rcvspace_init:1; + u32 notsent_lowat; + int keepalive_cnt; + int keepalive_idle; + int keepalive_intvl; struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; @@ -790,14 +793,36 @@ static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk) READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt); } +static inline u32 mptcp_notsent_lowat(const struct sock *sk) +{ + struct net *net = sock_net(sk); + u32 val; + + val = READ_ONCE(mptcp_sk(sk)->notsent_lowat); + return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat); +} + +static inline bool mptcp_stream_memory_free(const struct sock *sk, int wake) +{ + const struct mptcp_sock *msk = mptcp_sk(sk); + u32 notsent_bytes; + + notsent_bytes = READ_ONCE(msk->write_seq) - READ_ONCE(msk->snd_nxt); + return (notsent_bytes << wake) < mptcp_notsent_lowat(sk); +} + +static inline bool __mptcp_stream_is_writeable(const struct sock *sk, int wake) +{ + return mptcp_stream_memory_free(sk, wake) && + __sk_stream_is_writeable(sk, wake); +} + static inline void mptcp_write_space(struct sock *sk) { - if (sk_stream_is_writeable(sk)) { - /* pairs with memory barrier in mptcp_poll */ - smp_mb(); - if (test_and_clear_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags)) - sk_stream_write_space(sk); - } + /* pairs with memory barrier in mptcp_poll */ + smp_mb(); + if (mptcp_stream_memory_free(sk, 1)) + sk_stream_write_space(sk); } static inline void __mptcp_sync_sndbuf(struct sock *sk) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index ef3edba754..47aa826ba5 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -181,8 +181,6 @@ static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname, switch (optname) { case SO_KEEPALIVE: - mptcp_sol_socket_sync_intval(msk, optname, val); - return 0; case SO_DEBUG: case SO_MARK: case SO_PRIORITY: @@ -624,20 +622,36 @@ static int mptcp_setsockopt_sol_tcp_congestion(struct mptcp_sock *msk, sockptr_t return ret; } -static int mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, sockptr_t optval, - unsigned int optlen) +static int __mptcp_setsockopt_set_val(struct mptcp_sock *msk, int max, + int (*set_val)(struct sock *, int), + int *msk_val, int val) { struct mptcp_subflow_context *subflow; - struct sock *sk = (struct sock *)msk; - int val; + int err = 0; - if (optlen < sizeof(int)) - return -EINVAL; + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + int ret; - if (copy_from_sockptr(&val, optval, sizeof(val))) - return -EFAULT; + lock_sock(ssk); + ret = set_val(ssk, val); + err = err ? : ret; + release_sock(ssk); + } + + if (!err) { + *msk_val = val; + sockopt_seq_inc(msk); + } + + return err; +} + +static int __mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, int val) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; - lock_sock(sk); sockopt_seq_inc(msk); msk->cork = !!val; mptcp_for_each_subflow(msk, subflow) { @@ -649,25 +663,15 @@ static int mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, sockptr_t optva } if (!val) mptcp_check_and_set_pending(sk); - release_sock(sk); return 0; } -static int mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, sockptr_t optval, - unsigned int optlen) +static int __mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, int val) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; - int val; - - if (optlen < sizeof(int)) - return -EINVAL; - - if (copy_from_sockptr(&val, optval, sizeof(val))) - return -EFAULT; - lock_sock(sk); sockopt_seq_inc(msk); msk->nodelay = !!val; mptcp_for_each_subflow(msk, subflow) { @@ -679,8 +683,6 @@ static int mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, sockptr_t op } if (val) mptcp_check_and_set_pending(sk); - release_sock(sk); - return 0; } @@ -803,25 +805,10 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, int ret, val; switch (optname) { - case TCP_INQ: - ret = mptcp_get_int_option(msk, optval, optlen, &val); - if (ret) - return ret; - if (val < 0 || val > 1) - return -EINVAL; - - lock_sock(sk); - msk->recvmsg_inq = !!val; - release_sock(sk); - return 0; case TCP_ULP: return -EOPNOTSUPP; case TCP_CONGESTION: return mptcp_setsockopt_sol_tcp_congestion(msk, optval, optlen); - case TCP_CORK: - return mptcp_setsockopt_sol_tcp_cork(msk, optval, optlen); - case TCP_NODELAY: - return mptcp_setsockopt_sol_tcp_nodelay(msk, optval, optlen); case TCP_DEFER_ACCEPT: /* See tcp.c: TCP_DEFER_ACCEPT does not fail */ mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); @@ -834,7 +821,50 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, optval, optlen); } - return -EOPNOTSUPP; + ret = mptcp_get_int_option(msk, optval, optlen, &val); + if (ret) + return ret; + + lock_sock(sk); + switch (optname) { + case TCP_INQ: + if (val < 0 || val > 1) + ret = -EINVAL; + else + msk->recvmsg_inq = !!val; + break; + case TCP_NOTSENT_LOWAT: + WRITE_ONCE(msk->notsent_lowat, val); + mptcp_write_space(sk); + break; + case TCP_CORK: + ret = __mptcp_setsockopt_sol_tcp_cork(msk, val); + break; + case TCP_NODELAY: + ret = __mptcp_setsockopt_sol_tcp_nodelay(msk, val); + break; + case TCP_KEEPIDLE: + ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPIDLE, + &tcp_sock_set_keepidle_locked, + &msk->keepalive_idle, val); + break; + case TCP_KEEPINTVL: + ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPINTVL, + &tcp_sock_set_keepintvl, + &msk->keepalive_intvl, val); + break; + case TCP_KEEPCNT: + ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPCNT, + &tcp_sock_set_keepcnt, + &msk->keepalive_cnt, + val); + break; + default: + ret = -ENOPROTOOPT; + } + + release_sock(sk); + return ret; } int mptcp_setsockopt(struct sock *sk, int level, int optname, @@ -1331,6 +1361,8 @@ static int mptcp_put_int_option(struct mptcp_sock *msk, char __user *optval, static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, char __user *optval, int __user *optlen) { + struct sock *sk = (void *)msk; + switch (optname) { case TCP_ULP: case TCP_CONGESTION: @@ -1349,6 +1381,20 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, return mptcp_put_int_option(msk, optval, optlen, msk->cork); case TCP_NODELAY: return mptcp_put_int_option(msk, optval, optlen, msk->nodelay); + case TCP_KEEPIDLE: + return mptcp_put_int_option(msk, optval, optlen, + msk->keepalive_idle ? : + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_time) / HZ); + case TCP_KEEPINTVL: + return mptcp_put_int_option(msk, optval, optlen, + msk->keepalive_intvl ? : + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_intvl) / HZ); + case TCP_KEEPCNT: + return mptcp_put_int_option(msk, optval, optlen, + msk->keepalive_cnt ? : + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_probes)); + case TCP_NOTSENT_LOWAT: + return mptcp_put_int_option(msk, optval, optlen, msk->notsent_lowat); } return -EOPNOTSUPP; } @@ -1464,6 +1510,9 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) tcp_set_congestion_control(ssk, msk->ca_name, false, true); __tcp_sock_set_cork(ssk, !!msk->cork); __tcp_sock_set_nodelay(ssk, !!msk->nodelay); + tcp_sock_set_keepidle_locked(ssk, msk->keepalive_idle); + tcp_sock_set_keepintvl(ssk, msk->keepalive_intvl); + tcp_sock_set_keepcnt(ssk, msk->keepalive_cnt); inet_assign_bit(TRANSPARENT, ssk, inet_test_bit(TRANSPARENT, sk)); inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk)); |