summaryrefslogtreecommitdiffstats
path: root/net/mptcp
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-06-03 05:08:49 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-06-03 05:08:49 +0000
commit76047c0312414915035f4ebbaa533fe3817a21e0 (patch)
tree86f79ba3c1f753b1d39be793455035794fade031 /net/mptcp
parentAdding upstream version 6.8.11. (diff)
downloadlinux-76047c0312414915035f4ebbaa533fe3817a21e0.tar.xz
linux-76047c0312414915035f4ebbaa533fe3817a21e0.zip
Adding upstream version 6.8.12.upstream/6.8.12
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'net/mptcp')
-rw-r--r--net/mptcp/protocol.c54
-rw-r--r--net/mptcp/protocol.h45
-rw-r--r--net/mptcp/sockopt.c129
3 files changed, 161 insertions, 67 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 74c1faec27..54e29ab911 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1687,15 +1687,6 @@ out:
}
}
-static void mptcp_set_nospace(struct sock *sk)
-{
- /* enable autotune */
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-
- /* will be cleared on avail space */
- set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags);
-}
-
static int mptcp_disconnect(struct sock *sk, int flags);
static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
@@ -1766,6 +1757,30 @@ static int do_copy_data_nocache(struct sock *sk, int copy,
return 0;
}
+/* open-code sk_stream_memory_free() plus sent limit computation to
+ * avoid indirect calls in fast-path.
+ * Called under the msk socket lock, so we can avoid a bunch of ONCE
+ * annotations.
+ */
+static u32 mptcp_send_limit(const struct sock *sk)
+{
+ const struct mptcp_sock *msk = mptcp_sk(sk);
+ u32 limit, not_sent;
+
+ if (sk->sk_wmem_queued >= READ_ONCE(sk->sk_sndbuf))
+ return 0;
+
+ limit = mptcp_notsent_lowat(sk);
+ if (limit == UINT_MAX)
+ return UINT_MAX;
+
+ not_sent = msk->write_seq - msk->snd_nxt;
+ if (not_sent >= limit)
+ return 0;
+
+ return limit - not_sent;
+}
+
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1810,6 +1825,12 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct mptcp_data_frag *dfrag;
bool dfrag_collapsed;
size_t psize, offset;
+ u32 copy_limit;
+
+ /* ensure fitting the notsent_lowat() constraint */
+ copy_limit = mptcp_send_limit(sk);
+ if (!copy_limit)
+ goto wait_for_memory;
/* reuse tail pfrag, if possible, or carve a new one from the
* page allocator
@@ -1817,9 +1838,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
dfrag = mptcp_pending_tail(sk);
dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
if (!dfrag_collapsed) {
- if (!sk_stream_memory_free(sk))
- goto wait_for_memory;
-
if (!mptcp_page_frag_refill(sk, pfrag))
goto wait_for_memory;
@@ -1834,6 +1852,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
offset = dfrag->offset + dfrag->data_len;
psize = pfrag->size - offset;
psize = min_t(size_t, psize, msg_data_left(msg));
+ psize = min_t(size_t, psize, copy_limit);
total_ts = psize + frag_truesize;
if (!sk_wmem_schedule(sk, total_ts))
@@ -1869,7 +1888,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
continue;
wait_for_memory:
- mptcp_set_nospace(sk);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
__mptcp_push_pending(sk, msg->msg_flags);
ret = sk_stream_wait_memory(sk, &timeo);
if (ret)
@@ -3770,6 +3789,7 @@ static struct proto mptcp_prot = {
.unhash = mptcp_unhash,
.get_port = mptcp_get_port,
.forward_alloc_get = mptcp_forward_alloc_get,
+ .stream_memory_free = mptcp_stream_memory_free,
.sockets_allocated = &mptcp_sockets_allocated,
.memory_allocated = &tcp_memory_allocated,
@@ -3941,12 +3961,12 @@ static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
{
struct sock *sk = (struct sock *)msk;
- if (sk_stream_is_writeable(sk))
+ if (__mptcp_stream_is_writeable(sk, 1))
return EPOLLOUT | EPOLLWRNORM;
- mptcp_set_nospace(sk);
- smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
- if (sk_stream_is_writeable(sk))
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ smp_mb__after_atomic(); /* NOSPACE is changed by mptcp_write_space() */
+ if (__mptcp_stream_is_writeable(sk, 1))
return EPOLLOUT | EPOLLWRNORM;
return 0;
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 07f6242afc..5f4c10c41c 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -113,10 +113,9 @@
#define MPTCP_RST_TRANSIENT BIT(0)
/* MPTCP socket atomic flags */
-#define MPTCP_NOSPACE 1
-#define MPTCP_WORK_RTX 2
-#define MPTCP_FALLBACK_DONE 4
-#define MPTCP_WORK_CLOSE_SUBFLOW 5
+#define MPTCP_WORK_RTX 1
+#define MPTCP_FALLBACK_DONE 2
+#define MPTCP_WORK_CLOSE_SUBFLOW 3
/* MPTCP socket release cb flags */
#define MPTCP_PUSH_PENDING 1
@@ -306,6 +305,10 @@ struct mptcp_sock {
in_accept_queue:1,
free_first:1,
rcvspace_init:1;
+ u32 notsent_lowat;
+ int keepalive_cnt;
+ int keepalive_idle;
+ int keepalive_intvl;
struct work_struct work;
struct sk_buff *ooo_last_skb;
struct rb_root out_of_order_queue;
@@ -790,14 +793,36 @@ static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk)
READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt);
}
+static inline u32 mptcp_notsent_lowat(const struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+ u32 val;
+
+ val = READ_ONCE(mptcp_sk(sk)->notsent_lowat);
+ return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat);
+}
+
+static inline bool mptcp_stream_memory_free(const struct sock *sk, int wake)
+{
+ const struct mptcp_sock *msk = mptcp_sk(sk);
+ u32 notsent_bytes;
+
+ notsent_bytes = READ_ONCE(msk->write_seq) - READ_ONCE(msk->snd_nxt);
+ return (notsent_bytes << wake) < mptcp_notsent_lowat(sk);
+}
+
+static inline bool __mptcp_stream_is_writeable(const struct sock *sk, int wake)
+{
+ return mptcp_stream_memory_free(sk, wake) &&
+ __sk_stream_is_writeable(sk, wake);
+}
+
static inline void mptcp_write_space(struct sock *sk)
{
- if (sk_stream_is_writeable(sk)) {
- /* pairs with memory barrier in mptcp_poll */
- smp_mb();
- if (test_and_clear_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags))
- sk_stream_write_space(sk);
- }
+ /* pairs with memory barrier in mptcp_poll */
+ smp_mb();
+ if (mptcp_stream_memory_free(sk, 1))
+ sk_stream_write_space(sk);
}
static inline void __mptcp_sync_sndbuf(struct sock *sk)
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index ef3edba754..47aa826ba5 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -181,8 +181,6 @@ static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname,
switch (optname) {
case SO_KEEPALIVE:
- mptcp_sol_socket_sync_intval(msk, optname, val);
- return 0;
case SO_DEBUG:
case SO_MARK:
case SO_PRIORITY:
@@ -624,20 +622,36 @@ static int mptcp_setsockopt_sol_tcp_congestion(struct mptcp_sock *msk, sockptr_t
return ret;
}
-static int mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, sockptr_t optval,
- unsigned int optlen)
+static int __mptcp_setsockopt_set_val(struct mptcp_sock *msk, int max,
+ int (*set_val)(struct sock *, int),
+ int *msk_val, int val)
{
struct mptcp_subflow_context *subflow;
- struct sock *sk = (struct sock *)msk;
- int val;
+ int err = 0;
- if (optlen < sizeof(int))
- return -EINVAL;
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ int ret;
- if (copy_from_sockptr(&val, optval, sizeof(val)))
- return -EFAULT;
+ lock_sock(ssk);
+ ret = set_val(ssk, val);
+ err = err ? : ret;
+ release_sock(ssk);
+ }
+
+ if (!err) {
+ *msk_val = val;
+ sockopt_seq_inc(msk);
+ }
+
+ return err;
+}
+
+static int __mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, int val)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
- lock_sock(sk);
sockopt_seq_inc(msk);
msk->cork = !!val;
mptcp_for_each_subflow(msk, subflow) {
@@ -649,25 +663,15 @@ static int mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, sockptr_t optva
}
if (!val)
mptcp_check_and_set_pending(sk);
- release_sock(sk);
return 0;
}
-static int mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, sockptr_t optval,
- unsigned int optlen)
+static int __mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, int val)
{
struct mptcp_subflow_context *subflow;
struct sock *sk = (struct sock *)msk;
- int val;
-
- if (optlen < sizeof(int))
- return -EINVAL;
-
- if (copy_from_sockptr(&val, optval, sizeof(val)))
- return -EFAULT;
- lock_sock(sk);
sockopt_seq_inc(msk);
msk->nodelay = !!val;
mptcp_for_each_subflow(msk, subflow) {
@@ -679,8 +683,6 @@ static int mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, sockptr_t op
}
if (val)
mptcp_check_and_set_pending(sk);
- release_sock(sk);
-
return 0;
}
@@ -803,25 +805,10 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
int ret, val;
switch (optname) {
- case TCP_INQ:
- ret = mptcp_get_int_option(msk, optval, optlen, &val);
- if (ret)
- return ret;
- if (val < 0 || val > 1)
- return -EINVAL;
-
- lock_sock(sk);
- msk->recvmsg_inq = !!val;
- release_sock(sk);
- return 0;
case TCP_ULP:
return -EOPNOTSUPP;
case TCP_CONGESTION:
return mptcp_setsockopt_sol_tcp_congestion(msk, optval, optlen);
- case TCP_CORK:
- return mptcp_setsockopt_sol_tcp_cork(msk, optval, optlen);
- case TCP_NODELAY:
- return mptcp_setsockopt_sol_tcp_nodelay(msk, optval, optlen);
case TCP_DEFER_ACCEPT:
/* See tcp.c: TCP_DEFER_ACCEPT does not fail */
mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen);
@@ -834,7 +821,50 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
optval, optlen);
}
- return -EOPNOTSUPP;
+ ret = mptcp_get_int_option(msk, optval, optlen, &val);
+ if (ret)
+ return ret;
+
+ lock_sock(sk);
+ switch (optname) {
+ case TCP_INQ:
+ if (val < 0 || val > 1)
+ ret = -EINVAL;
+ else
+ msk->recvmsg_inq = !!val;
+ break;
+ case TCP_NOTSENT_LOWAT:
+ WRITE_ONCE(msk->notsent_lowat, val);
+ mptcp_write_space(sk);
+ break;
+ case TCP_CORK:
+ ret = __mptcp_setsockopt_sol_tcp_cork(msk, val);
+ break;
+ case TCP_NODELAY:
+ ret = __mptcp_setsockopt_sol_tcp_nodelay(msk, val);
+ break;
+ case TCP_KEEPIDLE:
+ ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPIDLE,
+ &tcp_sock_set_keepidle_locked,
+ &msk->keepalive_idle, val);
+ break;
+ case TCP_KEEPINTVL:
+ ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPINTVL,
+ &tcp_sock_set_keepintvl,
+ &msk->keepalive_intvl, val);
+ break;
+ case TCP_KEEPCNT:
+ ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPCNT,
+ &tcp_sock_set_keepcnt,
+ &msk->keepalive_cnt,
+ val);
+ break;
+ default:
+ ret = -ENOPROTOOPT;
+ }
+
+ release_sock(sk);
+ return ret;
}
int mptcp_setsockopt(struct sock *sk, int level, int optname,
@@ -1331,6 +1361,8 @@ static int mptcp_put_int_option(struct mptcp_sock *msk, char __user *optval,
static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
char __user *optval, int __user *optlen)
{
+ struct sock *sk = (void *)msk;
+
switch (optname) {
case TCP_ULP:
case TCP_CONGESTION:
@@ -1349,6 +1381,20 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
return mptcp_put_int_option(msk, optval, optlen, msk->cork);
case TCP_NODELAY:
return mptcp_put_int_option(msk, optval, optlen, msk->nodelay);
+ case TCP_KEEPIDLE:
+ return mptcp_put_int_option(msk, optval, optlen,
+ msk->keepalive_idle ? :
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_time) / HZ);
+ case TCP_KEEPINTVL:
+ return mptcp_put_int_option(msk, optval, optlen,
+ msk->keepalive_intvl ? :
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_intvl) / HZ);
+ case TCP_KEEPCNT:
+ return mptcp_put_int_option(msk, optval, optlen,
+ msk->keepalive_cnt ? :
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_probes));
+ case TCP_NOTSENT_LOWAT:
+ return mptcp_put_int_option(msk, optval, optlen, msk->notsent_lowat);
}
return -EOPNOTSUPP;
}
@@ -1464,6 +1510,9 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
tcp_set_congestion_control(ssk, msk->ca_name, false, true);
__tcp_sock_set_cork(ssk, !!msk->cork);
__tcp_sock_set_nodelay(ssk, !!msk->nodelay);
+ tcp_sock_set_keepidle_locked(ssk, msk->keepalive_idle);
+ tcp_sock_set_keepintvl(ssk, msk->keepalive_intvl);
+ tcp_sock_set_keepcnt(ssk, msk->keepalive_cnt);
inet_assign_bit(TRANSPARENT, ssk, inet_test_bit(TRANSPARENT, sk));
inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk));