summaryrefslogtreecommitdiffstats
path: root/net/mptcp
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp')
-rw-r--r--net/mptcp/Kconfig39
-rw-r--r--net/mptcp/Makefile14
-rw-r--r--net/mptcp/bpf.c36
-rw-r--r--net/mptcp/crypto.c83
-rw-r--r--net/mptcp/crypto_test.c72
-rw-r--r--net/mptcp/ctrl.c245
-rw-r--r--net/mptcp/diag.c104
-rw-r--r--net/mptcp/fastopen.c81
-rw-r--r--net/mptcp/mib.c111
-rw-r--r--net/mptcp/mib.h98
-rw-r--r--net/mptcp/mptcp_diag.c248
-rw-r--r--net/mptcp/options.c1653
-rw-r--r--net/mptcp/pm.c547
-rw-r--r--net/mptcp/pm_netlink.c2406
-rw-r--r--net/mptcp/pm_userspace.c503
-rw-r--r--net/mptcp/protocol.c4102
-rw-r--r--net/mptcp/protocol.h1132
-rw-r--r--net/mptcp/sched.c173
-rw-r--r--net/mptcp/sockopt.c1486
-rw-r--r--net/mptcp/subflow.c2077
-rw-r--r--net/mptcp/syncookies.c133
-rw-r--r--net/mptcp/token.c422
-rw-r--r--net/mptcp/token_test.c145
23 files changed, 15910 insertions, 0 deletions
diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig
new file mode 100644
index 000000000..20328920f
--- /dev/null
+++ b/net/mptcp/Kconfig
@@ -0,0 +1,39 @@
+
+config MPTCP
+ bool "MPTCP: Multipath TCP"
+ depends on INET
+ select SKB_EXTENSIONS
+ select CRYPTO_LIB_SHA256
+ select CRYPTO
+ help
+ Multipath TCP (MPTCP) connections send and receive data over multiple
+ subflows in order to utilize multiple network paths. Each subflow
+ uses the TCP protocol, and TCP options carry header information for
+ MPTCP.
+
+if MPTCP
+
+config INET_MPTCP_DIAG
+ depends on INET_DIAG
+ def_tristate INET_DIAG
+
+config MPTCP_IPV6
+ bool "MPTCP: IPv6 support for Multipath TCP"
+ depends on IPV6=y
+ default y
+
+config MPTCP_KUNIT_TEST
+ tristate "This builds the MPTCP KUnit tests" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS
+ help
+ Currently covers the MPTCP crypto and token helpers.
+ Only useful for kernel devs running KUnit test harness and are not
+ for inclusion into a production build.
+
+ For more information on KUnit and unit tests in general please refer
+ to the KUnit documentation in Documentation/dev-tools/kunit/.
+
+ If unsure, say N.
+
+endif
diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
new file mode 100644
index 000000000..84e531f86
--- /dev/null
+++ b/net/mptcp/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_MPTCP) += mptcp.o
+
+mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \
+ mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o sched.o
+
+obj-$(CONFIG_SYN_COOKIES) += syncookies.o
+obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o
+
+mptcp_crypto_test-objs := crypto_test.o
+mptcp_token_test-objs := token_test.o
+obj-$(CONFIG_MPTCP_KUNIT_TEST) += mptcp_crypto_test.o mptcp_token_test.o
+
+obj-$(CONFIG_BPF_SYSCALL) += bpf.o
diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c
new file mode 100644
index 000000000..8a16672b9
--- /dev/null
+++ b/net/mptcp/bpf.c
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2020, Tessares SA.
+ * Copyright (c) 2022, SUSE.
+ *
+ * Author: Nicolas Rybowski <nicolas.rybowski@tessares.net>
+ */
+
+#define pr_fmt(fmt) "MPTCP: " fmt
+
+#include <linux/bpf.h>
+#include "protocol.h"
+
+struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk)
+{
+ if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && sk_is_mptcp(sk))
+ return mptcp_sk(mptcp_subflow_ctx(sk)->conn);
+
+ return NULL;
+}
+
+BTF_SET8_START(bpf_mptcp_fmodret_ids)
+BTF_ID_FLAGS(func, update_socket_protocol)
+BTF_SET8_END(bpf_mptcp_fmodret_ids)
+
+static const struct btf_kfunc_id_set bpf_mptcp_fmodret_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_mptcp_fmodret_ids,
+};
+
+static int __init bpf_mptcp_kfunc_init(void)
+{
+ return register_btf_fmodret_id_set(&bpf_mptcp_fmodret_set);
+}
+late_initcall(bpf_mptcp_kfunc_init);
diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c
new file mode 100644
index 000000000..a89313499
--- /dev/null
+++ b/net/mptcp/crypto.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP cryptographic functions
+ * Copyright (c) 2017 - 2019, Intel Corporation.
+ *
+ * Note: This code is based on mptcp_ctrl.c, mptcp_ipv4.c, and
+ * mptcp_ipv6 from multipath-tcp.org, authored by:
+ *
+ * Sébastien Barré <sebastien.barre@uclouvain.be>
+ * Christoph Paasch <christoph.paasch@uclouvain.be>
+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
+ * Gregory Detal <gregory.detal@uclouvain.be>
+ * Fabien Duchêne <fabien.duchene@uclouvain.be>
+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
+ * Lavkesh Lahngir <lavkesh51@gmail.com>
+ * Andreas Ripke <ripke@neclab.eu>
+ * Vlad Dogaru <vlad.dogaru@intel.com>
+ * Octavian Purdila <octavian.purdila@intel.com>
+ * John Ronan <jronan@tssg.org>
+ * Catalin Nicutar <catalin.nicutar@gmail.com>
+ * Brandon Heller <brandonh@stanford.edu>
+ */
+
+#include <linux/kernel.h>
+#include <crypto/sha2.h>
+#include <asm/unaligned.h>
+
+#include "protocol.h"
+
+#define SHA256_DIGEST_WORDS (SHA256_DIGEST_SIZE / 4)
+
+void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn)
+{
+ __be32 mptcp_hashed_key[SHA256_DIGEST_WORDS];
+ __be64 input = cpu_to_be64(key);
+
+ sha256((__force u8 *)&input, sizeof(input), (u8 *)mptcp_hashed_key);
+
+ if (token)
+ *token = be32_to_cpu(mptcp_hashed_key[0]);
+ if (idsn)
+ *idsn = be64_to_cpu(*((__be64 *)&mptcp_hashed_key[6]));
+}
+
+void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac)
+{
+ u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE];
+ u8 key1be[8];
+ u8 key2be[8];
+ int i;
+
+ if (WARN_ON_ONCE(len > SHA256_DIGEST_SIZE))
+ len = SHA256_DIGEST_SIZE;
+
+ put_unaligned_be64(key1, key1be);
+ put_unaligned_be64(key2, key2be);
+
+ /* Generate key xored with ipad */
+ memset(input, 0x36, SHA256_BLOCK_SIZE);
+ for (i = 0; i < 8; i++)
+ input[i] ^= key1be[i];
+ for (i = 0; i < 8; i++)
+ input[i + 8] ^= key2be[i];
+
+ memcpy(&input[SHA256_BLOCK_SIZE], msg, len);
+
+ /* emit sha256(K1 || msg) on the second input block, so we can
+ * reuse 'input' for the last hashing
+ */
+ sha256(input, SHA256_BLOCK_SIZE + len, &input[SHA256_BLOCK_SIZE]);
+
+ /* Prepare second part of hmac */
+ memset(input, 0x5C, SHA256_BLOCK_SIZE);
+ for (i = 0; i < 8; i++)
+ input[i] ^= key1be[i];
+ for (i = 0; i < 8; i++)
+ input[i + 8] ^= key2be[i];
+
+ sha256(input, SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE, hmac);
+}
+
+#if IS_MODULE(CONFIG_MPTCP_KUNIT_TEST)
+EXPORT_SYMBOL_GPL(mptcp_crypto_hmac_sha);
+#endif
diff --git a/net/mptcp/crypto_test.c b/net/mptcp/crypto_test.c
new file mode 100644
index 000000000..017248dea
--- /dev/null
+++ b/net/mptcp/crypto_test.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <kunit/test.h>
+
+#include "protocol.h"
+
+struct test_case {
+ char *key;
+ char *msg;
+ char *result;
+};
+
+/* we can't reuse RFC 4231 test vectors, as we have constraint on the
+ * input and key size.
+ */
+static struct test_case tests[] = {
+ {
+ .key = "0b0b0b0b0b0b0b0b",
+ .msg = "48692054",
+ .result = "8385e24fb4235ac37556b6b886db106284a1da671699f46db1f235ec622dcafa",
+ },
+ {
+ .key = "aaaaaaaaaaaaaaaa",
+ .msg = "dddddddd",
+ .result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492984e1eb71aff9022f71046e9",
+ },
+ {
+ .key = "0102030405060708",
+ .msg = "cdcdcdcd",
+ .result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6f23b4d8c4da736a5dbbc6e7d",
+ },
+};
+
+static void mptcp_crypto_test_basic(struct kunit *test)
+{
+ char hmac[32], hmac_hex[65];
+ u32 nonce1, nonce2;
+ u64 key1, key2;
+ u8 msg[8];
+ int i, j;
+
+ for (i = 0; i < ARRAY_SIZE(tests); ++i) {
+ /* mptcp hmap will convert to be before computing the hmac */
+ key1 = be64_to_cpu(*((__be64 *)&tests[i].key[0]));
+ key2 = be64_to_cpu(*((__be64 *)&tests[i].key[8]));
+ nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0]));
+ nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4]));
+
+ put_unaligned_be32(nonce1, &msg[0]);
+ put_unaligned_be32(nonce2, &msg[4]);
+
+ mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
+ for (j = 0; j < 32; ++j)
+ sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff);
+ hmac_hex[64] = 0;
+
+ KUNIT_EXPECT_STREQ(test, &hmac_hex[0], tests[i].result);
+ }
+}
+
+static struct kunit_case mptcp_crypto_test_cases[] = {
+ KUNIT_CASE(mptcp_crypto_test_basic),
+ {}
+};
+
+static struct kunit_suite mptcp_crypto_suite = {
+ .name = "mptcp-crypto",
+ .test_cases = mptcp_crypto_test_cases,
+};
+
+kunit_test_suite(mptcp_crypto_suite);
+
+MODULE_LICENSE("GPL");
diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c
new file mode 100644
index 000000000..e72b518c5
--- /dev/null
+++ b/net/mptcp/ctrl.c
@@ -0,0 +1,245 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2019, Tessares SA.
+ */
+
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#include "protocol.h"
+
+#define MPTCP_SYSCTL_PATH "net/mptcp"
+
+static int mptcp_pernet_id;
+
+#ifdef CONFIG_SYSCTL
+static int mptcp_pm_type_max = __MPTCP_PM_TYPE_MAX;
+#endif
+
+struct mptcp_pernet {
+#ifdef CONFIG_SYSCTL
+ struct ctl_table_header *ctl_table_hdr;
+#endif
+
+ unsigned int add_addr_timeout;
+ unsigned int stale_loss_cnt;
+ u8 mptcp_enabled;
+ u8 checksum_enabled;
+ u8 allow_join_initial_addr_port;
+ u8 pm_type;
+ char scheduler[MPTCP_SCHED_NAME_MAX];
+};
+
+static struct mptcp_pernet *mptcp_get_pernet(const struct net *net)
+{
+ return net_generic(net, mptcp_pernet_id);
+}
+
+int mptcp_is_enabled(const struct net *net)
+{
+ return mptcp_get_pernet(net)->mptcp_enabled;
+}
+
+unsigned int mptcp_get_add_addr_timeout(const struct net *net)
+{
+ return mptcp_get_pernet(net)->add_addr_timeout;
+}
+
+int mptcp_is_checksum_enabled(const struct net *net)
+{
+ return mptcp_get_pernet(net)->checksum_enabled;
+}
+
+int mptcp_allow_join_id0(const struct net *net)
+{
+ return mptcp_get_pernet(net)->allow_join_initial_addr_port;
+}
+
+unsigned int mptcp_stale_loss_cnt(const struct net *net)
+{
+ return mptcp_get_pernet(net)->stale_loss_cnt;
+}
+
+int mptcp_get_pm_type(const struct net *net)
+{
+ return mptcp_get_pernet(net)->pm_type;
+}
+
+const char *mptcp_get_scheduler(const struct net *net)
+{
+ return mptcp_get_pernet(net)->scheduler;
+}
+
+static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
+{
+ pernet->mptcp_enabled = 1;
+ pernet->add_addr_timeout = TCP_RTO_MAX;
+ pernet->checksum_enabled = 0;
+ pernet->allow_join_initial_addr_port = 1;
+ pernet->stale_loss_cnt = 4;
+ pernet->pm_type = MPTCP_PM_TYPE_KERNEL;
+ strcpy(pernet->scheduler, "default");
+}
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table mptcp_sysctl_table[] = {
+ {
+ .procname = "enabled",
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ /* users with CAP_NET_ADMIN or root (not and) can change this
+ * value, same as other sysctl or the 'net' tree.
+ */
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
+ },
+ {
+ .procname = "add_addr_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "checksum_enabled",
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
+ },
+ {
+ .procname = "allow_join_initial_addr_port",
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
+ },
+ {
+ .procname = "stale_loss_cnt",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ },
+ {
+ .procname = "pm_type",
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &mptcp_pm_type_max
+ },
+ {
+ .procname = "scheduler",
+ .maxlen = MPTCP_SCHED_NAME_MAX,
+ .mode = 0644,
+ .proc_handler = proc_dostring,
+ },
+ {}
+};
+
+static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
+{
+ struct ctl_table_header *hdr;
+ struct ctl_table *table;
+
+ table = mptcp_sysctl_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(table, sizeof(mptcp_sysctl_table), GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+ }
+
+ table[0].data = &pernet->mptcp_enabled;
+ table[1].data = &pernet->add_addr_timeout;
+ table[2].data = &pernet->checksum_enabled;
+ table[3].data = &pernet->allow_join_initial_addr_port;
+ table[4].data = &pernet->stale_loss_cnt;
+ table[5].data = &pernet->pm_type;
+ table[6].data = &pernet->scheduler;
+
+ hdr = register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table,
+ ARRAY_SIZE(mptcp_sysctl_table));
+ if (!hdr)
+ goto err_reg;
+
+ pernet->ctl_table_hdr = hdr;
+
+ return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static void mptcp_pernet_del_table(struct mptcp_pernet *pernet)
+{
+ struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg;
+
+ unregister_net_sysctl_table(pernet->ctl_table_hdr);
+
+ kfree(table);
+}
+
+#else
+
+static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
+{
+ return 0;
+}
+
+static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) {}
+
+#endif /* CONFIG_SYSCTL */
+
+static int __net_init mptcp_net_init(struct net *net)
+{
+ struct mptcp_pernet *pernet = mptcp_get_pernet(net);
+
+ mptcp_pernet_set_defaults(pernet);
+
+ return mptcp_pernet_new_table(net, pernet);
+}
+
+/* Note: the callback will only be called per extra netns */
+static void __net_exit mptcp_net_exit(struct net *net)
+{
+ struct mptcp_pernet *pernet = mptcp_get_pernet(net);
+
+ mptcp_pernet_del_table(pernet);
+}
+
+static struct pernet_operations mptcp_pernet_ops = {
+ .init = mptcp_net_init,
+ .exit = mptcp_net_exit,
+ .id = &mptcp_pernet_id,
+ .size = sizeof(struct mptcp_pernet),
+};
+
+void __init mptcp_init(void)
+{
+ mptcp_join_cookie_init();
+ mptcp_proto_init();
+
+ if (register_pernet_subsys(&mptcp_pernet_ops) < 0)
+ panic("Failed to register MPTCP pernet subsystem.\n");
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+int __init mptcpv6_init(void)
+{
+ int err;
+
+ err = mptcp_proto_v6_init();
+
+ return err;
+}
+#endif
diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c
new file mode 100644
index 000000000..a53658674
--- /dev/null
+++ b/net/mptcp/diag.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0
+/* MPTCP socket monitoring support
+ *
+ * Copyright (c) 2019 Red Hat
+ *
+ * Author: Davide Caratti <dcaratti@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/inet_diag.h>
+#include <net/netlink.h>
+#include <uapi/linux/mptcp.h>
+#include "protocol.h"
+
+static int subflow_get_info(const struct sock *sk, struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *sf;
+ struct nlattr *start;
+ u32 flags = 0;
+ int err;
+
+ start = nla_nest_start_noflag(skb, INET_ULP_INFO_MPTCP);
+ if (!start)
+ return -EMSGSIZE;
+
+ rcu_read_lock();
+ sf = rcu_dereference(inet_csk(sk)->icsk_ulp_data);
+ if (!sf) {
+ err = 0;
+ goto nla_failure;
+ }
+
+ if (sf->mp_capable)
+ flags |= MPTCP_SUBFLOW_FLAG_MCAP_REM;
+ if (sf->request_mptcp)
+ flags |= MPTCP_SUBFLOW_FLAG_MCAP_LOC;
+ if (sf->mp_join)
+ flags |= MPTCP_SUBFLOW_FLAG_JOIN_REM;
+ if (sf->request_join)
+ flags |= MPTCP_SUBFLOW_FLAG_JOIN_LOC;
+ if (sf->backup)
+ flags |= MPTCP_SUBFLOW_FLAG_BKUP_REM;
+ if (sf->request_bkup)
+ flags |= MPTCP_SUBFLOW_FLAG_BKUP_LOC;
+ if (sf->fully_established)
+ flags |= MPTCP_SUBFLOW_FLAG_FULLY_ESTABLISHED;
+ if (sf->conn_finished)
+ flags |= MPTCP_SUBFLOW_FLAG_CONNECTED;
+ if (sf->map_valid)
+ flags |= MPTCP_SUBFLOW_FLAG_MAPVALID;
+
+ if (nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_REM, sf->remote_token) ||
+ nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_LOC, sf->token) ||
+ nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ,
+ sf->rel_write_seq) ||
+ nla_put_u64_64bit(skb, MPTCP_SUBFLOW_ATTR_MAP_SEQ, sf->map_seq,
+ MPTCP_SUBFLOW_ATTR_PAD) ||
+ nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_MAP_SFSEQ,
+ sf->map_subflow_seq) ||
+ nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_SSN_OFFSET, sf->ssn_offset) ||
+ nla_put_u16(skb, MPTCP_SUBFLOW_ATTR_MAP_DATALEN,
+ sf->map_data_len) ||
+ nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_FLAGS, flags) ||
+ nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_REM, sf->remote_id) ||
+ nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_LOC, sf->local_id)) {
+ err = -EMSGSIZE;
+ goto nla_failure;
+ }
+
+ rcu_read_unlock();
+ nla_nest_end(skb, start);
+ return 0;
+
+nla_failure:
+ rcu_read_unlock();
+ nla_nest_cancel(skb, start);
+ return err;
+}
+
+static size_t subflow_get_info_size(const struct sock *sk)
+{
+ size_t size = 0;
+
+ size += nla_total_size(0) + /* INET_ULP_INFO_MPTCP */
+ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_REM */
+ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_LOC */
+ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ */
+ nla_total_size_64bit(8) + /* MPTCP_SUBFLOW_ATTR_MAP_SEQ */
+ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_MAP_SFSEQ */
+ nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */
+ nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_MAP_DATALEN */
+ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_FLAGS */
+ nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_REM */
+ nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_LOC */
+ 0;
+ return size;
+}
+
+void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops)
+{
+ ops->get_info = subflow_get_info;
+ ops->get_info_size = subflow_get_info_size;
+}
diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c
new file mode 100644
index 000000000..74698582a
--- /dev/null
+++ b/net/mptcp/fastopen.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0
+/* MPTCP Fast Open Mechanism
+ *
+ * Copyright (c) 2021-2022, Dmytro SHYTYI
+ */
+
+#include "protocol.h"
+
+void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow,
+ struct request_sock *req)
+{
+ struct sock *sk, *ssk;
+ struct sk_buff *skb;
+ struct tcp_sock *tp;
+
+ /* on early fallback the subflow context is deleted by
+ * subflow_syn_recv_sock()
+ */
+ if (!subflow)
+ return;
+
+ ssk = subflow->tcp_sock;
+ sk = subflow->conn;
+ tp = tcp_sk(ssk);
+
+ subflow->is_mptfo = 1;
+
+ skb = skb_peek(&ssk->sk_receive_queue);
+ if (WARN_ON_ONCE(!skb))
+ return;
+
+ /* dequeue the skb from sk receive queue */
+ __skb_unlink(skb, &ssk->sk_receive_queue);
+ skb_ext_reset(skb);
+ skb_orphan(skb);
+
+ /* We copy the fastopen data, but that don't belong to the mptcp sequence
+ * space, need to offset it in the subflow sequence, see mptcp_subflow_get_map_offset()
+ */
+ tp->copied_seq += skb->len;
+ subflow->ssn_offset += skb->len;
+
+ /* initialize a dummy sequence number, we will update it at MPC
+ * completion, if needed
+ */
+ MPTCP_SKB_CB(skb)->map_seq = -skb->len;
+ MPTCP_SKB_CB(skb)->end_seq = 0;
+ MPTCP_SKB_CB(skb)->offset = 0;
+ MPTCP_SKB_CB(skb)->has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
+
+ mptcp_data_lock(sk);
+
+ mptcp_set_owner_r(skb, sk);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ mptcp_sk(sk)->bytes_received += skb->len;
+
+ sk->sk_data_ready(sk);
+
+ mptcp_data_unlock(sk);
+}
+
+void mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow,
+ const struct mptcp_options_received *mp_opt)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct sk_buff *skb;
+
+ mptcp_data_lock(sk);
+ skb = skb_peek_tail(&sk->sk_receive_queue);
+ if (skb) {
+ WARN_ON_ONCE(MPTCP_SKB_CB(skb)->end_seq);
+ pr_debug("msk %p moving seq %llx -> %llx end_seq %llx -> %llx", sk,
+ MPTCP_SKB_CB(skb)->map_seq, MPTCP_SKB_CB(skb)->map_seq + msk->ack_seq,
+ MPTCP_SKB_CB(skb)->end_seq, MPTCP_SKB_CB(skb)->end_seq + msk->ack_seq);
+ MPTCP_SKB_CB(skb)->map_seq += msk->ack_seq;
+ MPTCP_SKB_CB(skb)->end_seq += msk->ack_seq;
+ }
+
+ pr_debug("msk=%p ack_seq=%llx", msk, msk->ack_seq);
+ mptcp_data_unlock(sk);
+}
diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
new file mode 100644
index 000000000..a0990c365
--- /dev/null
+++ b/net/mptcp/mib.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/seq_file.h>
+#include <net/ip.h>
+#include <net/mptcp.h>
+#include <net/snmp.h>
+#include <net/net_namespace.h>
+
+#include "mib.h"
+
+static const struct snmp_mib mptcp_snmp_list[] = {
+ SNMP_MIB_ITEM("MPCapableSYNRX", MPTCP_MIB_MPCAPABLEPASSIVE),
+ SNMP_MIB_ITEM("MPCapableSYNTX", MPTCP_MIB_MPCAPABLEACTIVE),
+ SNMP_MIB_ITEM("MPCapableSYNACKRX", MPTCP_MIB_MPCAPABLEACTIVEACK),
+ SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK),
+ SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK),
+ SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK),
+ SNMP_MIB_ITEM("MPFallbackTokenInit", MPTCP_MIB_TOKENFALLBACKINIT),
+ SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS),
+ SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN),
+ SNMP_MIB_ITEM("MPJoinSynRx", MPTCP_MIB_JOINSYNRX),
+ SNMP_MIB_ITEM("MPJoinSynAckRx", MPTCP_MIB_JOINSYNACKRX),
+ SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC),
+ SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX),
+ SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC),
+ SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH),
+ SNMP_MIB_ITEM("InfiniteMapTx", MPTCP_MIB_INFINITEMAPTX),
+ SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX),
+ SNMP_MIB_ITEM("DSSNoMatchTCP", MPTCP_MIB_DSSTCPMISMATCH),
+ SNMP_MIB_ITEM("DataCsumErr", MPTCP_MIB_DATACSUMERR),
+ SNMP_MIB_ITEM("OFOQueueTail", MPTCP_MIB_OFOQUEUETAIL),
+ SNMP_MIB_ITEM("OFOQueue", MPTCP_MIB_OFOQUEUE),
+ SNMP_MIB_ITEM("OFOMerge", MPTCP_MIB_OFOMERGE),
+ SNMP_MIB_ITEM("NoDSSInWindow", MPTCP_MIB_NODSSWINDOW),
+ SNMP_MIB_ITEM("DuplicateData", MPTCP_MIB_DUPDATA),
+ SNMP_MIB_ITEM("AddAddr", MPTCP_MIB_ADDADDR),
+ SNMP_MIB_ITEM("AddAddrTx", MPTCP_MIB_ADDADDRTX),
+ SNMP_MIB_ITEM("AddAddrTxDrop", MPTCP_MIB_ADDADDRTXDROP),
+ SNMP_MIB_ITEM("EchoAdd", MPTCP_MIB_ECHOADD),
+ SNMP_MIB_ITEM("EchoAddTx", MPTCP_MIB_ECHOADDTX),
+ SNMP_MIB_ITEM("EchoAddTxDrop", MPTCP_MIB_ECHOADDTXDROP),
+ SNMP_MIB_ITEM("PortAdd", MPTCP_MIB_PORTADD),
+ SNMP_MIB_ITEM("AddAddrDrop", MPTCP_MIB_ADDADDRDROP),
+ SNMP_MIB_ITEM("MPJoinPortSynRx", MPTCP_MIB_JOINPORTSYNRX),
+ SNMP_MIB_ITEM("MPJoinPortSynAckRx", MPTCP_MIB_JOINPORTSYNACKRX),
+ SNMP_MIB_ITEM("MPJoinPortAckRx", MPTCP_MIB_JOINPORTACKRX),
+ SNMP_MIB_ITEM("MismatchPortSynRx", MPTCP_MIB_MISMATCHPORTSYNRX),
+ SNMP_MIB_ITEM("MismatchPortAckRx", MPTCP_MIB_MISMATCHPORTACKRX),
+ SNMP_MIB_ITEM("RmAddr", MPTCP_MIB_RMADDR),
+ SNMP_MIB_ITEM("RmAddrDrop", MPTCP_MIB_RMADDRDROP),
+ SNMP_MIB_ITEM("RmAddrTx", MPTCP_MIB_RMADDRTX),
+ SNMP_MIB_ITEM("RmAddrTxDrop", MPTCP_MIB_RMADDRTXDROP),
+ SNMP_MIB_ITEM("RmSubflow", MPTCP_MIB_RMSUBFLOW),
+ SNMP_MIB_ITEM("MPPrioTx", MPTCP_MIB_MPPRIOTX),
+ SNMP_MIB_ITEM("MPPrioRx", MPTCP_MIB_MPPRIORX),
+ SNMP_MIB_ITEM("MPFailTx", MPTCP_MIB_MPFAILTX),
+ SNMP_MIB_ITEM("MPFailRx", MPTCP_MIB_MPFAILRX),
+ SNMP_MIB_ITEM("MPFastcloseTx", MPTCP_MIB_MPFASTCLOSETX),
+ SNMP_MIB_ITEM("MPFastcloseRx", MPTCP_MIB_MPFASTCLOSERX),
+ SNMP_MIB_ITEM("MPRstTx", MPTCP_MIB_MPRSTTX),
+ SNMP_MIB_ITEM("MPRstRx", MPTCP_MIB_MPRSTRX),
+ SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED),
+ SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE),
+ SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER),
+ SNMP_MIB_ITEM("SndWndShared", MPTCP_MIB_SNDWNDSHARED),
+ SNMP_MIB_ITEM("RcvWndShared", MPTCP_MIB_RCVWNDSHARED),
+ SNMP_MIB_ITEM("RcvWndConflictUpdate", MPTCP_MIB_RCVWNDCONFLICTUPDATE),
+ SNMP_MIB_ITEM("RcvWndConflict", MPTCP_MIB_RCVWNDCONFLICT),
+ SNMP_MIB_SENTINEL
+};
+
+/* mptcp_mib_alloc - allocate percpu mib counters
+ *
+ * These are allocated when the first mptcp socket is created so
+ * we do not waste percpu memory if mptcp isn't in use.
+ */
+bool mptcp_mib_alloc(struct net *net)
+{
+ struct mptcp_mib __percpu *mib = alloc_percpu(struct mptcp_mib);
+
+ if (!mib)
+ return false;
+
+ if (cmpxchg(&net->mib.mptcp_statistics, NULL, mib))
+ free_percpu(mib);
+
+ return true;
+}
+
+void mptcp_seq_show(struct seq_file *seq)
+{
+ unsigned long sum[ARRAY_SIZE(mptcp_snmp_list) - 1];
+ struct net *net = seq->private;
+ int i;
+
+ seq_puts(seq, "MPTcpExt:");
+ for (i = 0; mptcp_snmp_list[i].name; i++)
+ seq_printf(seq, " %s", mptcp_snmp_list[i].name);
+
+ seq_puts(seq, "\nMPTcpExt:");
+
+ memset(sum, 0, sizeof(sum));
+ if (net->mib.mptcp_statistics)
+ snmp_get_cpu_field_batch(sum, mptcp_snmp_list,
+ net->mib.mptcp_statistics);
+
+ for (i = 0; mptcp_snmp_list[i].name; i++)
+ seq_printf(seq, " %lu", sum[i]);
+
+ seq_putc(seq, '\n');
+}
diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h
new file mode 100644
index 000000000..cae71d947
--- /dev/null
+++ b/net/mptcp/mib.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+enum linux_mptcp_mib_field {
+ MPTCP_MIB_NUM = 0,
+ MPTCP_MIB_MPCAPABLEPASSIVE, /* Received SYN with MP_CAPABLE */
+ MPTCP_MIB_MPCAPABLEACTIVE, /* Sent SYN with MP_CAPABLE */
+ MPTCP_MIB_MPCAPABLEACTIVEACK, /* Received SYN/ACK with MP_CAPABLE */
+ MPTCP_MIB_MPCAPABLEPASSIVEACK, /* Received third ACK with MP_CAPABLE */
+ MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK,/* Server-side fallback during 3-way handshake */
+ MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */
+ MPTCP_MIB_TOKENFALLBACKINIT, /* Could not init/allocate token */
+ MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */
+ MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */
+ MPTCP_MIB_JOINSYNRX, /* Received a SYN + MP_JOIN */
+ MPTCP_MIB_JOINSYNACKRX, /* Received a SYN/ACK + MP_JOIN */
+ MPTCP_MIB_JOINSYNACKMAC, /* HMAC was wrong on SYN/ACK + MP_JOIN */
+ MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */
+ MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */
+ MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */
+ MPTCP_MIB_INFINITEMAPTX, /* Sent an infinite mapping */
+ MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */
+ MPTCP_MIB_DSSTCPMISMATCH, /* DSS-mapping did not map with TCP's sequence numbers */
+ MPTCP_MIB_DATACSUMERR, /* The data checksum fail */
+ MPTCP_MIB_OFOQUEUETAIL, /* Segments inserted into OoO queue tail */
+ MPTCP_MIB_OFOQUEUE, /* Segments inserted into OoO queue */
+ MPTCP_MIB_OFOMERGE, /* Segments merged in OoO queue */
+ MPTCP_MIB_NODSSWINDOW, /* Segments not in MPTCP windows */
+ MPTCP_MIB_DUPDATA, /* Segments discarded due to duplicate DSS */
+ MPTCP_MIB_ADDADDR, /* Received ADD_ADDR with echo-flag=0 */
+ MPTCP_MIB_ADDADDRTX, /* Sent ADD_ADDR with echo-flag=0 */
+ MPTCP_MIB_ADDADDRTXDROP, /* ADD_ADDR with echo-flag=0 not send due to
+ * resource exhaustion
+ */
+ MPTCP_MIB_ECHOADD, /* Received ADD_ADDR with echo-flag=1 */
+ MPTCP_MIB_ECHOADDTX, /* Send ADD_ADDR with echo-flag=1 */
+ MPTCP_MIB_ECHOADDTXDROP, /* ADD_ADDR with echo-flag=1 not send due
+ * to resource exhaustion
+ */
+ MPTCP_MIB_PORTADD, /* Received ADD_ADDR with a port-number */
+ MPTCP_MIB_ADDADDRDROP, /* Dropped incoming ADD_ADDR */
+ MPTCP_MIB_JOINPORTSYNRX, /* Received a SYN MP_JOIN with a different port-number */
+ MPTCP_MIB_JOINPORTSYNACKRX, /* Received a SYNACK MP_JOIN with a different port-number */
+ MPTCP_MIB_JOINPORTACKRX, /* Received an ACK MP_JOIN with a different port-number */
+ MPTCP_MIB_MISMATCHPORTSYNRX, /* Received a SYN MP_JOIN with a mismatched port-number */
+ MPTCP_MIB_MISMATCHPORTACKRX, /* Received an ACK MP_JOIN with a mismatched port-number */
+ MPTCP_MIB_RMADDR, /* Received RM_ADDR */
+ MPTCP_MIB_RMADDRDROP, /* Dropped incoming RM_ADDR */
+ MPTCP_MIB_RMADDRTX, /* Sent RM_ADDR */
+ MPTCP_MIB_RMADDRTXDROP, /* RM_ADDR not sent due to resource exhaustion */
+ MPTCP_MIB_RMSUBFLOW, /* Remove a subflow */
+ MPTCP_MIB_MPPRIOTX, /* Transmit a MP_PRIO */
+ MPTCP_MIB_MPPRIORX, /* Received a MP_PRIO */
+ MPTCP_MIB_MPFAILTX, /* Transmit a MP_FAIL */
+ MPTCP_MIB_MPFAILRX, /* Received a MP_FAIL */
+ MPTCP_MIB_MPFASTCLOSETX, /* Transmit a MP_FASTCLOSE */
+ MPTCP_MIB_MPFASTCLOSERX, /* Received a MP_FASTCLOSE */
+ MPTCP_MIB_MPRSTTX, /* Transmit a MP_RST */
+ MPTCP_MIB_MPRSTRX, /* Received a MP_RST */
+ MPTCP_MIB_RCVPRUNED, /* Incoming packet dropped due to memory limit */
+ MPTCP_MIB_SUBFLOWSTALE, /* Subflows entered 'stale' status */
+ MPTCP_MIB_SUBFLOWRECOVER, /* Subflows returned to active status after being stale */
+ MPTCP_MIB_SNDWNDSHARED, /* Subflow snd wnd is overridden by msk's one */
+ MPTCP_MIB_RCVWNDSHARED, /* Subflow rcv wnd is overridden by msk's one */
+ MPTCP_MIB_RCVWNDCONFLICTUPDATE, /* subflow rcv wnd is overridden by msk's one due to
+ * conflict with another subflow while updating msk rcv wnd
+ */
+ MPTCP_MIB_RCVWNDCONFLICT, /* Conflict with while updating msk rcv wnd */
+ __MPTCP_MIB_MAX
+};
+
+#define LINUX_MIB_MPTCP_MAX __MPTCP_MIB_MAX
+struct mptcp_mib {
+ unsigned long mibs[LINUX_MIB_MPTCP_MAX];
+};
+
+static inline void MPTCP_ADD_STATS(struct net *net,
+ enum linux_mptcp_mib_field field,
+ int val)
+{
+ if (likely(net->mib.mptcp_statistics))
+ SNMP_ADD_STATS(net->mib.mptcp_statistics, field, val);
+}
+
+static inline void MPTCP_INC_STATS(struct net *net,
+ enum linux_mptcp_mib_field field)
+{
+ if (likely(net->mib.mptcp_statistics))
+ SNMP_INC_STATS(net->mib.mptcp_statistics, field);
+}
+
+static inline void __MPTCP_INC_STATS(struct net *net,
+ enum linux_mptcp_mib_field field)
+{
+ if (likely(net->mib.mptcp_statistics))
+ __SNMP_INC_STATS(net->mib.mptcp_statistics, field);
+}
+
+bool mptcp_mib_alloc(struct net *net);
diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c
new file mode 100644
index 000000000..8df1bdb64
--- /dev/null
+++ b/net/mptcp/mptcp_diag.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0
+/* MPTCP socket monitoring support
+ *
+ * Copyright (c) 2020 Red Hat
+ *
+ * Author: Paolo Abeni <pabeni@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/inet_diag.h>
+#include <net/netlink.h>
+#include <uapi/linux/mptcp.h>
+#include "protocol.h"
+
+static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *req,
+ struct nlattr *bc, bool net_admin)
+{
+ if (!inet_diag_bc_sk(bc, sk))
+ return 0;
+
+ return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, req, NLM_F_MULTI,
+ net_admin);
+}
+
+static int mptcp_diag_dump_one(struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *req)
+{
+ struct sk_buff *in_skb = cb->skb;
+ struct mptcp_sock *msk = NULL;
+ struct sk_buff *rep;
+ int err = -ENOENT;
+ struct net *net;
+ struct sock *sk;
+
+ net = sock_net(in_skb->sk);
+ msk = mptcp_token_get_sock(net, req->id.idiag_cookie[0]);
+ if (!msk)
+ goto out_nosk;
+
+ err = -ENOMEM;
+ sk = (struct sock *)msk;
+ rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) +
+ inet_diag_msg_attrs_size() +
+ nla_total_size(sizeof(struct mptcp_info)) +
+ nla_total_size(sizeof(struct inet_diag_meminfo)) + 64,
+ GFP_KERNEL);
+ if (!rep)
+ goto out;
+
+ err = inet_sk_diag_fill(sk, inet_csk(sk), rep, cb, req, 0,
+ netlink_net_capable(in_skb, CAP_NET_ADMIN));
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(rep);
+ goto out;
+ }
+ err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
+
+out:
+ sock_put(sk);
+
+out_nosk:
+ return err;
+}
+
+struct mptcp_diag_ctx {
+ long s_slot;
+ long s_num;
+ unsigned int l_slot;
+ unsigned int l_num;
+};
+
+static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r,
+ bool net_admin)
+{
+ struct inet_diag_dump_data *cb_data = cb->data;
+ struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx;
+ struct nlattr *bc = cb_data->inet_diag_nla_bc;
+ struct net *net = sock_net(skb->sk);
+ struct inet_hashinfo *hinfo;
+ int i;
+
+ hinfo = net->ipv4.tcp_death_row.hashinfo;
+
+ for (i = diag_ctx->l_slot; i <= hinfo->lhash2_mask; i++) {
+ struct inet_listen_hashbucket *ilb;
+ struct hlist_nulls_node *node;
+ struct sock *sk;
+ int num = 0;
+
+ ilb = &hinfo->lhash2[i];
+
+ rcu_read_lock();
+ spin_lock(&ilb->lock);
+ sk_nulls_for_each(sk, node, &ilb->nulls_head) {
+ const struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ int ret;
+
+ if (num < diag_ctx->l_num)
+ goto next_listen;
+
+ if (!ctx || strcmp(inet_csk(sk)->icsk_ulp_ops->name, "mptcp"))
+ goto next_listen;
+
+ sk = ctx->conn;
+ if (!sk || !net_eq(sock_net(sk), net))
+ goto next_listen;
+
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next_listen;
+
+ if (r->id.idiag_sport != inet->inet_sport &&
+ r->id.idiag_sport)
+ goto next_listen;
+
+ if (!refcount_inc_not_zero(&sk->sk_refcnt))
+ goto next_listen;
+
+ ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin);
+
+ sock_put(sk);
+
+ if (ret < 0) {
+ spin_unlock(&ilb->lock);
+ rcu_read_unlock();
+ diag_ctx->l_slot = i;
+ diag_ctx->l_num = num;
+ return;
+ }
+ diag_ctx->l_num = num + 1;
+ num = 0;
+next_listen:
+ ++num;
+ }
+ spin_unlock(&ilb->lock);
+ rcu_read_unlock();
+
+ cond_resched();
+ diag_ctx->l_num = 0;
+ }
+
+ diag_ctx->l_num = 0;
+ diag_ctx->l_slot = i;
+}
+
+static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r)
+{
+ bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+ struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx;
+ struct net *net = sock_net(skb->sk);
+ struct inet_diag_dump_data *cb_data;
+ struct mptcp_sock *msk;
+ struct nlattr *bc;
+
+ BUILD_BUG_ON(sizeof(cb->ctx) < sizeof(*diag_ctx));
+
+ cb_data = cb->data;
+ bc = cb_data->inet_diag_nla_bc;
+
+ while ((msk = mptcp_token_iter_next(net, &diag_ctx->s_slot,
+ &diag_ctx->s_num)) != NULL) {
+ struct inet_sock *inet = (struct inet_sock *)msk;
+ struct sock *sk = (struct sock *)msk;
+ int ret = 0;
+
+ if (!(r->idiag_states & (1 << sk->sk_state)))
+ goto next;
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next;
+ if (r->id.idiag_sport != inet->inet_sport &&
+ r->id.idiag_sport)
+ goto next;
+ if (r->id.idiag_dport != inet->inet_dport &&
+ r->id.idiag_dport)
+ goto next;
+
+ ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin);
+next:
+ sock_put(sk);
+ if (ret < 0) {
+ /* will retry on the same position */
+ diag_ctx->s_num--;
+ break;
+ }
+ cond_resched();
+ }
+
+ if ((r->idiag_states & TCPF_LISTEN) && r->id.idiag_dport == 0)
+ mptcp_diag_dump_listeners(skb, cb, r, net_admin);
+}
+
+static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+ void *_info)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_info *info = _info;
+
+ r->idiag_rqueue = sk_rmem_alloc_get(sk);
+ r->idiag_wqueue = sk_wmem_alloc_get(sk);
+
+ if (inet_sk_state_load(sk) == TCP_LISTEN) {
+ struct sock *lsk = READ_ONCE(msk->first);
+
+ if (lsk) {
+ /* override with settings from tcp listener,
+ * so Send-Q will show accept queue.
+ */
+ r->idiag_rqueue = READ_ONCE(lsk->sk_ack_backlog);
+ r->idiag_wqueue = READ_ONCE(lsk->sk_max_ack_backlog);
+ }
+ }
+
+ if (!info)
+ return;
+
+ mptcp_diag_fill_info(msk, info);
+}
+
+static const struct inet_diag_handler mptcp_diag_handler = {
+ .dump = mptcp_diag_dump,
+ .dump_one = mptcp_diag_dump_one,
+ .idiag_get_info = mptcp_diag_get_info,
+ .idiag_type = IPPROTO_MPTCP,
+ .idiag_info_size = sizeof(struct mptcp_info),
+};
+
+static int __init mptcp_diag_init(void)
+{
+ return inet_diag_register(&mptcp_diag_handler);
+}
+
+static void __exit mptcp_diag_exit(void)
+{
+ inet_diag_unregister(&mptcp_diag_handler);
+}
+
+module_init(mptcp_diag_init);
+module_exit(mptcp_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-262 /* AF_INET - IPPROTO_MPTCP */);
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
new file mode 100644
index 000000000..d2527d189
--- /dev/null
+++ b/net/mptcp/options.c
@@ -0,0 +1,1653 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2017 - 2019, Intel Corporation.
+ */
+
+#define pr_fmt(fmt) "MPTCP: " fmt
+
+#include <linux/kernel.h>
+#include <crypto/sha2.h>
+#include <net/tcp.h>
+#include <net/mptcp.h>
+#include "protocol.h"
+#include "mib.h"
+
+#include <trace/events/mptcp.h>
+
+static bool mptcp_cap_flag_sha256(u8 flags)
+{
+ return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256;
+}
+
+static void mptcp_parse_option(const struct sk_buff *skb,
+ const unsigned char *ptr, int opsize,
+ struct mptcp_options_received *mp_opt)
+{
+ u8 subtype = *ptr >> 4;
+ int expected_opsize;
+ u16 subopt;
+ u8 version;
+ u8 flags;
+ u8 i;
+
+ switch (subtype) {
+ case MPTCPOPT_MP_CAPABLE:
+ /* strict size checking */
+ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+ if (skb->len > tcp_hdr(skb)->doff << 2)
+ expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA;
+ else
+ expected_opsize = TCPOLEN_MPTCP_MPC_ACK;
+ subopt = OPTION_MPTCP_MPC_ACK;
+ } else {
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) {
+ expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK;
+ subopt = OPTION_MPTCP_MPC_SYNACK;
+ } else {
+ expected_opsize = TCPOLEN_MPTCP_MPC_SYN;
+ subopt = OPTION_MPTCP_MPC_SYN;
+ }
+ }
+
+ /* Cfr RFC 8684 Section 3.3.0:
+ * If a checksum is present but its use had
+ * not been negotiated in the MP_CAPABLE handshake, the receiver MUST
+ * close the subflow with a RST, as it is not behaving as negotiated.
+ * If a checksum is not present when its use has been negotiated, the
+ * receiver MUST close the subflow with a RST, as it is considered
+ * broken
+ * We parse even option with mismatching csum presence, so that
+ * later in subflow_data_ready we can trigger the reset.
+ */
+ if (opsize != expected_opsize &&
+ (expected_opsize != TCPOLEN_MPTCP_MPC_ACK_DATA ||
+ opsize != TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM))
+ break;
+
+ /* try to be gentle vs future versions on the initial syn */
+ version = *ptr++ & MPTCP_VERSION_MASK;
+ if (opsize != TCPOLEN_MPTCP_MPC_SYN) {
+ if (version != MPTCP_SUPPORTED_VERSION)
+ break;
+ } else if (version < MPTCP_SUPPORTED_VERSION) {
+ break;
+ }
+
+ flags = *ptr++;
+ if (!mptcp_cap_flag_sha256(flags) ||
+ (flags & MPTCP_CAP_EXTENSIBILITY))
+ break;
+
+ /* RFC 6824, Section 3.1:
+ * "For the Checksum Required bit (labeled "A"), if either
+ * host requires the use of checksums, checksums MUST be used.
+ * In other words, the only way for checksums not to be used
+ * is if both hosts in their SYNs set A=0."
+ */
+ if (flags & MPTCP_CAP_CHECKSUM_REQD)
+ mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD;
+
+ mp_opt->deny_join_id0 = !!(flags & MPTCP_CAP_DENY_JOIN_ID0);
+
+ mp_opt->suboptions |= subopt;
+ if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) {
+ mp_opt->sndr_key = get_unaligned_be64(ptr);
+ ptr += 8;
+ }
+ if (opsize >= TCPOLEN_MPTCP_MPC_ACK) {
+ mp_opt->rcvr_key = get_unaligned_be64(ptr);
+ ptr += 8;
+ }
+ if (opsize >= TCPOLEN_MPTCP_MPC_ACK_DATA) {
+ /* Section 3.1.:
+ * "the data parameters in a MP_CAPABLE are semantically
+ * equivalent to those in a DSS option and can be used
+ * interchangeably."
+ */
+ mp_opt->suboptions |= OPTION_MPTCP_DSS;
+ mp_opt->use_map = 1;
+ mp_opt->mpc_map = 1;
+ mp_opt->use_ack = 0;
+ mp_opt->data_len = get_unaligned_be16(ptr);
+ ptr += 2;
+ }
+ if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM) {
+ mp_opt->csum = get_unaligned((__force __sum16 *)ptr);
+ mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD;
+ ptr += 2;
+ }
+ pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d csum=%u",
+ version, flags, opsize, mp_opt->sndr_key,
+ mp_opt->rcvr_key, mp_opt->data_len, mp_opt->csum);
+ break;
+
+ case MPTCPOPT_MP_JOIN:
+ if (opsize == TCPOLEN_MPTCP_MPJ_SYN) {
+ mp_opt->suboptions |= OPTION_MPTCP_MPJ_SYN;
+ mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP;
+ mp_opt->join_id = *ptr++;
+ mp_opt->token = get_unaligned_be32(ptr);
+ ptr += 4;
+ mp_opt->nonce = get_unaligned_be32(ptr);
+ ptr += 4;
+ pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u",
+ mp_opt->backup, mp_opt->join_id,
+ mp_opt->token, mp_opt->nonce);
+ } else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) {
+ mp_opt->suboptions |= OPTION_MPTCP_MPJ_SYNACK;
+ mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP;
+ mp_opt->join_id = *ptr++;
+ mp_opt->thmac = get_unaligned_be64(ptr);
+ ptr += 8;
+ mp_opt->nonce = get_unaligned_be32(ptr);
+ ptr += 4;
+ pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u",
+ mp_opt->backup, mp_opt->join_id,
+ mp_opt->thmac, mp_opt->nonce);
+ } else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) {
+ mp_opt->suboptions |= OPTION_MPTCP_MPJ_ACK;
+ ptr += 2;
+ memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN);
+ pr_debug("MP_JOIN hmac");
+ }
+ break;
+
+ case MPTCPOPT_DSS:
+ pr_debug("DSS");
+ ptr++;
+
+ /* we must clear 'mpc_map' be able to detect MP_CAPABLE
+ * map vs DSS map in mptcp_incoming_options(), and reconstruct
+ * map info accordingly
+ */
+ mp_opt->mpc_map = 0;
+ flags = (*ptr++) & MPTCP_DSS_FLAG_MASK;
+ mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0;
+ mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0;
+ mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0;
+ mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0;
+ mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK);
+
+ pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d",
+ mp_opt->data_fin, mp_opt->dsn64,
+ mp_opt->use_map, mp_opt->ack64,
+ mp_opt->use_ack);
+
+ expected_opsize = TCPOLEN_MPTCP_DSS_BASE;
+
+ if (mp_opt->use_ack) {
+ if (mp_opt->ack64)
+ expected_opsize += TCPOLEN_MPTCP_DSS_ACK64;
+ else
+ expected_opsize += TCPOLEN_MPTCP_DSS_ACK32;
+ }
+
+ if (mp_opt->use_map) {
+ if (mp_opt->dsn64)
+ expected_opsize += TCPOLEN_MPTCP_DSS_MAP64;
+ else
+ expected_opsize += TCPOLEN_MPTCP_DSS_MAP32;
+ }
+
+ /* Always parse any csum presence combination, we will enforce
+ * RFC 8684 Section 3.3.0 checks later in subflow_data_ready
+ */
+ if (opsize != expected_opsize &&
+ opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM)
+ break;
+
+ mp_opt->suboptions |= OPTION_MPTCP_DSS;
+ if (mp_opt->use_ack) {
+ if (mp_opt->ack64) {
+ mp_opt->data_ack = get_unaligned_be64(ptr);
+ ptr += 8;
+ } else {
+ mp_opt->data_ack = get_unaligned_be32(ptr);
+ ptr += 4;
+ }
+
+ pr_debug("data_ack=%llu", mp_opt->data_ack);
+ }
+
+ if (mp_opt->use_map) {
+ if (mp_opt->dsn64) {
+ mp_opt->data_seq = get_unaligned_be64(ptr);
+ ptr += 8;
+ } else {
+ mp_opt->data_seq = get_unaligned_be32(ptr);
+ ptr += 4;
+ }
+
+ mp_opt->subflow_seq = get_unaligned_be32(ptr);
+ ptr += 4;
+
+ mp_opt->data_len = get_unaligned_be16(ptr);
+ ptr += 2;
+
+ if (opsize == expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) {
+ mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD;
+ mp_opt->csum = get_unaligned((__force __sum16 *)ptr);
+ ptr += 2;
+ }
+
+ pr_debug("data_seq=%llu subflow_seq=%u data_len=%u csum=%d:%u",
+ mp_opt->data_seq, mp_opt->subflow_seq,
+ mp_opt->data_len, !!(mp_opt->suboptions & OPTION_MPTCP_CSUMREQD),
+ mp_opt->csum);
+ }
+
+ break;
+
+ case MPTCPOPT_ADD_ADDR:
+ mp_opt->echo = (*ptr++) & MPTCP_ADDR_ECHO;
+ if (!mp_opt->echo) {
+ if (opsize == TCPOLEN_MPTCP_ADD_ADDR ||
+ opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT)
+ mp_opt->addr.family = AF_INET;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6 ||
+ opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT)
+ mp_opt->addr.family = AF_INET6;
+#endif
+ else
+ break;
+ } else {
+ if (opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE ||
+ opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT)
+ mp_opt->addr.family = AF_INET;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE ||
+ opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT)
+ mp_opt->addr.family = AF_INET6;
+#endif
+ else
+ break;
+ }
+
+ mp_opt->suboptions |= OPTION_MPTCP_ADD_ADDR;
+ mp_opt->addr.id = *ptr++;
+ mp_opt->addr.port = 0;
+ mp_opt->ahmac = 0;
+ if (mp_opt->addr.family == AF_INET) {
+ memcpy((u8 *)&mp_opt->addr.addr.s_addr, (u8 *)ptr, 4);
+ ptr += 4;
+ if (opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT ||
+ opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) {
+ mp_opt->addr.port = htons(get_unaligned_be16(ptr));
+ ptr += 2;
+ }
+ }
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else {
+ memcpy(mp_opt->addr.addr6.s6_addr, (u8 *)ptr, 16);
+ ptr += 16;
+ if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT ||
+ opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) {
+ mp_opt->addr.port = htons(get_unaligned_be16(ptr));
+ ptr += 2;
+ }
+ }
+#endif
+ if (!mp_opt->echo) {
+ mp_opt->ahmac = get_unaligned_be64(ptr);
+ ptr += 8;
+ }
+ pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d",
+ (mp_opt->addr.family == AF_INET6) ? "6" : "",
+ mp_opt->addr.id, mp_opt->ahmac, mp_opt->echo, ntohs(mp_opt->addr.port));
+ break;
+
+ case MPTCPOPT_RM_ADDR:
+ if (opsize < TCPOLEN_MPTCP_RM_ADDR_BASE + 1 ||
+ opsize > TCPOLEN_MPTCP_RM_ADDR_BASE + MPTCP_RM_IDS_MAX)
+ break;
+
+ ptr++;
+
+ mp_opt->suboptions |= OPTION_MPTCP_RM_ADDR;
+ mp_opt->rm_list.nr = opsize - TCPOLEN_MPTCP_RM_ADDR_BASE;
+ for (i = 0; i < mp_opt->rm_list.nr; i++)
+ mp_opt->rm_list.ids[i] = *ptr++;
+ pr_debug("RM_ADDR: rm_list_nr=%d", mp_opt->rm_list.nr);
+ break;
+
+ case MPTCPOPT_MP_PRIO:
+ if (opsize != TCPOLEN_MPTCP_PRIO)
+ break;
+
+ mp_opt->suboptions |= OPTION_MPTCP_PRIO;
+ mp_opt->backup = *ptr++ & MPTCP_PRIO_BKUP;
+ pr_debug("MP_PRIO: prio=%d", mp_opt->backup);
+ break;
+
+ case MPTCPOPT_MP_FASTCLOSE:
+ if (opsize != TCPOLEN_MPTCP_FASTCLOSE)
+ break;
+
+ ptr += 2;
+ mp_opt->rcvr_key = get_unaligned_be64(ptr);
+ ptr += 8;
+ mp_opt->suboptions |= OPTION_MPTCP_FASTCLOSE;
+ pr_debug("MP_FASTCLOSE: recv_key=%llu", mp_opt->rcvr_key);
+ break;
+
+ case MPTCPOPT_RST:
+ if (opsize != TCPOLEN_MPTCP_RST)
+ break;
+
+ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST))
+ break;
+
+ mp_opt->suboptions |= OPTION_MPTCP_RST;
+ flags = *ptr++;
+ mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT;
+ mp_opt->reset_reason = *ptr;
+ pr_debug("MP_RST: transient=%u reason=%u",
+ mp_opt->reset_transient, mp_opt->reset_reason);
+ break;
+
+ case MPTCPOPT_MP_FAIL:
+ if (opsize != TCPOLEN_MPTCP_FAIL)
+ break;
+
+ ptr += 2;
+ mp_opt->suboptions |= OPTION_MPTCP_FAIL;
+ mp_opt->fail_seq = get_unaligned_be64(ptr);
+ pr_debug("MP_FAIL: data_seq=%llu", mp_opt->fail_seq);
+ break;
+
+ default:
+ break;
+ }
+}
+
+void mptcp_get_options(const struct sk_buff *skb,
+ struct mptcp_options_received *mp_opt)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ const unsigned char *ptr;
+ int length;
+
+ /* initialize option status */
+ mp_opt->suboptions = 0;
+
+ length = (th->doff * 4) - sizeof(struct tcphdr);
+ ptr = (const unsigned char *)(th + 1);
+
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return;
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
+ length--;
+ continue;
+ default:
+ if (length < 2)
+ return;
+ opsize = *ptr++;
+ if (opsize < 2) /* "silly options" */
+ return;
+ if (opsize > length)
+ return; /* don't parse partial options */
+ if (opcode == TCPOPT_MPTCP)
+ mptcp_parse_option(skb, ptr, opsize, mp_opt);
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ }
+}
+
+bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
+ unsigned int *size, struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+ /* we will use snd_isn to detect first pkt [re]transmission
+ * in mptcp_established_options_mp()
+ */
+ subflow->snd_isn = TCP_SKB_CB(skb)->end_seq;
+ if (subflow->request_mptcp) {
+ opts->suboptions = OPTION_MPTCP_MPC_SYN;
+ opts->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk));
+ opts->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk));
+ *size = TCPOLEN_MPTCP_MPC_SYN;
+ return true;
+ } else if (subflow->request_join) {
+ pr_debug("remote_token=%u, nonce=%u", subflow->remote_token,
+ subflow->local_nonce);
+ opts->suboptions = OPTION_MPTCP_MPJ_SYN;
+ opts->join_id = subflow->local_id;
+ opts->token = subflow->remote_token;
+ opts->nonce = subflow->local_nonce;
+ opts->backup = subflow->request_bkup;
+ *size = TCPOLEN_MPTCP_MPJ_SYN;
+ return true;
+ }
+ return false;
+}
+
+static void clear_3rdack_retransmission(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ sk_stop_timer(sk, &icsk->icsk_delack_timer);
+ icsk->icsk_ack.timeout = 0;
+ icsk->icsk_ack.ato = 0;
+ icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER);
+}
+
+static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
+ bool snd_data_fin_enable,
+ unsigned int *size,
+ struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ struct mptcp_ext *mpext;
+ unsigned int data_len;
+ u8 len;
+
+ /* When skb is not available, we better over-estimate the emitted
+ * options len. A full DSS option (28 bytes) is longer than
+ * TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so
+ * tell the caller to defer the estimate to
+ * mptcp_established_options_dss(), which will reserve enough space.
+ */
+ if (!skb)
+ return false;
+
+ /* MPC/MPJ needed only on 3rd ack packet, DATA_FIN and TCP shutdown take precedence */
+ if (subflow->fully_established || snd_data_fin_enable ||
+ subflow->snd_isn != TCP_SKB_CB(skb)->seq ||
+ sk->sk_state != TCP_ESTABLISHED)
+ return false;
+
+ if (subflow->mp_capable) {
+ mpext = mptcp_get_ext(skb);
+ data_len = mpext ? mpext->data_len : 0;
+
+ /* we will check ops->data_len in mptcp_write_options() to
+ * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and
+ * TCPOLEN_MPTCP_MPC_ACK
+ */
+ opts->data_len = data_len;
+ opts->suboptions = OPTION_MPTCP_MPC_ACK;
+ opts->sndr_key = subflow->local_key;
+ opts->rcvr_key = subflow->remote_key;
+ opts->csum_reqd = READ_ONCE(msk->csum_enabled);
+ opts->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk));
+
+ /* Section 3.1.
+ * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK
+ * packets that start the first subflow of an MPTCP connection,
+ * as well as the first packet that carries data
+ */
+ if (data_len > 0) {
+ len = TCPOLEN_MPTCP_MPC_ACK_DATA;
+ if (opts->csum_reqd) {
+ /* we need to propagate more info to csum the pseudo hdr */
+ opts->data_seq = mpext->data_seq;
+ opts->subflow_seq = mpext->subflow_seq;
+ opts->csum = mpext->csum;
+ len += TCPOLEN_MPTCP_DSS_CHECKSUM;
+ }
+ *size = ALIGN(len, 4);
+ } else {
+ *size = TCPOLEN_MPTCP_MPC_ACK;
+ }
+
+ pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d",
+ subflow, subflow->local_key, subflow->remote_key,
+ data_len);
+
+ return true;
+ } else if (subflow->mp_join) {
+ opts->suboptions = OPTION_MPTCP_MPJ_ACK;
+ memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN);
+ *size = TCPOLEN_MPTCP_MPJ_ACK;
+ pr_debug("subflow=%p", subflow);
+
+ /* we can use the full delegate action helper only from BH context
+ * If we are in process context - sk is flushing the backlog at
+ * socket lock release time - just set the appropriate flag, will
+ * be handled by the release callback
+ */
+ if (sock_owned_by_user(sk))
+ set_bit(MPTCP_DELEGATE_ACK, &subflow->delegated_status);
+ else
+ mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_ACK);
+ return true;
+ }
+ return false;
+}
+
+static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow,
+ struct sk_buff *skb, struct mptcp_ext *ext)
+{
+ /* The write_seq value has already been incremented, so the actual
+ * sequence number for the DATA_FIN is one less.
+ */
+ u64 data_fin_tx_seq = READ_ONCE(mptcp_sk(subflow->conn)->write_seq) - 1;
+
+ if (!ext->use_map || !skb->len) {
+ /* RFC6824 requires a DSS mapping with specific values
+ * if DATA_FIN is set but no data payload is mapped
+ */
+ ext->data_fin = 1;
+ ext->use_map = 1;
+ ext->dsn64 = 1;
+ ext->data_seq = data_fin_tx_seq;
+ ext->subflow_seq = 0;
+ ext->data_len = 1;
+ } else if (ext->data_seq + ext->data_len == data_fin_tx_seq) {
+ /* If there's an existing DSS mapping and it is the
+ * final mapping, DATA_FIN consumes 1 additional byte of
+ * mapping space.
+ */
+ ext->data_fin = 1;
+ ext->data_len++;
+ }
+}
+
+static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
+ bool snd_data_fin_enable,
+ unsigned int *size,
+ struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ unsigned int dss_size = 0;
+ struct mptcp_ext *mpext;
+ unsigned int ack_size;
+ bool ret = false;
+ u64 ack_seq;
+
+ opts->csum_reqd = READ_ONCE(msk->csum_enabled);
+ mpext = skb ? mptcp_get_ext(skb) : NULL;
+
+ if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) {
+ unsigned int map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64;
+
+ if (mpext) {
+ if (opts->csum_reqd)
+ map_size += TCPOLEN_MPTCP_DSS_CHECKSUM;
+
+ opts->ext_copy = *mpext;
+ }
+
+ dss_size = map_size;
+ if (skb && snd_data_fin_enable)
+ mptcp_write_data_fin(subflow, skb, &opts->ext_copy);
+ opts->suboptions = OPTION_MPTCP_DSS;
+ ret = true;
+ }
+
+ /* passive sockets msk will set the 'can_ack' after accept(), even
+ * if the first subflow may have the already the remote key handy
+ */
+ opts->ext_copy.use_ack = 0;
+ if (!READ_ONCE(msk->can_ack)) {
+ *size = ALIGN(dss_size, 4);
+ return ret;
+ }
+
+ ack_seq = READ_ONCE(msk->ack_seq);
+ if (READ_ONCE(msk->use_64bit_ack)) {
+ ack_size = TCPOLEN_MPTCP_DSS_ACK64;
+ opts->ext_copy.data_ack = ack_seq;
+ opts->ext_copy.ack64 = 1;
+ } else {
+ ack_size = TCPOLEN_MPTCP_DSS_ACK32;
+ opts->ext_copy.data_ack32 = (uint32_t)ack_seq;
+ opts->ext_copy.ack64 = 0;
+ }
+ opts->ext_copy.use_ack = 1;
+ opts->suboptions = OPTION_MPTCP_DSS;
+ WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk));
+
+ /* Add kind/length/subtype/flag overhead if mapping is not populated */
+ if (dss_size == 0)
+ ack_size += TCPOLEN_MPTCP_DSS_BASE;
+
+ dss_size += ack_size;
+
+ *size = ALIGN(dss_size, 4);
+ return true;
+}
+
+static u64 add_addr_generate_hmac(u64 key1, u64 key2,
+ struct mptcp_addr_info *addr)
+{
+ u16 port = ntohs(addr->port);
+ u8 hmac[SHA256_DIGEST_SIZE];
+ u8 msg[19];
+ int i = 0;
+
+ msg[i++] = addr->id;
+ if (addr->family == AF_INET) {
+ memcpy(&msg[i], &addr->addr.s_addr, 4);
+ i += 4;
+ }
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (addr->family == AF_INET6) {
+ memcpy(&msg[i], &addr->addr6.s6_addr, 16);
+ i += 16;
+ }
+#endif
+ msg[i++] = port >> 8;
+ msg[i++] = port & 0xFF;
+
+ mptcp_crypto_hmac_sha(key1, key2, msg, i, hmac);
+
+ return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]);
+}
+
+static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *skb,
+ unsigned int *size,
+ unsigned int remaining,
+ struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ bool drop_other_suboptions = false;
+ unsigned int opt_size = *size;
+ bool echo;
+ int len;
+
+ /* add addr will strip the existing options, be sure to avoid breaking
+ * MPC/MPJ handshakes
+ */
+ if (!mptcp_pm_should_add_signal(msk) ||
+ (opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) ||
+ !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &opts->addr,
+ &echo, &drop_other_suboptions))
+ return false;
+
+ if (drop_other_suboptions)
+ remaining += opt_size;
+ len = mptcp_add_addr_len(opts->addr.family, echo, !!opts->addr.port);
+ if (remaining < len)
+ return false;
+
+ *size = len;
+ if (drop_other_suboptions) {
+ pr_debug("drop other suboptions");
+ opts->suboptions = 0;
+
+ /* note that e.g. DSS could have written into the memory
+ * aliased by ahmac, we must reset the field here
+ * to avoid appending the hmac even for ADD_ADDR echo
+ * options
+ */
+ opts->ahmac = 0;
+ *size -= opt_size;
+ }
+ opts->suboptions |= OPTION_MPTCP_ADD_ADDR;
+ if (!echo) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDRTX);
+ opts->ahmac = add_addr_generate_hmac(msk->local_key,
+ msk->remote_key,
+ &opts->addr);
+ } else {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADDTX);
+ }
+ pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d",
+ opts->addr.id, opts->ahmac, echo, ntohs(opts->addr.port));
+
+ return true;
+}
+
+static bool mptcp_established_options_rm_addr(struct sock *sk,
+ unsigned int *size,
+ unsigned int remaining,
+ struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ struct mptcp_rm_list rm_list;
+ int i, len;
+
+ if (!mptcp_pm_should_rm_signal(msk) ||
+ !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_list)))
+ return false;
+
+ len = mptcp_rm_addr_len(&rm_list);
+ if (len < 0)
+ return false;
+ if (remaining < len)
+ return false;
+
+ *size = len;
+ opts->suboptions |= OPTION_MPTCP_RM_ADDR;
+ opts->rm_list = rm_list;
+
+ for (i = 0; i < opts->rm_list.nr; i++)
+ pr_debug("rm_list_ids[%d]=%d", i, opts->rm_list.ids[i]);
+ MPTCP_ADD_STATS(sock_net(sk), MPTCP_MIB_RMADDRTX, opts->rm_list.nr);
+ return true;
+}
+
+static bool mptcp_established_options_mp_prio(struct sock *sk,
+ unsigned int *size,
+ unsigned int remaining,
+ struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+ /* can't send MP_PRIO with MPC, as they share the same option space:
+ * 'backup'. Also it makes no sense at all
+ */
+ if (!subflow->send_mp_prio || (opts->suboptions & OPTIONS_MPTCP_MPC))
+ return false;
+
+ /* account for the trailing 'nop' option */
+ if (remaining < TCPOLEN_MPTCP_PRIO_ALIGN)
+ return false;
+
+ *size = TCPOLEN_MPTCP_PRIO_ALIGN;
+ opts->suboptions |= OPTION_MPTCP_PRIO;
+ opts->backup = subflow->request_bkup;
+
+ pr_debug("prio=%d", opts->backup);
+
+ return true;
+}
+
+static noinline bool mptcp_established_options_rst(struct sock *sk, struct sk_buff *skb,
+ unsigned int *size,
+ unsigned int remaining,
+ struct mptcp_out_options *opts)
+{
+ const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+ if (remaining < TCPOLEN_MPTCP_RST)
+ return false;
+
+ *size = TCPOLEN_MPTCP_RST;
+ opts->suboptions |= OPTION_MPTCP_RST;
+ opts->reset_transient = subflow->reset_transient;
+ opts->reset_reason = subflow->reset_reason;
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTTX);
+
+ return true;
+}
+
+static bool mptcp_established_options_fastclose(struct sock *sk,
+ unsigned int *size,
+ unsigned int remaining,
+ struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+
+ if (likely(!subflow->send_fastclose))
+ return false;
+
+ if (remaining < TCPOLEN_MPTCP_FASTCLOSE)
+ return false;
+
+ *size = TCPOLEN_MPTCP_FASTCLOSE;
+ opts->suboptions |= OPTION_MPTCP_FASTCLOSE;
+ opts->rcvr_key = msk->remote_key;
+
+ pr_debug("FASTCLOSE key=%llu", opts->rcvr_key);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX);
+ return true;
+}
+
+static bool mptcp_established_options_mp_fail(struct sock *sk,
+ unsigned int *size,
+ unsigned int remaining,
+ struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+ if (likely(!subflow->send_mp_fail))
+ return false;
+
+ if (remaining < TCPOLEN_MPTCP_FAIL)
+ return false;
+
+ *size = TCPOLEN_MPTCP_FAIL;
+ opts->suboptions |= OPTION_MPTCP_FAIL;
+ opts->fail_seq = subflow->map_seq;
+
+ pr_debug("MP_FAIL fail_seq=%llu", opts->fail_seq);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX);
+
+ return true;
+}
+
+bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
+ unsigned int *size, unsigned int remaining,
+ struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ unsigned int opt_size = 0;
+ bool snd_data_fin;
+ bool ret = false;
+
+ opts->suboptions = 0;
+
+ if (unlikely(__mptcp_check_fallback(msk) && !mptcp_check_infinite_map(skb)))
+ return false;
+
+ if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) {
+ if (mptcp_established_options_fastclose(sk, &opt_size, remaining, opts) ||
+ mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) {
+ *size += opt_size;
+ remaining -= opt_size;
+ }
+ /* MP_RST can be used with MP_FASTCLOSE and MP_FAIL if there is room */
+ if (mptcp_established_options_rst(sk, skb, &opt_size, remaining, opts)) {
+ *size += opt_size;
+ remaining -= opt_size;
+ }
+ return true;
+ }
+
+ snd_data_fin = mptcp_data_fin_enabled(msk);
+ if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, opts))
+ ret = true;
+ else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, opts)) {
+ unsigned int mp_fail_size;
+
+ ret = true;
+ if (mptcp_established_options_mp_fail(sk, &mp_fail_size,
+ remaining - opt_size, opts)) {
+ *size += opt_size + mp_fail_size;
+ remaining -= opt_size - mp_fail_size;
+ return true;
+ }
+ }
+
+ /* we reserved enough space for the above options, and exceeding the
+ * TCP option space would be fatal
+ */
+ if (WARN_ON_ONCE(opt_size > remaining))
+ return false;
+
+ *size += opt_size;
+ remaining -= opt_size;
+ if (mptcp_established_options_add_addr(sk, skb, &opt_size, remaining, opts)) {
+ *size += opt_size;
+ remaining -= opt_size;
+ ret = true;
+ } else if (mptcp_established_options_rm_addr(sk, &opt_size, remaining, opts)) {
+ *size += opt_size;
+ remaining -= opt_size;
+ ret = true;
+ }
+
+ if (mptcp_established_options_mp_prio(sk, &opt_size, remaining, opts)) {
+ *size += opt_size;
+ remaining -= opt_size;
+ ret = true;
+ }
+
+ return ret;
+}
+
+bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
+ struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+
+ if (subflow_req->mp_capable) {
+ opts->suboptions = OPTION_MPTCP_MPC_SYNACK;
+ opts->sndr_key = subflow_req->local_key;
+ opts->csum_reqd = subflow_req->csum_reqd;
+ opts->allow_join_id0 = subflow_req->allow_join_id0;
+ *size = TCPOLEN_MPTCP_MPC_SYNACK;
+ pr_debug("subflow_req=%p, local_key=%llu",
+ subflow_req, subflow_req->local_key);
+ return true;
+ } else if (subflow_req->mp_join) {
+ opts->suboptions = OPTION_MPTCP_MPJ_SYNACK;
+ opts->backup = subflow_req->backup;
+ opts->join_id = subflow_req->local_id;
+ opts->thmac = subflow_req->thmac;
+ opts->nonce = subflow_req->local_nonce;
+ pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u",
+ subflow_req, opts->backup, opts->join_id,
+ opts->thmac, opts->nonce);
+ *size = TCPOLEN_MPTCP_MPJ_SYNACK;
+ return true;
+ }
+ return false;
+}
+
+static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk,
+ struct mptcp_subflow_context *subflow,
+ struct sk_buff *skb,
+ struct mptcp_options_received *mp_opt)
+{
+ /* here we can process OoO, in-window pkts, only in-sequence 4th ack
+ * will make the subflow fully established
+ */
+ if (likely(subflow->fully_established)) {
+ /* on passive sockets, check for 3rd ack retransmission
+ * note that msk is always set by subflow_syn_recv_sock()
+ * for mp_join subflows
+ */
+ if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 &&
+ TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq &&
+ subflow->mp_join && (mp_opt->suboptions & OPTIONS_MPTCP_MPJ) &&
+ !subflow->request_join)
+ tcp_send_ack(ssk);
+ goto check_notify;
+ }
+
+ /* we must process OoO packets before the first subflow is fully
+ * established. OoO packets are instead a protocol violation
+ * for MP_JOIN subflows as the peer must not send any data
+ * before receiving the forth ack - cfr. RFC 8684 section 3.2.
+ */
+ if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) {
+ if (subflow->mp_join)
+ goto reset;
+ if (subflow->is_mptfo && mp_opt->suboptions & OPTION_MPTCP_MPC_ACK)
+ goto set_fully_established;
+ return subflow->mp_capable;
+ }
+
+ if (subflow->remote_key_valid &&
+ (((mp_opt->suboptions & OPTION_MPTCP_DSS) && mp_opt->use_ack) ||
+ ((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) && !mp_opt->echo))) {
+ /* subflows are fully established as soon as we get any
+ * additional ack, including ADD_ADDR.
+ */
+ subflow->fully_established = 1;
+ WRITE_ONCE(msk->fully_established, true);
+ goto check_notify;
+ }
+
+ /* If the first established packet does not contain MP_CAPABLE + data
+ * then fallback to TCP. Fallback scenarios requires a reset for
+ * MP_JOIN subflows.
+ */
+ if (!(mp_opt->suboptions & OPTIONS_MPTCP_MPC)) {
+ if (subflow->mp_join)
+ goto reset;
+ subflow->mp_capable = 0;
+ pr_fallback(msk);
+ mptcp_do_fallback(ssk);
+ return false;
+ }
+
+ if (mp_opt->deny_join_id0)
+ WRITE_ONCE(msk->pm.remote_deny_join_id0, true);
+
+set_fully_established:
+ if (unlikely(!READ_ONCE(msk->pm.server_side)))
+ pr_warn_once("bogus mpc option on established client sk");
+ mptcp_subflow_fully_established(subflow, mp_opt);
+
+check_notify:
+ /* if the subflow is not already linked into the conn_list, we can't
+ * notify the PM: this subflow is still on the listener queue
+ * and the PM possibly acquiring the subflow lock could race with
+ * the listener close
+ */
+ if (likely(subflow->pm_notified) || list_empty(&subflow->node))
+ return true;
+
+ subflow->pm_notified = 1;
+ if (subflow->mp_join) {
+ clear_3rdack_retransmission(ssk);
+ mptcp_pm_subflow_established(msk);
+ } else {
+ mptcp_pm_fully_established(msk, ssk);
+ }
+ return true;
+
+reset:
+ mptcp_subflow_reset(ssk);
+ return false;
+}
+
+u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq)
+{
+ u32 old_seq32, cur_seq32;
+
+ old_seq32 = (u32)old_seq;
+ cur_seq32 = (u32)cur_seq;
+ cur_seq = (old_seq & GENMASK_ULL(63, 32)) + cur_seq32;
+ if (unlikely(cur_seq32 < old_seq32 && before(old_seq32, cur_seq32)))
+ return cur_seq + (1LL << 32);
+
+ /* reverse wrap could happen, too */
+ if (unlikely(cur_seq32 > old_seq32 && after(old_seq32, cur_seq32)))
+ return cur_seq - (1LL << 32);
+ return cur_seq;
+}
+
+static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una)
+{
+ msk->bytes_acked += new_snd_una - msk->snd_una;
+ msk->snd_una = new_snd_una;
+}
+
+static void ack_update_msk(struct mptcp_sock *msk,
+ struct sock *ssk,
+ struct mptcp_options_received *mp_opt)
+{
+ u64 new_wnd_end, new_snd_una, snd_nxt = READ_ONCE(msk->snd_nxt);
+ struct sock *sk = (struct sock *)msk;
+ u64 old_snd_una;
+
+ mptcp_data_lock(sk);
+
+ /* avoid ack expansion on update conflict, to reduce the risk of
+ * wrongly expanding to a future ack sequence number, which is way
+ * more dangerous than missing an ack
+ */
+ old_snd_una = msk->snd_una;
+ new_snd_una = mptcp_expand_seq(old_snd_una, mp_opt->data_ack, mp_opt->ack64);
+
+ /* ACK for data not even sent yet? Ignore.*/
+ if (unlikely(after64(new_snd_una, snd_nxt)))
+ new_snd_una = old_snd_una;
+
+ new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd;
+
+ if (after64(new_wnd_end, msk->wnd_end))
+ msk->wnd_end = new_wnd_end;
+
+ /* this assumes mptcp_incoming_options() is invoked after tcp_ack() */
+ if (after64(msk->wnd_end, READ_ONCE(msk->snd_nxt)))
+ __mptcp_check_push(sk, ssk);
+
+ if (after64(new_snd_una, old_snd_una)) {
+ __mptcp_snd_una_update(msk, new_snd_una);
+ __mptcp_data_acked(sk);
+ }
+ mptcp_data_unlock(sk);
+
+ trace_ack_update_msk(mp_opt->data_ack,
+ old_snd_una, new_snd_una,
+ new_wnd_end, msk->wnd_end);
+}
+
+bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit)
+{
+ /* Skip if DATA_FIN was already received.
+ * If updating simultaneously with the recvmsg loop, values
+ * should match. If they mismatch, the peer is misbehaving and
+ * we will prefer the most recent information.
+ */
+ if (READ_ONCE(msk->rcv_data_fin))
+ return false;
+
+ WRITE_ONCE(msk->rcv_data_fin_seq,
+ mptcp_expand_seq(READ_ONCE(msk->ack_seq), data_fin_seq, use_64bit));
+ WRITE_ONCE(msk->rcv_data_fin, 1);
+
+ return true;
+}
+
+static bool add_addr_hmac_valid(struct mptcp_sock *msk,
+ struct mptcp_options_received *mp_opt)
+{
+ u64 hmac = 0;
+
+ if (mp_opt->echo)
+ return true;
+
+ hmac = add_addr_generate_hmac(msk->remote_key,
+ msk->local_key,
+ &mp_opt->addr);
+
+ pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n",
+ msk, hmac, mp_opt->ahmac);
+
+ return hmac == mp_opt->ahmac;
+}
+
+/* Return false if a subflow has been reset, else return true */
+bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ struct mptcp_options_received mp_opt;
+ struct mptcp_ext *mpext;
+
+ if (__mptcp_check_fallback(msk)) {
+ /* Keep it simple and unconditionally trigger send data cleanup and
+ * pending queue spooling. We will need to acquire the data lock
+ * for more accurate checks, and once the lock is acquired, such
+ * helpers are cheap.
+ */
+ mptcp_data_lock(subflow->conn);
+ if (sk_stream_memory_free(sk))
+ __mptcp_check_push(subflow->conn, sk);
+
+ /* on fallback we just need to ignore the msk-level snd_una, as
+ * this is really plain TCP
+ */
+ __mptcp_snd_una_update(msk, READ_ONCE(msk->snd_nxt));
+
+ __mptcp_data_acked(subflow->conn);
+ mptcp_data_unlock(subflow->conn);
+ return true;
+ }
+
+ mptcp_get_options(skb, &mp_opt);
+
+ /* The subflow can be in close state only if check_fully_established()
+ * just sent a reset. If so, tell the caller to ignore the current packet.
+ */
+ if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))
+ return sk->sk_state != TCP_CLOSE;
+
+ if (unlikely(mp_opt.suboptions != OPTION_MPTCP_DSS)) {
+ if ((mp_opt.suboptions & OPTION_MPTCP_FASTCLOSE) &&
+ msk->local_key == mp_opt.rcvr_key) {
+ WRITE_ONCE(msk->rcv_fastclose, true);
+ mptcp_schedule_work((struct sock *)msk);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSERX);
+ }
+
+ if ((mp_opt.suboptions & OPTION_MPTCP_ADD_ADDR) &&
+ add_addr_hmac_valid(msk, &mp_opt)) {
+ if (!mp_opt.echo) {
+ mptcp_pm_add_addr_received(sk, &mp_opt.addr);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDR);
+ } else {
+ mptcp_pm_add_addr_echoed(msk, &mp_opt.addr);
+ mptcp_pm_del_add_timer(msk, &mp_opt.addr, true);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADD);
+ }
+
+ if (mp_opt.addr.port)
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_PORTADD);
+ }
+
+ if (mp_opt.suboptions & OPTION_MPTCP_RM_ADDR)
+ mptcp_pm_rm_addr_received(msk, &mp_opt.rm_list);
+
+ if (mp_opt.suboptions & OPTION_MPTCP_PRIO) {
+ mptcp_pm_mp_prio_received(sk, mp_opt.backup);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIORX);
+ }
+
+ if (mp_opt.suboptions & OPTION_MPTCP_FAIL) {
+ mptcp_pm_mp_fail_received(sk, mp_opt.fail_seq);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILRX);
+ }
+
+ if (mp_opt.suboptions & OPTION_MPTCP_RST) {
+ subflow->reset_seen = 1;
+ subflow->reset_reason = mp_opt.reset_reason;
+ subflow->reset_transient = mp_opt.reset_transient;
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTRX);
+ }
+
+ if (!(mp_opt.suboptions & OPTION_MPTCP_DSS))
+ return true;
+ }
+
+ /* we can't wait for recvmsg() to update the ack_seq, otherwise
+ * monodirectional flows will stuck
+ */
+ if (mp_opt.use_ack)
+ ack_update_msk(msk, sk, &mp_opt);
+
+ /* Zero-data-length packets are dropped by the caller and not
+ * propagated to the MPTCP layer, so the skb extension does not
+ * need to be allocated or populated. DATA_FIN information, if
+ * present, needs to be updated here before the skb is freed.
+ */
+ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
+ if (mp_opt.data_fin && mp_opt.data_len == 1 &&
+ mptcp_update_rcv_data_fin(msk, mp_opt.data_seq, mp_opt.dsn64))
+ mptcp_schedule_work((struct sock *)msk);
+
+ return true;
+ }
+
+ mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
+ if (!mpext)
+ return true;
+
+ memset(mpext, 0, sizeof(*mpext));
+
+ if (likely(mp_opt.use_map)) {
+ if (mp_opt.mpc_map) {
+ /* this is an MP_CAPABLE carrying MPTCP data
+ * we know this map the first chunk of data
+ */
+ mptcp_crypto_key_sha(subflow->remote_key, NULL,
+ &mpext->data_seq);
+ mpext->data_seq++;
+ mpext->subflow_seq = 1;
+ mpext->dsn64 = 1;
+ mpext->mpc_map = 1;
+ mpext->data_fin = 0;
+ } else {
+ mpext->data_seq = mp_opt.data_seq;
+ mpext->subflow_seq = mp_opt.subflow_seq;
+ mpext->dsn64 = mp_opt.dsn64;
+ mpext->data_fin = mp_opt.data_fin;
+ }
+ mpext->data_len = mp_opt.data_len;
+ mpext->use_map = 1;
+ mpext->csum_reqd = !!(mp_opt.suboptions & OPTION_MPTCP_CSUMREQD);
+
+ if (mpext->csum_reqd)
+ mpext->csum = mp_opt.csum;
+ }
+
+ return true;
+}
+
+static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th)
+{
+ const struct sock *ssk = (const struct sock *)tp;
+ struct mptcp_subflow_context *subflow;
+ u64 ack_seq, rcv_wnd_old, rcv_wnd_new;
+ struct mptcp_sock *msk;
+ u32 new_win;
+ u64 win;
+
+ subflow = mptcp_subflow_ctx(ssk);
+ msk = mptcp_sk(subflow->conn);
+
+ ack_seq = READ_ONCE(msk->ack_seq);
+ rcv_wnd_new = ack_seq + tp->rcv_wnd;
+
+ rcv_wnd_old = atomic64_read(&msk->rcv_wnd_sent);
+ if (after64(rcv_wnd_new, rcv_wnd_old)) {
+ u64 rcv_wnd;
+
+ for (;;) {
+ rcv_wnd = atomic64_cmpxchg(&msk->rcv_wnd_sent, rcv_wnd_old, rcv_wnd_new);
+
+ if (rcv_wnd == rcv_wnd_old)
+ break;
+
+ rcv_wnd_old = rcv_wnd;
+ if (before64(rcv_wnd_new, rcv_wnd_old)) {
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICTUPDATE);
+ goto raise_win;
+ }
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT);
+ }
+ return;
+ }
+
+ if (rcv_wnd_new != rcv_wnd_old) {
+raise_win:
+ win = rcv_wnd_old - ack_seq;
+ tp->rcv_wnd = min_t(u64, win, U32_MAX);
+ new_win = tp->rcv_wnd;
+
+ /* Make sure we do not exceed the maximum possible
+ * scaled window.
+ */
+ if (unlikely(th->syn))
+ new_win = min(new_win, 65535U) << tp->rx_opt.rcv_wscale;
+ if (!tp->rx_opt.rcv_wscale &&
+ READ_ONCE(sock_net(ssk)->ipv4.sysctl_tcp_workaround_signed_windows))
+ new_win = min(new_win, MAX_TCP_WINDOW);
+ else
+ new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
+
+ /* RFC1323 scaling applied */
+ new_win >>= tp->rx_opt.rcv_wscale;
+ th->window = htons(new_win);
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDSHARED);
+ }
+}
+
+__sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum)
+{
+ struct csum_pseudo_header header;
+ __wsum csum;
+
+ /* cfr RFC 8684 3.3.1.:
+ * the data sequence number used in the pseudo-header is
+ * always the 64-bit value, irrespective of what length is used in the
+ * DSS option itself.
+ */
+ header.data_seq = cpu_to_be64(data_seq);
+ header.subflow_seq = htonl(subflow_seq);
+ header.data_len = htons(data_len);
+ header.csum = 0;
+
+ csum = csum_partial(&header, sizeof(header), sum);
+ return csum_fold(csum);
+}
+
+static __sum16 mptcp_make_csum(const struct mptcp_ext *mpext)
+{
+ return __mptcp_make_csum(mpext->data_seq, mpext->subflow_seq, mpext->data_len,
+ ~csum_unfold(mpext->csum));
+}
+
+static void put_len_csum(u16 len, __sum16 csum, void *data)
+{
+ __sum16 *sumptr = data + 2;
+ __be16 *ptr = data;
+
+ put_unaligned_be16(len, ptr);
+
+ put_unaligned(csum, sumptr);
+}
+
+void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp,
+ struct mptcp_out_options *opts)
+{
+ const struct sock *ssk = (const struct sock *)tp;
+ struct mptcp_subflow_context *subflow;
+
+ /* Which options can be used together?
+ *
+ * X: mutually exclusive
+ * O: often used together
+ * C: can be used together in some cases
+ * P: could be used together but we prefer not to (optimisations)
+ *
+ * Opt: | MPC | MPJ | DSS | ADD | RM | PRIO | FAIL | FC |
+ * ------|------|------|------|------|------|------|------|------|
+ * MPC |------|------|------|------|------|------|------|------|
+ * MPJ | X |------|------|------|------|------|------|------|
+ * DSS | X | X |------|------|------|------|------|------|
+ * ADD | X | X | P |------|------|------|------|------|
+ * RM | C | C | C | P |------|------|------|------|
+ * PRIO | X | C | C | C | C |------|------|------|
+ * FAIL | X | X | C | X | X | X |------|------|
+ * FC | X | X | X | X | X | X | X |------|
+ * RST | X | X | X | X | X | X | O | O |
+ * ------|------|------|------|------|------|------|------|------|
+ *
+ * The same applies in mptcp_established_options() function.
+ */
+ if (likely(OPTION_MPTCP_DSS & opts->suboptions)) {
+ struct mptcp_ext *mpext = &opts->ext_copy;
+ u8 len = TCPOLEN_MPTCP_DSS_BASE;
+ u8 flags = 0;
+
+ if (mpext->use_ack) {
+ flags = MPTCP_DSS_HAS_ACK;
+ if (mpext->ack64) {
+ len += TCPOLEN_MPTCP_DSS_ACK64;
+ flags |= MPTCP_DSS_ACK64;
+ } else {
+ len += TCPOLEN_MPTCP_DSS_ACK32;
+ }
+ }
+
+ if (mpext->use_map) {
+ len += TCPOLEN_MPTCP_DSS_MAP64;
+
+ /* Use only 64-bit mapping flags for now, add
+ * support for optional 32-bit mappings later.
+ */
+ flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64;
+ if (mpext->data_fin)
+ flags |= MPTCP_DSS_DATA_FIN;
+
+ if (opts->csum_reqd)
+ len += TCPOLEN_MPTCP_DSS_CHECKSUM;
+ }
+
+ *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags);
+
+ if (mpext->use_ack) {
+ if (mpext->ack64) {
+ put_unaligned_be64(mpext->data_ack, ptr);
+ ptr += 2;
+ } else {
+ put_unaligned_be32(mpext->data_ack32, ptr);
+ ptr += 1;
+ }
+ }
+
+ if (mpext->use_map) {
+ put_unaligned_be64(mpext->data_seq, ptr);
+ ptr += 2;
+ put_unaligned_be32(mpext->subflow_seq, ptr);
+ ptr += 1;
+ if (opts->csum_reqd) {
+ /* data_len == 0 is reserved for the infinite mapping,
+ * the checksum will also be set to 0.
+ */
+ put_len_csum(mpext->data_len,
+ (mpext->data_len ? mptcp_make_csum(mpext) : 0),
+ ptr);
+ } else {
+ put_unaligned_be32(mpext->data_len << 16 |
+ TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
+ }
+ ptr += 1;
+ }
+
+ /* We might need to add MP_FAIL options in rare cases */
+ if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions))
+ goto mp_fail;
+ } else if (OPTIONS_MPTCP_MPC & opts->suboptions) {
+ u8 len, flag = MPTCP_CAP_HMAC_SHA256;
+
+ if (OPTION_MPTCP_MPC_SYN & opts->suboptions) {
+ len = TCPOLEN_MPTCP_MPC_SYN;
+ } else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) {
+ len = TCPOLEN_MPTCP_MPC_SYNACK;
+ } else if (opts->data_len) {
+ len = TCPOLEN_MPTCP_MPC_ACK_DATA;
+ if (opts->csum_reqd)
+ len += TCPOLEN_MPTCP_DSS_CHECKSUM;
+ } else {
+ len = TCPOLEN_MPTCP_MPC_ACK;
+ }
+
+ if (opts->csum_reqd)
+ flag |= MPTCP_CAP_CHECKSUM_REQD;
+
+ if (!opts->allow_join_id0)
+ flag |= MPTCP_CAP_DENY_JOIN_ID0;
+
+ *ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len,
+ MPTCP_SUPPORTED_VERSION,
+ flag);
+
+ if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) &
+ opts->suboptions))
+ goto mp_capable_done;
+
+ put_unaligned_be64(opts->sndr_key, ptr);
+ ptr += 2;
+ if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions))
+ goto mp_capable_done;
+
+ put_unaligned_be64(opts->rcvr_key, ptr);
+ ptr += 2;
+ if (!opts->data_len)
+ goto mp_capable_done;
+
+ if (opts->csum_reqd) {
+ put_len_csum(opts->data_len,
+ __mptcp_make_csum(opts->data_seq,
+ opts->subflow_seq,
+ opts->data_len,
+ ~csum_unfold(opts->csum)),
+ ptr);
+ } else {
+ put_unaligned_be32(opts->data_len << 16 |
+ TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
+ }
+ ptr += 1;
+
+ /* MPC is additionally mutually exclusive with MP_PRIO */
+ goto mp_capable_done;
+ } else if (OPTIONS_MPTCP_MPJ & opts->suboptions) {
+ if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) {
+ *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
+ TCPOLEN_MPTCP_MPJ_SYN,
+ opts->backup, opts->join_id);
+ put_unaligned_be32(opts->token, ptr);
+ ptr += 1;
+ put_unaligned_be32(opts->nonce, ptr);
+ ptr += 1;
+ } else if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) {
+ *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
+ TCPOLEN_MPTCP_MPJ_SYNACK,
+ opts->backup, opts->join_id);
+ put_unaligned_be64(opts->thmac, ptr);
+ ptr += 2;
+ put_unaligned_be32(opts->nonce, ptr);
+ ptr += 1;
+ } else {
+ *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
+ TCPOLEN_MPTCP_MPJ_ACK, 0, 0);
+ memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN);
+ ptr += 5;
+ }
+ } else if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) {
+ u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE;
+ u8 echo = MPTCP_ADDR_ECHO;
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (opts->addr.family == AF_INET6)
+ len = TCPOLEN_MPTCP_ADD_ADDR6_BASE;
+#endif
+
+ if (opts->addr.port)
+ len += TCPOLEN_MPTCP_PORT_LEN;
+
+ if (opts->ahmac) {
+ len += sizeof(opts->ahmac);
+ echo = 0;
+ }
+
+ *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
+ len, echo, opts->addr.id);
+ if (opts->addr.family == AF_INET) {
+ memcpy((u8 *)ptr, (u8 *)&opts->addr.addr.s_addr, 4);
+ ptr += 1;
+ }
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (opts->addr.family == AF_INET6) {
+ memcpy((u8 *)ptr, opts->addr.addr6.s6_addr, 16);
+ ptr += 4;
+ }
+#endif
+
+ if (!opts->addr.port) {
+ if (opts->ahmac) {
+ put_unaligned_be64(opts->ahmac, ptr);
+ ptr += 2;
+ }
+ } else {
+ u16 port = ntohs(opts->addr.port);
+
+ if (opts->ahmac) {
+ u8 *bptr = (u8 *)ptr;
+
+ put_unaligned_be16(port, bptr);
+ bptr += 2;
+ put_unaligned_be64(opts->ahmac, bptr);
+ bptr += 8;
+ put_unaligned_be16(TCPOPT_NOP << 8 |
+ TCPOPT_NOP, bptr);
+
+ ptr += 3;
+ } else {
+ put_unaligned_be32(port << 16 |
+ TCPOPT_NOP << 8 |
+ TCPOPT_NOP, ptr);
+ ptr += 1;
+ }
+ }
+ } else if (unlikely(OPTION_MPTCP_FASTCLOSE & opts->suboptions)) {
+ /* FASTCLOSE is mutually exclusive with others except RST */
+ *ptr++ = mptcp_option(MPTCPOPT_MP_FASTCLOSE,
+ TCPOLEN_MPTCP_FASTCLOSE,
+ 0, 0);
+ put_unaligned_be64(opts->rcvr_key, ptr);
+ ptr += 2;
+
+ if (OPTION_MPTCP_RST & opts->suboptions)
+ goto mp_rst;
+ return;
+ } else if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) {
+mp_fail:
+ /* MP_FAIL is mutually exclusive with others except RST */
+ subflow = mptcp_subflow_ctx(ssk);
+ subflow->send_mp_fail = 0;
+
+ *ptr++ = mptcp_option(MPTCPOPT_MP_FAIL,
+ TCPOLEN_MPTCP_FAIL,
+ 0, 0);
+ put_unaligned_be64(opts->fail_seq, ptr);
+ ptr += 2;
+
+ if (OPTION_MPTCP_RST & opts->suboptions)
+ goto mp_rst;
+ return;
+ } else if (unlikely(OPTION_MPTCP_RST & opts->suboptions)) {
+mp_rst:
+ *ptr++ = mptcp_option(MPTCPOPT_RST,
+ TCPOLEN_MPTCP_RST,
+ opts->reset_transient,
+ opts->reset_reason);
+ return;
+ }
+
+ if (OPTION_MPTCP_PRIO & opts->suboptions) {
+ subflow = mptcp_subflow_ctx(ssk);
+ subflow->send_mp_prio = 0;
+
+ *ptr++ = mptcp_option(MPTCPOPT_MP_PRIO,
+ TCPOLEN_MPTCP_PRIO,
+ opts->backup, TCPOPT_NOP);
+
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPPRIOTX);
+ }
+
+mp_capable_done:
+ if (OPTION_MPTCP_RM_ADDR & opts->suboptions) {
+ u8 i = 1;
+
+ *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR,
+ TCPOLEN_MPTCP_RM_ADDR_BASE + opts->rm_list.nr,
+ 0, opts->rm_list.ids[0]);
+
+ while (i < opts->rm_list.nr) {
+ u8 id1, id2, id3, id4;
+
+ id1 = opts->rm_list.ids[i];
+ id2 = i + 1 < opts->rm_list.nr ? opts->rm_list.ids[i + 1] : TCPOPT_NOP;
+ id3 = i + 2 < opts->rm_list.nr ? opts->rm_list.ids[i + 2] : TCPOPT_NOP;
+ id4 = i + 3 < opts->rm_list.nr ? opts->rm_list.ids[i + 3] : TCPOPT_NOP;
+ put_unaligned_be32(id1 << 24 | id2 << 16 | id3 << 8 | id4, ptr);
+ ptr += 1;
+ i += 4;
+ }
+ }
+
+ if (tp)
+ mptcp_set_rwin(tp, th);
+}
+
+__be32 mptcp_get_reset_option(const struct sk_buff *skb)
+{
+ const struct mptcp_ext *ext = mptcp_get_ext(skb);
+ u8 flags, reason;
+
+ if (ext) {
+ flags = ext->reset_transient;
+ reason = ext->reset_reason;
+
+ return mptcp_option(MPTCPOPT_RST, TCPOLEN_MPTCP_RST,
+ flags, reason);
+ }
+
+ return htonl(0u);
+}
+EXPORT_SYMBOL_GPL(mptcp_get_reset_option);
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
new file mode 100644
index 000000000..d8da5374d
--- /dev/null
+++ b/net/mptcp/pm.c
@@ -0,0 +1,547 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2019, Intel Corporation.
+ */
+#define pr_fmt(fmt) "MPTCP: " fmt
+
+#include <linux/kernel.h>
+#include <net/tcp.h>
+#include <net/mptcp.h>
+#include "protocol.h"
+
+#include "mib.h"
+
+/* path manager command handlers */
+
+int mptcp_pm_announce_addr(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr,
+ bool echo)
+{
+ u8 add_addr = READ_ONCE(msk->pm.addr_signal);
+
+ pr_debug("msk=%p, local_id=%d, echo=%d", msk, addr->id, echo);
+
+ lockdep_assert_held(&msk->pm.lock);
+
+ if (add_addr &
+ (echo ? BIT(MPTCP_ADD_ADDR_ECHO) : BIT(MPTCP_ADD_ADDR_SIGNAL))) {
+ MPTCP_INC_STATS(sock_net((struct sock *)msk),
+ echo ? MPTCP_MIB_ECHOADDTXDROP : MPTCP_MIB_ADDADDRTXDROP);
+ return -EINVAL;
+ }
+
+ if (echo) {
+ msk->pm.remote = *addr;
+ add_addr |= BIT(MPTCP_ADD_ADDR_ECHO);
+ } else {
+ msk->pm.local = *addr;
+ add_addr |= BIT(MPTCP_ADD_ADDR_SIGNAL);
+ }
+ WRITE_ONCE(msk->pm.addr_signal, add_addr);
+ return 0;
+}
+
+int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list)
+{
+ u8 rm_addr = READ_ONCE(msk->pm.addr_signal);
+
+ pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr);
+
+ if (rm_addr) {
+ MPTCP_ADD_STATS(sock_net((struct sock *)msk),
+ MPTCP_MIB_RMADDRTXDROP, rm_list->nr);
+ return -EINVAL;
+ }
+
+ msk->pm.rm_list_tx = *rm_list;
+ rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL);
+ WRITE_ONCE(msk->pm.addr_signal, rm_addr);
+ mptcp_pm_nl_addr_send_ack(msk);
+ return 0;
+}
+
+int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list)
+{
+ pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr);
+
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_pm_nl_rm_subflow_received(msk, rm_list);
+ spin_unlock_bh(&msk->pm.lock);
+ return 0;
+}
+
+/* path manager event handlers */
+
+void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+
+ pr_debug("msk=%p, token=%u side=%d", msk, msk->token, server_side);
+
+ WRITE_ONCE(pm->server_side, server_side);
+ mptcp_event(MPTCP_EVENT_CREATED, msk, ssk, GFP_ATOMIC);
+}
+
+bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+ unsigned int subflows_max;
+ int ret = 0;
+
+ if (mptcp_pm_is_userspace(msk)) {
+ if (mptcp_userspace_pm_active(msk)) {
+ spin_lock_bh(&pm->lock);
+ pm->subflows++;
+ spin_unlock_bh(&pm->lock);
+ return true;
+ }
+ return false;
+ }
+
+ subflows_max = mptcp_pm_get_subflows_max(msk);
+
+ pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows,
+ subflows_max, READ_ONCE(pm->accept_subflow));
+
+ /* try to avoid acquiring the lock below */
+ if (!READ_ONCE(pm->accept_subflow))
+ return false;
+
+ spin_lock_bh(&pm->lock);
+ if (READ_ONCE(pm->accept_subflow)) {
+ ret = pm->subflows < subflows_max;
+ if (ret && ++pm->subflows == subflows_max)
+ WRITE_ONCE(pm->accept_subflow, false);
+ }
+ spin_unlock_bh(&pm->lock);
+
+ return ret;
+}
+
+/* return true if the new status bit is currently cleared, that is, this event
+ * can be server, eventually by an already scheduled work
+ */
+static bool mptcp_pm_schedule_work(struct mptcp_sock *msk,
+ enum mptcp_pm_status new_status)
+{
+ pr_debug("msk=%p status=%x new=%lx", msk, msk->pm.status,
+ BIT(new_status));
+ if (msk->pm.status & BIT(new_status))
+ return false;
+
+ msk->pm.status |= BIT(new_status);
+ mptcp_schedule_work((struct sock *)msk);
+ return true;
+}
+
+void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+ bool announce = false;
+
+ pr_debug("msk=%p", msk);
+
+ spin_lock_bh(&pm->lock);
+
+ /* mptcp_pm_fully_established() can be invoked by multiple
+ * racing paths - accept() and check_fully_established()
+ * be sure to serve this event only once.
+ */
+ if (READ_ONCE(pm->work_pending) &&
+ !(msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)))
+ mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED);
+
+ if ((msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)) == 0)
+ announce = true;
+
+ msk->pm.status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED);
+ spin_unlock_bh(&pm->lock);
+
+ if (announce)
+ mptcp_event(MPTCP_EVENT_ESTABLISHED, msk, ssk, GFP_ATOMIC);
+}
+
+void mptcp_pm_connection_closed(struct mptcp_sock *msk)
+{
+ pr_debug("msk=%p", msk);
+}
+
+void mptcp_pm_subflow_established(struct mptcp_sock *msk)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+
+ pr_debug("msk=%p", msk);
+
+ if (!READ_ONCE(pm->work_pending))
+ return;
+
+ spin_lock_bh(&pm->lock);
+
+ if (READ_ONCE(pm->work_pending))
+ mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
+
+ spin_unlock_bh(&pm->lock);
+}
+
+void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk,
+ const struct mptcp_subflow_context *subflow)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+ bool update_subflows;
+
+ update_subflows = subflow->request_join || subflow->mp_join;
+ if (mptcp_pm_is_userspace(msk)) {
+ if (update_subflows) {
+ spin_lock_bh(&pm->lock);
+ pm->subflows--;
+ spin_unlock_bh(&pm->lock);
+ }
+ return;
+ }
+
+ if (!READ_ONCE(pm->work_pending) && !update_subflows)
+ return;
+
+ spin_lock_bh(&pm->lock);
+ if (update_subflows)
+ __mptcp_pm_close_subflow(msk);
+
+ /* Even if this subflow is not really established, tell the PM to try
+ * to pick the next ones, if possible.
+ */
+ if (mptcp_pm_nl_check_work_pending(msk))
+ mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
+
+ spin_unlock_bh(&pm->lock);
+}
+
+void mptcp_pm_add_addr_received(const struct sock *ssk,
+ const struct mptcp_addr_info *addr)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ struct mptcp_pm_data *pm = &msk->pm;
+
+ pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id,
+ READ_ONCE(pm->accept_addr));
+
+ mptcp_event_addr_announced(ssk, addr);
+
+ spin_lock_bh(&pm->lock);
+
+ if (mptcp_pm_is_userspace(msk)) {
+ if (mptcp_userspace_pm_active(msk)) {
+ mptcp_pm_announce_addr(msk, addr, true);
+ mptcp_pm_add_addr_send_ack(msk);
+ } else {
+ __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP);
+ }
+ } else if (!READ_ONCE(pm->accept_addr)) {
+ mptcp_pm_announce_addr(msk, addr, true);
+ mptcp_pm_add_addr_send_ack(msk);
+ } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) {
+ pm->remote = *addr;
+ } else {
+ __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP);
+ }
+
+ spin_unlock_bh(&pm->lock);
+}
+
+void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+
+ pr_debug("msk=%p", msk);
+
+ spin_lock_bh(&pm->lock);
+
+ if (mptcp_lookup_anno_list_by_saddr(msk, addr) && READ_ONCE(pm->work_pending))
+ mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
+
+ spin_unlock_bh(&pm->lock);
+}
+
+void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk)
+{
+ if (!mptcp_pm_should_add_signal(msk))
+ return;
+
+ mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_SEND_ACK);
+}
+
+void mptcp_pm_rm_addr_received(struct mptcp_sock *msk,
+ const struct mptcp_rm_list *rm_list)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+ u8 i;
+
+ pr_debug("msk=%p remote_ids_nr=%d", msk, rm_list->nr);
+
+ for (i = 0; i < rm_list->nr; i++)
+ mptcp_event_addr_removed(msk, rm_list->ids[i]);
+
+ spin_lock_bh(&pm->lock);
+ if (mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED))
+ pm->rm_list_rx = *rm_list;
+ else
+ __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_RMADDRDROP);
+ spin_unlock_bh(&pm->lock);
+}
+
+void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct sock *sk = subflow->conn;
+ struct mptcp_sock *msk;
+
+ pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup);
+ msk = mptcp_sk(sk);
+ if (subflow->backup != bkup)
+ subflow->backup = bkup;
+
+ mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC);
+}
+
+void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+
+ pr_debug("fail_seq=%llu", fail_seq);
+
+ if (!READ_ONCE(msk->allow_infinite_fallback))
+ return;
+
+ if (!subflow->fail_tout) {
+ pr_debug("send MP_FAIL response and infinite map");
+
+ subflow->send_mp_fail = 1;
+ subflow->send_infinite_map = 1;
+ tcp_send_ack(sk);
+ } else {
+ pr_debug("MP_FAIL response received");
+ WRITE_ONCE(subflow->fail_tout, 0);
+ }
+}
+
+/* path manager helpers */
+
+bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb,
+ unsigned int opt_size, unsigned int remaining,
+ struct mptcp_addr_info *addr, bool *echo,
+ bool *drop_other_suboptions)
+{
+ int ret = false;
+ u8 add_addr;
+ u8 family;
+ bool port;
+
+ spin_lock_bh(&msk->pm.lock);
+
+ /* double check after the lock is acquired */
+ if (!mptcp_pm_should_add_signal(msk))
+ goto out_unlock;
+
+ /* always drop every other options for pure ack ADD_ADDR; this is a
+ * plain dup-ack from TCP perspective. The other MPTCP-relevant info,
+ * if any, will be carried by the 'original' TCP ack
+ */
+ if (skb && skb_is_tcp_pure_ack(skb)) {
+ remaining += opt_size;
+ *drop_other_suboptions = true;
+ }
+
+ *echo = mptcp_pm_should_add_signal_echo(msk);
+ port = !!(*echo ? msk->pm.remote.port : msk->pm.local.port);
+
+ family = *echo ? msk->pm.remote.family : msk->pm.local.family;
+ if (remaining < mptcp_add_addr_len(family, *echo, port))
+ goto out_unlock;
+
+ if (*echo) {
+ *addr = msk->pm.remote;
+ add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_ECHO);
+ } else {
+ *addr = msk->pm.local;
+ add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_SIGNAL);
+ }
+ WRITE_ONCE(msk->pm.addr_signal, add_addr);
+ ret = true;
+
+out_unlock:
+ spin_unlock_bh(&msk->pm.lock);
+ return ret;
+}
+
+bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
+ struct mptcp_rm_list *rm_list)
+{
+ int ret = false, len;
+ u8 rm_addr;
+
+ spin_lock_bh(&msk->pm.lock);
+
+ /* double check after the lock is acquired */
+ if (!mptcp_pm_should_rm_signal(msk))
+ goto out_unlock;
+
+ rm_addr = msk->pm.addr_signal & ~BIT(MPTCP_RM_ADDR_SIGNAL);
+ len = mptcp_rm_addr_len(&msk->pm.rm_list_tx);
+ if (len < 0) {
+ WRITE_ONCE(msk->pm.addr_signal, rm_addr);
+ goto out_unlock;
+ }
+ if (remaining < len)
+ goto out_unlock;
+
+ *rm_list = msk->pm.rm_list_tx;
+ WRITE_ONCE(msk->pm.addr_signal, rm_addr);
+ ret = true;
+
+out_unlock:
+ spin_unlock_bh(&msk->pm.lock);
+ return ret;
+}
+
+int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
+{
+ struct mptcp_addr_info skc_local;
+ struct mptcp_addr_info msk_local;
+
+ if (WARN_ON_ONCE(!msk))
+ return -1;
+
+ /* The 0 ID mapping is defined by the first subflow, copied into the msk
+ * addr
+ */
+ mptcp_local_address((struct sock_common *)msk, &msk_local);
+ mptcp_local_address((struct sock_common *)skc, &skc_local);
+ if (mptcp_addresses_equal(&msk_local, &skc_local, false))
+ return 0;
+
+ if (mptcp_pm_is_userspace(msk))
+ return mptcp_userspace_pm_get_local_id(msk, &skc_local);
+ return mptcp_pm_nl_get_local_id(msk, &skc_local);
+}
+
+int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id,
+ u8 *flags, int *ifindex)
+{
+ *flags = 0;
+ *ifindex = 0;
+
+ if (!id)
+ return 0;
+
+ if (mptcp_pm_is_userspace(msk))
+ return mptcp_userspace_pm_get_flags_and_ifindex_by_id(msk, id, flags, ifindex);
+ return mptcp_pm_nl_get_flags_and_ifindex_by_id(msk, id, flags, ifindex);
+}
+
+int mptcp_pm_set_flags(struct net *net, struct nlattr *token,
+ struct mptcp_pm_addr_entry *loc,
+ struct mptcp_pm_addr_entry *rem, u8 bkup)
+{
+ if (token)
+ return mptcp_userspace_pm_set_flags(net, token, loc, rem, bkup);
+ return mptcp_pm_nl_set_flags(net, loc, bkup);
+}
+
+void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ u32 rcv_tstamp = READ_ONCE(tcp_sk(ssk)->rcv_tstamp);
+
+ /* keep track of rtx periods with no progress */
+ if (!subflow->stale_count) {
+ subflow->stale_rcv_tstamp = rcv_tstamp;
+ subflow->stale_count++;
+ } else if (subflow->stale_rcv_tstamp == rcv_tstamp) {
+ if (subflow->stale_count < U8_MAX)
+ subflow->stale_count++;
+ mptcp_pm_nl_subflow_chk_stale(msk, ssk);
+ } else {
+ subflow->stale_count = 0;
+ mptcp_subflow_set_active(subflow);
+ }
+}
+
+/* if sk is ipv4 or ipv6_only allows only same-family local and remote addresses,
+ * otherwise allow any matching local/remote pair
+ */
+bool mptcp_pm_addr_families_match(const struct sock *sk,
+ const struct mptcp_addr_info *loc,
+ const struct mptcp_addr_info *rem)
+{
+ bool mptcp_is_v4 = sk->sk_family == AF_INET;
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ bool loc_is_v4 = loc->family == AF_INET || ipv6_addr_v4mapped(&loc->addr6);
+ bool rem_is_v4 = rem->family == AF_INET || ipv6_addr_v4mapped(&rem->addr6);
+
+ if (mptcp_is_v4)
+ return loc_is_v4 && rem_is_v4;
+
+ if (ipv6_only_sock(sk))
+ return !loc_is_v4 && !rem_is_v4;
+
+ return loc_is_v4 == rem_is_v4;
+#else
+ return mptcp_is_v4 && loc->family == AF_INET && rem->family == AF_INET;
+#endif
+}
+
+void mptcp_pm_data_reset(struct mptcp_sock *msk)
+{
+ u8 pm_type = mptcp_get_pm_type(sock_net((struct sock *)msk));
+ struct mptcp_pm_data *pm = &msk->pm;
+
+ pm->add_addr_signaled = 0;
+ pm->add_addr_accepted = 0;
+ pm->local_addr_used = 0;
+ pm->subflows = 0;
+ pm->rm_list_tx.nr = 0;
+ pm->rm_list_rx.nr = 0;
+ WRITE_ONCE(pm->pm_type, pm_type);
+
+ if (pm_type == MPTCP_PM_TYPE_KERNEL) {
+ bool subflows_allowed = !!mptcp_pm_get_subflows_max(msk);
+
+ /* pm->work_pending must be only be set to 'true' when
+ * pm->pm_type is set to MPTCP_PM_TYPE_KERNEL
+ */
+ WRITE_ONCE(pm->work_pending,
+ (!!mptcp_pm_get_local_addr_max(msk) &&
+ subflows_allowed) ||
+ !!mptcp_pm_get_add_addr_signal_max(msk));
+ WRITE_ONCE(pm->accept_addr,
+ !!mptcp_pm_get_add_addr_accept_max(msk) &&
+ subflows_allowed);
+ WRITE_ONCE(pm->accept_subflow, subflows_allowed);
+ } else {
+ WRITE_ONCE(pm->work_pending, 0);
+ WRITE_ONCE(pm->accept_addr, 0);
+ WRITE_ONCE(pm->accept_subflow, 0);
+ }
+
+ WRITE_ONCE(pm->addr_signal, 0);
+ WRITE_ONCE(pm->remote_deny_join_id0, false);
+ pm->status = 0;
+ bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
+}
+
+void mptcp_pm_data_init(struct mptcp_sock *msk)
+{
+ spin_lock_init(&msk->pm.lock);
+ INIT_LIST_HEAD(&msk->pm.anno_list);
+ INIT_LIST_HEAD(&msk->pm.userspace_pm_local_addr_list);
+ mptcp_pm_data_reset(msk);
+}
+
+void __init mptcp_pm_init(void)
+{
+ mptcp_pm_nl_init();
+}
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
new file mode 100644
index 000000000..3011bc378
--- /dev/null
+++ b/net/mptcp/pm_netlink.c
@@ -0,0 +1,2406 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2020, Red Hat, Inc.
+ */
+
+#define pr_fmt(fmt) "MPTCP: " fmt
+
+#include <linux/inet.h>
+#include <linux/kernel.h>
+#include <net/tcp.h>
+#include <net/inet_common.h>
+#include <net/netns/generic.h>
+#include <net/mptcp.h>
+#include <net/genetlink.h>
+#include <uapi/linux/mptcp.h>
+
+#include "protocol.h"
+#include "mib.h"
+
+/* forward declaration */
+static struct genl_family mptcp_genl_family;
+
+static int pm_nl_pernet_id;
+
+struct mptcp_pm_add_entry {
+ struct list_head list;
+ struct mptcp_addr_info addr;
+ u8 retrans_times;
+ struct timer_list add_timer;
+ struct mptcp_sock *sock;
+};
+
+struct pm_nl_pernet {
+ /* protects pernet updates */
+ spinlock_t lock;
+ struct list_head local_addr_list;
+ unsigned int addrs;
+ unsigned int stale_loss_cnt;
+ unsigned int add_addr_signal_max;
+ unsigned int add_addr_accept_max;
+ unsigned int local_addr_max;
+ unsigned int subflows_max;
+ unsigned int next_id;
+ DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
+};
+
+#define MPTCP_PM_ADDR_MAX 8
+#define ADD_ADDR_RETRANS_MAX 3
+
+static struct pm_nl_pernet *pm_nl_get_pernet(const struct net *net)
+{
+ return net_generic(net, pm_nl_pernet_id);
+}
+
+static struct pm_nl_pernet *
+pm_nl_get_pernet_from_msk(const struct mptcp_sock *msk)
+{
+ return pm_nl_get_pernet(sock_net((struct sock *)msk));
+}
+
+bool mptcp_addresses_equal(const struct mptcp_addr_info *a,
+ const struct mptcp_addr_info *b, bool use_port)
+{
+ bool addr_equals = false;
+
+ if (a->family == b->family) {
+ if (a->family == AF_INET)
+ addr_equals = a->addr.s_addr == b->addr.s_addr;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else
+ addr_equals = !ipv6_addr_cmp(&a->addr6, &b->addr6);
+ } else if (a->family == AF_INET) {
+ if (ipv6_addr_v4mapped(&b->addr6))
+ addr_equals = a->addr.s_addr == b->addr6.s6_addr32[3];
+ } else if (b->family == AF_INET) {
+ if (ipv6_addr_v4mapped(&a->addr6))
+ addr_equals = a->addr6.s6_addr32[3] == b->addr.s_addr;
+#endif
+ }
+
+ if (!addr_equals)
+ return false;
+ if (!use_port)
+ return true;
+
+ return a->port == b->port;
+}
+
+void mptcp_local_address(const struct sock_common *skc, struct mptcp_addr_info *addr)
+{
+ addr->family = skc->skc_family;
+ addr->port = htons(skc->skc_num);
+ if (addr->family == AF_INET)
+ addr->addr.s_addr = skc->skc_rcv_saddr;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (addr->family == AF_INET6)
+ addr->addr6 = skc->skc_v6_rcv_saddr;
+#endif
+}
+
+static void remote_address(const struct sock_common *skc,
+ struct mptcp_addr_info *addr)
+{
+ addr->family = skc->skc_family;
+ addr->port = skc->skc_dport;
+ if (addr->family == AF_INET)
+ addr->addr.s_addr = skc->skc_daddr;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (addr->family == AF_INET6)
+ addr->addr6 = skc->skc_v6_daddr;
+#endif
+}
+
+static bool lookup_subflow_by_saddr(const struct list_head *list,
+ const struct mptcp_addr_info *saddr)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_addr_info cur;
+ struct sock_common *skc;
+
+ list_for_each_entry(subflow, list, node) {
+ skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow);
+
+ mptcp_local_address(skc, &cur);
+ if (mptcp_addresses_equal(&cur, saddr, saddr->port))
+ return true;
+ }
+
+ return false;
+}
+
+static bool lookup_subflow_by_daddr(const struct list_head *list,
+ const struct mptcp_addr_info *daddr)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_addr_info cur;
+ struct sock_common *skc;
+
+ list_for_each_entry(subflow, list, node) {
+ skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow);
+
+ remote_address(skc, &cur);
+ if (mptcp_addresses_equal(&cur, daddr, daddr->port))
+ return true;
+ }
+
+ return false;
+}
+
+static struct mptcp_pm_addr_entry *
+select_local_address(const struct pm_nl_pernet *pernet,
+ const struct mptcp_sock *msk)
+{
+ struct mptcp_pm_addr_entry *entry, *ret = NULL;
+
+ msk_owned_by_me(msk);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
+ if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
+ continue;
+
+ if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap))
+ continue;
+
+ ret = entry;
+ break;
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+static struct mptcp_pm_addr_entry *
+select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk)
+{
+ struct mptcp_pm_addr_entry *entry, *ret = NULL;
+
+ rcu_read_lock();
+ /* do not keep any additional per socket state, just signal
+ * the address list in order.
+ * Note: removal from the local address list during the msk life-cycle
+ * can lead to additional addresses not being announced.
+ */
+ list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
+ if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap))
+ continue;
+
+ if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL))
+ continue;
+
+ ret = entry;
+ break;
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk)
+{
+ const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+
+ return READ_ONCE(pernet->add_addr_signal_max);
+}
+EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max);
+
+unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk)
+{
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+
+ return READ_ONCE(pernet->add_addr_accept_max);
+}
+EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max);
+
+unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk)
+{
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+
+ return READ_ONCE(pernet->subflows_max);
+}
+EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max);
+
+unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk)
+{
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+
+ return READ_ONCE(pernet->local_addr_max);
+}
+EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max);
+
+bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk)
+{
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+
+ if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) ||
+ (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap,
+ MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) {
+ WRITE_ONCE(msk->pm.work_pending, false);
+ return false;
+ }
+ return true;
+}
+
+struct mptcp_pm_add_entry *
+mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr)
+{
+ struct mptcp_pm_add_entry *entry;
+
+ lockdep_assert_held(&msk->pm.lock);
+
+ list_for_each_entry(entry, &msk->pm.anno_list, list) {
+ if (mptcp_addresses_equal(&entry->addr, addr, true))
+ return entry;
+ }
+
+ return NULL;
+}
+
+bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk)
+{
+ struct mptcp_pm_add_entry *entry;
+ struct mptcp_addr_info saddr;
+ bool ret = false;
+
+ mptcp_local_address((struct sock_common *)sk, &saddr);
+
+ spin_lock_bh(&msk->pm.lock);
+ list_for_each_entry(entry, &msk->pm.anno_list, list) {
+ if (mptcp_addresses_equal(&entry->addr, &saddr, true)) {
+ ret = true;
+ goto out;
+ }
+ }
+
+out:
+ spin_unlock_bh(&msk->pm.lock);
+ return ret;
+}
+
+static void mptcp_pm_add_timer(struct timer_list *timer)
+{
+ struct mptcp_pm_add_entry *entry = from_timer(entry, timer, add_timer);
+ struct mptcp_sock *msk = entry->sock;
+ struct sock *sk = (struct sock *)msk;
+
+ pr_debug("msk=%p", msk);
+
+ if (!msk)
+ return;
+
+ if (inet_sk_state_load(sk) == TCP_CLOSE)
+ return;
+
+ if (!entry->addr.id)
+ return;
+
+ if (mptcp_pm_should_add_signal_addr(msk)) {
+ sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8);
+ goto out;
+ }
+
+ spin_lock_bh(&msk->pm.lock);
+
+ if (!mptcp_pm_should_add_signal_addr(msk)) {
+ pr_debug("retransmit ADD_ADDR id=%d", entry->addr.id);
+ mptcp_pm_announce_addr(msk, &entry->addr, false);
+ mptcp_pm_add_addr_send_ack(msk);
+ entry->retrans_times++;
+ }
+
+ if (entry->retrans_times < ADD_ADDR_RETRANS_MAX)
+ sk_reset_timer(sk, timer,
+ jiffies + mptcp_get_add_addr_timeout(sock_net(sk)));
+
+ spin_unlock_bh(&msk->pm.lock);
+
+ if (entry->retrans_times == ADD_ADDR_RETRANS_MAX)
+ mptcp_pm_subflow_established(msk);
+
+out:
+ __sock_put(sk);
+}
+
+struct mptcp_pm_add_entry *
+mptcp_pm_del_add_timer(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr, bool check_id)
+{
+ struct mptcp_pm_add_entry *entry;
+ struct sock *sk = (struct sock *)msk;
+
+ spin_lock_bh(&msk->pm.lock);
+ entry = mptcp_lookup_anno_list_by_saddr(msk, addr);
+ if (entry && (!check_id || entry->addr.id == addr->id))
+ entry->retrans_times = ADD_ADDR_RETRANS_MAX;
+ spin_unlock_bh(&msk->pm.lock);
+
+ if (entry && (!check_id || entry->addr.id == addr->id))
+ sk_stop_timer_sync(sk, &entry->add_timer);
+
+ return entry;
+}
+
+bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr)
+{
+ struct mptcp_pm_add_entry *add_entry = NULL;
+ struct sock *sk = (struct sock *)msk;
+ struct net *net = sock_net(sk);
+
+ lockdep_assert_held(&msk->pm.lock);
+
+ add_entry = mptcp_lookup_anno_list_by_saddr(msk, addr);
+
+ if (add_entry) {
+ if (mptcp_pm_is_kernel(msk))
+ return false;
+
+ sk_reset_timer(sk, &add_entry->add_timer,
+ jiffies + mptcp_get_add_addr_timeout(net));
+ return true;
+ }
+
+ add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC);
+ if (!add_entry)
+ return false;
+
+ list_add(&add_entry->list, &msk->pm.anno_list);
+
+ add_entry->addr = *addr;
+ add_entry->sock = msk;
+ add_entry->retrans_times = 0;
+
+ timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0);
+ sk_reset_timer(sk, &add_entry->add_timer,
+ jiffies + mptcp_get_add_addr_timeout(net));
+
+ return true;
+}
+
+void mptcp_pm_free_anno_list(struct mptcp_sock *msk)
+{
+ struct mptcp_pm_add_entry *entry, *tmp;
+ struct sock *sk = (struct sock *)msk;
+ LIST_HEAD(free_list);
+
+ pr_debug("msk=%p", msk);
+
+ spin_lock_bh(&msk->pm.lock);
+ list_splice_init(&msk->pm.anno_list, &free_list);
+ spin_unlock_bh(&msk->pm.lock);
+
+ list_for_each_entry_safe(entry, tmp, &free_list, list) {
+ sk_stop_timer_sync(sk, &entry->add_timer);
+ kfree(entry);
+ }
+}
+
+static bool lookup_address_in_vec(const struct mptcp_addr_info *addrs, unsigned int nr,
+ const struct mptcp_addr_info *addr)
+{
+ int i;
+
+ for (i = 0; i < nr; i++) {
+ if (addrs[i].id == addr->id)
+ return true;
+ }
+
+ return false;
+}
+
+/* Fill all the remote addresses into the array addrs[],
+ * and return the array size.
+ */
+static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk,
+ struct mptcp_addr_info *local,
+ bool fullmesh,
+ struct mptcp_addr_info *addrs)
+{
+ bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0);
+ struct sock *sk = (struct sock *)msk, *ssk;
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_addr_info remote = { 0 };
+ unsigned int subflows_max;
+ int i = 0;
+
+ subflows_max = mptcp_pm_get_subflows_max(msk);
+ remote_address((struct sock_common *)sk, &remote);
+
+ /* Non-fullmesh endpoint, fill in the single entry
+ * corresponding to the primary MPC subflow remote address
+ */
+ if (!fullmesh) {
+ if (deny_id0)
+ return 0;
+
+ if (!mptcp_pm_addr_families_match(sk, local, &remote))
+ return 0;
+
+ msk->pm.subflows++;
+ addrs[i++] = remote;
+ } else {
+ mptcp_for_each_subflow(msk, subflow) {
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ remote_address((struct sock_common *)ssk, &addrs[i]);
+ addrs[i].id = subflow->remote_id;
+ if (deny_id0 && !addrs[i].id)
+ continue;
+
+ if (!mptcp_pm_addr_families_match(sk, local, &addrs[i]))
+ continue;
+
+ if (!lookup_address_in_vec(addrs, i, &addrs[i]) &&
+ msk->pm.subflows < subflows_max) {
+ msk->pm.subflows++;
+ i++;
+ }
+ }
+ }
+
+ return i;
+}
+
+static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow,
+ bool prio, bool backup)
+{
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ bool slow;
+
+ pr_debug("send ack for %s",
+ prio ? "mp_prio" : (mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr"));
+
+ slow = lock_sock_fast(ssk);
+ if (prio) {
+ subflow->send_mp_prio = 1;
+ subflow->backup = backup;
+ subflow->request_bkup = backup;
+ }
+
+ __mptcp_subflow_send_ack(ssk);
+ unlock_sock_fast(ssk, slow);
+}
+
+static void mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow,
+ bool prio, bool backup)
+{
+ spin_unlock_bh(&msk->pm.lock);
+ __mptcp_pm_send_ack(msk, subflow, prio, backup);
+ spin_lock_bh(&msk->pm.lock);
+}
+
+static struct mptcp_pm_addr_entry *
+__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
+{
+ struct mptcp_pm_addr_entry *entry;
+
+ list_for_each_entry(entry, &pernet->local_addr_list, list) {
+ if (entry->addr.id == id)
+ return entry;
+ }
+ return NULL;
+}
+
+static struct mptcp_pm_addr_entry *
+__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info,
+ bool lookup_by_id)
+{
+ struct mptcp_pm_addr_entry *entry;
+
+ list_for_each_entry(entry, &pernet->local_addr_list, list) {
+ if ((!lookup_by_id &&
+ mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) ||
+ (lookup_by_id && entry->addr.id == info->id))
+ return entry;
+ }
+ return NULL;
+}
+
+static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct mptcp_pm_addr_entry *local;
+ unsigned int add_addr_signal_max;
+ unsigned int local_addr_max;
+ struct pm_nl_pernet *pernet;
+ unsigned int subflows_max;
+
+ pernet = pm_nl_get_pernet(sock_net(sk));
+
+ add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk);
+ local_addr_max = mptcp_pm_get_local_addr_max(msk);
+ subflows_max = mptcp_pm_get_subflows_max(msk);
+
+ /* do lazy endpoint usage accounting for the MPC subflows */
+ if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) {
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(msk->first);
+ struct mptcp_pm_addr_entry *entry;
+ struct mptcp_addr_info mpc_addr;
+ bool backup = false;
+
+ mptcp_local_address((struct sock_common *)msk->first, &mpc_addr);
+ rcu_read_lock();
+ entry = __lookup_addr(pernet, &mpc_addr, false);
+ if (entry) {
+ __clear_bit(entry->addr.id, msk->pm.id_avail_bitmap);
+ msk->mpc_endpoint_id = entry->addr.id;
+ backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
+ }
+ rcu_read_unlock();
+
+ if (backup)
+ mptcp_pm_send_ack(msk, subflow, true, backup);
+
+ msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED);
+ }
+
+ pr_debug("local %d:%d signal %d:%d subflows %d:%d\n",
+ msk->pm.local_addr_used, local_addr_max,
+ msk->pm.add_addr_signaled, add_addr_signal_max,
+ msk->pm.subflows, subflows_max);
+
+ /* check first for announce */
+ if (msk->pm.add_addr_signaled < add_addr_signal_max) {
+ local = select_signal_address(pernet, msk);
+
+ /* due to racing events on both ends we can reach here while
+ * previous add address is still running: if we invoke now
+ * mptcp_pm_announce_addr(), that will fail and the
+ * corresponding id will be marked as used.
+ * Instead let the PM machinery reschedule us when the
+ * current address announce will be completed.
+ */
+ if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL))
+ return;
+
+ if (local) {
+ if (mptcp_pm_alloc_anno_list(msk, &local->addr)) {
+ __clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
+ msk->pm.add_addr_signaled++;
+ mptcp_pm_announce_addr(msk, &local->addr, false);
+ mptcp_pm_nl_addr_send_ack(msk);
+ }
+ }
+ }
+
+ /* check if should create a new subflow */
+ while (msk->pm.local_addr_used < local_addr_max &&
+ msk->pm.subflows < subflows_max) {
+ struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX];
+ bool fullmesh;
+ int i, nr;
+
+ local = select_local_address(pernet, msk);
+ if (!local)
+ break;
+
+ fullmesh = !!(local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH);
+
+ msk->pm.local_addr_used++;
+ __clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
+ nr = fill_remote_addresses_vec(msk, &local->addr, fullmesh, addrs);
+ if (nr == 0)
+ continue;
+
+ spin_unlock_bh(&msk->pm.lock);
+ for (i = 0; i < nr; i++)
+ __mptcp_subflow_connect(sk, &local->addr, &addrs[i]);
+ spin_lock_bh(&msk->pm.lock);
+ }
+ mptcp_pm_nl_check_work_pending(msk);
+}
+
+static void mptcp_pm_nl_fully_established(struct mptcp_sock *msk)
+{
+ mptcp_pm_create_subflow_or_signal_addr(msk);
+}
+
+static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk)
+{
+ mptcp_pm_create_subflow_or_signal_addr(msk);
+}
+
+/* Fill all the local addresses into the array addrs[],
+ * and return the array size.
+ */
+static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk,
+ struct mptcp_addr_info *remote,
+ struct mptcp_addr_info *addrs)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct mptcp_pm_addr_entry *entry;
+ struct pm_nl_pernet *pernet;
+ unsigned int subflows_max;
+ int i = 0;
+
+ pernet = pm_nl_get_pernet_from_msk(msk);
+ subflows_max = mptcp_pm_get_subflows_max(msk);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
+ if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH))
+ continue;
+
+ if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote))
+ continue;
+
+ if (msk->pm.subflows < subflows_max) {
+ msk->pm.subflows++;
+ addrs[i++] = entry->addr;
+ }
+ }
+ rcu_read_unlock();
+
+ /* If the array is empty, fill in the single
+ * 'IPADDRANY' local address
+ */
+ if (!i) {
+ struct mptcp_addr_info local;
+
+ memset(&local, 0, sizeof(local));
+ local.family =
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ remote->family == AF_INET6 &&
+ ipv6_addr_v4mapped(&remote->addr6) ? AF_INET :
+#endif
+ remote->family;
+
+ if (!mptcp_pm_addr_families_match(sk, &local, remote))
+ return 0;
+
+ msk->pm.subflows++;
+ addrs[i++] = local;
+ }
+
+ return i;
+}
+
+static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
+{
+ struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX];
+ struct sock *sk = (struct sock *)msk;
+ unsigned int add_addr_accept_max;
+ struct mptcp_addr_info remote;
+ unsigned int subflows_max;
+ int i, nr;
+
+ add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk);
+ subflows_max = mptcp_pm_get_subflows_max(msk);
+
+ pr_debug("accepted %d:%d remote family %d",
+ msk->pm.add_addr_accepted, add_addr_accept_max,
+ msk->pm.remote.family);
+
+ remote = msk->pm.remote;
+ mptcp_pm_announce_addr(msk, &remote, true);
+ mptcp_pm_nl_addr_send_ack(msk);
+
+ if (lookup_subflow_by_daddr(&msk->conn_list, &remote))
+ return;
+
+ /* pick id 0 port, if none is provided the remote address */
+ if (!remote.port)
+ remote.port = sk->sk_dport;
+
+ /* connect to the specified remote address, using whatever
+ * local address the routing configuration will pick.
+ */
+ nr = fill_local_addresses_vec(msk, &remote, addrs);
+ if (nr == 0)
+ return;
+
+ msk->pm.add_addr_accepted++;
+ if (msk->pm.add_addr_accepted >= add_addr_accept_max ||
+ msk->pm.subflows >= subflows_max)
+ WRITE_ONCE(msk->pm.accept_addr, false);
+
+ spin_unlock_bh(&msk->pm.lock);
+ for (i = 0; i < nr; i++)
+ __mptcp_subflow_connect(sk, &addrs[i], &remote);
+ spin_lock_bh(&msk->pm.lock);
+}
+
+void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+
+ msk_owned_by_me(msk);
+ lockdep_assert_held(&msk->pm.lock);
+
+ if (!mptcp_pm_should_add_signal(msk) &&
+ !mptcp_pm_should_rm_signal(msk))
+ return;
+
+ subflow = list_first_entry_or_null(&msk->conn_list, typeof(*subflow), node);
+ if (subflow)
+ mptcp_pm_send_ack(msk, subflow, false, false);
+}
+
+int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr,
+ struct mptcp_addr_info *rem,
+ u8 bkup)
+{
+ struct mptcp_subflow_context *subflow;
+
+ pr_debug("bkup=%d", bkup);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ struct mptcp_addr_info local, remote;
+
+ mptcp_local_address((struct sock_common *)ssk, &local);
+ if (!mptcp_addresses_equal(&local, addr, addr->port))
+ continue;
+
+ if (rem && rem->family != AF_UNSPEC) {
+ remote_address((struct sock_common *)ssk, &remote);
+ if (!mptcp_addresses_equal(&remote, rem, rem->port))
+ continue;
+ }
+
+ __mptcp_pm_send_ack(msk, subflow, true, bkup);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static bool mptcp_local_id_match(const struct mptcp_sock *msk, u8 local_id, u8 id)
+{
+ return local_id == id || (!local_id && msk->mpc_endpoint_id == id);
+}
+
+static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk,
+ const struct mptcp_rm_list *rm_list,
+ enum linux_mptcp_mib_field rm_type)
+{
+ struct mptcp_subflow_context *subflow, *tmp;
+ struct sock *sk = (struct sock *)msk;
+ u8 i;
+
+ pr_debug("%s rm_list_nr %d",
+ rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", rm_list->nr);
+
+ msk_owned_by_me(msk);
+
+ if (sk->sk_state == TCP_LISTEN)
+ return;
+
+ if (!rm_list->nr)
+ return;
+
+ if (list_empty(&msk->conn_list))
+ return;
+
+ for (i = 0; i < rm_list->nr; i++) {
+ u8 rm_id = rm_list->ids[i];
+ bool removed = false;
+
+ mptcp_for_each_subflow_safe(msk, subflow, tmp) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ int how = RCV_SHUTDOWN | SEND_SHUTDOWN;
+ u8 id = subflow->local_id;
+
+ if (rm_type == MPTCP_MIB_RMADDR && subflow->remote_id != rm_id)
+ continue;
+ if (rm_type == MPTCP_MIB_RMSUBFLOW && !mptcp_local_id_match(msk, id, rm_id))
+ continue;
+
+ pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u",
+ rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow",
+ i, rm_id, subflow->local_id, subflow->remote_id,
+ msk->mpc_endpoint_id);
+ spin_unlock_bh(&msk->pm.lock);
+ mptcp_subflow_shutdown(sk, ssk, how);
+
+ /* the following takes care of updating the subflows counter */
+ mptcp_close_ssk(sk, ssk, subflow);
+ spin_lock_bh(&msk->pm.lock);
+
+ removed = true;
+ __MPTCP_INC_STATS(sock_net(sk), rm_type);
+ }
+ if (rm_type == MPTCP_MIB_RMSUBFLOW)
+ __set_bit(rm_id ? rm_id : msk->mpc_endpoint_id, msk->pm.id_avail_bitmap);
+ if (!removed)
+ continue;
+
+ if (!mptcp_pm_is_kernel(msk))
+ continue;
+
+ if (rm_type == MPTCP_MIB_RMADDR) {
+ msk->pm.add_addr_accepted--;
+ WRITE_ONCE(msk->pm.accept_addr, true);
+ } else if (rm_type == MPTCP_MIB_RMSUBFLOW) {
+ msk->pm.local_addr_used--;
+ }
+ }
+}
+
+static void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk)
+{
+ mptcp_pm_nl_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR);
+}
+
+void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk,
+ const struct mptcp_rm_list *rm_list)
+{
+ mptcp_pm_nl_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW);
+}
+
+void mptcp_pm_nl_work(struct mptcp_sock *msk)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+
+ msk_owned_by_me(msk);
+
+ if (!(pm->status & MPTCP_PM_WORK_MASK))
+ return;
+
+ spin_lock_bh(&msk->pm.lock);
+
+ pr_debug("msk=%p status=%x", msk, pm->status);
+ if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) {
+ pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED);
+ mptcp_pm_nl_add_addr_received(msk);
+ }
+ if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) {
+ pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK);
+ mptcp_pm_nl_addr_send_ack(msk);
+ }
+ if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) {
+ pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED);
+ mptcp_pm_nl_rm_addr_received(msk);
+ }
+ if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) {
+ pm->status &= ~BIT(MPTCP_PM_ESTABLISHED);
+ mptcp_pm_nl_fully_established(msk);
+ }
+ if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) {
+ pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED);
+ mptcp_pm_nl_subflow_established(msk);
+ }
+
+ spin_unlock_bh(&msk->pm.lock);
+}
+
+static bool address_use_port(struct mptcp_pm_addr_entry *entry)
+{
+ return (entry->flags &
+ (MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) ==
+ MPTCP_PM_ADDR_FLAG_SIGNAL;
+}
+
+/* caller must ensure the RCU grace period is already elapsed */
+static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry)
+{
+ if (entry->lsk)
+ sock_release(entry->lsk);
+ kfree(entry);
+}
+
+static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
+ struct mptcp_pm_addr_entry *entry)
+{
+ struct mptcp_pm_addr_entry *cur, *del_entry = NULL;
+ unsigned int addr_max;
+ int ret = -EINVAL;
+
+ spin_lock_bh(&pernet->lock);
+ /* to keep the code simple, don't do IDR-like allocation for address ID,
+ * just bail when we exceed limits
+ */
+ if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID)
+ pernet->next_id = 1;
+ if (pernet->addrs >= MPTCP_PM_ADDR_MAX) {
+ ret = -ERANGE;
+ goto out;
+ }
+ if (test_bit(entry->addr.id, pernet->id_bitmap)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ /* do not insert duplicate address, differentiate on port only
+ * singled addresses
+ */
+ if (!address_use_port(entry))
+ entry->addr.port = 0;
+ list_for_each_entry(cur, &pernet->local_addr_list, list) {
+ if (mptcp_addresses_equal(&cur->addr, &entry->addr,
+ cur->addr.port || entry->addr.port)) {
+ /* allow replacing the exiting endpoint only if such
+ * endpoint is an implicit one and the user-space
+ * did not provide an endpoint id
+ */
+ if (!(cur->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)) {
+ ret = -EEXIST;
+ goto out;
+ }
+ if (entry->addr.id)
+ goto out;
+
+ pernet->addrs--;
+ entry->addr.id = cur->addr.id;
+ list_del_rcu(&cur->list);
+ del_entry = cur;
+ break;
+ }
+ }
+
+ if (!entry->addr.id) {
+find_next:
+ entry->addr.id = find_next_zero_bit(pernet->id_bitmap,
+ MPTCP_PM_MAX_ADDR_ID + 1,
+ pernet->next_id);
+ if (!entry->addr.id && pernet->next_id != 1) {
+ pernet->next_id = 1;
+ goto find_next;
+ }
+ }
+
+ if (!entry->addr.id)
+ goto out;
+
+ __set_bit(entry->addr.id, pernet->id_bitmap);
+ if (entry->addr.id > pernet->next_id)
+ pernet->next_id = entry->addr.id;
+
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) {
+ addr_max = pernet->add_addr_signal_max;
+ WRITE_ONCE(pernet->add_addr_signal_max, addr_max + 1);
+ }
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
+ addr_max = pernet->local_addr_max;
+ WRITE_ONCE(pernet->local_addr_max, addr_max + 1);
+ }
+
+ pernet->addrs++;
+ if (!entry->addr.port)
+ list_add_tail_rcu(&entry->list, &pernet->local_addr_list);
+ else
+ list_add_rcu(&entry->list, &pernet->local_addr_list);
+ ret = entry->addr.id;
+
+out:
+ spin_unlock_bh(&pernet->lock);
+
+ /* just replaced an existing entry, free it */
+ if (del_entry) {
+ synchronize_rcu();
+ __mptcp_pm_release_addr_entry(del_entry);
+ }
+ return ret;
+}
+
+static struct lock_class_key mptcp_slock_keys[2];
+static struct lock_class_key mptcp_keys[2];
+
+static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
+ struct mptcp_pm_addr_entry *entry)
+{
+ bool is_ipv6 = sk->sk_family == AF_INET6;
+ int addrlen = sizeof(struct sockaddr_in);
+ struct sockaddr_storage addr;
+ struct sock *newsk, *ssk;
+ int backlog = 1024;
+ int err;
+
+ err = sock_create_kern(sock_net(sk), entry->addr.family,
+ SOCK_STREAM, IPPROTO_MPTCP, &entry->lsk);
+ if (err)
+ return err;
+
+ newsk = entry->lsk->sk;
+ if (!newsk)
+ return -EINVAL;
+
+ /* The subflow socket lock is acquired in a nested to the msk one
+ * in several places, even by the TCP stack, and this msk is a kernel
+ * socket: lockdep complains. Instead of propagating the _nested
+ * modifiers in several places, re-init the lock class for the msk
+ * socket to an mptcp specific one.
+ */
+ sock_lock_init_class_and_name(newsk,
+ is_ipv6 ? "mlock-AF_INET6" : "mlock-AF_INET",
+ &mptcp_slock_keys[is_ipv6],
+ is_ipv6 ? "msk_lock-AF_INET6" : "msk_lock-AF_INET",
+ &mptcp_keys[is_ipv6]);
+
+ lock_sock(newsk);
+ ssk = __mptcp_nmpc_sk(mptcp_sk(newsk));
+ release_sock(newsk);
+ if (IS_ERR(ssk))
+ return PTR_ERR(ssk);
+
+ mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family);
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (entry->addr.family == AF_INET6)
+ addrlen = sizeof(struct sockaddr_in6);
+#endif
+ if (ssk->sk_family == AF_INET)
+ err = inet_bind_sk(ssk, (struct sockaddr *)&addr, addrlen);
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (ssk->sk_family == AF_INET6)
+ err = inet6_bind_sk(ssk, (struct sockaddr *)&addr, addrlen);
+#endif
+ if (err)
+ return err;
+
+ inet_sk_state_store(newsk, TCP_LISTEN);
+ lock_sock(ssk);
+ err = __inet_listen_sk(ssk, backlog);
+ if (!err)
+ mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED);
+ release_sock(ssk);
+ return err;
+}
+
+int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc)
+{
+ struct mptcp_pm_addr_entry *entry;
+ struct pm_nl_pernet *pernet;
+ int ret = -1;
+
+ pernet = pm_nl_get_pernet_from_msk(msk);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
+ if (mptcp_addresses_equal(&entry->addr, skc, entry->addr.port)) {
+ ret = entry->addr.id;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ if (ret >= 0)
+ return ret;
+
+ /* address not found, add to local list */
+ entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->addr = *skc;
+ entry->addr.id = 0;
+ entry->addr.port = 0;
+ entry->ifindex = 0;
+ entry->flags = MPTCP_PM_ADDR_FLAG_IMPLICIT;
+ entry->lsk = NULL;
+ ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
+ if (ret < 0)
+ kfree(entry);
+
+ return ret;
+}
+
+#define MPTCP_PM_CMD_GRP_OFFSET 0
+#define MPTCP_PM_EV_GRP_OFFSET 1
+
+static const struct genl_multicast_group mptcp_pm_mcgrps[] = {
+ [MPTCP_PM_CMD_GRP_OFFSET] = { .name = MPTCP_PM_CMD_GRP_NAME, },
+ [MPTCP_PM_EV_GRP_OFFSET] = { .name = MPTCP_PM_EV_GRP_NAME,
+ .flags = GENL_UNS_ADMIN_PERM,
+ },
+};
+
+static const struct nla_policy
+mptcp_pm_addr_policy[MPTCP_PM_ADDR_ATTR_MAX + 1] = {
+ [MPTCP_PM_ADDR_ATTR_FAMILY] = { .type = NLA_U16, },
+ [MPTCP_PM_ADDR_ATTR_ID] = { .type = NLA_U8, },
+ [MPTCP_PM_ADDR_ATTR_ADDR4] = { .type = NLA_U32, },
+ [MPTCP_PM_ADDR_ATTR_ADDR6] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
+ [MPTCP_PM_ADDR_ATTR_PORT] = { .type = NLA_U16 },
+ [MPTCP_PM_ADDR_ATTR_FLAGS] = { .type = NLA_U32 },
+ [MPTCP_PM_ADDR_ATTR_IF_IDX] = { .type = NLA_S32 },
+};
+
+static const struct nla_policy mptcp_pm_policy[MPTCP_PM_ATTR_MAX + 1] = {
+ [MPTCP_PM_ATTR_ADDR] =
+ NLA_POLICY_NESTED(mptcp_pm_addr_policy),
+ [MPTCP_PM_ATTR_RCV_ADD_ADDRS] = { .type = NLA_U32, },
+ [MPTCP_PM_ATTR_SUBFLOWS] = { .type = NLA_U32, },
+ [MPTCP_PM_ATTR_TOKEN] = { .type = NLA_U32, },
+ [MPTCP_PM_ATTR_LOC_ID] = { .type = NLA_U8, },
+ [MPTCP_PM_ATTR_ADDR_REMOTE] =
+ NLA_POLICY_NESTED(mptcp_pm_addr_policy),
+};
+
+void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
+{
+ struct mptcp_subflow_context *iter, *subflow = mptcp_subflow_ctx(ssk);
+ struct sock *sk = (struct sock *)msk;
+ unsigned int active_max_loss_cnt;
+ struct net *net = sock_net(sk);
+ unsigned int stale_loss_cnt;
+ bool slow;
+
+ stale_loss_cnt = mptcp_stale_loss_cnt(net);
+ if (subflow->stale || !stale_loss_cnt || subflow->stale_count <= stale_loss_cnt)
+ return;
+
+ /* look for another available subflow not in loss state */
+ active_max_loss_cnt = max_t(int, stale_loss_cnt - 1, 1);
+ mptcp_for_each_subflow(msk, iter) {
+ if (iter != subflow && mptcp_subflow_active(iter) &&
+ iter->stale_count < active_max_loss_cnt) {
+ /* we have some alternatives, try to mark this subflow as idle ...*/
+ slow = lock_sock_fast(ssk);
+ if (!tcp_rtx_and_write_queues_empty(ssk)) {
+ subflow->stale = 1;
+ __mptcp_retransmit_pending_data(sk);
+ MPTCP_INC_STATS(net, MPTCP_MIB_SUBFLOWSTALE);
+ }
+ unlock_sock_fast(ssk, slow);
+
+ /* always try to push the pending data regardless of re-injections:
+ * we can possibly use backup subflows now, and subflow selection
+ * is cheap under the msk socket lock
+ */
+ __mptcp_push_pending(sk, 0);
+ return;
+ }
+ }
+}
+
+static int mptcp_pm_family_to_addr(int family)
+{
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (family == AF_INET6)
+ return MPTCP_PM_ADDR_ATTR_ADDR6;
+#endif
+ return MPTCP_PM_ADDR_ATTR_ADDR4;
+}
+
+static int mptcp_pm_parse_pm_addr_attr(struct nlattr *tb[],
+ const struct nlattr *attr,
+ struct genl_info *info,
+ struct mptcp_addr_info *addr,
+ bool require_family)
+{
+ int err, addr_addr;
+
+ if (!attr) {
+ GENL_SET_ERR_MSG(info, "missing address info");
+ return -EINVAL;
+ }
+
+ /* no validation needed - was already done via nested policy */
+ err = nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr,
+ mptcp_pm_addr_policy, info->extack);
+ if (err)
+ return err;
+
+ if (tb[MPTCP_PM_ADDR_ATTR_ID])
+ addr->id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]);
+
+ if (!tb[MPTCP_PM_ADDR_ATTR_FAMILY]) {
+ if (!require_family)
+ return 0;
+
+ NL_SET_ERR_MSG_ATTR(info->extack, attr,
+ "missing family");
+ return -EINVAL;
+ }
+
+ addr->family = nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_FAMILY]);
+ if (addr->family != AF_INET
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ && addr->family != AF_INET6
+#endif
+ ) {
+ NL_SET_ERR_MSG_ATTR(info->extack, attr,
+ "unknown address family");
+ return -EINVAL;
+ }
+ addr_addr = mptcp_pm_family_to_addr(addr->family);
+ if (!tb[addr_addr]) {
+ NL_SET_ERR_MSG_ATTR(info->extack, attr,
+ "missing address data");
+ return -EINVAL;
+ }
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (addr->family == AF_INET6)
+ addr->addr6 = nla_get_in6_addr(tb[addr_addr]);
+ else
+#endif
+ addr->addr.s_addr = nla_get_in_addr(tb[addr_addr]);
+
+ if (tb[MPTCP_PM_ADDR_ATTR_PORT])
+ addr->port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT]));
+
+ return 0;
+}
+
+int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info,
+ struct mptcp_addr_info *addr)
+{
+ struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1];
+
+ memset(addr, 0, sizeof(*addr));
+
+ return mptcp_pm_parse_pm_addr_attr(tb, attr, info, addr, true);
+}
+
+int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info,
+ bool require_family,
+ struct mptcp_pm_addr_entry *entry)
+{
+ struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1];
+ int err;
+
+ memset(entry, 0, sizeof(*entry));
+
+ err = mptcp_pm_parse_pm_addr_attr(tb, attr, info, &entry->addr, require_family);
+ if (err)
+ return err;
+
+ if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) {
+ u32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]);
+
+ entry->ifindex = val;
+ }
+
+ if (tb[MPTCP_PM_ADDR_ATTR_FLAGS])
+ entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]);
+
+ if (tb[MPTCP_PM_ADDR_ATTR_PORT])
+ entry->addr.port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT]));
+
+ return 0;
+}
+
+static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info)
+{
+ return pm_nl_get_pernet(genl_info_net(info));
+}
+
+static int mptcp_nl_add_subflow_or_signal_addr(struct net *net)
+{
+ struct mptcp_sock *msk;
+ long s_slot = 0, s_num = 0;
+
+ while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
+ struct sock *sk = (struct sock *)msk;
+
+ if (!READ_ONCE(msk->fully_established) ||
+ mptcp_pm_is_userspace(msk))
+ goto next;
+
+ lock_sock(sk);
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_pm_create_subflow_or_signal_addr(msk);
+ spin_unlock_bh(&msk->pm.lock);
+ release_sock(sk);
+
+next:
+ sock_put(sk);
+ cond_resched();
+ }
+
+ return 0;
+}
+
+static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ struct mptcp_pm_addr_entry addr, *entry;
+ int ret;
+
+ ret = mptcp_pm_parse_entry(attr, info, true, &addr);
+ if (ret < 0)
+ return ret;
+
+ if (addr.addr.port && !(addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) {
+ GENL_SET_ERR_MSG(info, "flags must have signal when using port");
+ return -EINVAL;
+ }
+
+ if (addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL &&
+ addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) {
+ GENL_SET_ERR_MSG(info, "flags mustn't have both signal and fullmesh");
+ return -EINVAL;
+ }
+
+ if (addr.flags & MPTCP_PM_ADDR_FLAG_IMPLICIT) {
+ GENL_SET_ERR_MSG(info, "can't create IMPLICIT endpoint");
+ return -EINVAL;
+ }
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL_ACCOUNT);
+ if (!entry) {
+ GENL_SET_ERR_MSG(info, "can't allocate addr");
+ return -ENOMEM;
+ }
+
+ *entry = addr;
+ if (entry->addr.port) {
+ ret = mptcp_pm_nl_create_listen_socket(skb->sk, entry);
+ if (ret) {
+ GENL_SET_ERR_MSG_FMT(info, "create listen socket error: %d", ret);
+ goto out_free;
+ }
+ }
+ ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
+ if (ret < 0) {
+ GENL_SET_ERR_MSG_FMT(info, "too many addresses or duplicate one: %d", ret);
+ goto out_free;
+ }
+
+ mptcp_nl_add_subflow_or_signal_addr(sock_net(skb->sk));
+ return 0;
+
+out_free:
+ __mptcp_pm_release_addr_entry(entry);
+ return ret;
+}
+
+int mptcp_pm_nl_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id,
+ u8 *flags, int *ifindex)
+{
+ struct mptcp_pm_addr_entry *entry;
+ struct sock *sk = (struct sock *)msk;
+ struct net *net = sock_net(sk);
+
+ rcu_read_lock();
+ entry = __lookup_addr_by_id(pm_nl_get_pernet(net), id);
+ if (entry) {
+ *flags = entry->flags;
+ *ifindex = entry->ifindex;
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static bool remove_anno_list_by_saddr(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr)
+{
+ struct mptcp_pm_add_entry *entry;
+
+ entry = mptcp_pm_del_add_timer(msk, addr, false);
+ if (entry) {
+ list_del(&entry->list);
+ kfree(entry);
+ return true;
+ }
+
+ return false;
+}
+
+static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr,
+ bool force)
+{
+ struct mptcp_rm_list list = { .nr = 0 };
+ bool ret;
+
+ list.ids[list.nr++] = addr->id;
+
+ ret = remove_anno_list_by_saddr(msk, addr);
+ if (ret || force) {
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_pm_remove_addr(msk, &list);
+ spin_unlock_bh(&msk->pm.lock);
+ }
+ return ret;
+}
+
+static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net,
+ const struct mptcp_pm_addr_entry *entry)
+{
+ const struct mptcp_addr_info *addr = &entry->addr;
+ struct mptcp_rm_list list = { .nr = 0 };
+ long s_slot = 0, s_num = 0;
+ struct mptcp_sock *msk;
+
+ pr_debug("remove_id=%d", addr->id);
+
+ list.ids[list.nr++] = addr->id;
+
+ while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
+ struct sock *sk = (struct sock *)msk;
+ bool remove_subflow;
+
+ if (mptcp_pm_is_userspace(msk))
+ goto next;
+
+ if (list_empty(&msk->conn_list)) {
+ mptcp_pm_remove_anno_addr(msk, addr, false);
+ goto next;
+ }
+
+ lock_sock(sk);
+ remove_subflow = lookup_subflow_by_saddr(&msk->conn_list, addr);
+ mptcp_pm_remove_anno_addr(msk, addr, remove_subflow &&
+ !(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT));
+ if (remove_subflow)
+ mptcp_pm_remove_subflow(msk, &list);
+ release_sock(sk);
+
+next:
+ sock_put(sk);
+ cond_resched();
+ }
+
+ return 0;
+}
+
+static int mptcp_nl_remove_id_zero_address(struct net *net,
+ struct mptcp_addr_info *addr)
+{
+ struct mptcp_rm_list list = { .nr = 0 };
+ long s_slot = 0, s_num = 0;
+ struct mptcp_sock *msk;
+
+ list.ids[list.nr++] = 0;
+
+ while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
+ struct sock *sk = (struct sock *)msk;
+ struct mptcp_addr_info msk_local;
+
+ if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk))
+ goto next;
+
+ mptcp_local_address((struct sock_common *)msk, &msk_local);
+ if (!mptcp_addresses_equal(&msk_local, addr, addr->port))
+ goto next;
+
+ lock_sock(sk);
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_pm_remove_addr(msk, &list);
+ mptcp_pm_nl_rm_subflow_received(msk, &list);
+ spin_unlock_bh(&msk->pm.lock);
+ release_sock(sk);
+
+next:
+ sock_put(sk);
+ cond_resched();
+ }
+
+ return 0;
+}
+
+static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ struct mptcp_pm_addr_entry addr, *entry;
+ unsigned int addr_max;
+ int ret;
+
+ ret = mptcp_pm_parse_entry(attr, info, false, &addr);
+ if (ret < 0)
+ return ret;
+
+ /* the zero id address is special: the first address used by the msk
+ * always gets such an id, so different subflows can have different zero
+ * id addresses. Additionally zero id is not accounted for in id_bitmap.
+ * Let's use an 'mptcp_rm_list' instead of the common remove code.
+ */
+ if (addr.addr.id == 0)
+ return mptcp_nl_remove_id_zero_address(sock_net(skb->sk), &addr.addr);
+
+ spin_lock_bh(&pernet->lock);
+ entry = __lookup_addr_by_id(pernet, addr.addr.id);
+ if (!entry) {
+ GENL_SET_ERR_MSG(info, "address not found");
+ spin_unlock_bh(&pernet->lock);
+ return -EINVAL;
+ }
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) {
+ addr_max = pernet->add_addr_signal_max;
+ WRITE_ONCE(pernet->add_addr_signal_max, addr_max - 1);
+ }
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
+ addr_max = pernet->local_addr_max;
+ WRITE_ONCE(pernet->local_addr_max, addr_max - 1);
+ }
+
+ pernet->addrs--;
+ list_del_rcu(&entry->list);
+ __clear_bit(entry->addr.id, pernet->id_bitmap);
+ spin_unlock_bh(&pernet->lock);
+
+ mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), entry);
+ synchronize_rcu();
+ __mptcp_pm_release_addr_entry(entry);
+
+ return ret;
+}
+
+void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list)
+{
+ struct mptcp_rm_list alist = { .nr = 0 };
+ struct mptcp_pm_addr_entry *entry;
+
+ list_for_each_entry(entry, rm_list, list) {
+ if ((remove_anno_list_by_saddr(msk, &entry->addr) ||
+ lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) &&
+ alist.nr < MPTCP_RM_IDS_MAX)
+ alist.ids[alist.nr++] = entry->addr.id;
+ }
+
+ if (alist.nr) {
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_pm_remove_addr(msk, &alist);
+ spin_unlock_bh(&msk->pm.lock);
+ }
+}
+
+void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk,
+ struct list_head *rm_list)
+{
+ struct mptcp_rm_list alist = { .nr = 0 }, slist = { .nr = 0 };
+ struct mptcp_pm_addr_entry *entry;
+
+ list_for_each_entry(entry, rm_list, list) {
+ if (lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) &&
+ slist.nr < MPTCP_RM_IDS_MAX)
+ slist.ids[slist.nr++] = entry->addr.id;
+
+ if (remove_anno_list_by_saddr(msk, &entry->addr) &&
+ alist.nr < MPTCP_RM_IDS_MAX)
+ alist.ids[alist.nr++] = entry->addr.id;
+ }
+
+ if (alist.nr) {
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_pm_remove_addr(msk, &alist);
+ spin_unlock_bh(&msk->pm.lock);
+ }
+ if (slist.nr)
+ mptcp_pm_remove_subflow(msk, &slist);
+}
+
+static void mptcp_nl_remove_addrs_list(struct net *net,
+ struct list_head *rm_list)
+{
+ long s_slot = 0, s_num = 0;
+ struct mptcp_sock *msk;
+
+ if (list_empty(rm_list))
+ return;
+
+ while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
+ struct sock *sk = (struct sock *)msk;
+
+ if (!mptcp_pm_is_userspace(msk)) {
+ lock_sock(sk);
+ mptcp_pm_remove_addrs_and_subflows(msk, rm_list);
+ release_sock(sk);
+ }
+
+ sock_put(sk);
+ cond_resched();
+ }
+}
+
+/* caller must ensure the RCU grace period is already elapsed */
+static void __flush_addrs(struct list_head *list)
+{
+ while (!list_empty(list)) {
+ struct mptcp_pm_addr_entry *cur;
+
+ cur = list_entry(list->next,
+ struct mptcp_pm_addr_entry, list);
+ list_del_rcu(&cur->list);
+ __mptcp_pm_release_addr_entry(cur);
+ }
+}
+
+static void __reset_counters(struct pm_nl_pernet *pernet)
+{
+ WRITE_ONCE(pernet->add_addr_signal_max, 0);
+ WRITE_ONCE(pernet->add_addr_accept_max, 0);
+ WRITE_ONCE(pernet->local_addr_max, 0);
+ pernet->addrs = 0;
+}
+
+static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info)
+{
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ LIST_HEAD(free_list);
+
+ spin_lock_bh(&pernet->lock);
+ list_splice_init(&pernet->local_addr_list, &free_list);
+ __reset_counters(pernet);
+ pernet->next_id = 1;
+ bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
+ spin_unlock_bh(&pernet->lock);
+ mptcp_nl_remove_addrs_list(sock_net(skb->sk), &free_list);
+ synchronize_rcu();
+ __flush_addrs(&free_list);
+ return 0;
+}
+
+static int mptcp_nl_fill_addr(struct sk_buff *skb,
+ struct mptcp_pm_addr_entry *entry)
+{
+ struct mptcp_addr_info *addr = &entry->addr;
+ struct nlattr *attr;
+
+ attr = nla_nest_start(skb, MPTCP_PM_ATTR_ADDR);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_FAMILY, addr->family))
+ goto nla_put_failure;
+ if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_PORT, ntohs(addr->port)))
+ goto nla_put_failure;
+ if (nla_put_u8(skb, MPTCP_PM_ADDR_ATTR_ID, addr->id))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->flags))
+ goto nla_put_failure;
+ if (entry->ifindex &&
+ nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->ifindex))
+ goto nla_put_failure;
+
+ if (addr->family == AF_INET &&
+ nla_put_in_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR4,
+ addr->addr.s_addr))
+ goto nla_put_failure;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (addr->family == AF_INET6 &&
+ nla_put_in6_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR6, &addr->addr6))
+ goto nla_put_failure;
+#endif
+ nla_nest_end(skb, attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, attr);
+ return -EMSGSIZE;
+}
+
+static int mptcp_nl_cmd_get_addr(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ struct mptcp_pm_addr_entry addr, *entry;
+ struct sk_buff *msg;
+ void *reply;
+ int ret;
+
+ ret = mptcp_pm_parse_entry(attr, info, false, &addr);
+ if (ret < 0)
+ return ret;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0,
+ info->genlhdr->cmd);
+ if (!reply) {
+ GENL_SET_ERR_MSG(info, "not enough space in Netlink message");
+ ret = -EMSGSIZE;
+ goto fail;
+ }
+
+ spin_lock_bh(&pernet->lock);
+ entry = __lookup_addr_by_id(pernet, addr.addr.id);
+ if (!entry) {
+ GENL_SET_ERR_MSG(info, "address not found");
+ ret = -EINVAL;
+ goto unlock_fail;
+ }
+
+ ret = mptcp_nl_fill_addr(msg, entry);
+ if (ret)
+ goto unlock_fail;
+
+ genlmsg_end(msg, reply);
+ ret = genlmsg_reply(msg, info);
+ spin_unlock_bh(&pernet->lock);
+ return ret;
+
+unlock_fail:
+ spin_unlock_bh(&pernet->lock);
+
+fail:
+ nlmsg_free(msg);
+ return ret;
+}
+
+static int mptcp_nl_cmd_dump_addrs(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct net *net = sock_net(msg->sk);
+ struct mptcp_pm_addr_entry *entry;
+ struct pm_nl_pernet *pernet;
+ int id = cb->args[0];
+ void *hdr;
+ int i;
+
+ pernet = pm_nl_get_pernet(net);
+
+ spin_lock_bh(&pernet->lock);
+ for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) {
+ if (test_bit(i, pernet->id_bitmap)) {
+ entry = __lookup_addr_by_id(pernet, i);
+ if (!entry)
+ break;
+
+ if (entry->addr.id <= id)
+ continue;
+
+ hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, &mptcp_genl_family,
+ NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR);
+ if (!hdr)
+ break;
+
+ if (mptcp_nl_fill_addr(msg, entry) < 0) {
+ genlmsg_cancel(msg, hdr);
+ break;
+ }
+
+ id = entry->addr.id;
+ genlmsg_end(msg, hdr);
+ }
+ }
+ spin_unlock_bh(&pernet->lock);
+
+ cb->args[0] = id;
+ return msg->len;
+}
+
+static int parse_limit(struct genl_info *info, int id, unsigned int *limit)
+{
+ struct nlattr *attr = info->attrs[id];
+
+ if (!attr)
+ return 0;
+
+ *limit = nla_get_u32(attr);
+ if (*limit > MPTCP_PM_ADDR_MAX) {
+ GENL_SET_ERR_MSG(info, "limit greater than maximum");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int
+mptcp_nl_cmd_set_limits(struct sk_buff *skb, struct genl_info *info)
+{
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ unsigned int rcv_addrs, subflows;
+ int ret;
+
+ spin_lock_bh(&pernet->lock);
+ rcv_addrs = pernet->add_addr_accept_max;
+ ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs);
+ if (ret)
+ goto unlock;
+
+ subflows = pernet->subflows_max;
+ ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows);
+ if (ret)
+ goto unlock;
+
+ WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs);
+ WRITE_ONCE(pernet->subflows_max, subflows);
+
+unlock:
+ spin_unlock_bh(&pernet->lock);
+ return ret;
+}
+
+static int
+mptcp_nl_cmd_get_limits(struct sk_buff *skb, struct genl_info *info)
+{
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ struct sk_buff *msg;
+ void *reply;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0,
+ MPTCP_PM_CMD_GET_LIMITS);
+ if (!reply)
+ goto fail;
+
+ if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS,
+ READ_ONCE(pernet->add_addr_accept_max)))
+ goto fail;
+
+ if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS,
+ READ_ONCE(pernet->subflows_max)))
+ goto fail;
+
+ genlmsg_end(msg, reply);
+ return genlmsg_reply(msg, info);
+
+fail:
+ GENL_SET_ERR_MSG(info, "not enough space in Netlink message");
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+}
+
+static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr)
+{
+ struct mptcp_rm_list list = { .nr = 0 };
+
+ list.ids[list.nr++] = addr->id;
+
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_pm_nl_rm_subflow_received(msk, &list);
+ mptcp_pm_create_subflow_or_signal_addr(msk);
+ spin_unlock_bh(&msk->pm.lock);
+}
+
+static int mptcp_nl_set_flags(struct net *net,
+ struct mptcp_addr_info *addr,
+ u8 bkup, u8 changed)
+{
+ long s_slot = 0, s_num = 0;
+ struct mptcp_sock *msk;
+ int ret = -EINVAL;
+
+ while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
+ struct sock *sk = (struct sock *)msk;
+
+ if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk))
+ goto next;
+
+ lock_sock(sk);
+ if (changed & MPTCP_PM_ADDR_FLAG_BACKUP)
+ ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, NULL, bkup);
+ if (changed & MPTCP_PM_ADDR_FLAG_FULLMESH)
+ mptcp_pm_nl_fullmesh(msk, addr);
+ release_sock(sk);
+
+next:
+ sock_put(sk);
+ cond_resched();
+ }
+
+ return ret;
+}
+
+int mptcp_pm_nl_set_flags(struct net *net, struct mptcp_pm_addr_entry *addr, u8 bkup)
+{
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet(net);
+ u8 changed, mask = MPTCP_PM_ADDR_FLAG_BACKUP |
+ MPTCP_PM_ADDR_FLAG_FULLMESH;
+ struct mptcp_pm_addr_entry *entry;
+ u8 lookup_by_id = 0;
+
+ if (addr->addr.family == AF_UNSPEC) {
+ lookup_by_id = 1;
+ if (!addr->addr.id)
+ return -EOPNOTSUPP;
+ }
+
+ spin_lock_bh(&pernet->lock);
+ entry = __lookup_addr(pernet, &addr->addr, lookup_by_id);
+ if (!entry) {
+ spin_unlock_bh(&pernet->lock);
+ return -EINVAL;
+ }
+ if ((addr->flags & MPTCP_PM_ADDR_FLAG_FULLMESH) &&
+ (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) {
+ spin_unlock_bh(&pernet->lock);
+ return -EINVAL;
+ }
+
+ changed = (addr->flags ^ entry->flags) & mask;
+ entry->flags = (entry->flags & ~mask) | (addr->flags & mask);
+ *addr = *entry;
+ spin_unlock_bh(&pernet->lock);
+
+ mptcp_nl_set_flags(net, &addr->addr, bkup, changed);
+ return 0;
+}
+
+static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info)
+{
+ struct mptcp_pm_addr_entry remote = { .addr = { .family = AF_UNSPEC }, };
+ struct mptcp_pm_addr_entry addr = { .addr = { .family = AF_UNSPEC }, };
+ struct nlattr *attr_rem = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE];
+ struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
+ struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ struct net *net = sock_net(skb->sk);
+ u8 bkup = 0;
+ int ret;
+
+ ret = mptcp_pm_parse_entry(attr, info, false, &addr);
+ if (ret < 0)
+ return ret;
+
+ if (attr_rem) {
+ ret = mptcp_pm_parse_entry(attr_rem, info, false, &remote);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP)
+ bkup = 1;
+
+ return mptcp_pm_set_flags(net, token, &addr, &remote, bkup);
+}
+
+static void mptcp_nl_mcast_send(struct net *net, struct sk_buff *nlskb, gfp_t gfp)
+{
+ genlmsg_multicast_netns(&mptcp_genl_family, net,
+ nlskb, 0, MPTCP_PM_EV_GRP_OFFSET, gfp);
+}
+
+bool mptcp_userspace_pm_active(const struct mptcp_sock *msk)
+{
+ return genl_has_listeners(&mptcp_genl_family,
+ sock_net((const struct sock *)msk),
+ MPTCP_PM_EV_GRP_OFFSET);
+}
+
+static int mptcp_event_add_subflow(struct sk_buff *skb, const struct sock *ssk)
+{
+ const struct inet_sock *issk = inet_sk(ssk);
+ const struct mptcp_subflow_context *sf;
+
+ if (nla_put_u16(skb, MPTCP_ATTR_FAMILY, ssk->sk_family))
+ return -EMSGSIZE;
+
+ switch (ssk->sk_family) {
+ case AF_INET:
+ if (nla_put_in_addr(skb, MPTCP_ATTR_SADDR4, issk->inet_saddr))
+ return -EMSGSIZE;
+ if (nla_put_in_addr(skb, MPTCP_ATTR_DADDR4, issk->inet_daddr))
+ return -EMSGSIZE;
+ break;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ case AF_INET6: {
+ const struct ipv6_pinfo *np = inet6_sk(ssk);
+
+ if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &np->saddr))
+ return -EMSGSIZE;
+ if (nla_put_in6_addr(skb, MPTCP_ATTR_DADDR6, &ssk->sk_v6_daddr))
+ return -EMSGSIZE;
+ break;
+ }
+#endif
+ default:
+ WARN_ON_ONCE(1);
+ return -EMSGSIZE;
+ }
+
+ if (nla_put_be16(skb, MPTCP_ATTR_SPORT, issk->inet_sport))
+ return -EMSGSIZE;
+ if (nla_put_be16(skb, MPTCP_ATTR_DPORT, issk->inet_dport))
+ return -EMSGSIZE;
+
+ sf = mptcp_subflow_ctx(ssk);
+ if (WARN_ON_ONCE(!sf))
+ return -EINVAL;
+
+ if (nla_put_u8(skb, MPTCP_ATTR_LOC_ID, sf->local_id))
+ return -EMSGSIZE;
+
+ if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, sf->remote_id))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int mptcp_event_put_token_and_ssk(struct sk_buff *skb,
+ const struct mptcp_sock *msk,
+ const struct sock *ssk)
+{
+ const struct sock *sk = (const struct sock *)msk;
+ const struct mptcp_subflow_context *sf;
+ u8 sk_err;
+
+ if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token))
+ return -EMSGSIZE;
+
+ if (mptcp_event_add_subflow(skb, ssk))
+ return -EMSGSIZE;
+
+ sf = mptcp_subflow_ctx(ssk);
+ if (WARN_ON_ONCE(!sf))
+ return -EINVAL;
+
+ if (nla_put_u8(skb, MPTCP_ATTR_BACKUP, sf->backup))
+ return -EMSGSIZE;
+
+ if (ssk->sk_bound_dev_if &&
+ nla_put_s32(skb, MPTCP_ATTR_IF_IDX, ssk->sk_bound_dev_if))
+ return -EMSGSIZE;
+
+ sk_err = READ_ONCE(ssk->sk_err);
+ if (sk_err && sk->sk_state == TCP_ESTABLISHED &&
+ nla_put_u8(skb, MPTCP_ATTR_ERROR, sk_err))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int mptcp_event_sub_established(struct sk_buff *skb,
+ const struct mptcp_sock *msk,
+ const struct sock *ssk)
+{
+ return mptcp_event_put_token_and_ssk(skb, msk, ssk);
+}
+
+static int mptcp_event_sub_closed(struct sk_buff *skb,
+ const struct mptcp_sock *msk,
+ const struct sock *ssk)
+{
+ const struct mptcp_subflow_context *sf;
+
+ if (mptcp_event_put_token_and_ssk(skb, msk, ssk))
+ return -EMSGSIZE;
+
+ sf = mptcp_subflow_ctx(ssk);
+ if (!sf->reset_seen)
+ return 0;
+
+ if (nla_put_u32(skb, MPTCP_ATTR_RESET_REASON, sf->reset_reason))
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, MPTCP_ATTR_RESET_FLAGS, sf->reset_transient))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int mptcp_event_created(struct sk_buff *skb,
+ const struct mptcp_sock *msk,
+ const struct sock *ssk)
+{
+ int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token);
+
+ if (err)
+ return err;
+
+ if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side)))
+ return -EMSGSIZE;
+
+ return mptcp_event_add_subflow(skb, ssk);
+}
+
+void mptcp_event_addr_removed(const struct mptcp_sock *msk, uint8_t id)
+{
+ struct net *net = sock_net((const struct sock *)msk);
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+
+ if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET))
+ return;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, MPTCP_EVENT_REMOVED);
+ if (!nlh)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, id))
+ goto nla_put_failure;
+
+ genlmsg_end(skb, nlh);
+ mptcp_nl_mcast_send(net, skb, GFP_ATOMIC);
+ return;
+
+nla_put_failure:
+ nlmsg_free(skb);
+}
+
+void mptcp_event_addr_announced(const struct sock *ssk,
+ const struct mptcp_addr_info *info)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ struct net *net = sock_net(ssk);
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+
+ if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET))
+ return;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0,
+ MPTCP_EVENT_ANNOUNCED);
+ if (!nlh)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, info->id))
+ goto nla_put_failure;
+
+ if (nla_put_be16(skb, MPTCP_ATTR_DPORT,
+ info->port == 0 ?
+ inet_sk(ssk)->inet_dport :
+ info->port))
+ goto nla_put_failure;
+
+ switch (info->family) {
+ case AF_INET:
+ if (nla_put_in_addr(skb, MPTCP_ATTR_DADDR4, info->addr.s_addr))
+ goto nla_put_failure;
+ break;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ case AF_INET6:
+ if (nla_put_in6_addr(skb, MPTCP_ATTR_DADDR6, &info->addr6))
+ goto nla_put_failure;
+ break;
+#endif
+ default:
+ WARN_ON_ONCE(1);
+ goto nla_put_failure;
+ }
+
+ genlmsg_end(skb, nlh);
+ mptcp_nl_mcast_send(net, skb, GFP_ATOMIC);
+ return;
+
+nla_put_failure:
+ nlmsg_free(skb);
+}
+
+void mptcp_event_pm_listener(const struct sock *ssk,
+ enum mptcp_event_type event)
+{
+ const struct inet_sock *issk = inet_sk(ssk);
+ struct net *net = sock_net(ssk);
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+
+ if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET))
+ return;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!skb)
+ return;
+
+ nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, event);
+ if (!nlh)
+ goto nla_put_failure;
+
+ if (nla_put_u16(skb, MPTCP_ATTR_FAMILY, ssk->sk_family))
+ goto nla_put_failure;
+
+ if (nla_put_be16(skb, MPTCP_ATTR_SPORT, issk->inet_sport))
+ goto nla_put_failure;
+
+ switch (ssk->sk_family) {
+ case AF_INET:
+ if (nla_put_in_addr(skb, MPTCP_ATTR_SADDR4, issk->inet_saddr))
+ goto nla_put_failure;
+ break;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ case AF_INET6: {
+ const struct ipv6_pinfo *np = inet6_sk(ssk);
+
+ if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &np->saddr))
+ goto nla_put_failure;
+ break;
+ }
+#endif
+ default:
+ WARN_ON_ONCE(1);
+ goto nla_put_failure;
+ }
+
+ genlmsg_end(skb, nlh);
+ mptcp_nl_mcast_send(net, skb, GFP_KERNEL);
+ return;
+
+nla_put_failure:
+ nlmsg_free(skb);
+}
+
+void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk,
+ const struct sock *ssk, gfp_t gfp)
+{
+ struct net *net = sock_net((const struct sock *)msk);
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+
+ if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET))
+ return;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!skb)
+ return;
+
+ nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, type);
+ if (!nlh)
+ goto nla_put_failure;
+
+ switch (type) {
+ case MPTCP_EVENT_UNSPEC:
+ WARN_ON_ONCE(1);
+ break;
+ case MPTCP_EVENT_CREATED:
+ case MPTCP_EVENT_ESTABLISHED:
+ if (mptcp_event_created(skb, msk, ssk) < 0)
+ goto nla_put_failure;
+ break;
+ case MPTCP_EVENT_CLOSED:
+ if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token) < 0)
+ goto nla_put_failure;
+ break;
+ case MPTCP_EVENT_ANNOUNCED:
+ case MPTCP_EVENT_REMOVED:
+ /* call mptcp_event_addr_announced()/removed instead */
+ WARN_ON_ONCE(1);
+ break;
+ case MPTCP_EVENT_SUB_ESTABLISHED:
+ case MPTCP_EVENT_SUB_PRIORITY:
+ if (mptcp_event_sub_established(skb, msk, ssk) < 0)
+ goto nla_put_failure;
+ break;
+ case MPTCP_EVENT_SUB_CLOSED:
+ if (mptcp_event_sub_closed(skb, msk, ssk) < 0)
+ goto nla_put_failure;
+ break;
+ case MPTCP_EVENT_LISTENER_CREATED:
+ case MPTCP_EVENT_LISTENER_CLOSED:
+ break;
+ }
+
+ genlmsg_end(skb, nlh);
+ mptcp_nl_mcast_send(net, skb, gfp);
+ return;
+
+nla_put_failure:
+ nlmsg_free(skb);
+}
+
+static const struct genl_small_ops mptcp_pm_ops[] = {
+ {
+ .cmd = MPTCP_PM_CMD_ADD_ADDR,
+ .doit = mptcp_nl_cmd_add_addr,
+ .flags = GENL_UNS_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_DEL_ADDR,
+ .doit = mptcp_nl_cmd_del_addr,
+ .flags = GENL_UNS_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_FLUSH_ADDRS,
+ .doit = mptcp_nl_cmd_flush_addrs,
+ .flags = GENL_UNS_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_GET_ADDR,
+ .doit = mptcp_nl_cmd_get_addr,
+ .dumpit = mptcp_nl_cmd_dump_addrs,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_SET_LIMITS,
+ .doit = mptcp_nl_cmd_set_limits,
+ .flags = GENL_UNS_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_GET_LIMITS,
+ .doit = mptcp_nl_cmd_get_limits,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_SET_FLAGS,
+ .doit = mptcp_nl_cmd_set_flags,
+ .flags = GENL_UNS_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_ANNOUNCE,
+ .doit = mptcp_nl_cmd_announce,
+ .flags = GENL_UNS_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_REMOVE,
+ .doit = mptcp_nl_cmd_remove,
+ .flags = GENL_UNS_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_SUBFLOW_CREATE,
+ .doit = mptcp_nl_cmd_sf_create,
+ .flags = GENL_UNS_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_SUBFLOW_DESTROY,
+ .doit = mptcp_nl_cmd_sf_destroy,
+ .flags = GENL_UNS_ADMIN_PERM,
+ },
+};
+
+static struct genl_family mptcp_genl_family __ro_after_init = {
+ .name = MPTCP_PM_NAME,
+ .version = MPTCP_PM_VER,
+ .maxattr = MPTCP_PM_ATTR_MAX,
+ .policy = mptcp_pm_policy,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .small_ops = mptcp_pm_ops,
+ .n_small_ops = ARRAY_SIZE(mptcp_pm_ops),
+ .resv_start_op = MPTCP_PM_CMD_SUBFLOW_DESTROY + 1,
+ .mcgrps = mptcp_pm_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(mptcp_pm_mcgrps),
+};
+
+static int __net_init pm_nl_init_net(struct net *net)
+{
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet(net);
+
+ INIT_LIST_HEAD_RCU(&pernet->local_addr_list);
+
+ /* Cit. 2 subflows ought to be enough for anybody. */
+ pernet->subflows_max = 2;
+ pernet->next_id = 1;
+ pernet->stale_loss_cnt = 4;
+ spin_lock_init(&pernet->lock);
+
+ /* No need to initialize other pernet fields, the struct is zeroed at
+ * allocation time.
+ */
+
+ return 0;
+}
+
+static void __net_exit pm_nl_exit_net(struct list_head *net_list)
+{
+ struct net *net;
+
+ list_for_each_entry(net, net_list, exit_list) {
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet(net);
+
+ /* net is removed from namespace list, can't race with
+ * other modifiers, also netns core already waited for a
+ * RCU grace period.
+ */
+ __flush_addrs(&pernet->local_addr_list);
+ }
+}
+
+static struct pernet_operations mptcp_pm_pernet_ops = {
+ .init = pm_nl_init_net,
+ .exit_batch = pm_nl_exit_net,
+ .id = &pm_nl_pernet_id,
+ .size = sizeof(struct pm_nl_pernet),
+};
+
+void __init mptcp_pm_nl_init(void)
+{
+ if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0)
+ panic("Failed to register MPTCP PM pernet subsystem.\n");
+
+ if (genl_register_family(&mptcp_genl_family))
+ panic("Failed to register MPTCP PM netlink family\n");
+}
diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c
new file mode 100644
index 000000000..d042d32be
--- /dev/null
+++ b/net/mptcp/pm_userspace.c
@@ -0,0 +1,503 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2022, Intel Corporation.
+ */
+
+#include "protocol.h"
+#include "mib.h"
+
+void mptcp_free_local_addr_list(struct mptcp_sock *msk)
+{
+ struct mptcp_pm_addr_entry *entry, *tmp;
+ struct sock *sk = (struct sock *)msk;
+ LIST_HEAD(free_list);
+
+ if (!mptcp_pm_is_userspace(msk))
+ return;
+
+ spin_lock_bh(&msk->pm.lock);
+ list_splice_init(&msk->pm.userspace_pm_local_addr_list, &free_list);
+ spin_unlock_bh(&msk->pm.lock);
+
+ list_for_each_entry_safe(entry, tmp, &free_list, list) {
+ sock_kfree_s(sk, entry, sizeof(*entry));
+ }
+}
+
+static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk,
+ struct mptcp_pm_addr_entry *entry)
+{
+ DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
+ struct mptcp_pm_addr_entry *match = NULL;
+ struct sock *sk = (struct sock *)msk;
+ struct mptcp_pm_addr_entry *e;
+ bool addr_match = false;
+ bool id_match = false;
+ int ret = -EINVAL;
+
+ bitmap_zero(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
+
+ spin_lock_bh(&msk->pm.lock);
+ list_for_each_entry(e, &msk->pm.userspace_pm_local_addr_list, list) {
+ addr_match = mptcp_addresses_equal(&e->addr, &entry->addr, true);
+ if (addr_match && entry->addr.id == 0)
+ entry->addr.id = e->addr.id;
+ id_match = (e->addr.id == entry->addr.id);
+ if (addr_match && id_match) {
+ match = e;
+ break;
+ } else if (addr_match || id_match) {
+ break;
+ }
+ __set_bit(e->addr.id, id_bitmap);
+ }
+
+ if (!match && !addr_match && !id_match) {
+ /* Memory for the entry is allocated from the
+ * sock option buffer.
+ */
+ e = sock_kmalloc(sk, sizeof(*e), GFP_ATOMIC);
+ if (!e) {
+ ret = -ENOMEM;
+ goto append_err;
+ }
+
+ *e = *entry;
+ if (!e->addr.id)
+ e->addr.id = find_next_zero_bit(id_bitmap,
+ MPTCP_PM_MAX_ADDR_ID + 1,
+ 1);
+ list_add_tail_rcu(&e->list, &msk->pm.userspace_pm_local_addr_list);
+ msk->pm.local_addr_used++;
+ ret = e->addr.id;
+ } else if (match) {
+ ret = entry->addr.id;
+ }
+
+append_err:
+ spin_unlock_bh(&msk->pm.lock);
+ return ret;
+}
+
+/* If the subflow is closed from the other peer (not via a
+ * subflow destroy command then), we want to keep the entry
+ * not to assign the same ID to another address and to be
+ * able to send RM_ADDR after the removal of the subflow.
+ */
+static int mptcp_userspace_pm_delete_local_addr(struct mptcp_sock *msk,
+ struct mptcp_pm_addr_entry *addr)
+{
+ struct mptcp_pm_addr_entry *entry, *tmp;
+
+ list_for_each_entry_safe(entry, tmp, &msk->pm.userspace_pm_local_addr_list, list) {
+ if (mptcp_addresses_equal(&entry->addr, &addr->addr, false)) {
+ /* TODO: a refcount is needed because the entry can
+ * be used multiple times (e.g. fullmesh mode).
+ */
+ list_del_rcu(&entry->list);
+ kfree(entry);
+ msk->pm.local_addr_used--;
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
+int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk,
+ unsigned int id,
+ u8 *flags, int *ifindex)
+{
+ struct mptcp_pm_addr_entry *entry, *match = NULL;
+
+ spin_lock_bh(&msk->pm.lock);
+ list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) {
+ if (id == entry->addr.id) {
+ match = entry;
+ break;
+ }
+ }
+ spin_unlock_bh(&msk->pm.lock);
+ if (match) {
+ *flags = match->flags;
+ *ifindex = match->ifindex;
+ }
+
+ return 0;
+}
+
+int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk,
+ struct mptcp_addr_info *skc)
+{
+ struct mptcp_pm_addr_entry new_entry;
+ __be16 msk_sport = ((struct inet_sock *)
+ inet_sk((struct sock *)msk))->inet_sport;
+
+ memset(&new_entry, 0, sizeof(struct mptcp_pm_addr_entry));
+ new_entry.addr = *skc;
+ new_entry.addr.id = 0;
+ new_entry.flags = MPTCP_PM_ADDR_FLAG_IMPLICIT;
+
+ if (new_entry.addr.port == msk_sport)
+ new_entry.addr.port = 0;
+
+ return mptcp_userspace_pm_append_new_local_addr(msk, &new_entry);
+}
+
+int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
+ struct nlattr *addr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ struct mptcp_pm_addr_entry addr_val;
+ struct mptcp_sock *msk;
+ int err = -EINVAL;
+ u32 token_val;
+
+ if (!addr || !token) {
+ GENL_SET_ERR_MSG(info, "missing required inputs");
+ return err;
+ }
+
+ token_val = nla_get_u32(token);
+
+ msk = mptcp_token_get_sock(sock_net(skb->sk), token_val);
+ if (!msk) {
+ NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+ return err;
+ }
+
+ if (!mptcp_pm_is_userspace(msk)) {
+ GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
+ goto announce_err;
+ }
+
+ err = mptcp_pm_parse_entry(addr, info, true, &addr_val);
+ if (err < 0) {
+ GENL_SET_ERR_MSG(info, "error parsing local address");
+ goto announce_err;
+ }
+
+ if (addr_val.addr.id == 0 || !(addr_val.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) {
+ GENL_SET_ERR_MSG(info, "invalid addr id or flags");
+ err = -EINVAL;
+ goto announce_err;
+ }
+
+ err = mptcp_userspace_pm_append_new_local_addr(msk, &addr_val);
+ if (err < 0) {
+ GENL_SET_ERR_MSG(info, "did not match address and id");
+ goto announce_err;
+ }
+
+ lock_sock((struct sock *)msk);
+ spin_lock_bh(&msk->pm.lock);
+
+ if (mptcp_pm_alloc_anno_list(msk, &addr_val.addr)) {
+ msk->pm.add_addr_signaled++;
+ mptcp_pm_announce_addr(msk, &addr_val.addr, false);
+ mptcp_pm_nl_addr_send_ack(msk);
+ }
+
+ spin_unlock_bh(&msk->pm.lock);
+ release_sock((struct sock *)msk);
+
+ err = 0;
+ announce_err:
+ sock_put((struct sock *)msk);
+ return err;
+}
+
+int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
+ struct nlattr *id = info->attrs[MPTCP_PM_ATTR_LOC_ID];
+ struct mptcp_pm_addr_entry *match = NULL;
+ struct mptcp_pm_addr_entry *entry;
+ struct mptcp_sock *msk;
+ LIST_HEAD(free_list);
+ int err = -EINVAL;
+ u32 token_val;
+ u8 id_val;
+
+ if (!id || !token) {
+ GENL_SET_ERR_MSG(info, "missing required inputs");
+ return err;
+ }
+
+ id_val = nla_get_u8(id);
+ token_val = nla_get_u32(token);
+
+ msk = mptcp_token_get_sock(sock_net(skb->sk), token_val);
+ if (!msk) {
+ NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+ return err;
+ }
+
+ if (!mptcp_pm_is_userspace(msk)) {
+ GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
+ goto remove_err;
+ }
+
+ lock_sock((struct sock *)msk);
+
+ list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) {
+ if (entry->addr.id == id_val) {
+ match = entry;
+ break;
+ }
+ }
+
+ if (!match) {
+ GENL_SET_ERR_MSG(info, "address with specified id not found");
+ release_sock((struct sock *)msk);
+ goto remove_err;
+ }
+
+ list_move(&match->list, &free_list);
+
+ mptcp_pm_remove_addrs(msk, &free_list);
+
+ release_sock((struct sock *)msk);
+
+ list_for_each_entry_safe(match, entry, &free_list, list) {
+ sock_kfree_s((struct sock *)msk, match, sizeof(*match));
+ }
+
+ err = 0;
+ remove_err:
+ sock_put((struct sock *)msk);
+ return err;
+}
+
+int mptcp_nl_cmd_sf_create(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE];
+ struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
+ struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ struct mptcp_pm_addr_entry local = { 0 };
+ struct mptcp_addr_info addr_r;
+ struct mptcp_addr_info addr_l;
+ struct mptcp_sock *msk;
+ int err = -EINVAL;
+ struct sock *sk;
+ u32 token_val;
+
+ if (!laddr || !raddr || !token) {
+ GENL_SET_ERR_MSG(info, "missing required inputs");
+ return err;
+ }
+
+ token_val = nla_get_u32(token);
+
+ msk = mptcp_token_get_sock(genl_info_net(info), token_val);
+ if (!msk) {
+ NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+ return err;
+ }
+
+ if (!mptcp_pm_is_userspace(msk)) {
+ GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
+ goto create_err;
+ }
+
+ err = mptcp_pm_parse_addr(laddr, info, &addr_l);
+ if (err < 0) {
+ NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr");
+ goto create_err;
+ }
+
+ err = mptcp_pm_parse_addr(raddr, info, &addr_r);
+ if (err < 0) {
+ NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr");
+ goto create_err;
+ }
+
+ sk = (struct sock *)msk;
+
+ if (!mptcp_pm_addr_families_match(sk, &addr_l, &addr_r)) {
+ GENL_SET_ERR_MSG(info, "families mismatch");
+ err = -EINVAL;
+ goto create_err;
+ }
+
+ local.addr = addr_l;
+ err = mptcp_userspace_pm_append_new_local_addr(msk, &local);
+ if (err < 0) {
+ GENL_SET_ERR_MSG(info, "did not match address and id");
+ goto create_err;
+ }
+
+ lock_sock(sk);
+
+ err = __mptcp_subflow_connect(sk, &addr_l, &addr_r);
+
+ release_sock(sk);
+
+ spin_lock_bh(&msk->pm.lock);
+ if (err)
+ mptcp_userspace_pm_delete_local_addr(msk, &local);
+ else
+ msk->pm.subflows++;
+ spin_unlock_bh(&msk->pm.lock);
+
+ create_err:
+ sock_put((struct sock *)msk);
+ return err;
+}
+
+static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *local,
+ const struct mptcp_addr_info *remote)
+{
+ struct mptcp_subflow_context *subflow;
+
+ if (local->family != remote->family)
+ return NULL;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ const struct inet_sock *issk;
+ struct sock *ssk;
+
+ ssk = mptcp_subflow_tcp_sock(subflow);
+
+ if (local->family != ssk->sk_family)
+ continue;
+
+ issk = inet_sk(ssk);
+
+ switch (ssk->sk_family) {
+ case AF_INET:
+ if (issk->inet_saddr != local->addr.s_addr ||
+ issk->inet_daddr != remote->addr.s_addr)
+ continue;
+ break;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ case AF_INET6: {
+ const struct ipv6_pinfo *pinfo = inet6_sk(ssk);
+
+ if (!ipv6_addr_equal(&local->addr6, &pinfo->saddr) ||
+ !ipv6_addr_equal(&remote->addr6, &ssk->sk_v6_daddr))
+ continue;
+ break;
+ }
+#endif
+ default:
+ continue;
+ }
+
+ if (issk->inet_sport == local->port &&
+ issk->inet_dport == remote->port)
+ return ssk;
+ }
+
+ return NULL;
+}
+
+int mptcp_nl_cmd_sf_destroy(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE];
+ struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
+ struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ struct mptcp_addr_info addr_l;
+ struct mptcp_addr_info addr_r;
+ struct mptcp_sock *msk;
+ struct sock *sk, *ssk;
+ int err = -EINVAL;
+ u32 token_val;
+
+ if (!laddr || !raddr || !token) {
+ GENL_SET_ERR_MSG(info, "missing required inputs");
+ return err;
+ }
+
+ token_val = nla_get_u32(token);
+
+ msk = mptcp_token_get_sock(genl_info_net(info), token_val);
+ if (!msk) {
+ NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+ return err;
+ }
+
+ if (!mptcp_pm_is_userspace(msk)) {
+ GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
+ goto destroy_err;
+ }
+
+ err = mptcp_pm_parse_addr(laddr, info, &addr_l);
+ if (err < 0) {
+ NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr");
+ goto destroy_err;
+ }
+
+ err = mptcp_pm_parse_addr(raddr, info, &addr_r);
+ if (err < 0) {
+ NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr");
+ goto destroy_err;
+ }
+
+ if (addr_l.family != addr_r.family) {
+ GENL_SET_ERR_MSG(info, "address families do not match");
+ err = -EINVAL;
+ goto destroy_err;
+ }
+
+ if (!addr_l.port || !addr_r.port) {
+ GENL_SET_ERR_MSG(info, "missing local or remote port");
+ err = -EINVAL;
+ goto destroy_err;
+ }
+
+ sk = (struct sock *)msk;
+ lock_sock(sk);
+ ssk = mptcp_nl_find_ssk(msk, &addr_l, &addr_r);
+ if (ssk) {
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct mptcp_pm_addr_entry entry = { .addr = addr_l };
+
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_userspace_pm_delete_local_addr(msk, &entry);
+ spin_unlock_bh(&msk->pm.lock);
+ mptcp_subflow_shutdown(sk, ssk, RCV_SHUTDOWN | SEND_SHUTDOWN);
+ mptcp_close_ssk(sk, ssk, subflow);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW);
+ err = 0;
+ } else {
+ err = -ESRCH;
+ }
+ release_sock(sk);
+
+destroy_err:
+ sock_put((struct sock *)msk);
+ return err;
+}
+
+int mptcp_userspace_pm_set_flags(struct net *net, struct nlattr *token,
+ struct mptcp_pm_addr_entry *loc,
+ struct mptcp_pm_addr_entry *rem, u8 bkup)
+{
+ struct mptcp_sock *msk;
+ int ret = -EINVAL;
+ u32 token_val;
+
+ token_val = nla_get_u32(token);
+
+ msk = mptcp_token_get_sock(net, token_val);
+ if (!msk)
+ return ret;
+
+ if (!mptcp_pm_is_userspace(msk))
+ goto set_flags_err;
+
+ if (loc->addr.family == AF_UNSPEC ||
+ rem->addr.family == AF_UNSPEC)
+ goto set_flags_err;
+
+ lock_sock((struct sock *)msk);
+ ret = mptcp_pm_nl_mp_prio_send_ack(msk, &loc->addr, &rem->addr, bkup);
+ release_sock((struct sock *)msk);
+
+set_flags_err:
+ sock_put((struct sock *)msk);
+ return ret;
+}
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
new file mode 100644
index 000000000..5c003a0f0
--- /dev/null
+++ b/net/mptcp/protocol.c
@@ -0,0 +1,4102 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2017 - 2019, Intel Corporation.
+ */
+
+#define pr_fmt(fmt) "MPTCP: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/sched/signal.h>
+#include <linux/atomic.h>
+#include <net/sock.h>
+#include <net/inet_common.h>
+#include <net/inet_hashtables.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/tcp_states.h>
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+#include <net/transp_v6.h>
+#endif
+#include <net/mptcp.h>
+#include <net/xfrm.h>
+#include <asm/ioctls.h>
+#include "protocol.h"
+#include "mib.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/mptcp.h>
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+struct mptcp6_sock {
+ struct mptcp_sock msk;
+ struct ipv6_pinfo np;
+};
+#endif
+
+enum {
+ MPTCP_CMSG_TS = BIT(0),
+ MPTCP_CMSG_INQ = BIT(1),
+};
+
+static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp;
+
+static void __mptcp_destroy_sock(struct sock *sk);
+static void mptcp_check_send_data_fin(struct sock *sk);
+
+DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
+static struct net_device mptcp_napi_dev;
+
+/* Returns end sequence number of the receiver's advertised window */
+static u64 mptcp_wnd_end(const struct mptcp_sock *msk)
+{
+ return READ_ONCE(msk->wnd_end);
+}
+
+static bool mptcp_is_tcpsk(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+
+ if (unlikely(sk->sk_prot == &tcp_prot)) {
+ /* we are being invoked after mptcp_accept() has
+ * accepted a non-mp-capable flow: sk is a tcp_sk,
+ * not an mptcp one.
+ *
+ * Hand the socket over to tcp so all further socket ops
+ * bypass mptcp.
+ */
+ WRITE_ONCE(sock->ops, &inet_stream_ops);
+ return true;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ } else if (unlikely(sk->sk_prot == &tcpv6_prot)) {
+ WRITE_ONCE(sock->ops, &inet6_stream_ops);
+ return true;
+#endif
+ }
+
+ return false;
+}
+
+static int __mptcp_socket_create(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ struct socket *ssock;
+ int err;
+
+ err = mptcp_subflow_create_socket(sk, sk->sk_family, &ssock);
+ if (err)
+ return err;
+
+ msk->scaling_ratio = tcp_sk(ssock->sk)->scaling_ratio;
+ WRITE_ONCE(msk->first, ssock->sk);
+ subflow = mptcp_subflow_ctx(ssock->sk);
+ list_add(&subflow->node, &msk->conn_list);
+ sock_hold(ssock->sk);
+ subflow->request_mptcp = 1;
+ subflow->subflow_id = msk->subflow_id++;
+
+ /* This is the first subflow, always with id 0 */
+ subflow->local_id_valid = 1;
+ mptcp_sock_graft(msk->first, sk->sk_socket);
+ iput(SOCK_INODE(ssock));
+
+ return 0;
+}
+
+/* If the MPC handshake is not started, returns the first subflow,
+ * eventually allocating it.
+ */
+struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk)
+{
+ struct sock *sk = (struct sock *)msk;
+ int ret;
+
+ if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+ return ERR_PTR(-EINVAL);
+
+ if (!msk->first) {
+ ret = __mptcp_socket_create(msk);
+ if (ret)
+ return ERR_PTR(ret);
+
+ mptcp_sockopt_sync(msk, msk->first);
+ }
+
+ return msk->first;
+}
+
+static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
+{
+ sk_drops_add(sk, skb);
+ __kfree_skb(skb);
+}
+
+static void mptcp_rmem_fwd_alloc_add(struct sock *sk, int size)
+{
+ WRITE_ONCE(mptcp_sk(sk)->rmem_fwd_alloc,
+ mptcp_sk(sk)->rmem_fwd_alloc + size);
+}
+
+static void mptcp_rmem_charge(struct sock *sk, int size)
+{
+ mptcp_rmem_fwd_alloc_add(sk, -size);
+}
+
+static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
+ struct sk_buff *from)
+{
+ bool fragstolen;
+ int delta;
+
+ if (MPTCP_SKB_CB(from)->offset ||
+ !skb_try_coalesce(to, from, &fragstolen, &delta))
+ return false;
+
+ pr_debug("colesced seq %llx into %llx new len %d new end seq %llx",
+ MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq,
+ to->len, MPTCP_SKB_CB(from)->end_seq);
+ MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
+
+ /* note the fwd memory can reach a negative value after accounting
+ * for the delta, but the later skb free will restore a non
+ * negative one
+ */
+ atomic_add(delta, &sk->sk_rmem_alloc);
+ mptcp_rmem_charge(sk, delta);
+ kfree_skb_partial(from, fragstolen);
+
+ return true;
+}
+
+static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
+ struct sk_buff *from)
+{
+ if (MPTCP_SKB_CB(from)->map_seq != MPTCP_SKB_CB(to)->end_seq)
+ return false;
+
+ return mptcp_try_coalesce((struct sock *)msk, to, from);
+}
+
+static void __mptcp_rmem_reclaim(struct sock *sk, int amount)
+{
+ amount >>= PAGE_SHIFT;
+ mptcp_rmem_charge(sk, amount << PAGE_SHIFT);
+ __sk_mem_reduce_allocated(sk, amount);
+}
+
+static void mptcp_rmem_uncharge(struct sock *sk, int size)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ int reclaimable;
+
+ mptcp_rmem_fwd_alloc_add(sk, size);
+ reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk);
+
+ /* see sk_mem_uncharge() for the rationale behind the following schema */
+ if (unlikely(reclaimable >= PAGE_SIZE))
+ __mptcp_rmem_reclaim(sk, reclaimable);
+}
+
+static void mptcp_rfree(struct sk_buff *skb)
+{
+ unsigned int len = skb->truesize;
+ struct sock *sk = skb->sk;
+
+ atomic_sub(len, &sk->sk_rmem_alloc);
+ mptcp_rmem_uncharge(sk, len);
+}
+
+void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk)
+{
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = mptcp_rfree;
+ atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+ mptcp_rmem_charge(sk, skb->truesize);
+}
+
+/* "inspired" by tcp_data_queue_ofo(), main differences:
+ * - use mptcp seqs
+ * - don't cope with sacks
+ */
+static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct rb_node **p, *parent;
+ u64 seq, end_seq, max_seq;
+ struct sk_buff *skb1;
+
+ seq = MPTCP_SKB_CB(skb)->map_seq;
+ end_seq = MPTCP_SKB_CB(skb)->end_seq;
+ max_seq = atomic64_read(&msk->rcv_wnd_sent);
+
+ pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq,
+ RB_EMPTY_ROOT(&msk->out_of_order_queue));
+ if (after64(end_seq, max_seq)) {
+ /* out of window */
+ mptcp_drop(sk, skb);
+ pr_debug("oow by %lld, rcv_wnd_sent %llu\n",
+ (unsigned long long)end_seq - (unsigned long)max_seq,
+ (unsigned long long)atomic64_read(&msk->rcv_wnd_sent));
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW);
+ return;
+ }
+
+ p = &msk->out_of_order_queue.rb_node;
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUE);
+ if (RB_EMPTY_ROOT(&msk->out_of_order_queue)) {
+ rb_link_node(&skb->rbnode, NULL, p);
+ rb_insert_color(&skb->rbnode, &msk->out_of_order_queue);
+ msk->ooo_last_skb = skb;
+ goto end;
+ }
+
+ /* with 2 subflows, adding at end of ooo queue is quite likely
+ * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
+ */
+ if (mptcp_ooo_try_coalesce(msk, msk->ooo_last_skb, skb)) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL);
+ return;
+ }
+
+ /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
+ if (!before64(seq, MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq)) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL);
+ parent = &msk->ooo_last_skb->rbnode;
+ p = &parent->rb_right;
+ goto insert;
+ }
+
+ /* Find place to insert this segment. Handle overlaps on the way. */
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+ skb1 = rb_to_skb(parent);
+ if (before64(seq, MPTCP_SKB_CB(skb1)->map_seq)) {
+ p = &parent->rb_left;
+ continue;
+ }
+ if (before64(seq, MPTCP_SKB_CB(skb1)->end_seq)) {
+ if (!after64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) {
+ /* All the bits are present. Drop. */
+ mptcp_drop(sk, skb);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ return;
+ }
+ if (after64(seq, MPTCP_SKB_CB(skb1)->map_seq)) {
+ /* partial overlap:
+ * | skb |
+ * | skb1 |
+ * continue traversing
+ */
+ } else {
+ /* skb's seq == skb1's seq and skb covers skb1.
+ * Replace skb1 with skb.
+ */
+ rb_replace_node(&skb1->rbnode, &skb->rbnode,
+ &msk->out_of_order_queue);
+ mptcp_drop(sk, skb1);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ goto merge_right;
+ }
+ } else if (mptcp_ooo_try_coalesce(msk, skb1, skb)) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE);
+ return;
+ }
+ p = &parent->rb_right;
+ }
+
+insert:
+ /* Insert segment into RB tree. */
+ rb_link_node(&skb->rbnode, parent, p);
+ rb_insert_color(&skb->rbnode, &msk->out_of_order_queue);
+
+merge_right:
+ /* Remove other segments covered by skb. */
+ while ((skb1 = skb_rb_next(skb)) != NULL) {
+ if (before64(end_seq, MPTCP_SKB_CB(skb1)->end_seq))
+ break;
+ rb_erase(&skb1->rbnode, &msk->out_of_order_queue);
+ mptcp_drop(sk, skb1);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ }
+ /* If there is no skb after us, we are the last_skb ! */
+ if (!skb1)
+ msk->ooo_last_skb = skb;
+
+end:
+ skb_condense(skb);
+ mptcp_set_owner_r(skb, sk);
+}
+
+static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ int amt, amount;
+
+ if (size <= msk->rmem_fwd_alloc)
+ return true;
+
+ size -= msk->rmem_fwd_alloc;
+ amt = sk_mem_pages(size);
+ amount = amt << PAGE_SHIFT;
+ if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV))
+ return false;
+
+ mptcp_rmem_fwd_alloc_add(sk, amount);
+ return true;
+}
+
+static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
+ struct sk_buff *skb, unsigned int offset,
+ size_t copy_len)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct sock *sk = (struct sock *)msk;
+ struct sk_buff *tail;
+ bool has_rxtstamp;
+
+ __skb_unlink(skb, &ssk->sk_receive_queue);
+
+ skb_ext_reset(skb);
+ skb_orphan(skb);
+
+ /* try to fetch required memory from subflow */
+ if (!mptcp_rmem_schedule(sk, ssk, skb->truesize))
+ goto drop;
+
+ has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
+
+ /* the skb map_seq accounts for the skb offset:
+ * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq
+ * value
+ */
+ MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow);
+ MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len;
+ MPTCP_SKB_CB(skb)->offset = offset;
+ MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp;
+
+ if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) {
+ /* in sequence */
+ msk->bytes_received += copy_len;
+ WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len);
+ tail = skb_peek_tail(&sk->sk_receive_queue);
+ if (tail && mptcp_try_coalesce(sk, tail, skb))
+ return true;
+
+ mptcp_set_owner_r(skb, sk);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ return true;
+ } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) {
+ mptcp_data_queue_ofo(msk, skb);
+ return false;
+ }
+
+ /* old data, keep it simple and drop the whole pkt, sender
+ * will retransmit as needed, if needed.
+ */
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+drop:
+ mptcp_drop(sk, skb);
+ return false;
+}
+
+static void mptcp_stop_rtx_timer(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+ mptcp_sk(sk)->timer_ival = 0;
+}
+
+static void mptcp_close_wake_up(struct sock *sk)
+{
+ if (sock_flag(sk, SOCK_DEAD))
+ return;
+
+ sk->sk_state_change(sk);
+ if (sk->sk_shutdown == SHUTDOWN_MASK ||
+ sk->sk_state == TCP_CLOSE)
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+ else
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+}
+
+static bool mptcp_pending_data_fin_ack(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ return ((1 << sk->sk_state) &
+ (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) &&
+ msk->write_seq == READ_ONCE(msk->snd_una);
+}
+
+static void mptcp_check_data_fin_ack(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ /* Look for an acknowledged DATA_FIN */
+ if (mptcp_pending_data_fin_ack(sk)) {
+ WRITE_ONCE(msk->snd_data_fin_enable, 0);
+
+ switch (sk->sk_state) {
+ case TCP_FIN_WAIT1:
+ inet_sk_state_store(sk, TCP_FIN_WAIT2);
+ break;
+ case TCP_CLOSING:
+ case TCP_LAST_ACK:
+ inet_sk_state_store(sk, TCP_CLOSE);
+ break;
+ }
+
+ mptcp_close_wake_up(sk);
+ }
+}
+
+static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (READ_ONCE(msk->rcv_data_fin) &&
+ ((1 << sk->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2))) {
+ u64 rcv_data_fin_seq = READ_ONCE(msk->rcv_data_fin_seq);
+
+ if (msk->ack_seq == rcv_data_fin_seq) {
+ if (seq)
+ *seq = rcv_data_fin_seq;
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static void mptcp_set_datafin_timeout(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ u32 retransmits;
+
+ retransmits = min_t(u32, icsk->icsk_retransmits,
+ ilog2(TCP_RTO_MAX / TCP_RTO_MIN));
+
+ mptcp_sk(sk)->timer_ival = TCP_RTO_MIN << retransmits;
+}
+
+static void __mptcp_set_timeout(struct sock *sk, long tout)
+{
+ mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN;
+}
+
+static long mptcp_timeout_from_subflow(const struct mptcp_subflow_context *subflow)
+{
+ const struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ return inet_csk(ssk)->icsk_pending && !subflow->stale_count ?
+ inet_csk(ssk)->icsk_timeout - jiffies : 0;
+}
+
+static void mptcp_set_timeout(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow;
+ long tout = 0;
+
+ mptcp_for_each_subflow(mptcp_sk(sk), subflow)
+ tout = max(tout, mptcp_timeout_from_subflow(subflow));
+ __mptcp_set_timeout(sk, tout);
+}
+
+static inline bool tcp_can_send_ack(const struct sock *ssk)
+{
+ return !((1 << inet_sk_state_load(ssk)) &
+ (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN));
+}
+
+void __mptcp_subflow_send_ack(struct sock *ssk)
+{
+ if (tcp_can_send_ack(ssk))
+ tcp_send_ack(ssk);
+}
+
+static void mptcp_subflow_send_ack(struct sock *ssk)
+{
+ bool slow;
+
+ slow = lock_sock_fast(ssk);
+ __mptcp_subflow_send_ack(ssk);
+ unlock_sock_fast(ssk, slow);
+}
+
+static void mptcp_send_ack(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+
+ mptcp_for_each_subflow(msk, subflow)
+ mptcp_subflow_send_ack(mptcp_subflow_tcp_sock(subflow));
+}
+
+static void mptcp_subflow_cleanup_rbuf(struct sock *ssk)
+{
+ bool slow;
+
+ slow = lock_sock_fast(ssk);
+ if (tcp_can_send_ack(ssk))
+ tcp_cleanup_rbuf(ssk, 1);
+ unlock_sock_fast(ssk, slow);
+}
+
+static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty)
+{
+ const struct inet_connection_sock *icsk = inet_csk(ssk);
+ u8 ack_pending = READ_ONCE(icsk->icsk_ack.pending);
+ const struct tcp_sock *tp = tcp_sk(ssk);
+
+ return (ack_pending & ICSK_ACK_SCHED) &&
+ ((READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->rcv_wup) >
+ READ_ONCE(icsk->icsk_ack.rcv_mss)) ||
+ (rx_empty && ack_pending &
+ (ICSK_ACK_PUSHED2 | ICSK_ACK_PUSHED)));
+}
+
+static void mptcp_cleanup_rbuf(struct mptcp_sock *msk)
+{
+ int old_space = READ_ONCE(msk->old_wspace);
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ int space = __mptcp_space(sk);
+ bool cleanup, rx_empty;
+
+ cleanup = (space > 0) && (space >= (old_space << 1));
+ rx_empty = !__mptcp_rmem(sk);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ if (cleanup || mptcp_subflow_could_cleanup(ssk, rx_empty))
+ mptcp_subflow_cleanup_rbuf(ssk);
+ }
+}
+
+static bool mptcp_check_data_fin(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ u64 rcv_data_fin_seq;
+ bool ret = false;
+
+ /* Need to ack a DATA_FIN received from a peer while this side
+ * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2.
+ * msk->rcv_data_fin was set when parsing the incoming options
+ * at the subflow level and the msk lock was not held, so this
+ * is the first opportunity to act on the DATA_FIN and change
+ * the msk state.
+ *
+ * If we are caught up to the sequence number of the incoming
+ * DATA_FIN, send the DATA_ACK now and do state transition. If
+ * not caught up, do nothing and let the recv code send DATA_ACK
+ * when catching up.
+ */
+
+ if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) {
+ WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1);
+ WRITE_ONCE(msk->rcv_data_fin, 0);
+
+ WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN);
+ smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
+
+ switch (sk->sk_state) {
+ case TCP_ESTABLISHED:
+ inet_sk_state_store(sk, TCP_CLOSE_WAIT);
+ break;
+ case TCP_FIN_WAIT1:
+ inet_sk_state_store(sk, TCP_CLOSING);
+ break;
+ case TCP_FIN_WAIT2:
+ inet_sk_state_store(sk, TCP_CLOSE);
+ break;
+ default:
+ /* Other states not expected */
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ ret = true;
+ if (!__mptcp_check_fallback(msk))
+ mptcp_send_ack(msk);
+ mptcp_close_wake_up(sk);
+ }
+ return ret;
+}
+
+static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
+ struct sock *ssk,
+ unsigned int *bytes)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct sock *sk = (struct sock *)msk;
+ unsigned int moved = 0;
+ bool more_data_avail;
+ struct tcp_sock *tp;
+ bool done = false;
+ int sk_rbuf;
+
+ sk_rbuf = READ_ONCE(sk->sk_rcvbuf);
+
+ if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
+ int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf);
+
+ if (unlikely(ssk_rbuf > sk_rbuf)) {
+ WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf);
+ sk_rbuf = ssk_rbuf;
+ }
+ }
+
+ pr_debug("msk=%p ssk=%p", msk, ssk);
+ tp = tcp_sk(ssk);
+ do {
+ u32 map_remaining, offset;
+ u32 seq = tp->copied_seq;
+ struct sk_buff *skb;
+ bool fin;
+
+ /* try to move as much data as available */
+ map_remaining = subflow->map_data_len -
+ mptcp_subflow_get_map_offset(subflow);
+
+ skb = skb_peek(&ssk->sk_receive_queue);
+ if (!skb) {
+ /* With racing move_skbs_to_msk() and __mptcp_move_skbs(),
+ * a different CPU can have already processed the pending
+ * data, stop here or we can enter an infinite loop
+ */
+ if (!moved)
+ done = true;
+ break;
+ }
+
+ if (__mptcp_check_fallback(msk)) {
+ /* Under fallback skbs have no MPTCP extension and TCP could
+ * collapse them between the dummy map creation and the
+ * current dequeue. Be sure to adjust the map size.
+ */
+ map_remaining = skb->len;
+ subflow->map_data_len = skb->len;
+ }
+
+ offset = seq - TCP_SKB_CB(skb)->seq;
+ fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
+ if (fin) {
+ done = true;
+ seq++;
+ }
+
+ if (offset < skb->len) {
+ size_t len = skb->len - offset;
+
+ if (tp->urg_data)
+ done = true;
+
+ if (__mptcp_move_skb(msk, ssk, skb, offset, len))
+ moved += len;
+ seq += len;
+
+ if (WARN_ON_ONCE(map_remaining < len))
+ break;
+ } else {
+ WARN_ON_ONCE(!fin);
+ sk_eat_skb(ssk, skb);
+ done = true;
+ }
+
+ WRITE_ONCE(tp->copied_seq, seq);
+ more_data_avail = mptcp_subflow_data_available(ssk);
+
+ if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) {
+ done = true;
+ break;
+ }
+ } while (more_data_avail);
+
+ *bytes += moved;
+ return done;
+}
+
+static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct sk_buff *skb, *tail;
+ bool moved = false;
+ struct rb_node *p;
+ u64 end_seq;
+
+ p = rb_first(&msk->out_of_order_queue);
+ pr_debug("msk=%p empty=%d", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue));
+ while (p) {
+ skb = rb_to_skb(p);
+ if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq))
+ break;
+
+ p = rb_next(p);
+ rb_erase(&skb->rbnode, &msk->out_of_order_queue);
+
+ if (unlikely(!after64(MPTCP_SKB_CB(skb)->end_seq,
+ msk->ack_seq))) {
+ mptcp_drop(sk, skb);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
+ continue;
+ }
+
+ end_seq = MPTCP_SKB_CB(skb)->end_seq;
+ tail = skb_peek_tail(&sk->sk_receive_queue);
+ if (!tail || !mptcp_ooo_try_coalesce(msk, tail, skb)) {
+ int delta = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq;
+
+ /* skip overlapping data, if any */
+ pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d",
+ MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq,
+ delta);
+ MPTCP_SKB_CB(skb)->offset += delta;
+ MPTCP_SKB_CB(skb)->map_seq += delta;
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ }
+ msk->bytes_received += end_seq - msk->ack_seq;
+ msk->ack_seq = end_seq;
+ moved = true;
+ }
+ return moved;
+}
+
+static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk)
+{
+ int err = sock_error(ssk);
+ int ssk_state;
+
+ if (!err)
+ return false;
+
+ /* only propagate errors on fallen-back sockets or
+ * on MPC connect
+ */
+ if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk)))
+ return false;
+
+ /* We need to propagate only transition to CLOSE state.
+ * Orphaned socket will see such state change via
+ * subflow_sched_work_if_closed() and that path will properly
+ * destroy the msk as needed.
+ */
+ ssk_state = inet_sk_state_load(ssk);
+ if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD))
+ inet_sk_state_store(sk, ssk_state);
+ WRITE_ONCE(sk->sk_err, -err);
+
+ /* This barrier is coupled with smp_rmb() in mptcp_poll() */
+ smp_wmb();
+ sk_error_report(sk);
+ return true;
+}
+
+void __mptcp_error_report(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ mptcp_for_each_subflow(msk, subflow)
+ if (__mptcp_subflow_error_report(sk, mptcp_subflow_tcp_sock(subflow)))
+ break;
+}
+
+/* In most cases we will be able to lock the mptcp socket. If its already
+ * owned, we need to defer to the work queue to avoid ABBA deadlock.
+ */
+static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
+{
+ struct sock *sk = (struct sock *)msk;
+ unsigned int moved = 0;
+
+ __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
+ __mptcp_ofo_queue(msk);
+ if (unlikely(ssk->sk_err)) {
+ if (!sock_owned_by_user(sk))
+ __mptcp_error_report(sk);
+ else
+ __set_bit(MPTCP_ERROR_REPORT, &msk->cb_flags);
+ }
+
+ /* If the moves have caught up with the DATA_FIN sequence number
+ * it's time to ack the DATA_FIN and change socket state, but
+ * this is not a good place to change state. Let the workqueue
+ * do it.
+ */
+ if (mptcp_pending_data_fin(sk, NULL))
+ mptcp_schedule_work(sk);
+ return moved > 0;
+}
+
+void mptcp_data_ready(struct sock *sk, struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ int sk_rbuf, ssk_rbuf;
+
+ /* The peer can send data while we are shutting down this
+ * subflow at msk destruction time, but we must avoid enqueuing
+ * more data to the msk receive queue
+ */
+ if (unlikely(subflow->disposable))
+ return;
+
+ ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf);
+ sk_rbuf = READ_ONCE(sk->sk_rcvbuf);
+ if (unlikely(ssk_rbuf > sk_rbuf))
+ sk_rbuf = ssk_rbuf;
+
+ /* over limit? can't append more skbs to msk, Also, no need to wake-up*/
+ if (__mptcp_rmem(sk) > sk_rbuf) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED);
+ return;
+ }
+
+ /* Wake-up the reader only for in-sequence data */
+ mptcp_data_lock(sk);
+ if (move_skbs_to_msk(msk, ssk))
+ sk->sk_data_ready(sk);
+
+ mptcp_data_unlock(sk);
+}
+
+static void mptcp_subflow_joined(struct mptcp_sock *msk, struct sock *ssk)
+{
+ mptcp_subflow_ctx(ssk)->map_seq = READ_ONCE(msk->ack_seq);
+ WRITE_ONCE(msk->allow_infinite_fallback, false);
+ mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC);
+}
+
+static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
+{
+ struct sock *sk = (struct sock *)msk;
+
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return false;
+
+ /* attach to msk socket only after we are sure we will deal with it
+ * at close time
+ */
+ if (sk->sk_socket && !ssk->sk_socket)
+ mptcp_sock_graft(ssk, sk->sk_socket);
+
+ mptcp_subflow_ctx(ssk)->subflow_id = msk->subflow_id++;
+ mptcp_sockopt_sync_locked(msk, ssk);
+ mptcp_subflow_joined(msk, ssk);
+ mptcp_stop_tout_timer(sk);
+ __mptcp_propagate_sndbuf(sk, ssk);
+ return true;
+}
+
+static void __mptcp_flush_join_list(struct sock *sk, struct list_head *join_list)
+{
+ struct mptcp_subflow_context *tmp, *subflow;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ list_for_each_entry_safe(subflow, tmp, join_list, node) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ bool slow = lock_sock_fast(ssk);
+
+ list_move_tail(&subflow->node, &msk->conn_list);
+ if (!__mptcp_finish_join(msk, ssk))
+ mptcp_subflow_reset(ssk);
+ unlock_sock_fast(ssk, slow);
+ }
+}
+
+static bool mptcp_rtx_timer_pending(struct sock *sk)
+{
+ return timer_pending(&inet_csk(sk)->icsk_retransmit_timer);
+}
+
+static void mptcp_reset_rtx_timer(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ unsigned long tout;
+
+ /* prevent rescheduling on close */
+ if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE))
+ return;
+
+ tout = mptcp_sk(sk)->timer_ival;
+ sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout);
+}
+
+bool mptcp_schedule_work(struct sock *sk)
+{
+ if (inet_sk_state_load(sk) != TCP_CLOSE &&
+ schedule_work(&mptcp_sk(sk)->work)) {
+ /* each subflow already holds a reference to the sk, and the
+ * workqueue is invoked by a subflow, so sk can't go away here.
+ */
+ sock_hold(sk);
+ return true;
+ }
+ return false;
+}
+
+static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+
+ msk_owned_by_me(msk);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ if (READ_ONCE(subflow->data_avail))
+ return mptcp_subflow_tcp_sock(subflow);
+ }
+
+ return NULL;
+}
+
+static bool mptcp_skb_can_collapse_to(u64 write_seq,
+ const struct sk_buff *skb,
+ const struct mptcp_ext *mpext)
+{
+ if (!tcp_skb_can_collapse_to(skb))
+ return false;
+
+ /* can collapse only if MPTCP level sequence is in order and this
+ * mapping has not been xmitted yet
+ */
+ return mpext && mpext->data_seq + mpext->data_len == write_seq &&
+ !mpext->frozen;
+}
+
+/* we can append data to the given data frag if:
+ * - there is space available in the backing page_frag
+ * - the data frag tail matches the current page_frag free offset
+ * - the data frag end sequence number matches the current write seq
+ */
+static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
+ const struct page_frag *pfrag,
+ const struct mptcp_data_frag *df)
+{
+ return df && pfrag->page == df->page &&
+ pfrag->size - pfrag->offset > 0 &&
+ pfrag->offset == (df->offset + df->data_len) &&
+ df->data_seq + df->data_len == msk->write_seq;
+}
+
+static void dfrag_uncharge(struct sock *sk, int len)
+{
+ sk_mem_uncharge(sk, len);
+ sk_wmem_queued_add(sk, -len);
+}
+
+static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
+{
+ int len = dfrag->data_len + dfrag->overhead;
+
+ list_del(&dfrag->list);
+ dfrag_uncharge(sk, len);
+ put_page(dfrag->page);
+}
+
+static void __mptcp_clean_una(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_data_frag *dtmp, *dfrag;
+ u64 snd_una;
+
+ snd_una = msk->snd_una;
+ list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) {
+ if (after64(dfrag->data_seq + dfrag->data_len, snd_una))
+ break;
+
+ if (unlikely(dfrag == msk->first_pending)) {
+ /* in recovery mode can see ack after the current snd head */
+ if (WARN_ON_ONCE(!msk->recovery))
+ break;
+
+ WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
+ }
+
+ dfrag_clear(sk, dfrag);
+ }
+
+ dfrag = mptcp_rtx_head(sk);
+ if (dfrag && after64(snd_una, dfrag->data_seq)) {
+ u64 delta = snd_una - dfrag->data_seq;
+
+ /* prevent wrap around in recovery mode */
+ if (unlikely(delta > dfrag->already_sent)) {
+ if (WARN_ON_ONCE(!msk->recovery))
+ goto out;
+ if (WARN_ON_ONCE(delta > dfrag->data_len))
+ goto out;
+ dfrag->already_sent += delta - dfrag->already_sent;
+ }
+
+ dfrag->data_seq += delta;
+ dfrag->offset += delta;
+ dfrag->data_len -= delta;
+ dfrag->already_sent -= delta;
+
+ dfrag_uncharge(sk, delta);
+ }
+
+ /* all retransmitted data acked, recovery completed */
+ if (unlikely(msk->recovery) && after64(msk->snd_una, msk->recovery_snd_nxt))
+ msk->recovery = false;
+
+out:
+ if (snd_una == READ_ONCE(msk->snd_nxt) &&
+ snd_una == READ_ONCE(msk->write_seq)) {
+ if (mptcp_rtx_timer_pending(sk) && !mptcp_data_fin_enabled(msk))
+ mptcp_stop_rtx_timer(sk);
+ } else {
+ mptcp_reset_rtx_timer(sk);
+ }
+}
+
+static void __mptcp_clean_una_wakeup(struct sock *sk)
+{
+ lockdep_assert_held_once(&sk->sk_lock.slock);
+
+ __mptcp_clean_una(sk);
+ mptcp_write_space(sk);
+}
+
+static void mptcp_clean_una_wakeup(struct sock *sk)
+{
+ mptcp_data_lock(sk);
+ __mptcp_clean_una_wakeup(sk);
+ mptcp_data_unlock(sk);
+}
+
+static void mptcp_enter_memory_pressure(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ bool first = true;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ if (first)
+ tcp_enter_memory_pressure(ssk);
+ sk_stream_moderate_sndbuf(ssk);
+
+ first = false;
+ }
+ __mptcp_sync_sndbuf(sk);
+}
+
+/* ensure we get enough memory for the frag hdr, beyond some minimal amount of
+ * data
+ */
+static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
+{
+ if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag),
+ pfrag, sk->sk_allocation)))
+ return true;
+
+ mptcp_enter_memory_pressure(sk);
+ return false;
+}
+
+static struct mptcp_data_frag *
+mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag,
+ int orig_offset)
+{
+ int offset = ALIGN(orig_offset, sizeof(long));
+ struct mptcp_data_frag *dfrag;
+
+ dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset);
+ dfrag->data_len = 0;
+ dfrag->data_seq = msk->write_seq;
+ dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag);
+ dfrag->offset = offset + sizeof(struct mptcp_data_frag);
+ dfrag->already_sent = 0;
+ dfrag->page = pfrag->page;
+
+ return dfrag;
+}
+
+struct mptcp_sendmsg_info {
+ int mss_now;
+ int size_goal;
+ u16 limit;
+ u16 sent;
+ unsigned int flags;
+ bool data_lock_held;
+};
+
+static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk,
+ u64 data_seq, int avail_size)
+{
+ u64 window_end = mptcp_wnd_end(msk);
+ u64 mptcp_snd_wnd;
+
+ if (__mptcp_check_fallback(msk))
+ return avail_size;
+
+ mptcp_snd_wnd = window_end - data_seq;
+ avail_size = min_t(unsigned int, mptcp_snd_wnd, avail_size);
+
+ if (unlikely(tcp_sk(ssk)->snd_wnd < mptcp_snd_wnd)) {
+ tcp_sk(ssk)->snd_wnd = min_t(u64, U32_MAX, mptcp_snd_wnd);
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_SNDWNDSHARED);
+ }
+
+ return avail_size;
+}
+
+static bool __mptcp_add_ext(struct sk_buff *skb, gfp_t gfp)
+{
+ struct skb_ext *mpext = __skb_ext_alloc(gfp);
+
+ if (!mpext)
+ return false;
+ __skb_ext_set(skb, SKB_EXT_MPTCP, mpext);
+ return true;
+}
+
+static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp)
+{
+ struct sk_buff *skb;
+
+ skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp);
+ if (likely(skb)) {
+ if (likely(__mptcp_add_ext(skb, gfp))) {
+ skb_reserve(skb, MAX_TCP_HEADER);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
+ return skb;
+ }
+ __kfree_skb(skb);
+ } else {
+ mptcp_enter_memory_pressure(sk);
+ }
+ return NULL;
+}
+
+static struct sk_buff *__mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp)
+{
+ struct sk_buff *skb;
+
+ skb = __mptcp_do_alloc_tx_skb(sk, gfp);
+ if (!skb)
+ return NULL;
+
+ if (likely(sk_wmem_schedule(ssk, skb->truesize))) {
+ tcp_skb_entail(ssk, skb);
+ return skb;
+ }
+ tcp_skb_tsorted_anchor_cleanup(skb);
+ kfree_skb(skb);
+ return NULL;
+}
+
+static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held)
+{
+ gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation;
+
+ return __mptcp_alloc_tx_skb(sk, ssk, gfp);
+}
+
+/* note: this always recompute the csum on the whole skb, even
+ * if we just appended a single frag. More status info needed
+ */
+static void mptcp_update_data_checksum(struct sk_buff *skb, int added)
+{
+ struct mptcp_ext *mpext = mptcp_get_ext(skb);
+ __wsum csum = ~csum_unfold(mpext->csum);
+ int offset = skb->len - added;
+
+ mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset));
+}
+
+static void mptcp_update_infinite_map(struct mptcp_sock *msk,
+ struct sock *ssk,
+ struct mptcp_ext *mpext)
+{
+ if (!mpext)
+ return;
+
+ mpext->infinite_map = 1;
+ mpext->data_len = 0;
+
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX);
+ mptcp_subflow_ctx(ssk)->send_infinite_map = 0;
+ pr_fallback(msk);
+ mptcp_do_fallback(ssk);
+}
+
+#define MPTCP_MAX_GSO_SIZE (GSO_LEGACY_MAX_SIZE - (MAX_TCP_HEADER + 1))
+
+static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
+ struct mptcp_data_frag *dfrag,
+ struct mptcp_sendmsg_info *info)
+{
+ u64 data_seq = dfrag->data_seq + info->sent;
+ int offset = dfrag->offset + info->sent;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ bool zero_window_probe = false;
+ struct mptcp_ext *mpext = NULL;
+ bool can_coalesce = false;
+ bool reuse_skb = true;
+ struct sk_buff *skb;
+ size_t copy;
+ int i;
+
+ pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u",
+ msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent);
+
+ if (WARN_ON_ONCE(info->sent > info->limit ||
+ info->limit > dfrag->data_len))
+ return 0;
+
+ if (unlikely(!__tcp_can_send(ssk)))
+ return -EAGAIN;
+
+ /* compute send limit */
+ if (unlikely(ssk->sk_gso_max_size > MPTCP_MAX_GSO_SIZE))
+ ssk->sk_gso_max_size = MPTCP_MAX_GSO_SIZE;
+ info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags);
+ copy = info->size_goal;
+
+ skb = tcp_write_queue_tail(ssk);
+ if (skb && copy > skb->len) {
+ /* Limit the write to the size available in the
+ * current skb, if any, so that we create at most a new skb.
+ * Explicitly tells TCP internals to avoid collapsing on later
+ * queue management operation, to avoid breaking the ext <->
+ * SSN association set here
+ */
+ mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
+ if (!mptcp_skb_can_collapse_to(data_seq, skb, mpext)) {
+ TCP_SKB_CB(skb)->eor = 1;
+ goto alloc_skb;
+ }
+
+ i = skb_shinfo(skb)->nr_frags;
+ can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset);
+ if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) {
+ tcp_mark_push(tcp_sk(ssk), skb);
+ goto alloc_skb;
+ }
+
+ copy -= skb->len;
+ } else {
+alloc_skb:
+ skb = mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held);
+ if (!skb)
+ return -ENOMEM;
+
+ i = skb_shinfo(skb)->nr_frags;
+ reuse_skb = false;
+ mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
+ }
+
+ /* Zero window and all data acked? Probe. */
+ copy = mptcp_check_allowed_size(msk, ssk, data_seq, copy);
+ if (copy == 0) {
+ u64 snd_una = READ_ONCE(msk->snd_una);
+
+ if (snd_una != msk->snd_nxt || tcp_write_queue_tail(ssk)) {
+ tcp_remove_empty_skb(ssk);
+ return 0;
+ }
+
+ zero_window_probe = true;
+ data_seq = snd_una - 1;
+ copy = 1;
+ }
+
+ copy = min_t(size_t, copy, info->limit - info->sent);
+ if (!sk_wmem_schedule(ssk, copy)) {
+ tcp_remove_empty_skb(ssk);
+ return -ENOMEM;
+ }
+
+ if (can_coalesce) {
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+ } else {
+ get_page(dfrag->page);
+ skb_fill_page_desc(skb, i, dfrag->page, offset, copy);
+ }
+
+ skb->len += copy;
+ skb->data_len += copy;
+ skb->truesize += copy;
+ sk_wmem_queued_add(ssk, copy);
+ sk_mem_charge(ssk, copy);
+ WRITE_ONCE(tcp_sk(ssk)->write_seq, tcp_sk(ssk)->write_seq + copy);
+ TCP_SKB_CB(skb)->end_seq += copy;
+ tcp_skb_pcount_set(skb, 0);
+
+ /* on skb reuse we just need to update the DSS len */
+ if (reuse_skb) {
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
+ mpext->data_len += copy;
+ goto out;
+ }
+
+ memset(mpext, 0, sizeof(*mpext));
+ mpext->data_seq = data_seq;
+ mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
+ mpext->data_len = copy;
+ mpext->use_map = 1;
+ mpext->dsn64 = 1;
+
+ pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d",
+ mpext->data_seq, mpext->subflow_seq, mpext->data_len,
+ mpext->dsn64);
+
+ if (zero_window_probe) {
+ mptcp_subflow_ctx(ssk)->rel_write_seq += copy;
+ mpext->frozen = 1;
+ if (READ_ONCE(msk->csum_enabled))
+ mptcp_update_data_checksum(skb, copy);
+ tcp_push_pending_frames(ssk);
+ return 0;
+ }
+out:
+ if (READ_ONCE(msk->csum_enabled))
+ mptcp_update_data_checksum(skb, copy);
+ if (mptcp_subflow_ctx(ssk)->send_infinite_map)
+ mptcp_update_infinite_map(msk, ssk, mpext);
+ trace_mptcp_sendmsg_frag(mpext);
+ mptcp_subflow_ctx(ssk)->rel_write_seq += copy;
+ return copy;
+}
+
+#define MPTCP_SEND_BURST_SIZE ((1 << 16) - \
+ sizeof(struct tcphdr) - \
+ MAX_TCP_OPTION_SPACE - \
+ sizeof(struct ipv6hdr) - \
+ sizeof(struct frag_hdr))
+
+struct subflow_send_info {
+ struct sock *ssk;
+ u64 linger_time;
+};
+
+void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow)
+{
+ if (!subflow->stale)
+ return;
+
+ subflow->stale = 0;
+ MPTCP_INC_STATS(sock_net(mptcp_subflow_tcp_sock(subflow)), MPTCP_MIB_SUBFLOWRECOVER);
+}
+
+bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
+{
+ if (unlikely(subflow->stale)) {
+ u32 rcv_tstamp = READ_ONCE(tcp_sk(mptcp_subflow_tcp_sock(subflow))->rcv_tstamp);
+
+ if (subflow->stale_rcv_tstamp == rcv_tstamp)
+ return false;
+
+ mptcp_subflow_set_active(subflow);
+ }
+ return __mptcp_subflow_active(subflow);
+}
+
+#define SSK_MODE_ACTIVE 0
+#define SSK_MODE_BACKUP 1
+#define SSK_MODE_MAX 2
+
+/* implement the mptcp packet scheduler;
+ * returns the subflow that will transmit the next DSS
+ * additionally updates the rtx timeout
+ */
+struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
+{
+ struct subflow_send_info send_info[SSK_MODE_MAX];
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ u32 pace, burst, wmem;
+ int i, nr_active = 0;
+ struct sock *ssk;
+ u64 linger_time;
+ long tout = 0;
+
+ /* pick the subflow with the lower wmem/wspace ratio */
+ for (i = 0; i < SSK_MODE_MAX; ++i) {
+ send_info[i].ssk = NULL;
+ send_info[i].linger_time = -1;
+ }
+
+ mptcp_for_each_subflow(msk, subflow) {
+ trace_mptcp_subflow_get_send(subflow);
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ if (!mptcp_subflow_active(subflow))
+ continue;
+
+ tout = max(tout, mptcp_timeout_from_subflow(subflow));
+ nr_active += !subflow->backup;
+ pace = subflow->avg_pacing_rate;
+ if (unlikely(!pace)) {
+ /* init pacing rate from socket */
+ subflow->avg_pacing_rate = READ_ONCE(ssk->sk_pacing_rate);
+ pace = subflow->avg_pacing_rate;
+ if (!pace)
+ continue;
+ }
+
+ linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace);
+ if (linger_time < send_info[subflow->backup].linger_time) {
+ send_info[subflow->backup].ssk = ssk;
+ send_info[subflow->backup].linger_time = linger_time;
+ }
+ }
+ __mptcp_set_timeout(sk, tout);
+
+ /* pick the best backup if no other subflow is active */
+ if (!nr_active)
+ send_info[SSK_MODE_ACTIVE].ssk = send_info[SSK_MODE_BACKUP].ssk;
+
+ /* According to the blest algorithm, to avoid HoL blocking for the
+ * faster flow, we need to:
+ * - estimate the faster flow linger time
+ * - use the above to estimate the amount of byte transferred
+ * by the faster flow
+ * - check that the amount of queued data is greter than the above,
+ * otherwise do not use the picked, slower, subflow
+ * We select the subflow with the shorter estimated time to flush
+ * the queued mem, which basically ensure the above. We just need
+ * to check that subflow has a non empty cwin.
+ */
+ ssk = send_info[SSK_MODE_ACTIVE].ssk;
+ if (!ssk || !sk_stream_memory_free(ssk))
+ return NULL;
+
+ burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt);
+ wmem = READ_ONCE(ssk->sk_wmem_queued);
+ if (!burst)
+ return ssk;
+
+ subflow = mptcp_subflow_ctx(ssk);
+ subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem +
+ READ_ONCE(ssk->sk_pacing_rate) * burst,
+ burst + wmem);
+ msk->snd_burst = burst;
+ return ssk;
+}
+
+static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info)
+{
+ tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal);
+ release_sock(ssk);
+}
+
+static void mptcp_update_post_push(struct mptcp_sock *msk,
+ struct mptcp_data_frag *dfrag,
+ u32 sent)
+{
+ u64 snd_nxt_new = dfrag->data_seq;
+
+ dfrag->already_sent += sent;
+
+ msk->snd_burst -= sent;
+
+ snd_nxt_new += dfrag->already_sent;
+
+ /* snd_nxt_new can be smaller than snd_nxt in case mptcp
+ * is recovering after a failover. In that event, this re-sends
+ * old segments.
+ *
+ * Thus compute snd_nxt_new candidate based on
+ * the dfrag->data_seq that was sent and the data
+ * that has been handed to the subflow for transmission
+ * and skip update in case it was old dfrag.
+ */
+ if (likely(after64(snd_nxt_new, msk->snd_nxt))) {
+ msk->bytes_sent += snd_nxt_new - msk->snd_nxt;
+ msk->snd_nxt = snd_nxt_new;
+ }
+}
+
+void mptcp_check_and_set_pending(struct sock *sk)
+{
+ if (mptcp_send_head(sk))
+ mptcp_sk(sk)->push_pending |= BIT(MPTCP_PUSH_PENDING);
+}
+
+static int __subflow_push_pending(struct sock *sk, struct sock *ssk,
+ struct mptcp_sendmsg_info *info)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_data_frag *dfrag;
+ int len, copied = 0, err = 0;
+
+ while ((dfrag = mptcp_send_head(sk))) {
+ info->sent = dfrag->already_sent;
+ info->limit = dfrag->data_len;
+ len = dfrag->data_len - dfrag->already_sent;
+ while (len > 0) {
+ int ret = 0;
+
+ ret = mptcp_sendmsg_frag(sk, ssk, dfrag, info);
+ if (ret <= 0) {
+ err = copied ? : ret;
+ goto out;
+ }
+
+ info->sent += ret;
+ copied += ret;
+ len -= ret;
+
+ mptcp_update_post_push(msk, dfrag, ret);
+ }
+ WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
+
+ if (msk->snd_burst <= 0 ||
+ !sk_stream_memory_free(ssk) ||
+ !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) {
+ err = copied;
+ goto out;
+ }
+ mptcp_set_timeout(sk);
+ }
+ err = copied;
+
+out:
+ return err;
+}
+
+void __mptcp_push_pending(struct sock *sk, unsigned int flags)
+{
+ struct sock *prev_ssk = NULL, *ssk = NULL;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_sendmsg_info info = {
+ .flags = flags,
+ };
+ bool do_check_data_fin = false;
+ int push_count = 1;
+
+ while (mptcp_send_head(sk) && (push_count > 0)) {
+ struct mptcp_subflow_context *subflow;
+ int ret = 0;
+
+ if (mptcp_sched_get_send(msk))
+ break;
+
+ push_count = 0;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ if (READ_ONCE(subflow->scheduled)) {
+ mptcp_subflow_set_scheduled(subflow, false);
+
+ prev_ssk = ssk;
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ if (ssk != prev_ssk) {
+ /* First check. If the ssk has changed since
+ * the last round, release prev_ssk
+ */
+ if (prev_ssk)
+ mptcp_push_release(prev_ssk, &info);
+
+ /* Need to lock the new subflow only if different
+ * from the previous one, otherwise we are still
+ * helding the relevant lock
+ */
+ lock_sock(ssk);
+ }
+
+ push_count++;
+
+ ret = __subflow_push_pending(sk, ssk, &info);
+ if (ret <= 0) {
+ if (ret != -EAGAIN ||
+ (1 << ssk->sk_state) &
+ (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSE))
+ push_count--;
+ continue;
+ }
+ do_check_data_fin = true;
+ }
+ }
+ }
+
+ /* at this point we held the socket lock for the last subflow we used */
+ if (ssk)
+ mptcp_push_release(ssk, &info);
+
+ /* ensure the rtx timer is running */
+ if (!mptcp_rtx_timer_pending(sk))
+ mptcp_reset_rtx_timer(sk);
+ if (do_check_data_fin)
+ mptcp_check_send_data_fin(sk);
+}
+
+static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool first)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_sendmsg_info info = {
+ .data_lock_held = true,
+ };
+ bool keep_pushing = true;
+ struct sock *xmit_ssk;
+ int copied = 0;
+
+ info.flags = 0;
+ while (mptcp_send_head(sk) && keep_pushing) {
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ int ret = 0;
+
+ /* check for a different subflow usage only after
+ * spooling the first chunk of data
+ */
+ if (first) {
+ mptcp_subflow_set_scheduled(subflow, false);
+ ret = __subflow_push_pending(sk, ssk, &info);
+ first = false;
+ if (ret <= 0)
+ break;
+ copied += ret;
+ continue;
+ }
+
+ if (mptcp_sched_get_send(msk))
+ goto out;
+
+ if (READ_ONCE(subflow->scheduled)) {
+ mptcp_subflow_set_scheduled(subflow, false);
+ ret = __subflow_push_pending(sk, ssk, &info);
+ if (ret <= 0)
+ keep_pushing = false;
+ copied += ret;
+ }
+
+ mptcp_for_each_subflow(msk, subflow) {
+ if (READ_ONCE(subflow->scheduled)) {
+ xmit_ssk = mptcp_subflow_tcp_sock(subflow);
+ if (xmit_ssk != ssk) {
+ mptcp_subflow_delegate(subflow,
+ MPTCP_DELEGATE_SEND);
+ keep_pushing = false;
+ }
+ }
+ }
+ }
+
+out:
+ /* __mptcp_alloc_tx_skb could have released some wmem and we are
+ * not going to flush it via release_sock()
+ */
+ if (copied) {
+ tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
+ info.size_goal);
+ if (!mptcp_rtx_timer_pending(sk))
+ mptcp_reset_rtx_timer(sk);
+
+ if (msk->snd_data_fin_enable &&
+ msk->snd_nxt + 1 == msk->write_seq)
+ mptcp_schedule_work(sk);
+ }
+}
+
+static void mptcp_set_nospace(struct sock *sk)
+{
+ /* enable autotune */
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+
+ /* will be cleared on avail space */
+ set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags);
+}
+
+static int mptcp_disconnect(struct sock *sk, int flags);
+
+static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
+ size_t len, int *copied_syn)
+{
+ unsigned int saved_flags = msg->msg_flags;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct sock *ssk;
+ int ret;
+
+ /* on flags based fastopen the mptcp is supposed to create the
+ * first subflow right now. Otherwise we are in the defer_connect
+ * path, and the first subflow must be already present.
+ * Since the defer_connect flag is cleared after the first succsful
+ * fastopen attempt, no need to check for additional subflow status.
+ */
+ if (msg->msg_flags & MSG_FASTOPEN) {
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk))
+ return PTR_ERR(ssk);
+ }
+ if (!msk->first)
+ return -EINVAL;
+
+ ssk = msk->first;
+
+ lock_sock(ssk);
+ msg->msg_flags |= MSG_DONTWAIT;
+ msk->fastopening = 1;
+ ret = tcp_sendmsg_fastopen(ssk, msg, copied_syn, len, NULL);
+ msk->fastopening = 0;
+ msg->msg_flags = saved_flags;
+ release_sock(ssk);
+
+ /* do the blocking bits of inet_stream_connect outside the ssk socket lock */
+ if (ret == -EINPROGRESS && !(msg->msg_flags & MSG_DONTWAIT)) {
+ ret = __inet_stream_connect(sk->sk_socket, msg->msg_name,
+ msg->msg_namelen, msg->msg_flags, 1);
+
+ /* Keep the same behaviour of plain TCP: zero the copied bytes in
+ * case of any error, except timeout or signal
+ */
+ if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR)
+ *copied_syn = 0;
+ } else if (ret && ret != -EINPROGRESS) {
+ /* The disconnect() op called by tcp_sendmsg_fastopen()/
+ * __inet_stream_connect() can fail, due to looking check,
+ * see mptcp_disconnect().
+ * Attempt it again outside the problematic scope.
+ */
+ if (!mptcp_disconnect(sk, 0))
+ sk->sk_socket->state = SS_UNCONNECTED;
+ }
+ inet_clear_bit(DEFER_CONNECT, sk);
+
+ return ret;
+}
+
+static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct page_frag *pfrag;
+ size_t copied = 0;
+ int ret = 0;
+ long timeo;
+
+ /* silently ignore everything else */
+ msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_FASTOPEN;
+
+ lock_sock(sk);
+
+ if (unlikely(inet_test_bit(DEFER_CONNECT, sk) ||
+ msg->msg_flags & MSG_FASTOPEN)) {
+ int copied_syn = 0;
+
+ ret = mptcp_sendmsg_fastopen(sk, msg, len, &copied_syn);
+ copied += copied_syn;
+ if (ret == -EINPROGRESS && copied_syn > 0)
+ goto out;
+ else if (ret)
+ goto do_error;
+ }
+
+ timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+
+ if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
+ ret = sk_stream_wait_connect(sk, &timeo);
+ if (ret)
+ goto do_error;
+ }
+
+ ret = -EPIPE;
+ if (unlikely(sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)))
+ goto do_error;
+
+ pfrag = sk_page_frag(sk);
+
+ while (msg_data_left(msg)) {
+ int total_ts, frag_truesize = 0;
+ struct mptcp_data_frag *dfrag;
+ bool dfrag_collapsed;
+ size_t psize, offset;
+
+ /* reuse tail pfrag, if possible, or carve a new one from the
+ * page allocator
+ */
+ dfrag = mptcp_pending_tail(sk);
+ dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
+ if (!dfrag_collapsed) {
+ if (!sk_stream_memory_free(sk))
+ goto wait_for_memory;
+
+ if (!mptcp_page_frag_refill(sk, pfrag))
+ goto wait_for_memory;
+
+ dfrag = mptcp_carve_data_frag(msk, pfrag, pfrag->offset);
+ frag_truesize = dfrag->overhead;
+ }
+
+ /* we do not bound vs wspace, to allow a single packet.
+ * memory accounting will prevent execessive memory usage
+ * anyway
+ */
+ offset = dfrag->offset + dfrag->data_len;
+ psize = pfrag->size - offset;
+ psize = min_t(size_t, psize, msg_data_left(msg));
+ total_ts = psize + frag_truesize;
+
+ if (!sk_wmem_schedule(sk, total_ts))
+ goto wait_for_memory;
+
+ if (copy_page_from_iter(dfrag->page, offset, psize,
+ &msg->msg_iter) != psize) {
+ ret = -EFAULT;
+ goto do_error;
+ }
+
+ /* data successfully copied into the write queue */
+ sk_forward_alloc_add(sk, -total_ts);
+ copied += psize;
+ dfrag->data_len += psize;
+ frag_truesize += psize;
+ pfrag->offset += frag_truesize;
+ WRITE_ONCE(msk->write_seq, msk->write_seq + psize);
+
+ /* charge data on mptcp pending queue to the msk socket
+ * Note: we charge such data both to sk and ssk
+ */
+ sk_wmem_queued_add(sk, frag_truesize);
+ if (!dfrag_collapsed) {
+ get_page(dfrag->page);
+ list_add_tail(&dfrag->list, &msk->rtx_queue);
+ if (!msk->first_pending)
+ WRITE_ONCE(msk->first_pending, dfrag);
+ }
+ pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d", msk,
+ dfrag->data_seq, dfrag->data_len, dfrag->already_sent,
+ !dfrag_collapsed);
+
+ continue;
+
+wait_for_memory:
+ mptcp_set_nospace(sk);
+ __mptcp_push_pending(sk, msg->msg_flags);
+ ret = sk_stream_wait_memory(sk, &timeo);
+ if (ret)
+ goto do_error;
+ }
+
+ if (copied)
+ __mptcp_push_pending(sk, msg->msg_flags);
+
+out:
+ release_sock(sk);
+ return copied;
+
+do_error:
+ if (copied)
+ goto out;
+
+ copied = sk_stream_error(sk, msg->msg_flags, ret);
+ goto out;
+}
+
+static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
+ struct msghdr *msg,
+ size_t len, int flags,
+ struct scm_timestamping_internal *tss,
+ int *cmsg_flags)
+{
+ struct sk_buff *skb, *tmp;
+ int copied = 0;
+
+ skb_queue_walk_safe(&msk->receive_queue, skb, tmp) {
+ u32 offset = MPTCP_SKB_CB(skb)->offset;
+ u32 data_len = skb->len - offset;
+ u32 count = min_t(size_t, len - copied, data_len);
+ int err;
+
+ if (!(flags & MSG_TRUNC)) {
+ err = skb_copy_datagram_msg(skb, offset, msg, count);
+ if (unlikely(err < 0)) {
+ if (!copied)
+ return err;
+ break;
+ }
+ }
+
+ if (MPTCP_SKB_CB(skb)->has_rxtstamp) {
+ tcp_update_recv_tstamps(skb, tss);
+ *cmsg_flags |= MPTCP_CMSG_TS;
+ }
+
+ copied += count;
+
+ if (count < data_len) {
+ if (!(flags & MSG_PEEK)) {
+ MPTCP_SKB_CB(skb)->offset += count;
+ MPTCP_SKB_CB(skb)->map_seq += count;
+ }
+ break;
+ }
+
+ if (!(flags & MSG_PEEK)) {
+ /* we will bulk release the skb memory later */
+ skb->destructor = NULL;
+ WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize);
+ __skb_unlink(skb, &msk->receive_queue);
+ __kfree_skb(skb);
+ }
+
+ if (copied >= len)
+ break;
+ }
+
+ return copied;
+}
+
+/* receive buffer autotuning. See tcp_rcv_space_adjust for more information.
+ *
+ * Only difference: Use highest rtt estimate of the subflows in use.
+ */
+static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ u8 scaling_ratio = U8_MAX;
+ u32 time, advmss = 1;
+ u64 rtt_us, mstamp;
+
+ msk_owned_by_me(msk);
+
+ if (copied <= 0)
+ return;
+
+ msk->rcvq_space.copied += copied;
+
+ mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC);
+ time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time);
+
+ rtt_us = msk->rcvq_space.rtt_us;
+ if (rtt_us && time < (rtt_us >> 3))
+ return;
+
+ rtt_us = 0;
+ mptcp_for_each_subflow(msk, subflow) {
+ const struct tcp_sock *tp;
+ u64 sf_rtt_us;
+ u32 sf_advmss;
+
+ tp = tcp_sk(mptcp_subflow_tcp_sock(subflow));
+
+ sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us);
+ sf_advmss = READ_ONCE(tp->advmss);
+
+ rtt_us = max(sf_rtt_us, rtt_us);
+ advmss = max(sf_advmss, advmss);
+ scaling_ratio = min(tp->scaling_ratio, scaling_ratio);
+ }
+
+ msk->rcvq_space.rtt_us = rtt_us;
+ msk->scaling_ratio = scaling_ratio;
+ if (time < (rtt_us >> 3) || rtt_us == 0)
+ return;
+
+ if (msk->rcvq_space.copied <= msk->rcvq_space.space)
+ goto new_measure;
+
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
+ !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
+ u64 rcvwin, grow;
+ int rcvbuf;
+
+ rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss;
+
+ grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space);
+
+ do_div(grow, msk->rcvq_space.space);
+ rcvwin += (grow << 1);
+
+ rcvbuf = min_t(u64, __tcp_space_from_win(scaling_ratio, rcvwin),
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
+
+ if (rcvbuf > sk->sk_rcvbuf) {
+ u32 window_clamp;
+
+ window_clamp = __tcp_win_from_space(scaling_ratio, rcvbuf);
+ WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
+
+ /* Make subflows follow along. If we do not do this, we
+ * get drops at subflow level if skbs can't be moved to
+ * the mptcp rx queue fast enough (announced rcv_win can
+ * exceed ssk->sk_rcvbuf).
+ */
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk;
+ bool slow;
+
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ slow = lock_sock_fast(ssk);
+ WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
+ tcp_sk(ssk)->window_clamp = window_clamp;
+ tcp_cleanup_rbuf(ssk, 1);
+ unlock_sock_fast(ssk, slow);
+ }
+ }
+ }
+
+ msk->rcvq_space.space = msk->rcvq_space.copied;
+new_measure:
+ msk->rcvq_space.copied = 0;
+ msk->rcvq_space.time = mstamp;
+}
+
+static void __mptcp_update_rmem(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (!msk->rmem_released)
+ return;
+
+ atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc);
+ mptcp_rmem_uncharge(sk, msk->rmem_released);
+ WRITE_ONCE(msk->rmem_released, 0);
+}
+
+static void __mptcp_splice_receive_queue(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue);
+}
+
+static bool __mptcp_move_skbs(struct mptcp_sock *msk)
+{
+ struct sock *sk = (struct sock *)msk;
+ unsigned int moved = 0;
+ bool ret, done;
+
+ do {
+ struct sock *ssk = mptcp_subflow_recv_lookup(msk);
+ bool slowpath;
+
+ /* we can have data pending in the subflows only if the msk
+ * receive buffer was full at subflow_data_ready() time,
+ * that is an unlikely slow path.
+ */
+ if (likely(!ssk))
+ break;
+
+ slowpath = lock_sock_fast(ssk);
+ mptcp_data_lock(sk);
+ __mptcp_update_rmem(sk);
+ done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
+ mptcp_data_unlock(sk);
+
+ if (unlikely(ssk->sk_err))
+ __mptcp_error_report(sk);
+ unlock_sock_fast(ssk, slowpath);
+ } while (!done);
+
+ /* acquire the data lock only if some input data is pending */
+ ret = moved > 0;
+ if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) ||
+ !skb_queue_empty_lockless(&sk->sk_receive_queue)) {
+ mptcp_data_lock(sk);
+ __mptcp_update_rmem(sk);
+ ret |= __mptcp_ofo_queue(msk);
+ __mptcp_splice_receive_queue(sk);
+ mptcp_data_unlock(sk);
+ }
+ if (ret)
+ mptcp_check_data_fin((struct sock *)msk);
+ return !skb_queue_empty(&msk->receive_queue);
+}
+
+static unsigned int mptcp_inq_hint(const struct sock *sk)
+{
+ const struct mptcp_sock *msk = mptcp_sk(sk);
+ const struct sk_buff *skb;
+
+ skb = skb_peek(&msk->receive_queue);
+ if (skb) {
+ u64 hint_val = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq;
+
+ if (hint_val >= INT_MAX)
+ return INT_MAX;
+
+ return (unsigned int)hint_val;
+ }
+
+ if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
+ return 1;
+
+ return 0;
+}
+
+static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int flags, int *addr_len)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct scm_timestamping_internal tss;
+ int copied = 0, cmsg_flags = 0;
+ int target;
+ long timeo;
+
+ /* MSG_ERRQUEUE is really a no-op till we support IP_RECVERR */
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
+
+ lock_sock(sk);
+ if (unlikely(sk->sk_state == TCP_LISTEN)) {
+ copied = -ENOTCONN;
+ goto out_err;
+ }
+
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+ len = min_t(size_t, len, INT_MAX);
+ target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+
+ if (unlikely(msk->recvmsg_inq))
+ cmsg_flags = MPTCP_CMSG_INQ;
+
+ while (copied < len) {
+ int bytes_read;
+
+ bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags, &tss, &cmsg_flags);
+ if (unlikely(bytes_read < 0)) {
+ if (!copied)
+ copied = bytes_read;
+ goto out_err;
+ }
+
+ copied += bytes_read;
+
+ /* be sure to advertise window change */
+ mptcp_cleanup_rbuf(msk);
+
+ if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk))
+ continue;
+
+ /* only the master socket status is relevant here. The exit
+ * conditions mirror closely tcp_recvmsg()
+ */
+ if (copied >= target)
+ break;
+
+ if (copied) {
+ if (sk->sk_err ||
+ sk->sk_state == TCP_CLOSE ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ !timeo ||
+ signal_pending(current))
+ break;
+ } else {
+ if (sk->sk_err) {
+ copied = sock_error(sk);
+ break;
+ }
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN) {
+ /* race breaker: the shutdown could be after the
+ * previous receive queue check
+ */
+ if (__mptcp_move_skbs(msk))
+ continue;
+ break;
+ }
+
+ if (sk->sk_state == TCP_CLOSE) {
+ copied = -ENOTCONN;
+ break;
+ }
+
+ if (!timeo) {
+ copied = -EAGAIN;
+ break;
+ }
+
+ if (signal_pending(current)) {
+ copied = sock_intr_errno(timeo);
+ break;
+ }
+ }
+
+ pr_debug("block timeout %ld", timeo);
+ sk_wait_data(sk, &timeo, NULL);
+ }
+
+out_err:
+ if (cmsg_flags && copied >= 0) {
+ if (cmsg_flags & MPTCP_CMSG_TS)
+ tcp_recv_timestamp(msg, sk, &tss);
+
+ if (cmsg_flags & MPTCP_CMSG_INQ) {
+ unsigned int inq = mptcp_inq_hint(sk);
+
+ put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
+ }
+ }
+
+ pr_debug("msk=%p rx queue empty=%d:%d copied=%d",
+ msk, skb_queue_empty_lockless(&sk->sk_receive_queue),
+ skb_queue_empty(&msk->receive_queue), copied);
+ if (!(flags & MSG_PEEK))
+ mptcp_rcv_space_adjust(msk, copied);
+
+ release_sock(sk);
+ return copied;
+}
+
+static void mptcp_retransmit_timer(struct timer_list *t)
+{
+ struct inet_connection_sock *icsk = from_timer(icsk, t,
+ icsk_retransmit_timer);
+ struct sock *sk = &icsk->icsk_inet.sk;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ /* we need a process context to retransmit */
+ if (!test_and_set_bit(MPTCP_WORK_RTX, &msk->flags))
+ mptcp_schedule_work(sk);
+ } else {
+ /* delegate our work to tcp_release_cb() */
+ __set_bit(MPTCP_RETRANSMIT, &msk->cb_flags);
+ }
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+static void mptcp_tout_timer(struct timer_list *t)
+{
+ struct sock *sk = from_timer(sk, t, sk_timer);
+
+ mptcp_schedule_work(sk);
+ sock_put(sk);
+}
+
+/* Find an idle subflow. Return NULL if there is unacked data at tcp
+ * level.
+ *
+ * A backup subflow is returned only if that is the only kind available.
+ */
+struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk)
+{
+ struct sock *backup = NULL, *pick = NULL;
+ struct mptcp_subflow_context *subflow;
+ int min_stale_count = INT_MAX;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ if (!__mptcp_subflow_active(subflow))
+ continue;
+
+ /* still data outstanding at TCP level? skip this */
+ if (!tcp_rtx_and_write_queues_empty(ssk)) {
+ mptcp_pm_subflow_chk_stale(msk, ssk);
+ min_stale_count = min_t(int, min_stale_count, subflow->stale_count);
+ continue;
+ }
+
+ if (subflow->backup) {
+ if (!backup)
+ backup = ssk;
+ continue;
+ }
+
+ if (!pick)
+ pick = ssk;
+ }
+
+ if (pick)
+ return pick;
+
+ /* use backup only if there are no progresses anywhere */
+ return min_stale_count > 1 ? backup : NULL;
+}
+
+bool __mptcp_retransmit_pending_data(struct sock *sk)
+{
+ struct mptcp_data_frag *cur, *rtx_head;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (__mptcp_check_fallback(msk))
+ return false;
+
+ if (tcp_rtx_and_write_queues_empty(sk))
+ return false;
+
+ /* the closing socket has some data untransmitted and/or unacked:
+ * some data in the mptcp rtx queue has not really xmitted yet.
+ * keep it simple and re-inject the whole mptcp level rtx queue
+ */
+ mptcp_data_lock(sk);
+ __mptcp_clean_una_wakeup(sk);
+ rtx_head = mptcp_rtx_head(sk);
+ if (!rtx_head) {
+ mptcp_data_unlock(sk);
+ return false;
+ }
+
+ msk->recovery_snd_nxt = msk->snd_nxt;
+ msk->recovery = true;
+ mptcp_data_unlock(sk);
+
+ msk->first_pending = rtx_head;
+ msk->snd_burst = 0;
+
+ /* be sure to clear the "sent status" on all re-injected fragments */
+ list_for_each_entry(cur, &msk->rtx_queue, list) {
+ if (!cur->already_sent)
+ break;
+ cur->already_sent = 0;
+ }
+
+ return true;
+}
+
+/* flags for __mptcp_close_ssk() */
+#define MPTCP_CF_PUSH BIT(1)
+#define MPTCP_CF_FASTCLOSE BIT(2)
+
+/* be sure to send a reset only if the caller asked for it, also
+ * clean completely the subflow status when the subflow reaches
+ * TCP_CLOSE state
+ */
+static void __mptcp_subflow_disconnect(struct sock *ssk,
+ struct mptcp_subflow_context *subflow,
+ unsigned int flags)
+{
+ if (((1 << ssk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
+ (flags & MPTCP_CF_FASTCLOSE)) {
+ /* The MPTCP code never wait on the subflow sockets, TCP-level
+ * disconnect should never fail
+ */
+ WARN_ON_ONCE(tcp_disconnect(ssk, 0));
+ mptcp_subflow_ctx_reset(subflow);
+ } else {
+ tcp_shutdown(ssk, SEND_SHUTDOWN);
+ }
+}
+
+/* subflow sockets can be either outgoing (connect) or incoming
+ * (accept).
+ *
+ * Outgoing subflows use in-kernel sockets.
+ * Incoming subflows do not have their own 'struct socket' allocated,
+ * so we need to use tcp_close() after detaching them from the mptcp
+ * parent socket.
+ */
+static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
+ struct mptcp_subflow_context *subflow,
+ unsigned int flags)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ bool dispose_it, need_push = false;
+
+ /* If the first subflow moved to a close state before accept, e.g. due
+ * to an incoming reset or listener shutdown, the subflow socket is
+ * already deleted by inet_child_forget() and the mptcp socket can't
+ * survive too.
+ */
+ if (msk->in_accept_queue && msk->first == ssk &&
+ (sock_flag(sk, SOCK_DEAD) || sock_flag(ssk, SOCK_DEAD))) {
+ /* ensure later check in mptcp_worker() will dispose the msk */
+ mptcp_set_close_tout(sk, tcp_jiffies32 - (TCP_TIMEWAIT_LEN + 1));
+ sock_set_flag(sk, SOCK_DEAD);
+ lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
+ mptcp_subflow_drop_ctx(ssk);
+ goto out_release;
+ }
+
+ dispose_it = msk->free_first || ssk != msk->first;
+ if (dispose_it)
+ list_del(&subflow->node);
+
+ lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
+
+ if ((flags & MPTCP_CF_FASTCLOSE) && !__mptcp_check_fallback(msk)) {
+ /* be sure to force the tcp_close path
+ * to generate the egress reset
+ */
+ ssk->sk_lingertime = 0;
+ sock_set_flag(ssk, SOCK_LINGER);
+ subflow->send_fastclose = 1;
+ }
+
+ need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk);
+ if (!dispose_it) {
+ __mptcp_subflow_disconnect(ssk, subflow, flags);
+ release_sock(ssk);
+
+ goto out;
+ }
+
+ subflow->disposable = 1;
+
+ /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops
+ * the ssk has been already destroyed, we just need to release the
+ * reference owned by msk;
+ */
+ if (!inet_csk(ssk)->icsk_ulp_ops) {
+ WARN_ON_ONCE(!sock_flag(ssk, SOCK_DEAD));
+ kfree_rcu(subflow, rcu);
+ } else {
+ /* otherwise tcp will dispose of the ssk and subflow ctx */
+ __tcp_close(ssk, 0);
+
+ /* close acquired an extra ref */
+ __sock_put(ssk);
+ }
+
+out_release:
+ __mptcp_subflow_error_report(sk, ssk);
+ release_sock(ssk);
+
+ sock_put(ssk);
+
+ if (ssk == msk->first)
+ WRITE_ONCE(msk->first, NULL);
+
+out:
+ __mptcp_sync_sndbuf(sk);
+ if (need_push)
+ __mptcp_push_pending(sk, 0);
+
+ /* Catch every 'all subflows closed' scenario, including peers silently
+ * closing them, e.g. due to timeout.
+ * For established sockets, allow an additional timeout before closing,
+ * as the protocol can still create more subflows.
+ */
+ if (list_is_singular(&msk->conn_list) && msk->first &&
+ inet_sk_state_load(msk->first) == TCP_CLOSE) {
+ if (sk->sk_state != TCP_ESTABLISHED ||
+ msk->in_accept_queue || sock_flag(sk, SOCK_DEAD)) {
+ inet_sk_state_store(sk, TCP_CLOSE);
+ mptcp_close_wake_up(sk);
+ } else {
+ mptcp_start_tout_timer(sk);
+ }
+ }
+}
+
+void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
+ struct mptcp_subflow_context *subflow)
+{
+ if (sk->sk_state == TCP_ESTABLISHED)
+ mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL);
+
+ /* subflow aborted before reaching the fully_established status
+ * attempt the creation of the next subflow
+ */
+ mptcp_pm_subflow_check_next(mptcp_sk(sk), ssk, subflow);
+
+ __mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_PUSH);
+}
+
+static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
+{
+ return 0;
+}
+
+static void __mptcp_close_subflow(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow, *tmp;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ might_sleep();
+
+ mptcp_for_each_subflow_safe(msk, subflow, tmp) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ if (inet_sk_state_load(ssk) != TCP_CLOSE)
+ continue;
+
+ /* 'subflow_data_ready' will re-sched once rx queue is empty */
+ if (!skb_queue_empty_lockless(&ssk->sk_receive_queue))
+ continue;
+
+ mptcp_close_ssk(sk, ssk, subflow);
+ }
+
+}
+
+static bool mptcp_close_tout_expired(const struct sock *sk)
+{
+ if (!inet_csk(sk)->icsk_mtup.probe_timestamp ||
+ sk->sk_state == TCP_CLOSE)
+ return false;
+
+ return time_after32(tcp_jiffies32,
+ inet_csk(sk)->icsk_mtup.probe_timestamp + TCP_TIMEWAIT_LEN);
+}
+
+static void mptcp_check_fastclose(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow, *tmp;
+ struct sock *sk = (struct sock *)msk;
+
+ if (likely(!READ_ONCE(msk->rcv_fastclose)))
+ return;
+
+ mptcp_token_destroy(msk);
+
+ mptcp_for_each_subflow_safe(msk, subflow, tmp) {
+ struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
+ bool slow;
+
+ slow = lock_sock_fast(tcp_sk);
+ if (tcp_sk->sk_state != TCP_CLOSE) {
+ tcp_send_active_reset(tcp_sk, GFP_ATOMIC);
+ tcp_set_state(tcp_sk, TCP_CLOSE);
+ }
+ unlock_sock_fast(tcp_sk, slow);
+ }
+
+ /* Mirror the tcp_reset() error propagation */
+ switch (sk->sk_state) {
+ case TCP_SYN_SENT:
+ WRITE_ONCE(sk->sk_err, ECONNREFUSED);
+ break;
+ case TCP_CLOSE_WAIT:
+ WRITE_ONCE(sk->sk_err, EPIPE);
+ break;
+ case TCP_CLOSE:
+ return;
+ default:
+ WRITE_ONCE(sk->sk_err, ECONNRESET);
+ }
+
+ inet_sk_state_store(sk, TCP_CLOSE);
+ WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
+ smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
+ set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags);
+
+ /* the calling mptcp_worker will properly destroy the socket */
+ if (sock_flag(sk, SOCK_DEAD))
+ return;
+
+ sk->sk_state_change(sk);
+ sk_error_report(sk);
+}
+
+static void __mptcp_retrans(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sendmsg_info info = {};
+ struct mptcp_data_frag *dfrag;
+ struct sock *ssk;
+ int ret, err;
+ u16 len = 0;
+
+ mptcp_clean_una_wakeup(sk);
+
+ /* first check ssk: need to kick "stale" logic */
+ err = mptcp_sched_get_retrans(msk);
+ dfrag = mptcp_rtx_head(sk);
+ if (!dfrag) {
+ if (mptcp_data_fin_enabled(msk)) {
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ icsk->icsk_retransmits++;
+ mptcp_set_datafin_timeout(sk);
+ mptcp_send_ack(msk);
+
+ goto reset_timer;
+ }
+
+ if (!mptcp_send_head(sk))
+ return;
+
+ goto reset_timer;
+ }
+
+ if (err)
+ goto reset_timer;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ if (READ_ONCE(subflow->scheduled)) {
+ u16 copied = 0;
+
+ mptcp_subflow_set_scheduled(subflow, false);
+
+ ssk = mptcp_subflow_tcp_sock(subflow);
+
+ lock_sock(ssk);
+
+ /* limit retransmission to the bytes already sent on some subflows */
+ info.sent = 0;
+ info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len :
+ dfrag->already_sent;
+ while (info.sent < info.limit) {
+ ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
+ if (ret <= 0)
+ break;
+
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS);
+ copied += ret;
+ info.sent += ret;
+ }
+ if (copied) {
+ len = max(copied, len);
+ tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
+ info.size_goal);
+ WRITE_ONCE(msk->allow_infinite_fallback, false);
+ }
+
+ release_sock(ssk);
+ }
+ }
+
+ msk->bytes_retrans += len;
+ dfrag->already_sent = max(dfrag->already_sent, len);
+
+reset_timer:
+ mptcp_check_and_set_pending(sk);
+
+ if (!mptcp_rtx_timer_pending(sk))
+ mptcp_reset_rtx_timer(sk);
+}
+
+/* schedule the timeout timer for the relevant event: either close timeout
+ * or mp_fail timeout. The close timeout takes precedence on the mp_fail one
+ */
+void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout)
+{
+ struct sock *sk = (struct sock *)msk;
+ unsigned long timeout, close_timeout;
+
+ if (!fail_tout && !inet_csk(sk)->icsk_mtup.probe_timestamp)
+ return;
+
+ close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies +
+ TCP_TIMEWAIT_LEN;
+
+ /* the close timeout takes precedence on the fail one, and here at least one of
+ * them is active
+ */
+ timeout = inet_csk(sk)->icsk_mtup.probe_timestamp ? close_timeout : fail_tout;
+
+ sk_reset_timer(sk, &sk->sk_timer, timeout);
+}
+
+static void mptcp_mp_fail_no_response(struct mptcp_sock *msk)
+{
+ struct sock *ssk = msk->first;
+ bool slow;
+
+ if (!ssk)
+ return;
+
+ pr_debug("MP_FAIL doesn't respond, reset the subflow");
+
+ slow = lock_sock_fast(ssk);
+ mptcp_subflow_reset(ssk);
+ WRITE_ONCE(mptcp_subflow_ctx(ssk)->fail_tout, 0);
+ unlock_sock_fast(ssk, slow);
+}
+
+static void mptcp_do_fastclose(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow, *tmp;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ inet_sk_state_store(sk, TCP_CLOSE);
+ mptcp_for_each_subflow_safe(msk, subflow, tmp)
+ __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow),
+ subflow, MPTCP_CF_FASTCLOSE);
+}
+
+static void mptcp_worker(struct work_struct *work)
+{
+ struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
+ struct sock *sk = (struct sock *)msk;
+ unsigned long fail_tout;
+ int state;
+
+ lock_sock(sk);
+ state = sk->sk_state;
+ if (unlikely((1 << state) & (TCPF_CLOSE | TCPF_LISTEN)))
+ goto unlock;
+
+ mptcp_check_fastclose(msk);
+
+ mptcp_pm_nl_work(msk);
+
+ mptcp_check_send_data_fin(sk);
+ mptcp_check_data_fin_ack(sk);
+ mptcp_check_data_fin(sk);
+
+ if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
+ __mptcp_close_subflow(sk);
+
+ if (mptcp_close_tout_expired(sk)) {
+ mptcp_do_fastclose(sk);
+ mptcp_close_wake_up(sk);
+ }
+
+ if (sock_flag(sk, SOCK_DEAD) && sk->sk_state == TCP_CLOSE) {
+ __mptcp_destroy_sock(sk);
+ goto unlock;
+ }
+
+ if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
+ __mptcp_retrans(sk);
+
+ fail_tout = msk->first ? READ_ONCE(mptcp_subflow_ctx(msk->first)->fail_tout) : 0;
+ if (fail_tout && time_after(jiffies, fail_tout))
+ mptcp_mp_fail_no_response(msk);
+
+unlock:
+ release_sock(sk);
+ sock_put(sk);
+}
+
+static void __mptcp_init_sock(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ INIT_LIST_HEAD(&msk->conn_list);
+ INIT_LIST_HEAD(&msk->join_list);
+ INIT_LIST_HEAD(&msk->rtx_queue);
+ INIT_WORK(&msk->work, mptcp_worker);
+ __skb_queue_head_init(&msk->receive_queue);
+ msk->out_of_order_queue = RB_ROOT;
+ msk->first_pending = NULL;
+ msk->rmem_fwd_alloc = 0;
+ WRITE_ONCE(msk->rmem_released, 0);
+ msk->timer_ival = TCP_RTO_MIN;
+
+ WRITE_ONCE(msk->first, NULL);
+ inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
+ WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
+ WRITE_ONCE(msk->allow_infinite_fallback, true);
+ msk->recovery = false;
+ msk->subflow_id = 1;
+
+ mptcp_pm_data_init(msk);
+
+ /* re-use the csk retrans timer for MPTCP-level retrans */
+ timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
+ timer_setup(&sk->sk_timer, mptcp_tout_timer, 0);
+}
+
+static void mptcp_ca_reset(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ tcp_assign_congestion_control(sk);
+ strcpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name);
+
+ /* no need to keep a reference to the ops, the name will suffice */
+ tcp_cleanup_congestion_control(sk);
+ icsk->icsk_ca_ops = NULL;
+}
+
+static int mptcp_init_sock(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+ int ret;
+
+ __mptcp_init_sock(sk);
+
+ if (!mptcp_is_enabled(net))
+ return -ENOPROTOOPT;
+
+ if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net))
+ return -ENOMEM;
+
+ ret = mptcp_init_sched(mptcp_sk(sk),
+ mptcp_sched_find(mptcp_get_scheduler(net)));
+ if (ret)
+ return ret;
+
+ set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags);
+
+ /* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will
+ * propagate the correct value
+ */
+ mptcp_ca_reset(sk);
+
+ sk_sockets_allocated_inc(sk);
+ sk->sk_rcvbuf = READ_ONCE(net->ipv4.sysctl_tcp_rmem[1]);
+ sk->sk_sndbuf = READ_ONCE(net->ipv4.sysctl_tcp_wmem[1]);
+
+ return 0;
+}
+
+static void __mptcp_clear_xmit(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_data_frag *dtmp, *dfrag;
+
+ WRITE_ONCE(msk->first_pending, NULL);
+ list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list)
+ dfrag_clear(sk, dfrag);
+}
+
+void mptcp_cancel_work(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (cancel_work_sync(&msk->work))
+ __sock_put(sk);
+}
+
+void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
+{
+ lock_sock(ssk);
+
+ switch (ssk->sk_state) {
+ case TCP_LISTEN:
+ if (!(how & RCV_SHUTDOWN))
+ break;
+ fallthrough;
+ case TCP_SYN_SENT:
+ WARN_ON_ONCE(tcp_disconnect(ssk, O_NONBLOCK));
+ break;
+ default:
+ if (__mptcp_check_fallback(mptcp_sk(sk))) {
+ pr_debug("Fallback");
+ ssk->sk_shutdown |= how;
+ tcp_shutdown(ssk, how);
+
+ /* simulate the data_fin ack reception to let the state
+ * machine move forward
+ */
+ WRITE_ONCE(mptcp_sk(sk)->snd_una, mptcp_sk(sk)->snd_nxt);
+ mptcp_schedule_work(sk);
+ } else {
+ pr_debug("Sending DATA_FIN on subflow %p", ssk);
+ tcp_send_ack(ssk);
+ if (!mptcp_rtx_timer_pending(sk))
+ mptcp_reset_rtx_timer(sk);
+ }
+ break;
+ }
+
+ release_sock(ssk);
+}
+
+static const unsigned char new_state[16] = {
+ /* current state: new state: action: */
+ [0 /* (Invalid) */] = TCP_CLOSE,
+ [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+ [TCP_SYN_SENT] = TCP_CLOSE,
+ [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+ [TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
+ [TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
+ [TCP_TIME_WAIT] = TCP_CLOSE, /* should not happen ! */
+ [TCP_CLOSE] = TCP_CLOSE,
+ [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN,
+ [TCP_LAST_ACK] = TCP_LAST_ACK,
+ [TCP_LISTEN] = TCP_CLOSE,
+ [TCP_CLOSING] = TCP_CLOSING,
+ [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */
+};
+
+static int mptcp_close_state(struct sock *sk)
+{
+ int next = (int)new_state[sk->sk_state];
+ int ns = next & TCP_STATE_MASK;
+
+ inet_sk_state_store(sk, ns);
+
+ return next & TCP_ACTION_FIN;
+}
+
+static void mptcp_check_send_data_fin(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu",
+ msk, msk->snd_data_fin_enable, !!mptcp_send_head(sk),
+ msk->snd_nxt, msk->write_seq);
+
+ /* we still need to enqueue subflows or not really shutting down,
+ * skip this
+ */
+ if (!msk->snd_data_fin_enable || msk->snd_nxt + 1 != msk->write_seq ||
+ mptcp_send_head(sk))
+ return;
+
+ WRITE_ONCE(msk->snd_nxt, msk->write_seq);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
+
+ mptcp_subflow_shutdown(sk, tcp_sk, SEND_SHUTDOWN);
+ }
+}
+
+static void __mptcp_wr_shutdown(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d",
+ msk, msk->snd_data_fin_enable, sk->sk_shutdown, sk->sk_state,
+ !!mptcp_send_head(sk));
+
+ /* will be ignored by fallback sockets */
+ WRITE_ONCE(msk->write_seq, msk->write_seq + 1);
+ WRITE_ONCE(msk->snd_data_fin_enable, 1);
+
+ mptcp_check_send_data_fin(sk);
+}
+
+static void __mptcp_destroy_sock(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ pr_debug("msk=%p", msk);
+
+ might_sleep();
+
+ mptcp_stop_rtx_timer(sk);
+ sk_stop_timer(sk, &sk->sk_timer);
+ msk->pm.status = 0;
+ mptcp_release_sched(msk);
+
+ sk->sk_prot->destroy(sk);
+
+ WARN_ON_ONCE(msk->rmem_fwd_alloc);
+ WARN_ON_ONCE(msk->rmem_released);
+ sk_stream_kill_queues(sk);
+ xfrm_sk_free_policy(sk);
+
+ sock_put(sk);
+}
+
+void __mptcp_unaccepted_force_close(struct sock *sk)
+{
+ sock_set_flag(sk, SOCK_DEAD);
+ mptcp_do_fastclose(sk);
+ __mptcp_destroy_sock(sk);
+}
+
+static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
+{
+ /* Concurrent splices from sk_receive_queue into receive_queue will
+ * always show at least one non-empty queue when checked in this order.
+ */
+ if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) &&
+ skb_queue_empty_lockless(&msk->receive_queue))
+ return 0;
+
+ return EPOLLIN | EPOLLRDNORM;
+}
+
+static void mptcp_check_listen_stop(struct sock *sk)
+{
+ struct sock *ssk;
+
+ if (inet_sk_state_load(sk) != TCP_LISTEN)
+ return;
+
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ ssk = mptcp_sk(sk)->first;
+ if (WARN_ON_ONCE(!ssk || inet_sk_state_load(ssk) != TCP_LISTEN))
+ return;
+
+ lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
+ tcp_set_state(ssk, TCP_CLOSE);
+ mptcp_subflow_queue_clean(sk, ssk);
+ inet_csk_listen_stop(ssk);
+ mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED);
+ release_sock(ssk);
+}
+
+bool __mptcp_close(struct sock *sk, long timeout)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ bool do_cancel_work = false;
+ int subflows_alive = 0;
+
+ WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
+
+ if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) {
+ mptcp_check_listen_stop(sk);
+ inet_sk_state_store(sk, TCP_CLOSE);
+ goto cleanup;
+ }
+
+ if (mptcp_check_readable(msk) || timeout < 0) {
+ /* If the msk has read data, or the caller explicitly ask it,
+ * do the MPTCP equivalent of TCP reset, aka MPTCP fastclose
+ */
+ mptcp_do_fastclose(sk);
+ timeout = 0;
+ } else if (mptcp_close_state(sk)) {
+ __mptcp_wr_shutdown(sk);
+ }
+
+ sk_stream_wait_close(sk, timeout);
+
+cleanup:
+ /* orphan all the subflows */
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ bool slow = lock_sock_fast_nested(ssk);
+
+ subflows_alive += ssk->sk_state != TCP_CLOSE;
+
+ /* since the close timeout takes precedence on the fail one,
+ * cancel the latter
+ */
+ if (ssk == msk->first)
+ subflow->fail_tout = 0;
+
+ /* detach from the parent socket, but allow data_ready to
+ * push incoming data into the mptcp stack, to properly ack it
+ */
+ ssk->sk_socket = NULL;
+ ssk->sk_wq = NULL;
+ unlock_sock_fast(ssk, slow);
+ }
+ sock_orphan(sk);
+
+ /* all the subflows are closed, only timeout can change the msk
+ * state, let's not keep resources busy for no reasons
+ */
+ if (subflows_alive == 0)
+ inet_sk_state_store(sk, TCP_CLOSE);
+
+ sock_hold(sk);
+ pr_debug("msk=%p state=%d", sk, sk->sk_state);
+ if (msk->token)
+ mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);
+
+ if (sk->sk_state == TCP_CLOSE) {
+ __mptcp_destroy_sock(sk);
+ do_cancel_work = true;
+ } else {
+ mptcp_start_tout_timer(sk);
+ }
+
+ return do_cancel_work;
+}
+
+static void mptcp_close(struct sock *sk, long timeout)
+{
+ bool do_cancel_work;
+
+ lock_sock(sk);
+
+ do_cancel_work = __mptcp_close(sk, timeout);
+ release_sock(sk);
+ if (do_cancel_work)
+ mptcp_cancel_work(sk);
+
+ sock_put(sk);
+}
+
+static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
+{
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ const struct ipv6_pinfo *ssk6 = inet6_sk(ssk);
+ struct ipv6_pinfo *msk6 = inet6_sk(msk);
+
+ msk->sk_v6_daddr = ssk->sk_v6_daddr;
+ msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr;
+
+ if (msk6 && ssk6) {
+ msk6->saddr = ssk6->saddr;
+ msk6->flow_label = ssk6->flow_label;
+ }
+#endif
+
+ inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num;
+ inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport;
+ inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport;
+ inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr;
+ inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr;
+ inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr;
+}
+
+static int mptcp_disconnect(struct sock *sk, int flags)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ /* We are on the fastopen error path. We can't call straight into the
+ * subflows cleanup code due to lock nesting (we are already under
+ * msk->firstsocket lock).
+ */
+ if (msk->fastopening)
+ return -EBUSY;
+
+ mptcp_check_listen_stop(sk);
+ inet_sk_state_store(sk, TCP_CLOSE);
+
+ mptcp_stop_rtx_timer(sk);
+ mptcp_stop_tout_timer(sk);
+
+ if (msk->token)
+ mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);
+
+ /* msk->subflow is still intact, the following will not free the first
+ * subflow
+ */
+ mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE);
+ WRITE_ONCE(msk->flags, 0);
+ msk->cb_flags = 0;
+ msk->push_pending = 0;
+ msk->recovery = false;
+ msk->can_ack = false;
+ msk->fully_established = false;
+ msk->rcv_data_fin = false;
+ msk->snd_data_fin_enable = false;
+ msk->rcv_fastclose = false;
+ msk->use_64bit_ack = false;
+ WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
+ mptcp_pm_data_reset(msk);
+ mptcp_ca_reset(sk);
+ msk->bytes_acked = 0;
+ msk->bytes_received = 0;
+ msk->bytes_sent = 0;
+ msk->bytes_retrans = 0;
+
+ WRITE_ONCE(sk->sk_shutdown, 0);
+ sk_error_report(sk);
+ return 0;
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk)
+{
+ unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo);
+
+ return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
+}
+#endif
+
+struct sock *mptcp_sk_clone_init(const struct sock *sk,
+ const struct mptcp_options_received *mp_opt,
+ struct sock *ssk,
+ struct request_sock *req)
+{
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+ struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC);
+ struct mptcp_sock *msk;
+
+ if (!nsk)
+ return NULL;
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (nsk->sk_family == AF_INET6)
+ inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk);
+#endif
+
+ __mptcp_init_sock(nsk);
+
+ msk = mptcp_sk(nsk);
+ msk->local_key = subflow_req->local_key;
+ msk->token = subflow_req->token;
+ msk->in_accept_queue = 1;
+ WRITE_ONCE(msk->fully_established, false);
+ if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD)
+ WRITE_ONCE(msk->csum_enabled, true);
+
+ msk->write_seq = subflow_req->idsn + 1;
+ msk->snd_nxt = msk->write_seq;
+ msk->snd_una = msk->write_seq;
+ msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd;
+ msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq;
+ mptcp_init_sched(msk, mptcp_sk(sk)->sched);
+
+ /* passive msk is created after the first/MPC subflow */
+ msk->subflow_id = 2;
+
+ sock_reset_flag(nsk, SOCK_RCU_FREE);
+ security_inet_csk_clone(nsk, req);
+
+ /* this can't race with mptcp_close(), as the msk is
+ * not yet exposted to user-space
+ */
+ inet_sk_state_store(nsk, TCP_ESTABLISHED);
+
+ /* The msk maintain a ref to each subflow in the connections list */
+ WRITE_ONCE(msk->first, ssk);
+ list_add(&mptcp_subflow_ctx(ssk)->node, &msk->conn_list);
+ sock_hold(ssk);
+
+ /* new mpc subflow takes ownership of the newly
+ * created mptcp socket
+ */
+ mptcp_token_accept(subflow_req, msk);
+
+ /* set msk addresses early to ensure mptcp_pm_get_local_id()
+ * uses the correct data
+ */
+ mptcp_copy_inaddrs(nsk, ssk);
+ __mptcp_propagate_sndbuf(nsk, ssk);
+
+ mptcp_rcv_space_init(msk, ssk);
+ bh_unlock_sock(nsk);
+
+ /* note: the newly allocated socket refcount is 2 now */
+ return nsk;
+}
+
+void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
+{
+ const struct tcp_sock *tp = tcp_sk(ssk);
+
+ msk->rcvq_space.copied = 0;
+ msk->rcvq_space.rtt_us = 0;
+
+ msk->rcvq_space.time = tp->tcp_mstamp;
+
+ /* initial rcv_space offering made to peer */
+ msk->rcvq_space.space = min_t(u32, tp->rcv_wnd,
+ TCP_INIT_CWND * tp->advmss);
+ if (msk->rcvq_space.space == 0)
+ msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
+
+ WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd);
+}
+
+static struct sock *mptcp_accept(struct sock *ssk, int flags, int *err,
+ bool kern)
+{
+ struct sock *newsk;
+
+ pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk));
+ newsk = inet_csk_accept(ssk, flags, err, kern);
+ if (!newsk)
+ return NULL;
+
+ pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk));
+ if (sk_is_mptcp(newsk)) {
+ struct mptcp_subflow_context *subflow;
+ struct sock *new_mptcp_sock;
+
+ subflow = mptcp_subflow_ctx(newsk);
+ new_mptcp_sock = subflow->conn;
+
+ /* is_mptcp should be false if subflow->conn is missing, see
+ * subflow_syn_recv_sock()
+ */
+ if (WARN_ON_ONCE(!new_mptcp_sock)) {
+ tcp_sk(newsk)->is_mptcp = 0;
+ goto out;
+ }
+
+ newsk = new_mptcp_sock;
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
+ } else {
+ MPTCP_INC_STATS(sock_net(ssk),
+ MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
+ }
+
+out:
+ newsk->sk_kern_sock = kern;
+ return newsk;
+}
+
+void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags)
+{
+ struct mptcp_subflow_context *subflow, *tmp;
+ struct sock *sk = (struct sock *)msk;
+
+ __mptcp_clear_xmit(sk);
+
+ /* join list will be eventually flushed (with rst) at sock lock release time */
+ mptcp_for_each_subflow_safe(msk, subflow, tmp)
+ __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags);
+
+ /* move to sk_receive_queue, sk_stream_kill_queues will purge it */
+ mptcp_data_lock(sk);
+ skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue);
+ __skb_queue_purge(&sk->sk_receive_queue);
+ skb_rbtree_purge(&msk->out_of_order_queue);
+ mptcp_data_unlock(sk);
+
+ /* move all the rx fwd alloc into the sk_mem_reclaim_final in
+ * inet_sock_destruct() will dispose it
+ */
+ sk_forward_alloc_add(sk, msk->rmem_fwd_alloc);
+ WRITE_ONCE(msk->rmem_fwd_alloc, 0);
+ mptcp_token_destroy(msk);
+ mptcp_pm_free_anno_list(msk);
+ mptcp_free_local_addr_list(msk);
+}
+
+static void mptcp_destroy(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ /* allow the following to close even the initial subflow */
+ msk->free_first = 1;
+ mptcp_destroy_common(msk, 0);
+ sk_sockets_allocated_dec(sk);
+}
+
+void __mptcp_data_acked(struct sock *sk)
+{
+ if (!sock_owned_by_user(sk))
+ __mptcp_clean_una(sk);
+ else
+ __set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->cb_flags);
+
+ if (mptcp_pending_data_fin_ack(sk))
+ mptcp_schedule_work(sk);
+}
+
+void __mptcp_check_push(struct sock *sk, struct sock *ssk)
+{
+ if (!mptcp_send_head(sk))
+ return;
+
+ if (!sock_owned_by_user(sk))
+ __mptcp_subflow_push_pending(sk, ssk, false);
+ else
+ __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags);
+}
+
+#define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \
+ BIT(MPTCP_RETRANSMIT) | \
+ BIT(MPTCP_FLUSH_JOIN_LIST))
+
+/* processes deferred events and flush wmem */
+static void mptcp_release_cb(struct sock *sk)
+ __must_hold(&sk->sk_lock.slock)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ for (;;) {
+ unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED) |
+ msk->push_pending;
+ struct list_head join_list;
+
+ if (!flags)
+ break;
+
+ INIT_LIST_HEAD(&join_list);
+ list_splice_init(&msk->join_list, &join_list);
+
+ /* the following actions acquire the subflow socket lock
+ *
+ * 1) can't be invoked in atomic scope
+ * 2) must avoid ABBA deadlock with msk socket spinlock: the RX
+ * datapath acquires the msk socket spinlock while helding
+ * the subflow socket lock
+ */
+ msk->push_pending = 0;
+ msk->cb_flags &= ~flags;
+ spin_unlock_bh(&sk->sk_lock.slock);
+
+ if (flags & BIT(MPTCP_FLUSH_JOIN_LIST))
+ __mptcp_flush_join_list(sk, &join_list);
+ if (flags & BIT(MPTCP_PUSH_PENDING))
+ __mptcp_push_pending(sk, 0);
+ if (flags & BIT(MPTCP_RETRANSMIT))
+ __mptcp_retrans(sk);
+
+ cond_resched();
+ spin_lock_bh(&sk->sk_lock.slock);
+ }
+
+ if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags))
+ __mptcp_clean_una_wakeup(sk);
+ if (unlikely(msk->cb_flags)) {
+ /* be sure to sync the msk state before taking actions
+ * depending on sk_state (MPTCP_ERROR_REPORT)
+ * On sk release avoid actions depending on the first subflow
+ */
+ if (__test_and_clear_bit(MPTCP_SYNC_STATE, &msk->cb_flags) && msk->first)
+ __mptcp_sync_state(sk, msk->pending_state);
+ if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags))
+ __mptcp_error_report(sk);
+ if (__test_and_clear_bit(MPTCP_SYNC_SNDBUF, &msk->cb_flags))
+ __mptcp_sync_sndbuf(sk);
+ }
+
+ __mptcp_update_rmem(sk);
+}
+
+/* MP_JOIN client subflow must wait for 4th ack before sending any data:
+ * TCP can't schedule delack timer before the subflow is fully established.
+ * MPTCP uses the delack timer to do 3rd ack retransmissions
+ */
+static void schedule_3rdack_retransmission(struct sock *ssk)
+{
+ struct inet_connection_sock *icsk = inet_csk(ssk);
+ struct tcp_sock *tp = tcp_sk(ssk);
+ unsigned long timeout;
+
+ if (mptcp_subflow_ctx(ssk)->fully_established)
+ return;
+
+ /* reschedule with a timeout above RTT, as we must look only for drop */
+ if (tp->srtt_us)
+ timeout = usecs_to_jiffies(tp->srtt_us >> (3 - 1));
+ else
+ timeout = TCP_TIMEOUT_INIT;
+ timeout += jiffies;
+
+ WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER);
+ icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
+ icsk->icsk_ack.timeout = timeout;
+ sk_reset_timer(ssk, &icsk->icsk_delack_timer, timeout);
+}
+
+void mptcp_subflow_process_delegated(struct sock *ssk, long status)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct sock *sk = subflow->conn;
+
+ if (status & BIT(MPTCP_DELEGATE_SEND)) {
+ mptcp_data_lock(sk);
+ if (!sock_owned_by_user(sk))
+ __mptcp_subflow_push_pending(sk, ssk, true);
+ else
+ __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags);
+ mptcp_data_unlock(sk);
+ }
+ if (status & BIT(MPTCP_DELEGATE_SNDBUF)) {
+ mptcp_data_lock(sk);
+ if (!sock_owned_by_user(sk))
+ __mptcp_sync_sndbuf(sk);
+ else
+ __set_bit(MPTCP_SYNC_SNDBUF, &mptcp_sk(sk)->cb_flags);
+ mptcp_data_unlock(sk);
+ }
+ if (status & BIT(MPTCP_DELEGATE_ACK))
+ schedule_3rdack_retransmission(ssk);
+}
+
+static int mptcp_hash(struct sock *sk)
+{
+ /* should never be called,
+ * we hash the TCP subflows not the master socket
+ */
+ WARN_ON_ONCE(1);
+ return 0;
+}
+
+static void mptcp_unhash(struct sock *sk)
+{
+ /* called from sk_common_release(), but nothing to do here */
+}
+
+static int mptcp_get_port(struct sock *sk, unsigned short snum)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ pr_debug("msk=%p, ssk=%p", msk, msk->first);
+ if (WARN_ON_ONCE(!msk->first))
+ return -EINVAL;
+
+ return inet_csk_get_port(msk->first, snum);
+}
+
+void mptcp_finish_connect(struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk;
+ struct sock *sk;
+
+ subflow = mptcp_subflow_ctx(ssk);
+ sk = subflow->conn;
+ msk = mptcp_sk(sk);
+
+ pr_debug("msk=%p, token=%u", sk, subflow->token);
+
+ subflow->map_seq = subflow->iasn;
+ subflow->map_subflow_seq = 1;
+
+ /* the socket is not connected yet, no msk/subflow ops can access/race
+ * accessing the field below
+ */
+ WRITE_ONCE(msk->local_key, subflow->local_key);
+ WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
+ WRITE_ONCE(msk->snd_nxt, msk->write_seq);
+ WRITE_ONCE(msk->snd_una, msk->write_seq);
+
+ mptcp_pm_new_connection(msk, ssk, 0);
+
+ mptcp_rcv_space_init(msk, ssk);
+}
+
+void mptcp_sock_graft(struct sock *sk, struct socket *parent)
+{
+ write_lock_bh(&sk->sk_callback_lock);
+ rcu_assign_pointer(sk->sk_wq, &parent->wq);
+ sk_set_socket(sk, parent);
+ sk->sk_uid = SOCK_INODE(parent)->i_uid;
+ write_unlock_bh(&sk->sk_callback_lock);
+}
+
+bool mptcp_finish_join(struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ struct sock *parent = (void *)msk;
+ bool ret = true;
+
+ pr_debug("msk=%p, subflow=%p", msk, subflow);
+
+ /* mptcp socket already closing? */
+ if (!mptcp_is_fully_established(parent)) {
+ subflow->reset_reason = MPTCP_RST_EMPTCP;
+ return false;
+ }
+
+ /* active subflow, already present inside the conn_list */
+ if (!list_empty(&subflow->node)) {
+ mptcp_subflow_joined(msk, ssk);
+ mptcp_propagate_sndbuf(parent, ssk);
+ return true;
+ }
+
+ if (!mptcp_pm_allow_new_subflow(msk))
+ goto err_prohibited;
+
+ /* If we can't acquire msk socket lock here, let the release callback
+ * handle it
+ */
+ mptcp_data_lock(parent);
+ if (!sock_owned_by_user(parent)) {
+ ret = __mptcp_finish_join(msk, ssk);
+ if (ret) {
+ sock_hold(ssk);
+ list_add_tail(&subflow->node, &msk->conn_list);
+ }
+ } else {
+ sock_hold(ssk);
+ list_add_tail(&subflow->node, &msk->join_list);
+ __set_bit(MPTCP_FLUSH_JOIN_LIST, &msk->cb_flags);
+ }
+ mptcp_data_unlock(parent);
+
+ if (!ret) {
+err_prohibited:
+ subflow->reset_reason = MPTCP_RST_EPROHIBIT;
+ return false;
+ }
+
+ return true;
+}
+
+static void mptcp_shutdown(struct sock *sk, int how)
+{
+ pr_debug("sk=%p, how=%d", sk, how);
+
+ if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk))
+ __mptcp_wr_shutdown(sk);
+}
+
+static int mptcp_forward_alloc_get(const struct sock *sk)
+{
+ return READ_ONCE(sk->sk_forward_alloc) +
+ READ_ONCE(mptcp_sk(sk)->rmem_fwd_alloc);
+}
+
+static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v)
+{
+ const struct sock *sk = (void *)msk;
+ u64 delta;
+
+ if (sk->sk_state == TCP_LISTEN)
+ return -EINVAL;
+
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+ return 0;
+
+ delta = msk->write_seq - v;
+ if (__mptcp_check_fallback(msk) && msk->first) {
+ struct tcp_sock *tp = tcp_sk(msk->first);
+
+ /* the first subflow is disconnected after close - see
+ * __mptcp_close_ssk(). tcp_disconnect() moves the write_seq
+ * so ignore that status, too.
+ */
+ if (!((1 << msk->first->sk_state) &
+ (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)))
+ delta += READ_ONCE(tp->write_seq) - tp->snd_una;
+ }
+ if (delta > INT_MAX)
+ delta = INT_MAX;
+
+ return (int)delta;
+}
+
+static int mptcp_ioctl(struct sock *sk, int cmd, int *karg)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ bool slow;
+
+ switch (cmd) {
+ case SIOCINQ:
+ if (sk->sk_state == TCP_LISTEN)
+ return -EINVAL;
+
+ lock_sock(sk);
+ __mptcp_move_skbs(msk);
+ *karg = mptcp_inq_hint(sk);
+ release_sock(sk);
+ break;
+ case SIOCOUTQ:
+ slow = lock_sock_fast(sk);
+ *karg = mptcp_ioctl_outq(msk, READ_ONCE(msk->snd_una));
+ unlock_sock_fast(sk, slow);
+ break;
+ case SIOCOUTQNSD:
+ slow = lock_sock_fast(sk);
+ *karg = mptcp_ioctl_outq(msk, msk->snd_nxt);
+ unlock_sock_fast(sk, slow);
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return 0;
+}
+
+static void mptcp_subflow_early_fallback(struct mptcp_sock *msk,
+ struct mptcp_subflow_context *subflow)
+{
+ subflow->request_mptcp = 0;
+ __mptcp_do_fallback(msk);
+}
+
+static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ int err = -EINVAL;
+ struct sock *ssk;
+
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk))
+ return PTR_ERR(ssk);
+
+ inet_sk_state_store(sk, TCP_SYN_SENT);
+ subflow = mptcp_subflow_ctx(ssk);
+#ifdef CONFIG_TCP_MD5SIG
+ /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
+ * TCP option space.
+ */
+ if (rcu_access_pointer(tcp_sk(ssk)->md5sig_info))
+ mptcp_subflow_early_fallback(msk, subflow);
+#endif
+ if (subflow->request_mptcp && mptcp_token_new_connect(ssk)) {
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_TOKENFALLBACKINIT);
+ mptcp_subflow_early_fallback(msk, subflow);
+ }
+ if (likely(!__mptcp_check_fallback(msk)))
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVE);
+
+ /* if reaching here via the fastopen/sendmsg path, the caller already
+ * acquired the subflow socket lock, too.
+ */
+ if (!msk->fastopening)
+ lock_sock(ssk);
+
+ /* the following mirrors closely a very small chunk of code from
+ * __inet_stream_connect()
+ */
+ if (ssk->sk_state != TCP_CLOSE)
+ goto out;
+
+ if (BPF_CGROUP_PRE_CONNECT_ENABLED(ssk)) {
+ err = ssk->sk_prot->pre_connect(ssk, uaddr, addr_len);
+ if (err)
+ goto out;
+ }
+
+ err = ssk->sk_prot->connect(ssk, uaddr, addr_len);
+ if (err < 0)
+ goto out;
+
+ inet_assign_bit(DEFER_CONNECT, sk, inet_test_bit(DEFER_CONNECT, ssk));
+
+out:
+ if (!msk->fastopening)
+ release_sock(ssk);
+
+ /* on successful connect, the msk state will be moved to established by
+ * subflow_finish_connect()
+ */
+ if (unlikely(err)) {
+ /* avoid leaving a dangling token in an unconnected socket */
+ mptcp_token_destroy(msk);
+ inet_sk_state_store(sk, TCP_CLOSE);
+ return err;
+ }
+
+ mptcp_copy_inaddrs(sk, ssk);
+ return 0;
+}
+
+static struct proto mptcp_prot = {
+ .name = "MPTCP",
+ .owner = THIS_MODULE,
+ .init = mptcp_init_sock,
+ .connect = mptcp_connect,
+ .disconnect = mptcp_disconnect,
+ .close = mptcp_close,
+ .accept = mptcp_accept,
+ .setsockopt = mptcp_setsockopt,
+ .getsockopt = mptcp_getsockopt,
+ .shutdown = mptcp_shutdown,
+ .destroy = mptcp_destroy,
+ .sendmsg = mptcp_sendmsg,
+ .ioctl = mptcp_ioctl,
+ .recvmsg = mptcp_recvmsg,
+ .release_cb = mptcp_release_cb,
+ .hash = mptcp_hash,
+ .unhash = mptcp_unhash,
+ .get_port = mptcp_get_port,
+ .forward_alloc_get = mptcp_forward_alloc_get,
+ .sockets_allocated = &mptcp_sockets_allocated,
+
+ .memory_allocated = &tcp_memory_allocated,
+ .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
+
+ .memory_pressure = &tcp_memory_pressure,
+ .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
+ .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
+ .sysctl_mem = sysctl_tcp_mem,
+ .obj_size = sizeof(struct mptcp_sock),
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
+ .no_autobind = true,
+};
+
+static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ struct mptcp_sock *msk = mptcp_sk(sock->sk);
+ struct sock *ssk, *sk = sock->sk;
+ int err = -EINVAL;
+
+ lock_sock(sk);
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk)) {
+ err = PTR_ERR(ssk);
+ goto unlock;
+ }
+
+ if (sk->sk_family == AF_INET)
+ err = inet_bind_sk(ssk, uaddr, addr_len);
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (sk->sk_family == AF_INET6)
+ err = inet6_bind_sk(ssk, uaddr, addr_len);
+#endif
+ if (!err)
+ mptcp_copy_inaddrs(sk, ssk);
+
+unlock:
+ release_sock(sk);
+ return err;
+}
+
+static int mptcp_listen(struct socket *sock, int backlog)
+{
+ struct mptcp_sock *msk = mptcp_sk(sock->sk);
+ struct sock *sk = sock->sk;
+ struct sock *ssk;
+ int err;
+
+ pr_debug("msk=%p", msk);
+
+ lock_sock(sk);
+
+ err = -EINVAL;
+ if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
+ goto unlock;
+
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk)) {
+ err = PTR_ERR(ssk);
+ goto unlock;
+ }
+
+ inet_sk_state_store(sk, TCP_LISTEN);
+ sock_set_flag(sk, SOCK_RCU_FREE);
+
+ lock_sock(ssk);
+ err = __inet_listen_sk(ssk, backlog);
+ release_sock(ssk);
+ inet_sk_state_store(sk, inet_sk_state_load(ssk));
+
+ if (!err) {
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ mptcp_copy_inaddrs(sk, ssk);
+ mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED);
+ }
+
+unlock:
+ release_sock(sk);
+ return err;
+}
+
+static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
+ int flags, bool kern)
+{
+ struct mptcp_sock *msk = mptcp_sk(sock->sk);
+ struct sock *ssk, *newsk;
+ int err;
+
+ pr_debug("msk=%p", msk);
+
+ /* Buggy applications can call accept on socket states other then LISTEN
+ * but no need to allocate the first subflow just to error out.
+ */
+ ssk = READ_ONCE(msk->first);
+ if (!ssk)
+ return -EINVAL;
+
+ newsk = mptcp_accept(ssk, flags, &err, kern);
+ if (!newsk)
+ return err;
+
+ lock_sock(newsk);
+
+ __inet_accept(sock, newsock, newsk);
+ if (!mptcp_is_tcpsk(newsock->sk)) {
+ struct mptcp_sock *msk = mptcp_sk(newsk);
+ struct mptcp_subflow_context *subflow;
+
+ set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags);
+ msk->in_accept_queue = 0;
+
+ /* set ssk->sk_socket of accept()ed flows to mptcp socket.
+ * This is needed so NOSPACE flag can be set from tcp stack.
+ */
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ if (!ssk->sk_socket)
+ mptcp_sock_graft(ssk, newsock);
+ }
+
+ /* Do late cleanup for the first subflow as necessary. Also
+ * deal with bad peers not doing a complete shutdown.
+ */
+ if (unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) {
+ __mptcp_close_ssk(newsk, msk->first,
+ mptcp_subflow_ctx(msk->first), 0);
+ if (unlikely(list_is_singular(&msk->conn_list)))
+ inet_sk_state_store(newsk, TCP_CLOSE);
+ }
+ }
+ release_sock(newsk);
+
+ return 0;
+}
+
+static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
+{
+ struct sock *sk = (struct sock *)msk;
+
+ if (sk_stream_is_writeable(sk))
+ return EPOLLOUT | EPOLLWRNORM;
+
+ mptcp_set_nospace(sk);
+ smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
+ if (sk_stream_is_writeable(sk))
+ return EPOLLOUT | EPOLLWRNORM;
+
+ return 0;
+}
+
+static __poll_t mptcp_poll(struct file *file, struct socket *sock,
+ struct poll_table_struct *wait)
+{
+ struct sock *sk = sock->sk;
+ struct mptcp_sock *msk;
+ __poll_t mask = 0;
+ u8 shutdown;
+ int state;
+
+ msk = mptcp_sk(sk);
+ sock_poll_wait(file, sock, wait);
+
+ state = inet_sk_state_load(sk);
+ pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags);
+ if (state == TCP_LISTEN) {
+ struct sock *ssk = READ_ONCE(msk->first);
+
+ if (WARN_ON_ONCE(!ssk))
+ return 0;
+
+ return inet_csk_listen_poll(ssk);
+ }
+
+ shutdown = READ_ONCE(sk->sk_shutdown);
+ if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
+ mask |= EPOLLHUP;
+ if (shutdown & RCV_SHUTDOWN)
+ mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
+
+ if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
+ mask |= mptcp_check_readable(msk);
+ if (shutdown & SEND_SHUTDOWN)
+ mask |= EPOLLOUT | EPOLLWRNORM;
+ else
+ mask |= mptcp_check_writeable(msk);
+ } else if (state == TCP_SYN_SENT &&
+ inet_test_bit(DEFER_CONNECT, sk)) {
+ /* cf tcp_poll() note about TFO */
+ mask |= EPOLLOUT | EPOLLWRNORM;
+ }
+
+ /* This barrier is coupled with smp_wmb() in __mptcp_error_report() */
+ smp_rmb();
+ if (READ_ONCE(sk->sk_err))
+ mask |= EPOLLERR;
+
+ return mask;
+}
+
+static const struct proto_ops mptcp_stream_ops = {
+ .family = PF_INET,
+ .owner = THIS_MODULE,
+ .release = inet_release,
+ .bind = mptcp_bind,
+ .connect = inet_stream_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = mptcp_stream_accept,
+ .getname = inet_getname,
+ .poll = mptcp_poll,
+ .ioctl = inet_ioctl,
+ .gettstamp = sock_gettstamp,
+ .listen = mptcp_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = inet_recvmsg,
+ .mmap = sock_no_mmap,
+};
+
+static struct inet_protosw mptcp_protosw = {
+ .type = SOCK_STREAM,
+ .protocol = IPPROTO_MPTCP,
+ .prot = &mptcp_prot,
+ .ops = &mptcp_stream_ops,
+ .flags = INET_PROTOSW_ICSK,
+};
+
+static int mptcp_napi_poll(struct napi_struct *napi, int budget)
+{
+ struct mptcp_delegated_action *delegated;
+ struct mptcp_subflow_context *subflow;
+ int work_done = 0;
+
+ delegated = container_of(napi, struct mptcp_delegated_action, napi);
+ while ((subflow = mptcp_subflow_delegated_next(delegated)) != NULL) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ bh_lock_sock_nested(ssk);
+ if (!sock_owned_by_user(ssk)) {
+ mptcp_subflow_process_delegated(ssk, xchg(&subflow->delegated_status, 0));
+ } else {
+ /* tcp_release_cb_override already processed
+ * the action or will do at next release_sock().
+ * In both case must dequeue the subflow here - on the same
+ * CPU that scheduled it.
+ */
+ smp_wmb();
+ clear_bit(MPTCP_DELEGATE_SCHEDULED, &subflow->delegated_status);
+ }
+ bh_unlock_sock(ssk);
+ sock_put(ssk);
+
+ if (++work_done == budget)
+ return budget;
+ }
+
+ /* always provide a 0 'work_done' argument, so that napi_complete_done
+ * will not try accessing the NULL napi->dev ptr
+ */
+ napi_complete_done(napi, 0);
+ return work_done;
+}
+
+void __init mptcp_proto_init(void)
+{
+ struct mptcp_delegated_action *delegated;
+ int cpu;
+
+ mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo;
+
+ if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL))
+ panic("Failed to allocate MPTCP pcpu counter\n");
+
+ init_dummy_netdev(&mptcp_napi_dev);
+ for_each_possible_cpu(cpu) {
+ delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu);
+ INIT_LIST_HEAD(&delegated->head);
+ netif_napi_add_tx(&mptcp_napi_dev, &delegated->napi,
+ mptcp_napi_poll);
+ napi_enable(&delegated->napi);
+ }
+
+ mptcp_subflow_init();
+ mptcp_pm_init();
+ mptcp_sched_init();
+ mptcp_token_init();
+
+ if (proto_register(&mptcp_prot, 1) != 0)
+ panic("Failed to register MPTCP proto.\n");
+
+ inet_register_protosw(&mptcp_protosw);
+
+ BUILD_BUG_ON(sizeof(struct mptcp_skb_cb) > sizeof_field(struct sk_buff, cb));
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+static const struct proto_ops mptcp_v6_stream_ops = {
+ .family = PF_INET6,
+ .owner = THIS_MODULE,
+ .release = inet6_release,
+ .bind = mptcp_bind,
+ .connect = inet_stream_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = mptcp_stream_accept,
+ .getname = inet6_getname,
+ .poll = mptcp_poll,
+ .ioctl = inet6_ioctl,
+ .gettstamp = sock_gettstamp,
+ .listen = mptcp_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet6_sendmsg,
+ .recvmsg = inet6_recvmsg,
+ .mmap = sock_no_mmap,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = inet6_compat_ioctl,
+#endif
+};
+
+static struct proto mptcp_v6_prot;
+
+static struct inet_protosw mptcp_v6_protosw = {
+ .type = SOCK_STREAM,
+ .protocol = IPPROTO_MPTCP,
+ .prot = &mptcp_v6_prot,
+ .ops = &mptcp_v6_stream_ops,
+ .flags = INET_PROTOSW_ICSK,
+};
+
+int __init mptcp_proto_v6_init(void)
+{
+ int err;
+
+ mptcp_v6_prot = mptcp_prot;
+ strcpy(mptcp_v6_prot.name, "MPTCPv6");
+ mptcp_v6_prot.slab = NULL;
+ mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock);
+ mptcp_v6_prot.ipv6_pinfo_offset = offsetof(struct mptcp6_sock, np);
+
+ err = proto_register(&mptcp_v6_prot, 1);
+ if (err)
+ return err;
+
+ err = inet6_register_protosw(&mptcp_v6_protosw);
+ if (err)
+ proto_unregister(&mptcp_v6_prot);
+
+ return err;
+}
+#endif
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
new file mode 100644
index 000000000..07c5ac37d
--- /dev/null
+++ b/net/mptcp/protocol.h
@@ -0,0 +1,1132 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Multipath TCP
+ *
+ * Copyright (c) 2017 - 2019, Intel Corporation.
+ */
+
+#ifndef __MPTCP_PROTOCOL_H
+#define __MPTCP_PROTOCOL_H
+
+#include <linux/random.h>
+#include <net/tcp.h>
+#include <net/inet_connection_sock.h>
+#include <uapi/linux/mptcp.h>
+#include <net/genetlink.h>
+
+#define MPTCP_SUPPORTED_VERSION 1
+
+/* MPTCP option bits */
+#define OPTION_MPTCP_MPC_SYN BIT(0)
+#define OPTION_MPTCP_MPC_SYNACK BIT(1)
+#define OPTION_MPTCP_MPC_ACK BIT(2)
+#define OPTION_MPTCP_MPJ_SYN BIT(3)
+#define OPTION_MPTCP_MPJ_SYNACK BIT(4)
+#define OPTION_MPTCP_MPJ_ACK BIT(5)
+#define OPTION_MPTCP_ADD_ADDR BIT(6)
+#define OPTION_MPTCP_RM_ADDR BIT(7)
+#define OPTION_MPTCP_FASTCLOSE BIT(8)
+#define OPTION_MPTCP_PRIO BIT(9)
+#define OPTION_MPTCP_RST BIT(10)
+#define OPTION_MPTCP_DSS BIT(11)
+#define OPTION_MPTCP_FAIL BIT(12)
+
+#define OPTION_MPTCP_CSUMREQD BIT(13)
+
+#define OPTIONS_MPTCP_MPC (OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | \
+ OPTION_MPTCP_MPC_ACK)
+#define OPTIONS_MPTCP_MPJ (OPTION_MPTCP_MPJ_SYN | OPTION_MPTCP_MPJ_SYNACK | \
+ OPTION_MPTCP_MPJ_ACK)
+
+/* MPTCP option subtypes */
+#define MPTCPOPT_MP_CAPABLE 0
+#define MPTCPOPT_MP_JOIN 1
+#define MPTCPOPT_DSS 2
+#define MPTCPOPT_ADD_ADDR 3
+#define MPTCPOPT_RM_ADDR 4
+#define MPTCPOPT_MP_PRIO 5
+#define MPTCPOPT_MP_FAIL 6
+#define MPTCPOPT_MP_FASTCLOSE 7
+#define MPTCPOPT_RST 8
+
+/* MPTCP suboption lengths */
+#define TCPOLEN_MPTCP_MPC_SYN 4
+#define TCPOLEN_MPTCP_MPC_SYNACK 12
+#define TCPOLEN_MPTCP_MPC_ACK 20
+#define TCPOLEN_MPTCP_MPC_ACK_DATA 22
+#define TCPOLEN_MPTCP_MPJ_SYN 12
+#define TCPOLEN_MPTCP_MPJ_SYNACK 16
+#define TCPOLEN_MPTCP_MPJ_ACK 24
+#define TCPOLEN_MPTCP_DSS_BASE 4
+#define TCPOLEN_MPTCP_DSS_ACK32 4
+#define TCPOLEN_MPTCP_DSS_ACK64 8
+#define TCPOLEN_MPTCP_DSS_MAP32 10
+#define TCPOLEN_MPTCP_DSS_MAP64 14
+#define TCPOLEN_MPTCP_DSS_CHECKSUM 2
+#define TCPOLEN_MPTCP_ADD_ADDR 16
+#define TCPOLEN_MPTCP_ADD_ADDR_PORT 18
+#define TCPOLEN_MPTCP_ADD_ADDR_BASE 8
+#define TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT 10
+#define TCPOLEN_MPTCP_ADD_ADDR6 28
+#define TCPOLEN_MPTCP_ADD_ADDR6_PORT 30
+#define TCPOLEN_MPTCP_ADD_ADDR6_BASE 20
+#define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 22
+#define TCPOLEN_MPTCP_PORT_LEN 2
+#define TCPOLEN_MPTCP_PORT_ALIGN 2
+#define TCPOLEN_MPTCP_RM_ADDR_BASE 3
+#define TCPOLEN_MPTCP_PRIO 3
+#define TCPOLEN_MPTCP_PRIO_ALIGN 4
+#define TCPOLEN_MPTCP_FASTCLOSE 12
+#define TCPOLEN_MPTCP_RST 4
+#define TCPOLEN_MPTCP_FAIL 12
+
+#define TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM (TCPOLEN_MPTCP_DSS_CHECKSUM + TCPOLEN_MPTCP_MPC_ACK_DATA)
+
+/* MPTCP MP_JOIN flags */
+#define MPTCPOPT_BACKUP BIT(0)
+#define MPTCPOPT_THMAC_LEN 8
+
+/* MPTCP MP_CAPABLE flags */
+#define MPTCP_VERSION_MASK (0x0F)
+#define MPTCP_CAP_CHECKSUM_REQD BIT(7)
+#define MPTCP_CAP_EXTENSIBILITY BIT(6)
+#define MPTCP_CAP_DENY_JOIN_ID0 BIT(5)
+#define MPTCP_CAP_HMAC_SHA256 BIT(0)
+#define MPTCP_CAP_FLAG_MASK (0x1F)
+
+/* MPTCP DSS flags */
+#define MPTCP_DSS_DATA_FIN BIT(4)
+#define MPTCP_DSS_DSN64 BIT(3)
+#define MPTCP_DSS_HAS_MAP BIT(2)
+#define MPTCP_DSS_ACK64 BIT(1)
+#define MPTCP_DSS_HAS_ACK BIT(0)
+#define MPTCP_DSS_FLAG_MASK (0x1F)
+
+/* MPTCP ADD_ADDR flags */
+#define MPTCP_ADDR_ECHO BIT(0)
+
+/* MPTCP MP_PRIO flags */
+#define MPTCP_PRIO_BKUP BIT(0)
+
+/* MPTCP TCPRST flags */
+#define MPTCP_RST_TRANSIENT BIT(0)
+
+/* MPTCP socket atomic flags */
+#define MPTCP_NOSPACE 1
+#define MPTCP_WORK_RTX 2
+#define MPTCP_FALLBACK_DONE 4
+#define MPTCP_WORK_CLOSE_SUBFLOW 5
+
+/* MPTCP socket release cb flags */
+#define MPTCP_PUSH_PENDING 1
+#define MPTCP_CLEAN_UNA 2
+#define MPTCP_ERROR_REPORT 3
+#define MPTCP_RETRANSMIT 4
+#define MPTCP_FLUSH_JOIN_LIST 5
+#define MPTCP_SYNC_STATE 6
+#define MPTCP_SYNC_SNDBUF 7
+
+struct mptcp_skb_cb {
+ u64 map_seq;
+ u64 end_seq;
+ u32 offset;
+ u8 has_rxtstamp:1;
+};
+
+#define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0]))
+
+static inline bool before64(__u64 seq1, __u64 seq2)
+{
+ return (__s64)(seq1 - seq2) < 0;
+}
+
+#define after64(seq2, seq1) before64(seq1, seq2)
+
+struct mptcp_options_received {
+ u64 sndr_key;
+ u64 rcvr_key;
+ u64 data_ack;
+ u64 data_seq;
+ u32 subflow_seq;
+ u16 data_len;
+ __sum16 csum;
+ u16 suboptions;
+ u32 token;
+ u32 nonce;
+ u16 use_map:1,
+ dsn64:1,
+ data_fin:1,
+ use_ack:1,
+ ack64:1,
+ mpc_map:1,
+ reset_reason:4,
+ reset_transient:1,
+ echo:1,
+ backup:1,
+ deny_join_id0:1,
+ __unused:2;
+ u8 join_id;
+ u64 thmac;
+ u8 hmac[MPTCPOPT_HMAC_LEN];
+ struct mptcp_addr_info addr;
+ struct mptcp_rm_list rm_list;
+ u64 ahmac;
+ u64 fail_seq;
+};
+
+static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field)
+{
+ return htonl((TCPOPT_MPTCP << 24) | (len << 16) | (subopt << 12) |
+ ((nib & 0xF) << 8) | field);
+}
+
+enum mptcp_pm_status {
+ MPTCP_PM_ADD_ADDR_RECEIVED,
+ MPTCP_PM_ADD_ADDR_SEND_ACK,
+ MPTCP_PM_RM_ADDR_RECEIVED,
+ MPTCP_PM_ESTABLISHED,
+ MPTCP_PM_SUBFLOW_ESTABLISHED,
+ MPTCP_PM_ALREADY_ESTABLISHED, /* persistent status, set after ESTABLISHED event */
+ MPTCP_PM_MPC_ENDPOINT_ACCOUNTED /* persistent status, set after MPC local address is
+ * accounted int id_avail_bitmap
+ */
+};
+
+enum mptcp_pm_type {
+ MPTCP_PM_TYPE_KERNEL = 0,
+ MPTCP_PM_TYPE_USERSPACE,
+
+ __MPTCP_PM_TYPE_NR,
+ __MPTCP_PM_TYPE_MAX = __MPTCP_PM_TYPE_NR - 1,
+};
+
+/* Status bits below MPTCP_PM_ALREADY_ESTABLISHED need pm worker actions */
+#define MPTCP_PM_WORK_MASK ((1 << MPTCP_PM_ALREADY_ESTABLISHED) - 1)
+
+enum mptcp_addr_signal_status {
+ MPTCP_ADD_ADDR_SIGNAL,
+ MPTCP_ADD_ADDR_ECHO,
+ MPTCP_RM_ADDR_SIGNAL,
+};
+
+/* max value of mptcp_addr_info.id */
+#define MPTCP_PM_MAX_ADDR_ID U8_MAX
+
+struct mptcp_pm_data {
+ struct mptcp_addr_info local;
+ struct mptcp_addr_info remote;
+ struct list_head anno_list;
+ struct list_head userspace_pm_local_addr_list;
+
+ spinlock_t lock; /*protects the whole PM data */
+
+ u8 addr_signal;
+ bool server_side;
+ bool work_pending;
+ bool accept_addr;
+ bool accept_subflow;
+ bool remote_deny_join_id0;
+ u8 add_addr_signaled;
+ u8 add_addr_accepted;
+ u8 local_addr_used;
+ u8 pm_type;
+ u8 subflows;
+ u8 status;
+ DECLARE_BITMAP(id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
+ struct mptcp_rm_list rm_list_tx;
+ struct mptcp_rm_list rm_list_rx;
+};
+
+struct mptcp_pm_addr_entry {
+ struct list_head list;
+ struct mptcp_addr_info addr;
+ u8 flags;
+ int ifindex;
+ struct socket *lsk;
+};
+
+struct mptcp_data_frag {
+ struct list_head list;
+ u64 data_seq;
+ u16 data_len;
+ u16 offset;
+ u16 overhead;
+ u16 already_sent;
+ struct page *page;
+};
+
+/* MPTCP connection sock */
+struct mptcp_sock {
+ /* inet_connection_sock must be the first member */
+ struct inet_connection_sock sk;
+ u64 local_key;
+ u64 remote_key;
+ u64 write_seq;
+ u64 bytes_sent;
+ u64 snd_nxt;
+ u64 bytes_received;
+ u64 ack_seq;
+ atomic64_t rcv_wnd_sent;
+ u64 rcv_data_fin_seq;
+ u64 bytes_retrans;
+ int rmem_fwd_alloc;
+ int snd_burst;
+ int old_wspace;
+ u64 recovery_snd_nxt; /* in recovery mode accept up to this seq;
+ * recovery related fields are under data_lock
+ * protection
+ */
+ u64 bytes_acked;
+ u64 snd_una;
+ u64 wnd_end;
+ unsigned long timer_ival;
+ u32 token;
+ int rmem_released;
+ unsigned long flags;
+ unsigned long cb_flags;
+ unsigned long push_pending;
+ bool recovery; /* closing subflow write queue reinjected */
+ bool can_ack;
+ bool fully_established;
+ bool rcv_data_fin;
+ bool snd_data_fin_enable;
+ bool rcv_fastclose;
+ bool use_64bit_ack; /* Set when we received a 64-bit DSN */
+ bool csum_enabled;
+ bool allow_infinite_fallback;
+ u8 pending_state; /* A subflow asked to set this sk_state,
+ * protected by the msk data lock
+ */
+ u8 mpc_endpoint_id;
+ u8 recvmsg_inq:1,
+ cork:1,
+ nodelay:1,
+ fastopening:1,
+ in_accept_queue:1,
+ free_first:1;
+ struct work_struct work;
+ struct sk_buff *ooo_last_skb;
+ struct rb_root out_of_order_queue;
+ struct sk_buff_head receive_queue;
+ struct list_head conn_list;
+ struct list_head rtx_queue;
+ struct mptcp_data_frag *first_pending;
+ struct list_head join_list;
+ struct sock *first; /* The mptcp ops can safely dereference, using suitable
+ * ONCE annotation, the subflow outside the socket
+ * lock as such sock is freed after close().
+ */
+ struct mptcp_pm_data pm;
+ struct mptcp_sched_ops *sched;
+ struct {
+ u32 space; /* bytes copied in last measurement window */
+ u32 copied; /* bytes copied in this measurement window */
+ u64 time; /* start time of measurement window */
+ u64 rtt_us; /* last maximum rtt of subflows */
+ } rcvq_space;
+ u8 scaling_ratio;
+
+ u32 subflow_id;
+ u32 setsockopt_seq;
+ char ca_name[TCP_CA_NAME_MAX];
+};
+
+#define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
+#define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock)
+
+#define mptcp_for_each_subflow(__msk, __subflow) \
+ list_for_each_entry(__subflow, &((__msk)->conn_list), node)
+#define mptcp_for_each_subflow_safe(__msk, __subflow, __tmp) \
+ list_for_each_entry_safe(__subflow, __tmp, &((__msk)->conn_list), node)
+
+static inline void msk_owned_by_me(const struct mptcp_sock *msk)
+{
+ sock_owned_by_me((const struct sock *)msk);
+}
+
+#define mptcp_sk(ptr) container_of_const(ptr, struct mptcp_sock, sk.icsk_inet.sk)
+
+/* the msk socket don't use the backlog, also account for the bulk
+ * free memory
+ */
+static inline int __mptcp_rmem(const struct sock *sk)
+{
+ return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released);
+}
+
+static inline int mptcp_win_from_space(const struct sock *sk, int space)
+{
+ return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space);
+}
+
+static inline int __mptcp_space(const struct sock *sk)
+{
+ return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk));
+}
+
+static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk)
+{
+ const struct mptcp_sock *msk = mptcp_sk(sk);
+
+ return READ_ONCE(msk->first_pending);
+}
+
+static inline struct mptcp_data_frag *mptcp_send_next(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_data_frag *cur;
+
+ cur = msk->first_pending;
+ return list_is_last(&cur->list, &msk->rtx_queue) ? NULL :
+ list_next_entry(cur, list);
+}
+
+static inline struct mptcp_data_frag *mptcp_pending_tail(const struct sock *sk)
+{
+ const struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (!msk->first_pending)
+ return NULL;
+
+ if (WARN_ON_ONCE(list_empty(&msk->rtx_queue)))
+ return NULL;
+
+ return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list);
+}
+
+static inline struct mptcp_data_frag *mptcp_rtx_head(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (msk->snd_una == READ_ONCE(msk->snd_nxt))
+ return NULL;
+
+ return list_first_entry_or_null(&msk->rtx_queue, struct mptcp_data_frag, list);
+}
+
+struct csum_pseudo_header {
+ __be64 data_seq;
+ __be32 subflow_seq;
+ __be16 data_len;
+ __sum16 csum;
+};
+
+struct mptcp_subflow_request_sock {
+ struct tcp_request_sock sk;
+ u16 mp_capable : 1,
+ mp_join : 1,
+ backup : 1,
+ csum_reqd : 1,
+ allow_join_id0 : 1;
+ u8 local_id;
+ u8 remote_id;
+ u64 local_key;
+ u64 idsn;
+ u32 token;
+ u32 ssn_offset;
+ u64 thmac;
+ u32 local_nonce;
+ u32 remote_nonce;
+ struct mptcp_sock *msk;
+ struct hlist_nulls_node token_node;
+};
+
+static inline struct mptcp_subflow_request_sock *
+mptcp_subflow_rsk(const struct request_sock *rsk)
+{
+ return (struct mptcp_subflow_request_sock *)rsk;
+}
+
+enum mptcp_data_avail {
+ MPTCP_SUBFLOW_NODATA,
+ MPTCP_SUBFLOW_DATA_AVAIL,
+};
+
+struct mptcp_delegated_action {
+ struct napi_struct napi;
+ struct list_head head;
+};
+
+DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
+
+#define MPTCP_DELEGATE_SCHEDULED 0
+#define MPTCP_DELEGATE_SEND 1
+#define MPTCP_DELEGATE_ACK 2
+#define MPTCP_DELEGATE_SNDBUF 3
+
+#define MPTCP_DELEGATE_ACTIONS_MASK (~BIT(MPTCP_DELEGATE_SCHEDULED))
+/* MPTCP subflow context */
+struct mptcp_subflow_context {
+ struct list_head node;/* conn_list of subflows */
+
+ struct_group(reset,
+
+ unsigned long avg_pacing_rate; /* protected by msk socket lock */
+ u64 local_key;
+ u64 remote_key;
+ u64 idsn;
+ u64 map_seq;
+ u32 snd_isn;
+ u32 token;
+ u32 rel_write_seq;
+ u32 map_subflow_seq;
+ u32 ssn_offset;
+ u32 map_data_len;
+ __wsum map_data_csum;
+ u32 map_csum_len;
+ u32 request_mptcp : 1, /* send MP_CAPABLE */
+ request_join : 1, /* send MP_JOIN */
+ request_bkup : 1,
+ mp_capable : 1, /* remote is MPTCP capable */
+ mp_join : 1, /* remote is JOINing */
+ fully_established : 1, /* path validated */
+ pm_notified : 1, /* PM hook called for established status */
+ conn_finished : 1,
+ map_valid : 1,
+ map_csum_reqd : 1,
+ map_data_fin : 1,
+ mpc_map : 1,
+ backup : 1,
+ send_mp_prio : 1,
+ send_mp_fail : 1,
+ send_fastclose : 1,
+ send_infinite_map : 1,
+ remote_key_valid : 1, /* received the peer key from */
+ disposable : 1, /* ctx can be free at ulp release time */
+ stale : 1, /* unable to snd/rcv data, do not use for xmit */
+ local_id_valid : 1, /* local_id is correctly initialized */
+ valid_csum_seen : 1, /* at least one csum validated */
+ is_mptfo : 1, /* subflow is doing TFO */
+ __unused : 9;
+ enum mptcp_data_avail data_avail;
+ bool scheduled;
+ u32 remote_nonce;
+ u64 thmac;
+ u32 local_nonce;
+ u32 remote_token;
+ union {
+ u8 hmac[MPTCPOPT_HMAC_LEN]; /* MPJ subflow only */
+ u64 iasn; /* initial ack sequence number, MPC subflows only */
+ };
+ u8 local_id;
+ u8 remote_id;
+ u8 reset_seen:1;
+ u8 reset_transient:1;
+ u8 reset_reason:4;
+ u8 stale_count;
+
+ u32 subflow_id;
+
+ long delegated_status;
+ unsigned long fail_tout;
+
+ );
+
+ struct list_head delegated_node; /* link into delegated_action, protected by local BH */
+
+ u32 setsockopt_seq;
+ u32 stale_rcv_tstamp;
+ int cached_sndbuf; /* sndbuf size when last synced with the msk sndbuf,
+ * protected by the msk socket lock
+ */
+
+ struct sock *tcp_sock; /* tcp sk backpointer */
+ struct sock *conn; /* parent mptcp_sock */
+ const struct inet_connection_sock_af_ops *icsk_af_ops;
+ void (*tcp_state_change)(struct sock *sk);
+ void (*tcp_error_report)(struct sock *sk);
+
+ struct rcu_head rcu;
+};
+
+static inline struct mptcp_subflow_context *
+mptcp_subflow_ctx(const struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ /* Use RCU on icsk_ulp_data only for sock diag code */
+ return (__force struct mptcp_subflow_context *)icsk->icsk_ulp_data;
+}
+
+static inline struct sock *
+mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow)
+{
+ return subflow->tcp_sock;
+}
+
+static inline void
+mptcp_subflow_ctx_reset(struct mptcp_subflow_context *subflow)
+{
+ memset(&subflow->reset, 0, sizeof(subflow->reset));
+ subflow->request_mptcp = 1;
+}
+
+static inline u64
+mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow)
+{
+ return tcp_sk(mptcp_subflow_tcp_sock(subflow))->copied_seq -
+ subflow->ssn_offset -
+ subflow->map_subflow_seq;
+}
+
+static inline u64
+mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow)
+{
+ return subflow->map_seq + mptcp_subflow_get_map_offset(subflow);
+}
+
+void mptcp_subflow_process_delegated(struct sock *ssk, long actions);
+
+static inline void mptcp_subflow_delegate(struct mptcp_subflow_context *subflow, int action)
+{
+ long old, set_bits = BIT(MPTCP_DELEGATE_SCHEDULED) | BIT(action);
+ struct mptcp_delegated_action *delegated;
+ bool schedule;
+
+ /* the caller held the subflow bh socket lock */
+ lockdep_assert_in_softirq();
+
+ /* The implied barrier pairs with tcp_release_cb_override()
+ * mptcp_napi_poll(), and ensures the below list check sees list
+ * updates done prior to delegated status bits changes
+ */
+ old = set_mask_bits(&subflow->delegated_status, 0, set_bits);
+ if (!(old & BIT(MPTCP_DELEGATE_SCHEDULED))) {
+ if (WARN_ON_ONCE(!list_empty(&subflow->delegated_node)))
+ return;
+
+ delegated = this_cpu_ptr(&mptcp_delegated_actions);
+ schedule = list_empty(&delegated->head);
+ list_add_tail(&subflow->delegated_node, &delegated->head);
+ sock_hold(mptcp_subflow_tcp_sock(subflow));
+ if (schedule)
+ napi_schedule(&delegated->napi);
+ }
+}
+
+static inline struct mptcp_subflow_context *
+mptcp_subflow_delegated_next(struct mptcp_delegated_action *delegated)
+{
+ struct mptcp_subflow_context *ret;
+
+ if (list_empty(&delegated->head))
+ return NULL;
+
+ ret = list_first_entry(&delegated->head, struct mptcp_subflow_context, delegated_node);
+ list_del_init(&ret->delegated_node);
+ return ret;
+}
+
+int mptcp_is_enabled(const struct net *net);
+unsigned int mptcp_get_add_addr_timeout(const struct net *net);
+int mptcp_is_checksum_enabled(const struct net *net);
+int mptcp_allow_join_id0(const struct net *net);
+unsigned int mptcp_stale_loss_cnt(const struct net *net);
+int mptcp_get_pm_type(const struct net *net);
+const char *mptcp_get_scheduler(const struct net *net);
+void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
+ const struct mptcp_options_received *mp_opt);
+bool __mptcp_retransmit_pending_data(struct sock *sk);
+void mptcp_check_and_set_pending(struct sock *sk);
+void __mptcp_push_pending(struct sock *sk, unsigned int flags);
+bool mptcp_subflow_data_available(struct sock *sk);
+void __init mptcp_subflow_init(void);
+void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how);
+void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
+ struct mptcp_subflow_context *subflow);
+void __mptcp_subflow_send_ack(struct sock *ssk);
+void mptcp_subflow_reset(struct sock *ssk);
+void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk);
+void mptcp_sock_graft(struct sock *sk, struct socket *parent);
+struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk);
+bool __mptcp_close(struct sock *sk, long timeout);
+void mptcp_cancel_work(struct sock *sk);
+void __mptcp_unaccepted_force_close(struct sock *sk);
+void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk);
+
+bool mptcp_addresses_equal(const struct mptcp_addr_info *a,
+ const struct mptcp_addr_info *b, bool use_port);
+void mptcp_local_address(const struct sock_common *skc, struct mptcp_addr_info *addr);
+
+/* called with sk socket lock held */
+int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
+ const struct mptcp_addr_info *remote);
+int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
+ struct socket **new_sock);
+void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
+ struct sockaddr_storage *addr,
+ unsigned short family);
+struct mptcp_sched_ops *mptcp_sched_find(const char *name);
+int mptcp_register_scheduler(struct mptcp_sched_ops *sched);
+void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched);
+void mptcp_sched_init(void);
+int mptcp_init_sched(struct mptcp_sock *msk,
+ struct mptcp_sched_ops *sched);
+void mptcp_release_sched(struct mptcp_sock *msk);
+void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow,
+ bool scheduled);
+struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk);
+struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk);
+int mptcp_sched_get_send(struct mptcp_sock *msk);
+int mptcp_sched_get_retrans(struct mptcp_sock *msk);
+
+static inline bool __tcp_can_send(const struct sock *ssk)
+{
+ /* only send if our side has not closed yet */
+ return ((1 << inet_sk_state_load(ssk)) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
+}
+
+static inline bool __mptcp_subflow_active(struct mptcp_subflow_context *subflow)
+{
+ /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
+ if (subflow->request_join && !subflow->fully_established)
+ return false;
+
+ return __tcp_can_send(mptcp_subflow_tcp_sock(subflow));
+}
+
+void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow);
+
+bool mptcp_subflow_active(struct mptcp_subflow_context *subflow);
+
+void mptcp_subflow_drop_ctx(struct sock *ssk);
+
+static inline void mptcp_subflow_tcp_fallback(struct sock *sk,
+ struct mptcp_subflow_context *ctx)
+{
+ sk->sk_data_ready = sock_def_readable;
+ sk->sk_state_change = ctx->tcp_state_change;
+ sk->sk_write_space = sk_stream_write_space;
+ sk->sk_error_report = ctx->tcp_error_report;
+
+ inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops;
+}
+
+void __init mptcp_proto_init(void);
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+int __init mptcp_proto_v6_init(void);
+#endif
+
+struct sock *mptcp_sk_clone_init(const struct sock *sk,
+ const struct mptcp_options_received *mp_opt,
+ struct sock *ssk,
+ struct request_sock *req);
+void mptcp_get_options(const struct sk_buff *skb,
+ struct mptcp_options_received *mp_opt);
+
+void mptcp_finish_connect(struct sock *sk);
+void __mptcp_sync_state(struct sock *sk, int state);
+void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout);
+
+static inline void mptcp_stop_tout_timer(struct sock *sk)
+{
+ if (!inet_csk(sk)->icsk_mtup.probe_timestamp)
+ return;
+
+ sk_stop_timer(sk, &sk->sk_timer);
+ inet_csk(sk)->icsk_mtup.probe_timestamp = 0;
+}
+
+static inline void mptcp_set_close_tout(struct sock *sk, unsigned long tout)
+{
+ /* avoid 0 timestamp, as that means no close timeout */
+ inet_csk(sk)->icsk_mtup.probe_timestamp = tout ? : 1;
+}
+
+static inline void mptcp_start_tout_timer(struct sock *sk)
+{
+ mptcp_set_close_tout(sk, tcp_jiffies32);
+ mptcp_reset_tout_timer(mptcp_sk(sk), 0);
+}
+
+static inline bool mptcp_is_fully_established(struct sock *sk)
+{
+ return inet_sk_state_load(sk) == TCP_ESTABLISHED &&
+ READ_ONCE(mptcp_sk(sk)->fully_established);
+}
+void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk);
+void mptcp_data_ready(struct sock *sk, struct sock *ssk);
+bool mptcp_finish_join(struct sock *sk);
+bool mptcp_schedule_work(struct sock *sk);
+int mptcp_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen);
+int mptcp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *option);
+
+u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq);
+static inline u64 mptcp_expand_seq(u64 old_seq, u64 cur_seq, bool use_64bit)
+{
+ if (use_64bit)
+ return cur_seq;
+
+ return __mptcp_expand_seq(old_seq, cur_seq);
+}
+void __mptcp_check_push(struct sock *sk, struct sock *ssk);
+void __mptcp_data_acked(struct sock *sk);
+void __mptcp_error_report(struct sock *sk);
+bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit);
+static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk)
+{
+ return READ_ONCE(msk->snd_data_fin_enable) &&
+ READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt);
+}
+
+static inline void __mptcp_sync_sndbuf(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow;
+ int ssk_sndbuf, new_sndbuf;
+
+ if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
+ return;
+
+ new_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[0];
+ mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
+ ssk_sndbuf = READ_ONCE(mptcp_subflow_tcp_sock(subflow)->sk_sndbuf);
+
+ subflow->cached_sndbuf = ssk_sndbuf;
+ new_sndbuf += ssk_sndbuf;
+ }
+
+ /* the msk max wmem limit is <nr_subflows> * tcp wmem[2] */
+ WRITE_ONCE(sk->sk_sndbuf, new_sndbuf);
+}
+
+/* The called held both the msk socket and the subflow socket locks,
+ * possibly under BH
+ */
+static inline void __mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+
+ if (READ_ONCE(ssk->sk_sndbuf) != subflow->cached_sndbuf)
+ __mptcp_sync_sndbuf(sk);
+}
+
+/* the caller held only the subflow socket lock, either in process or
+ * BH context. Additionally this can be called under the msk data lock,
+ * so we can't acquire such lock here: let the delegate action acquires
+ * the needed locks in suitable order.
+ */
+static inline void mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+
+ if (likely(READ_ONCE(ssk->sk_sndbuf) == subflow->cached_sndbuf))
+ return;
+
+ local_bh_disable();
+ mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_SNDBUF);
+ local_bh_enable();
+}
+
+static inline void mptcp_write_space(struct sock *sk)
+{
+ if (sk_stream_is_writeable(sk)) {
+ /* pairs with memory barrier in mptcp_poll */
+ smp_mb();
+ if (test_and_clear_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags))
+ sk_stream_write_space(sk);
+ }
+}
+
+void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags);
+
+#define MPTCP_TOKEN_MAX_RETRIES 4
+
+void __init mptcp_token_init(void);
+static inline void mptcp_token_init_request(struct request_sock *req)
+{
+ mptcp_subflow_rsk(req)->token_node.pprev = NULL;
+}
+
+int mptcp_token_new_request(struct request_sock *req);
+void mptcp_token_destroy_request(struct request_sock *req);
+int mptcp_token_new_connect(struct sock *ssk);
+void mptcp_token_accept(struct mptcp_subflow_request_sock *r,
+ struct mptcp_sock *msk);
+bool mptcp_token_exists(u32 token);
+struct mptcp_sock *mptcp_token_get_sock(struct net *net, u32 token);
+struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot,
+ long *s_num);
+void mptcp_token_destroy(struct mptcp_sock *msk);
+
+void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn);
+
+void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac);
+__sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum);
+
+void __init mptcp_pm_init(void);
+void mptcp_pm_data_init(struct mptcp_sock *msk);
+void mptcp_pm_data_reset(struct mptcp_sock *msk);
+int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info,
+ struct mptcp_addr_info *addr);
+int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info,
+ bool require_family,
+ struct mptcp_pm_addr_entry *entry);
+bool mptcp_pm_addr_families_match(const struct sock *sk,
+ const struct mptcp_addr_info *loc,
+ const struct mptcp_addr_info *rem);
+void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk);
+void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk);
+void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side);
+void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk);
+bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk);
+void mptcp_pm_connection_closed(struct mptcp_sock *msk);
+void mptcp_pm_subflow_established(struct mptcp_sock *msk);
+bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk);
+void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk,
+ const struct mptcp_subflow_context *subflow);
+void mptcp_pm_add_addr_received(const struct sock *ssk,
+ const struct mptcp_addr_info *addr);
+void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr);
+void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk);
+void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk);
+void mptcp_pm_rm_addr_received(struct mptcp_sock *msk,
+ const struct mptcp_rm_list *rm_list);
+void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup);
+void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq);
+int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr,
+ struct mptcp_addr_info *rem,
+ u8 bkup);
+bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr);
+void mptcp_pm_free_anno_list(struct mptcp_sock *msk);
+bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk);
+struct mptcp_pm_add_entry *
+mptcp_pm_del_add_timer(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr, bool check_id);
+struct mptcp_pm_add_entry *
+mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr);
+int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk,
+ unsigned int id,
+ u8 *flags, int *ifindex);
+int mptcp_pm_nl_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id,
+ u8 *flags, int *ifindex);
+int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk,
+ unsigned int id,
+ u8 *flags, int *ifindex);
+int mptcp_pm_set_flags(struct net *net, struct nlattr *token,
+ struct mptcp_pm_addr_entry *loc,
+ struct mptcp_pm_addr_entry *rem, u8 bkup);
+int mptcp_pm_nl_set_flags(struct net *net, struct mptcp_pm_addr_entry *addr, u8 bkup);
+int mptcp_userspace_pm_set_flags(struct net *net, struct nlattr *token,
+ struct mptcp_pm_addr_entry *loc,
+ struct mptcp_pm_addr_entry *rem, u8 bkup);
+int mptcp_pm_announce_addr(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr,
+ bool echo);
+int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list);
+int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list);
+void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list);
+void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk,
+ struct list_head *rm_list);
+
+void mptcp_free_local_addr_list(struct mptcp_sock *msk);
+int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info);
+int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info);
+int mptcp_nl_cmd_sf_create(struct sk_buff *skb, struct genl_info *info);
+int mptcp_nl_cmd_sf_destroy(struct sk_buff *skb, struct genl_info *info);
+
+void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk,
+ const struct sock *ssk, gfp_t gfp);
+void mptcp_event_addr_announced(const struct sock *ssk, const struct mptcp_addr_info *info);
+void mptcp_event_addr_removed(const struct mptcp_sock *msk, u8 id);
+void mptcp_event_pm_listener(const struct sock *ssk,
+ enum mptcp_event_type event);
+bool mptcp_userspace_pm_active(const struct mptcp_sock *msk);
+
+void mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow,
+ const struct mptcp_options_received *mp_opt);
+void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow,
+ struct request_sock *req);
+
+static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk)
+{
+ return READ_ONCE(msk->pm.addr_signal) &
+ (BIT(MPTCP_ADD_ADDR_SIGNAL) | BIT(MPTCP_ADD_ADDR_ECHO));
+}
+
+static inline bool mptcp_pm_should_add_signal_addr(struct mptcp_sock *msk)
+{
+ return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_SIGNAL);
+}
+
+static inline bool mptcp_pm_should_add_signal_echo(struct mptcp_sock *msk)
+{
+ return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_ECHO);
+}
+
+static inline bool mptcp_pm_should_rm_signal(struct mptcp_sock *msk)
+{
+ return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_RM_ADDR_SIGNAL);
+}
+
+static inline bool mptcp_pm_is_userspace(const struct mptcp_sock *msk)
+{
+ return READ_ONCE(msk->pm.pm_type) == MPTCP_PM_TYPE_USERSPACE;
+}
+
+static inline bool mptcp_pm_is_kernel(const struct mptcp_sock *msk)
+{
+ return READ_ONCE(msk->pm.pm_type) == MPTCP_PM_TYPE_KERNEL;
+}
+
+static inline unsigned int mptcp_add_addr_len(int family, bool echo, bool port)
+{
+ u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE;
+
+ if (family == AF_INET6)
+ len = TCPOLEN_MPTCP_ADD_ADDR6_BASE;
+ if (!echo)
+ len += MPTCPOPT_THMAC_LEN;
+ /* account for 2 trailing 'nop' options */
+ if (port)
+ len += TCPOLEN_MPTCP_PORT_LEN + TCPOLEN_MPTCP_PORT_ALIGN;
+
+ return len;
+}
+
+static inline int mptcp_rm_addr_len(const struct mptcp_rm_list *rm_list)
+{
+ if (rm_list->nr == 0 || rm_list->nr > MPTCP_RM_IDS_MAX)
+ return -EINVAL;
+
+ return TCPOLEN_MPTCP_RM_ADDR_BASE + roundup(rm_list->nr - 1, 4) + 1;
+}
+
+bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb,
+ unsigned int opt_size, unsigned int remaining,
+ struct mptcp_addr_info *addr, bool *echo,
+ bool *drop_other_suboptions);
+bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
+ struct mptcp_rm_list *rm_list);
+int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
+int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc);
+int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc);
+
+void __init mptcp_pm_nl_init(void);
+void mptcp_pm_nl_work(struct mptcp_sock *msk);
+void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk,
+ const struct mptcp_rm_list *rm_list);
+unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk);
+unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk);
+unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk);
+unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk);
+
+/* called under PM lock */
+static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk)
+{
+ if (--msk->pm.subflows < mptcp_pm_get_subflows_max(msk))
+ WRITE_ONCE(msk->pm.accept_subflow, true);
+}
+
+static inline void mptcp_pm_close_subflow(struct mptcp_sock *msk)
+{
+ spin_lock_bh(&msk->pm.lock);
+ __mptcp_pm_close_subflow(msk);
+ spin_unlock_bh(&msk->pm.lock);
+}
+
+void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk);
+void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk);
+
+static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb)
+{
+ return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP);
+}
+
+void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops);
+
+static inline bool __mptcp_check_fallback(const struct mptcp_sock *msk)
+{
+ return test_bit(MPTCP_FALLBACK_DONE, &msk->flags);
+}
+
+static inline bool mptcp_check_fallback(const struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+
+ return __mptcp_check_fallback(msk);
+}
+
+static inline void __mptcp_do_fallback(struct mptcp_sock *msk)
+{
+ if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) {
+ pr_debug("TCP fallback already done (msk=%p)", msk);
+ return;
+ }
+ set_bit(MPTCP_FALLBACK_DONE, &msk->flags);
+}
+
+static inline void mptcp_do_fallback(struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct sock *sk = subflow->conn;
+ struct mptcp_sock *msk;
+
+ msk = mptcp_sk(sk);
+ __mptcp_do_fallback(msk);
+ if (READ_ONCE(msk->snd_data_fin_enable) && !(ssk->sk_shutdown & SEND_SHUTDOWN)) {
+ gfp_t saved_allocation = ssk->sk_allocation;
+
+ /* we are in a atomic (BH) scope, override ssk default for data
+ * fin allocation
+ */
+ ssk->sk_allocation = GFP_ATOMIC;
+ ssk->sk_shutdown |= SEND_SHUTDOWN;
+ tcp_shutdown(ssk, SEND_SHUTDOWN);
+ ssk->sk_allocation = saved_allocation;
+ }
+}
+
+#define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a)
+
+static inline bool mptcp_check_infinite_map(struct sk_buff *skb)
+{
+ struct mptcp_ext *mpext;
+
+ mpext = skb ? mptcp_get_ext(skb) : NULL;
+ if (mpext && mpext->infinite_map)
+ return true;
+
+ return false;
+}
+
+static inline bool is_active_ssk(struct mptcp_subflow_context *subflow)
+{
+ return (subflow->request_mptcp || subflow->request_join);
+}
+
+static inline bool subflow_simultaneous_connect(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+ return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1) &&
+ is_active_ssk(subflow) &&
+ !subflow->conn_finished;
+}
+
+#ifdef CONFIG_SYN_COOKIES
+void subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req,
+ struct sk_buff *skb);
+bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req,
+ struct sk_buff *skb);
+void __init mptcp_join_cookie_init(void);
+#else
+static inline void
+subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req,
+ struct sk_buff *skb) {}
+static inline bool
+mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req,
+ struct sk_buff *skb)
+{
+ return false;
+}
+
+static inline void mptcp_join_cookie_init(void) {}
+#endif
+
+#endif /* __MPTCP_PROTOCOL_H */
diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c
new file mode 100644
index 000000000..4ab0693c0
--- /dev/null
+++ b/net/mptcp/sched.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2022, SUSE.
+ */
+
+#define pr_fmt(fmt) "MPTCP: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/spinlock.h>
+#include "protocol.h"
+
+static DEFINE_SPINLOCK(mptcp_sched_list_lock);
+static LIST_HEAD(mptcp_sched_list);
+
+static int mptcp_sched_default_get_subflow(struct mptcp_sock *msk,
+ struct mptcp_sched_data *data)
+{
+ struct sock *ssk;
+
+ ssk = data->reinject ? mptcp_subflow_get_retrans(msk) :
+ mptcp_subflow_get_send(msk);
+ if (!ssk)
+ return -EINVAL;
+
+ mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true);
+ return 0;
+}
+
+static struct mptcp_sched_ops mptcp_sched_default = {
+ .get_subflow = mptcp_sched_default_get_subflow,
+ .name = "default",
+ .owner = THIS_MODULE,
+};
+
+/* Must be called with rcu read lock held */
+struct mptcp_sched_ops *mptcp_sched_find(const char *name)
+{
+ struct mptcp_sched_ops *sched, *ret = NULL;
+
+ list_for_each_entry_rcu(sched, &mptcp_sched_list, list) {
+ if (!strcmp(sched->name, name)) {
+ ret = sched;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+int mptcp_register_scheduler(struct mptcp_sched_ops *sched)
+{
+ if (!sched->get_subflow)
+ return -EINVAL;
+
+ spin_lock(&mptcp_sched_list_lock);
+ if (mptcp_sched_find(sched->name)) {
+ spin_unlock(&mptcp_sched_list_lock);
+ return -EEXIST;
+ }
+ list_add_tail_rcu(&sched->list, &mptcp_sched_list);
+ spin_unlock(&mptcp_sched_list_lock);
+
+ pr_debug("%s registered", sched->name);
+ return 0;
+}
+
+void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched)
+{
+ if (sched == &mptcp_sched_default)
+ return;
+
+ spin_lock(&mptcp_sched_list_lock);
+ list_del_rcu(&sched->list);
+ spin_unlock(&mptcp_sched_list_lock);
+}
+
+void mptcp_sched_init(void)
+{
+ mptcp_register_scheduler(&mptcp_sched_default);
+}
+
+int mptcp_init_sched(struct mptcp_sock *msk,
+ struct mptcp_sched_ops *sched)
+{
+ if (!sched)
+ sched = &mptcp_sched_default;
+
+ if (!bpf_try_module_get(sched, sched->owner))
+ return -EBUSY;
+
+ msk->sched = sched;
+ if (msk->sched->init)
+ msk->sched->init(msk);
+
+ pr_debug("sched=%s", msk->sched->name);
+
+ return 0;
+}
+
+void mptcp_release_sched(struct mptcp_sock *msk)
+{
+ struct mptcp_sched_ops *sched = msk->sched;
+
+ if (!sched)
+ return;
+
+ msk->sched = NULL;
+ if (sched->release)
+ sched->release(msk);
+
+ bpf_module_put(sched, sched->owner);
+}
+
+void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow,
+ bool scheduled)
+{
+ WRITE_ONCE(subflow->scheduled, scheduled);
+}
+
+int mptcp_sched_get_send(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sched_data data;
+
+ msk_owned_by_me(msk);
+
+ /* the following check is moved out of mptcp_subflow_get_send */
+ if (__mptcp_check_fallback(msk)) {
+ if (msk->first &&
+ __tcp_can_send(msk->first) &&
+ sk_stream_memory_free(msk->first)) {
+ mptcp_subflow_set_scheduled(mptcp_subflow_ctx(msk->first), true);
+ return 0;
+ }
+ return -EINVAL;
+ }
+
+ mptcp_for_each_subflow(msk, subflow) {
+ if (READ_ONCE(subflow->scheduled))
+ return 0;
+ }
+
+ data.reinject = false;
+ if (msk->sched == &mptcp_sched_default || !msk->sched)
+ return mptcp_sched_default_get_subflow(msk, &data);
+ return msk->sched->get_subflow(msk, &data);
+}
+
+int mptcp_sched_get_retrans(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sched_data data;
+
+ msk_owned_by_me(msk);
+
+ /* the following check is moved out of mptcp_subflow_get_retrans */
+ if (__mptcp_check_fallback(msk))
+ return -EINVAL;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ if (READ_ONCE(subflow->scheduled))
+ return 0;
+ }
+
+ data.reinject = true;
+ if (msk->sched == &mptcp_sched_default || !msk->sched)
+ return mptcp_sched_default_get_subflow(msk, &data);
+ return msk->sched->get_subflow(msk, &data);
+}
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
new file mode 100644
index 000000000..116e30082
--- /dev/null
+++ b/net/mptcp/sockopt.c
@@ -0,0 +1,1486 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2021, Red Hat.
+ */
+
+#define pr_fmt(fmt) "MPTCP: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <net/sock.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/mptcp.h>
+#include "protocol.h"
+
+#define MIN_INFO_OPTLEN_SIZE 16
+#define MIN_FULL_INFO_OPTLEN_SIZE 40
+
+static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk)
+{
+ msk_owned_by_me(msk);
+
+ if (likely(!__mptcp_check_fallback(msk)))
+ return NULL;
+
+ return msk->first;
+}
+
+static u32 sockopt_seq_reset(const struct sock *sk)
+{
+ sock_owned_by_me(sk);
+
+ /* Highbits contain state. Allows to distinguish sockopt_seq
+ * of listener and established:
+ * s0 = new_listener()
+ * sockopt(s0) - seq is 1
+ * s1 = accept(s0) - s1 inherits seq 1 if listener sk (s0)
+ * sockopt(s0) - seq increments to 2 on s0
+ * sockopt(s1) // seq increments to 2 on s1 (different option)
+ * new ssk completes join, inherits options from s0 // seq 2
+ * Needs sync from mptcp join logic, but ssk->seq == msk->seq
+ *
+ * Set High order bits to sk_state so ssk->seq == msk->seq test
+ * will fail.
+ */
+
+ return (u32)sk->sk_state << 24u;
+}
+
+static void sockopt_seq_inc(struct mptcp_sock *msk)
+{
+ u32 seq = (msk->setsockopt_seq + 1) & 0x00ffffff;
+
+ msk->setsockopt_seq = sockopt_seq_reset((struct sock *)msk) + seq;
+}
+
+static int mptcp_get_int_option(struct mptcp_sock *msk, sockptr_t optval,
+ unsigned int optlen, int *val)
+{
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (copy_from_sockptr(val, optval, sizeof(*val)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, int val)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+
+ lock_sock(sk);
+ sockopt_seq_inc(msk);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ bool slow = lock_sock_fast(ssk);
+
+ switch (optname) {
+ case SO_DEBUG:
+ sock_valbool_flag(ssk, SOCK_DBG, !!val);
+ break;
+ case SO_KEEPALIVE:
+ if (ssk->sk_prot->keepalive)
+ ssk->sk_prot->keepalive(ssk, !!val);
+ sock_valbool_flag(ssk, SOCK_KEEPOPEN, !!val);
+ break;
+ case SO_PRIORITY:
+ ssk->sk_priority = val;
+ break;
+ case SO_SNDBUF:
+ case SO_SNDBUFFORCE:
+ ssk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+ WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf);
+ mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf;
+ break;
+ case SO_RCVBUF:
+ case SO_RCVBUFFORCE:
+ ssk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+ WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf);
+ break;
+ case SO_MARK:
+ if (READ_ONCE(ssk->sk_mark) != sk->sk_mark) {
+ WRITE_ONCE(ssk->sk_mark, sk->sk_mark);
+ sk_dst_reset(ssk);
+ }
+ break;
+ case SO_INCOMING_CPU:
+ WRITE_ONCE(ssk->sk_incoming_cpu, val);
+ break;
+ }
+
+ subflow->setsockopt_seq = msk->setsockopt_seq;
+ unlock_sock_fast(ssk, slow);
+ }
+
+ release_sock(sk);
+}
+
+static int mptcp_sol_socket_intval(struct mptcp_sock *msk, int optname, int val)
+{
+ sockptr_t optval = KERNEL_SOCKPTR(&val);
+ struct sock *sk = (struct sock *)msk;
+ int ret;
+
+ ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname,
+ optval, sizeof(val));
+ if (ret)
+ return ret;
+
+ mptcp_sol_socket_sync_intval(msk, optname, val);
+ return 0;
+}
+
+static void mptcp_so_incoming_cpu(struct mptcp_sock *msk, int val)
+{
+ struct sock *sk = (struct sock *)msk;
+
+ WRITE_ONCE(sk->sk_incoming_cpu, val);
+
+ mptcp_sol_socket_sync_intval(msk, SO_INCOMING_CPU, val);
+}
+
+static int mptcp_setsockopt_sol_socket_tstamp(struct mptcp_sock *msk, int optname, int val)
+{
+ sockptr_t optval = KERNEL_SOCKPTR(&val);
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ int ret;
+
+ ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname,
+ optval, sizeof(val));
+ if (ret)
+ return ret;
+
+ lock_sock(sk);
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ bool slow = lock_sock_fast(ssk);
+
+ sock_set_timestamp(sk, optname, !!val);
+ unlock_sock_fast(ssk, slow);
+ }
+
+ release_sock(sk);
+ return 0;
+}
+
+static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname,
+ sockptr_t optval,
+ unsigned int optlen)
+{
+ int val, ret;
+
+ ret = mptcp_get_int_option(msk, optval, optlen, &val);
+ if (ret)
+ return ret;
+
+ switch (optname) {
+ case SO_KEEPALIVE:
+ mptcp_sol_socket_sync_intval(msk, optname, val);
+ return 0;
+ case SO_DEBUG:
+ case SO_MARK:
+ case SO_PRIORITY:
+ case SO_SNDBUF:
+ case SO_SNDBUFFORCE:
+ case SO_RCVBUF:
+ case SO_RCVBUFFORCE:
+ return mptcp_sol_socket_intval(msk, optname, val);
+ case SO_INCOMING_CPU:
+ mptcp_so_incoming_cpu(msk, val);
+ return 0;
+ case SO_TIMESTAMP_OLD:
+ case SO_TIMESTAMP_NEW:
+ case SO_TIMESTAMPNS_OLD:
+ case SO_TIMESTAMPNS_NEW:
+ return mptcp_setsockopt_sol_socket_tstamp(msk, optname, val);
+ }
+
+ return -ENOPROTOOPT;
+}
+
+static int mptcp_setsockopt_sol_socket_timestamping(struct mptcp_sock *msk,
+ int optname,
+ sockptr_t optval,
+ unsigned int optlen)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ struct so_timestamping timestamping;
+ int ret;
+
+ if (optlen == sizeof(timestamping)) {
+ if (copy_from_sockptr(&timestamping, optval,
+ sizeof(timestamping)))
+ return -EFAULT;
+ } else if (optlen == sizeof(int)) {
+ memset(&timestamping, 0, sizeof(timestamping));
+
+ if (copy_from_sockptr(&timestamping.flags, optval, sizeof(int)))
+ return -EFAULT;
+ } else {
+ return -EINVAL;
+ }
+
+ ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname,
+ KERNEL_SOCKPTR(&timestamping),
+ sizeof(timestamping));
+ if (ret)
+ return ret;
+
+ lock_sock(sk);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ bool slow = lock_sock_fast(ssk);
+
+ sock_set_timestamping(sk, optname, timestamping);
+ unlock_sock_fast(ssk, slow);
+ }
+
+ release_sock(sk);
+
+ return 0;
+}
+
+static int mptcp_setsockopt_sol_socket_linger(struct mptcp_sock *msk, sockptr_t optval,
+ unsigned int optlen)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ struct linger ling;
+ sockptr_t kopt;
+ int ret;
+
+ if (optlen < sizeof(ling))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&ling, optval, sizeof(ling)))
+ return -EFAULT;
+
+ kopt = KERNEL_SOCKPTR(&ling);
+ ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, SO_LINGER, kopt, sizeof(ling));
+ if (ret)
+ return ret;
+
+ lock_sock(sk);
+ sockopt_seq_inc(msk);
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ bool slow = lock_sock_fast(ssk);
+
+ if (!ling.l_onoff) {
+ sock_reset_flag(ssk, SOCK_LINGER);
+ } else {
+ ssk->sk_lingertime = sk->sk_lingertime;
+ sock_set_flag(ssk, SOCK_LINGER);
+ }
+
+ subflow->setsockopt_seq = msk->setsockopt_seq;
+ unlock_sock_fast(ssk, slow);
+ }
+
+ release_sock(sk);
+ return 0;
+}
+
+static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct sock *ssk;
+ int ret;
+
+ switch (optname) {
+ case SO_REUSEPORT:
+ case SO_REUSEADDR:
+ case SO_BINDTODEVICE:
+ case SO_BINDTOIFINDEX:
+ lock_sock(sk);
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk)) {
+ release_sock(sk);
+ return PTR_ERR(ssk);
+ }
+
+ ret = sk_setsockopt(ssk, SOL_SOCKET, optname, optval, optlen);
+ if (ret == 0) {
+ if (optname == SO_REUSEPORT)
+ sk->sk_reuseport = ssk->sk_reuseport;
+ else if (optname == SO_REUSEADDR)
+ sk->sk_reuse = ssk->sk_reuse;
+ else if (optname == SO_BINDTODEVICE)
+ sk->sk_bound_dev_if = ssk->sk_bound_dev_if;
+ else if (optname == SO_BINDTOIFINDEX)
+ sk->sk_bound_dev_if = ssk->sk_bound_dev_if;
+ }
+ release_sock(sk);
+ return ret;
+ case SO_KEEPALIVE:
+ case SO_PRIORITY:
+ case SO_SNDBUF:
+ case SO_SNDBUFFORCE:
+ case SO_RCVBUF:
+ case SO_RCVBUFFORCE:
+ case SO_MARK:
+ case SO_INCOMING_CPU:
+ case SO_DEBUG:
+ case SO_TIMESTAMP_OLD:
+ case SO_TIMESTAMP_NEW:
+ case SO_TIMESTAMPNS_OLD:
+ case SO_TIMESTAMPNS_NEW:
+ return mptcp_setsockopt_sol_socket_int(msk, optname, optval,
+ optlen);
+ case SO_TIMESTAMPING_OLD:
+ case SO_TIMESTAMPING_NEW:
+ return mptcp_setsockopt_sol_socket_timestamping(msk, optname,
+ optval, optlen);
+ case SO_LINGER:
+ return mptcp_setsockopt_sol_socket_linger(msk, optval, optlen);
+ case SO_RCVLOWAT:
+ case SO_RCVTIMEO_OLD:
+ case SO_RCVTIMEO_NEW:
+ case SO_SNDTIMEO_OLD:
+ case SO_SNDTIMEO_NEW:
+ case SO_BUSY_POLL:
+ case SO_PREFER_BUSY_POLL:
+ case SO_BUSY_POLL_BUDGET:
+ /* No need to copy: only relevant for msk */
+ return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen);
+ case SO_NO_CHECK:
+ case SO_DONTROUTE:
+ case SO_BROADCAST:
+ case SO_BSDCOMPAT:
+ case SO_PASSCRED:
+ case SO_PASSPIDFD:
+ case SO_PASSSEC:
+ case SO_RXQ_OVFL:
+ case SO_WIFI_STATUS:
+ case SO_NOFCS:
+ case SO_SELECT_ERR_QUEUE:
+ return 0;
+ }
+
+ /* SO_OOBINLINE is not supported, let's avoid the related mess
+ * SO_ATTACH_FILTER, SO_ATTACH_BPF, SO_ATTACH_REUSEPORT_CBPF,
+ * SO_DETACH_REUSEPORT_BPF, SO_DETACH_FILTER, SO_LOCK_FILTER,
+ * we must be careful with subflows
+ *
+ * SO_ATTACH_REUSEPORT_EBPF is not supported, at it checks
+ * explicitly the sk_protocol field
+ *
+ * SO_PEEK_OFF is unsupported, as it is for plain TCP
+ * SO_MAX_PACING_RATE is unsupported, we must be careful with subflows
+ * SO_CNX_ADVICE is currently unsupported, could possibly be relevant,
+ * but likely needs careful design
+ *
+ * SO_ZEROCOPY is currently unsupported, TODO in sndmsg
+ * SO_TXTIME is currently unsupported
+ */
+
+ return -EOPNOTSUPP;
+}
+
+static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = (struct sock *)msk;
+ int ret = -EOPNOTSUPP;
+ struct sock *ssk;
+
+ switch (optname) {
+ case IPV6_V6ONLY:
+ case IPV6_TRANSPARENT:
+ case IPV6_FREEBIND:
+ lock_sock(sk);
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk)) {
+ release_sock(sk);
+ return PTR_ERR(ssk);
+ }
+
+ ret = tcp_setsockopt(ssk, SOL_IPV6, optname, optval, optlen);
+ if (ret != 0) {
+ release_sock(sk);
+ return ret;
+ }
+
+ sockopt_seq_inc(msk);
+
+ switch (optname) {
+ case IPV6_V6ONLY:
+ sk->sk_ipv6only = ssk->sk_ipv6only;
+ break;
+ case IPV6_TRANSPARENT:
+ inet_assign_bit(TRANSPARENT, sk,
+ inet_test_bit(TRANSPARENT, ssk));
+ break;
+ case IPV6_FREEBIND:
+ inet_assign_bit(FREEBIND, sk,
+ inet_test_bit(FREEBIND, ssk));
+ break;
+ }
+
+ release_sock(sk);
+ break;
+ }
+
+ return ret;
+}
+
+static bool mptcp_supported_sockopt(int level, int optname)
+{
+ if (level == SOL_IP) {
+ switch (optname) {
+ /* should work fine */
+ case IP_FREEBIND:
+ case IP_TRANSPARENT:
+
+ /* the following are control cmsg related */
+ case IP_PKTINFO:
+ case IP_RECVTTL:
+ case IP_RECVTOS:
+ case IP_RECVOPTS:
+ case IP_RETOPTS:
+ case IP_PASSSEC:
+ case IP_RECVORIGDSTADDR:
+ case IP_CHECKSUM:
+ case IP_RECVFRAGSIZE:
+
+ /* common stuff that need some love */
+ case IP_TOS:
+ case IP_TTL:
+ case IP_BIND_ADDRESS_NO_PORT:
+ case IP_MTU_DISCOVER:
+ case IP_RECVERR:
+
+ /* possibly less common may deserve some love */
+ case IP_MINTTL:
+
+ /* the following is apparently a no-op for plain TCP */
+ case IP_RECVERR_RFC4884:
+ return true;
+ }
+
+ /* IP_OPTIONS is not supported, needs subflow care */
+ /* IP_HDRINCL, IP_NODEFRAG are not supported, RAW specific */
+ /* IP_MULTICAST_TTL, IP_MULTICAST_LOOP, IP_UNICAST_IF,
+ * IP_ADD_MEMBERSHIP, IP_ADD_SOURCE_MEMBERSHIP, IP_DROP_MEMBERSHIP,
+ * IP_DROP_SOURCE_MEMBERSHIP, IP_BLOCK_SOURCE, IP_UNBLOCK_SOURCE,
+ * MCAST_JOIN_GROUP, MCAST_LEAVE_GROUP MCAST_JOIN_SOURCE_GROUP,
+ * MCAST_LEAVE_SOURCE_GROUP, MCAST_BLOCK_SOURCE, MCAST_UNBLOCK_SOURCE,
+ * MCAST_MSFILTER, IP_MULTICAST_ALL are not supported, better not deal
+ * with mcast stuff
+ */
+ /* IP_IPSEC_POLICY, IP_XFRM_POLICY are nut supported, unrelated here */
+ return false;
+ }
+ if (level == SOL_IPV6) {
+ switch (optname) {
+ case IPV6_V6ONLY:
+
+ /* the following are control cmsg related */
+ case IPV6_RECVPKTINFO:
+ case IPV6_2292PKTINFO:
+ case IPV6_RECVHOPLIMIT:
+ case IPV6_2292HOPLIMIT:
+ case IPV6_RECVRTHDR:
+ case IPV6_2292RTHDR:
+ case IPV6_RECVHOPOPTS:
+ case IPV6_2292HOPOPTS:
+ case IPV6_RECVDSTOPTS:
+ case IPV6_2292DSTOPTS:
+ case IPV6_RECVTCLASS:
+ case IPV6_FLOWINFO:
+ case IPV6_RECVPATHMTU:
+ case IPV6_RECVORIGDSTADDR:
+ case IPV6_RECVFRAGSIZE:
+
+ /* the following ones need some love but are quite common */
+ case IPV6_TCLASS:
+ case IPV6_TRANSPARENT:
+ case IPV6_FREEBIND:
+ case IPV6_PKTINFO:
+ case IPV6_2292PKTOPTIONS:
+ case IPV6_UNICAST_HOPS:
+ case IPV6_MTU_DISCOVER:
+ case IPV6_MTU:
+ case IPV6_RECVERR:
+ case IPV6_FLOWINFO_SEND:
+ case IPV6_FLOWLABEL_MGR:
+ case IPV6_MINHOPCOUNT:
+ case IPV6_DONTFRAG:
+ case IPV6_AUTOFLOWLABEL:
+
+ /* the following one is a no-op for plain TCP */
+ case IPV6_RECVERR_RFC4884:
+ return true;
+ }
+
+ /* IPV6_HOPOPTS, IPV6_RTHDRDSTOPTS, IPV6_RTHDR, IPV6_DSTOPTS are
+ * not supported
+ */
+ /* IPV6_MULTICAST_HOPS, IPV6_MULTICAST_LOOP, IPV6_UNICAST_IF,
+ * IPV6_MULTICAST_IF, IPV6_ADDRFORM,
+ * IPV6_ADD_MEMBERSHIP, IPV6_DROP_MEMBERSHIP, IPV6_JOIN_ANYCAST,
+ * IPV6_LEAVE_ANYCAST, IPV6_MULTICAST_ALL, MCAST_JOIN_GROUP, MCAST_LEAVE_GROUP,
+ * MCAST_JOIN_SOURCE_GROUP, MCAST_LEAVE_SOURCE_GROUP,
+ * MCAST_BLOCK_SOURCE, MCAST_UNBLOCK_SOURCE, MCAST_MSFILTER
+ * are not supported better not deal with mcast
+ */
+ /* IPV6_ROUTER_ALERT, IPV6_ROUTER_ALERT_ISOLATE are not supported, since are evil */
+
+ /* IPV6_IPSEC_POLICY, IPV6_XFRM_POLICY are not supported */
+ /* IPV6_ADDR_PREFERENCES is not supported, we must be careful with subflows */
+ return false;
+ }
+ if (level == SOL_TCP) {
+ switch (optname) {
+ /* the following are no-op or should work just fine */
+ case TCP_THIN_DUPACK:
+ case TCP_DEFER_ACCEPT:
+
+ /* the following need some love */
+ case TCP_MAXSEG:
+ case TCP_NODELAY:
+ case TCP_THIN_LINEAR_TIMEOUTS:
+ case TCP_CONGESTION:
+ case TCP_CORK:
+ case TCP_KEEPIDLE:
+ case TCP_KEEPINTVL:
+ case TCP_KEEPCNT:
+ case TCP_SYNCNT:
+ case TCP_SAVE_SYN:
+ case TCP_LINGER2:
+ case TCP_WINDOW_CLAMP:
+ case TCP_QUICKACK:
+ case TCP_USER_TIMEOUT:
+ case TCP_TIMESTAMP:
+ case TCP_NOTSENT_LOWAT:
+ case TCP_TX_DELAY:
+ case TCP_INQ:
+ case TCP_FASTOPEN:
+ case TCP_FASTOPEN_CONNECT:
+ case TCP_FASTOPEN_KEY:
+ case TCP_FASTOPEN_NO_COOKIE:
+ return true;
+ }
+
+ /* TCP_MD5SIG, TCP_MD5SIG_EXT are not supported, MD5 is not compatible with MPTCP */
+
+ /* TCP_REPAIR, TCP_REPAIR_QUEUE, TCP_QUEUE_SEQ, TCP_REPAIR_OPTIONS,
+ * TCP_REPAIR_WINDOW are not supported, better avoid this mess
+ */
+ }
+ return false;
+}
+
+static int mptcp_setsockopt_sol_tcp_congestion(struct mptcp_sock *msk, sockptr_t optval,
+ unsigned int optlen)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ char name[TCP_CA_NAME_MAX];
+ bool cap_net_admin;
+ int ret;
+
+ if (optlen < 1)
+ return -EINVAL;
+
+ ret = strncpy_from_sockptr(name, optval,
+ min_t(long, TCP_CA_NAME_MAX - 1, optlen));
+ if (ret < 0)
+ return -EFAULT;
+
+ name[ret] = 0;
+
+ cap_net_admin = ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN);
+
+ ret = 0;
+ lock_sock(sk);
+ sockopt_seq_inc(msk);
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ int err;
+
+ lock_sock(ssk);
+ err = tcp_set_congestion_control(ssk, name, true, cap_net_admin);
+ if (err < 0 && ret == 0)
+ ret = err;
+ subflow->setsockopt_seq = msk->setsockopt_seq;
+ release_sock(ssk);
+ }
+
+ if (ret == 0)
+ strcpy(msk->ca_name, name);
+
+ release_sock(sk);
+ return ret;
+}
+
+static int mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, sockptr_t optval,
+ unsigned int optlen)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ int val;
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
+ return -EFAULT;
+
+ lock_sock(sk);
+ sockopt_seq_inc(msk);
+ msk->cork = !!val;
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ lock_sock(ssk);
+ __tcp_sock_set_cork(ssk, !!val);
+ release_sock(ssk);
+ }
+ if (!val)
+ mptcp_check_and_set_pending(sk);
+ release_sock(sk);
+
+ return 0;
+}
+
+static int mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, sockptr_t optval,
+ unsigned int optlen)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ int val;
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
+ return -EFAULT;
+
+ lock_sock(sk);
+ sockopt_seq_inc(msk);
+ msk->nodelay = !!val;
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ lock_sock(ssk);
+ __tcp_sock_set_nodelay(ssk, !!val);
+ release_sock(ssk);
+ }
+ if (val)
+ mptcp_check_and_set_pending(sk);
+ release_sock(sk);
+
+ return 0;
+}
+
+static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct sock *ssk;
+ int err;
+
+ err = ip_setsockopt(sk, SOL_IP, optname, optval, optlen);
+ if (err != 0)
+ return err;
+
+ lock_sock(sk);
+
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk)) {
+ release_sock(sk);
+ return PTR_ERR(ssk);
+ }
+
+ switch (optname) {
+ case IP_FREEBIND:
+ inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk));
+ break;
+ case IP_TRANSPARENT:
+ inet_assign_bit(TRANSPARENT, ssk,
+ inet_test_bit(TRANSPARENT, sk));
+ break;
+ default:
+ release_sock(sk);
+ WARN_ON_ONCE(1);
+ return -EOPNOTSUPP;
+ }
+
+ sockopt_seq_inc(msk);
+ release_sock(sk);
+ return 0;
+}
+
+static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ int err, val;
+
+ err = ip_setsockopt(sk, SOL_IP, optname, optval, optlen);
+
+ if (err != 0)
+ return err;
+
+ lock_sock(sk);
+ sockopt_seq_inc(msk);
+ val = inet_sk(sk)->tos;
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ bool slow;
+
+ slow = lock_sock_fast(ssk);
+ __ip_sock_set_tos(ssk, val);
+ unlock_sock_fast(ssk, slow);
+ }
+ release_sock(sk);
+
+ return 0;
+}
+
+static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ switch (optname) {
+ case IP_FREEBIND:
+ case IP_TRANSPARENT:
+ return mptcp_setsockopt_sol_ip_set_transparent(msk, optname, optval, optlen);
+ case IP_TOS:
+ return mptcp_setsockopt_v4_set_tos(msk, optname, optval, optlen);
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct sock *ssk;
+ int ret;
+
+ /* Limit to first subflow, before the connection establishment */
+ lock_sock(sk);
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk)) {
+ ret = PTR_ERR(ssk);
+ goto unlock;
+ }
+
+ ret = tcp_setsockopt(ssk, level, optname, optval, optlen);
+
+unlock:
+ release_sock(sk);
+ return ret;
+}
+
+static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = (void *)msk;
+ int ret, val;
+
+ switch (optname) {
+ case TCP_INQ:
+ ret = mptcp_get_int_option(msk, optval, optlen, &val);
+ if (ret)
+ return ret;
+ if (val < 0 || val > 1)
+ return -EINVAL;
+
+ lock_sock(sk);
+ msk->recvmsg_inq = !!val;
+ release_sock(sk);
+ return 0;
+ case TCP_ULP:
+ return -EOPNOTSUPP;
+ case TCP_CONGESTION:
+ return mptcp_setsockopt_sol_tcp_congestion(msk, optval, optlen);
+ case TCP_CORK:
+ return mptcp_setsockopt_sol_tcp_cork(msk, optval, optlen);
+ case TCP_NODELAY:
+ return mptcp_setsockopt_sol_tcp_nodelay(msk, optval, optlen);
+ case TCP_DEFER_ACCEPT:
+ /* See tcp.c: TCP_DEFER_ACCEPT does not fail */
+ mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen);
+ return 0;
+ case TCP_FASTOPEN:
+ case TCP_FASTOPEN_CONNECT:
+ case TCP_FASTOPEN_KEY:
+ case TCP_FASTOPEN_NO_COOKIE:
+ return mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname,
+ optval, optlen);
+ }
+
+ return -EOPNOTSUPP;
+}
+
+int mptcp_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct sock *ssk;
+
+ pr_debug("msk=%p", msk);
+
+ if (level == SOL_SOCKET)
+ return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen);
+
+ if (!mptcp_supported_sockopt(level, optname))
+ return -ENOPROTOOPT;
+
+ /* @@ the meaning of setsockopt() when the socket is connected and
+ * there are multiple subflows is not yet defined. It is up to the
+ * MPTCP-level socket to configure the subflows until the subflow
+ * is in TCP fallback, when TCP socket options are passed through
+ * to the one remaining subflow.
+ */
+ lock_sock(sk);
+ ssk = __mptcp_tcp_fallback(msk);
+ release_sock(sk);
+ if (ssk)
+ return tcp_setsockopt(ssk, level, optname, optval, optlen);
+
+ if (level == SOL_IP)
+ return mptcp_setsockopt_v4(msk, optname, optval, optlen);
+
+ if (level == SOL_IPV6)
+ return mptcp_setsockopt_v6(msk, optname, optval, optlen);
+
+ if (level == SOL_TCP)
+ return mptcp_setsockopt_sol_tcp(msk, optname, optval, optlen);
+
+ return -EOPNOTSUPP;
+}
+
+static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct sock *ssk;
+ int ret;
+
+ lock_sock(sk);
+ ssk = msk->first;
+ if (ssk) {
+ ret = tcp_getsockopt(ssk, level, optname, optval, optlen);
+ goto out;
+ }
+
+ ssk = __mptcp_nmpc_sk(msk);
+ if (IS_ERR(ssk)) {
+ ret = PTR_ERR(ssk);
+ goto out;
+ }
+
+ ret = tcp_getsockopt(ssk, level, optname, optval, optlen);
+
+out:
+ release_sock(sk);
+ return ret;
+}
+
+void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
+{
+ struct sock *sk = (struct sock *)msk;
+ u32 flags = 0;
+ bool slow;
+
+ memset(info, 0, sizeof(*info));
+
+ info->mptcpi_subflows = READ_ONCE(msk->pm.subflows);
+ info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled);
+ info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted);
+ info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used);
+
+ if (inet_sk_state_load(sk) == TCP_LISTEN)
+ return;
+
+ /* The following limits only make sense for the in-kernel PM */
+ if (mptcp_pm_is_kernel(msk)) {
+ info->mptcpi_subflows_max =
+ mptcp_pm_get_subflows_max(msk);
+ info->mptcpi_add_addr_signal_max =
+ mptcp_pm_get_add_addr_signal_max(msk);
+ info->mptcpi_add_addr_accepted_max =
+ mptcp_pm_get_add_addr_accept_max(msk);
+ info->mptcpi_local_addr_max =
+ mptcp_pm_get_local_addr_max(msk);
+ }
+
+ if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags))
+ flags |= MPTCP_INFO_FLAG_FALLBACK;
+ if (READ_ONCE(msk->can_ack))
+ flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED;
+ info->mptcpi_flags = flags;
+ mptcp_data_lock(sk);
+ info->mptcpi_snd_una = msk->snd_una;
+ info->mptcpi_rcv_nxt = msk->ack_seq;
+ info->mptcpi_bytes_acked = msk->bytes_acked;
+ mptcp_data_unlock(sk);
+
+ slow = lock_sock_fast(sk);
+ info->mptcpi_csum_enabled = msk->csum_enabled;
+ info->mptcpi_token = msk->token;
+ info->mptcpi_write_seq = msk->write_seq;
+ info->mptcpi_retransmits = inet_csk(sk)->icsk_retransmits;
+ info->mptcpi_bytes_sent = msk->bytes_sent;
+ info->mptcpi_bytes_received = msk->bytes_received;
+ info->mptcpi_bytes_retrans = msk->bytes_retrans;
+ unlock_sock_fast(sk, slow);
+}
+EXPORT_SYMBOL_GPL(mptcp_diag_fill_info);
+
+static int mptcp_getsockopt_info(struct mptcp_sock *msk, char __user *optval, int __user *optlen)
+{
+ struct mptcp_info m_info;
+ int len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ len = min_t(unsigned int, len, sizeof(struct mptcp_info));
+
+ mptcp_diag_fill_info(msk, &m_info);
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+
+ if (copy_to_user(optval, &m_info, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int mptcp_put_subflow_data(struct mptcp_subflow_data *sfd,
+ char __user *optval,
+ u32 copied,
+ int __user *optlen)
+{
+ u32 copylen = min_t(u32, sfd->size_subflow_data, sizeof(*sfd));
+
+ if (copied)
+ copied += sfd->size_subflow_data;
+ else
+ copied = copylen;
+
+ if (put_user(copied, optlen))
+ return -EFAULT;
+
+ if (copy_to_user(optval, sfd, copylen))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int mptcp_get_subflow_data(struct mptcp_subflow_data *sfd,
+ char __user *optval,
+ int __user *optlen)
+{
+ int len, copylen;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ /* if mptcp_subflow_data size is changed, need to adjust
+ * this function to deal with programs using old version.
+ */
+ BUILD_BUG_ON(sizeof(*sfd) != MIN_INFO_OPTLEN_SIZE);
+
+ if (len < MIN_INFO_OPTLEN_SIZE)
+ return -EINVAL;
+
+ memset(sfd, 0, sizeof(*sfd));
+
+ copylen = min_t(unsigned int, len, sizeof(*sfd));
+ if (copy_from_user(sfd, optval, copylen))
+ return -EFAULT;
+
+ /* size_subflow_data is u32, but len is signed */
+ if (sfd->size_subflow_data > INT_MAX ||
+ sfd->size_user > INT_MAX)
+ return -EINVAL;
+
+ if (sfd->size_subflow_data < MIN_INFO_OPTLEN_SIZE ||
+ sfd->size_subflow_data > len)
+ return -EINVAL;
+
+ if (sfd->num_subflows || sfd->size_kernel)
+ return -EINVAL;
+
+ return len - sfd->size_subflow_data;
+}
+
+static int mptcp_getsockopt_tcpinfo(struct mptcp_sock *msk, char __user *optval,
+ int __user *optlen)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ unsigned int sfcount = 0, copied = 0;
+ struct mptcp_subflow_data sfd;
+ char __user *infoptr;
+ int len;
+
+ len = mptcp_get_subflow_data(&sfd, optval, optlen);
+ if (len < 0)
+ return len;
+
+ sfd.size_kernel = sizeof(struct tcp_info);
+ sfd.size_user = min_t(unsigned int, sfd.size_user,
+ sizeof(struct tcp_info));
+
+ infoptr = optval + sfd.size_subflow_data;
+
+ lock_sock(sk);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ ++sfcount;
+
+ if (len && len >= sfd.size_user) {
+ struct tcp_info info;
+
+ tcp_get_info(ssk, &info);
+
+ if (copy_to_user(infoptr, &info, sfd.size_user)) {
+ release_sock(sk);
+ return -EFAULT;
+ }
+
+ infoptr += sfd.size_user;
+ copied += sfd.size_user;
+ len -= sfd.size_user;
+ }
+ }
+
+ release_sock(sk);
+
+ sfd.num_subflows = sfcount;
+
+ if (mptcp_put_subflow_data(&sfd, optval, copied, optlen))
+ return -EFAULT;
+
+ return 0;
+}
+
+static void mptcp_get_sub_addrs(const struct sock *sk, struct mptcp_subflow_addrs *a)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+
+ memset(a, 0, sizeof(*a));
+
+ if (sk->sk_family == AF_INET) {
+ a->sin_local.sin_family = AF_INET;
+ a->sin_local.sin_port = inet->inet_sport;
+ a->sin_local.sin_addr.s_addr = inet->inet_rcv_saddr;
+
+ if (!a->sin_local.sin_addr.s_addr)
+ a->sin_local.sin_addr.s_addr = inet->inet_saddr;
+
+ a->sin_remote.sin_family = AF_INET;
+ a->sin_remote.sin_port = inet->inet_dport;
+ a->sin_remote.sin_addr.s_addr = inet->inet_daddr;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (sk->sk_family == AF_INET6) {
+ const struct ipv6_pinfo *np = inet6_sk(sk);
+
+ if (WARN_ON_ONCE(!np))
+ return;
+
+ a->sin6_local.sin6_family = AF_INET6;
+ a->sin6_local.sin6_port = inet->inet_sport;
+
+ if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+ a->sin6_local.sin6_addr = np->saddr;
+ else
+ a->sin6_local.sin6_addr = sk->sk_v6_rcv_saddr;
+
+ a->sin6_remote.sin6_family = AF_INET6;
+ a->sin6_remote.sin6_port = inet->inet_dport;
+ a->sin6_remote.sin6_addr = sk->sk_v6_daddr;
+#endif
+ }
+}
+
+static int mptcp_getsockopt_subflow_addrs(struct mptcp_sock *msk, char __user *optval,
+ int __user *optlen)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ unsigned int sfcount = 0, copied = 0;
+ struct mptcp_subflow_data sfd;
+ char __user *addrptr;
+ int len;
+
+ len = mptcp_get_subflow_data(&sfd, optval, optlen);
+ if (len < 0)
+ return len;
+
+ sfd.size_kernel = sizeof(struct mptcp_subflow_addrs);
+ sfd.size_user = min_t(unsigned int, sfd.size_user,
+ sizeof(struct mptcp_subflow_addrs));
+
+ addrptr = optval + sfd.size_subflow_data;
+
+ lock_sock(sk);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ ++sfcount;
+
+ if (len && len >= sfd.size_user) {
+ struct mptcp_subflow_addrs a;
+
+ mptcp_get_sub_addrs(ssk, &a);
+
+ if (copy_to_user(addrptr, &a, sfd.size_user)) {
+ release_sock(sk);
+ return -EFAULT;
+ }
+
+ addrptr += sfd.size_user;
+ copied += sfd.size_user;
+ len -= sfd.size_user;
+ }
+ }
+
+ release_sock(sk);
+
+ sfd.num_subflows = sfcount;
+
+ if (mptcp_put_subflow_data(&sfd, optval, copied, optlen))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int mptcp_get_full_info(struct mptcp_full_info *mfi,
+ char __user *optval,
+ int __user *optlen)
+{
+ int len;
+
+ BUILD_BUG_ON(offsetof(struct mptcp_full_info, mptcp_info) !=
+ MIN_FULL_INFO_OPTLEN_SIZE);
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ if (len < MIN_FULL_INFO_OPTLEN_SIZE)
+ return -EINVAL;
+
+ memset(mfi, 0, sizeof(*mfi));
+ if (copy_from_user(mfi, optval, MIN_FULL_INFO_OPTLEN_SIZE))
+ return -EFAULT;
+
+ if (mfi->size_tcpinfo_kernel ||
+ mfi->size_sfinfo_kernel ||
+ mfi->num_subflows)
+ return -EINVAL;
+
+ if (mfi->size_sfinfo_user > INT_MAX ||
+ mfi->size_tcpinfo_user > INT_MAX)
+ return -EINVAL;
+
+ return len - MIN_FULL_INFO_OPTLEN_SIZE;
+}
+
+static int mptcp_put_full_info(struct mptcp_full_info *mfi,
+ char __user *optval,
+ u32 copylen,
+ int __user *optlen)
+{
+ copylen += MIN_FULL_INFO_OPTLEN_SIZE;
+ if (put_user(copylen, optlen))
+ return -EFAULT;
+
+ if (copy_to_user(optval, mfi, copylen))
+ return -EFAULT;
+ return 0;
+}
+
+static int mptcp_getsockopt_full_info(struct mptcp_sock *msk, char __user *optval,
+ int __user *optlen)
+{
+ unsigned int sfcount = 0, copylen = 0;
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ void __user *tcpinfoptr, *sfinfoptr;
+ struct mptcp_full_info mfi;
+ int len;
+
+ len = mptcp_get_full_info(&mfi, optval, optlen);
+ if (len < 0)
+ return len;
+
+ /* don't bother filling the mptcp info if there is not enough
+ * user-space-provided storage
+ */
+ if (len > 0) {
+ mptcp_diag_fill_info(msk, &mfi.mptcp_info);
+ copylen += min_t(unsigned int, len, sizeof(struct mptcp_info));
+ }
+
+ mfi.size_tcpinfo_kernel = sizeof(struct tcp_info);
+ mfi.size_tcpinfo_user = min_t(unsigned int, mfi.size_tcpinfo_user,
+ sizeof(struct tcp_info));
+ sfinfoptr = u64_to_user_ptr(mfi.subflow_info);
+ mfi.size_sfinfo_kernel = sizeof(struct mptcp_subflow_info);
+ mfi.size_sfinfo_user = min_t(unsigned int, mfi.size_sfinfo_user,
+ sizeof(struct mptcp_subflow_info));
+ tcpinfoptr = u64_to_user_ptr(mfi.tcp_info);
+
+ lock_sock(sk);
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ struct mptcp_subflow_info sfinfo;
+ struct tcp_info tcp_info;
+
+ if (sfcount++ >= mfi.size_arrays_user)
+ continue;
+
+ /* fetch addr/tcp_info only if the user space buffers
+ * are wide enough
+ */
+ memset(&sfinfo, 0, sizeof(sfinfo));
+ sfinfo.id = subflow->subflow_id;
+ if (mfi.size_sfinfo_user >
+ offsetof(struct mptcp_subflow_info, addrs))
+ mptcp_get_sub_addrs(ssk, &sfinfo.addrs);
+ if (copy_to_user(sfinfoptr, &sfinfo, mfi.size_sfinfo_user))
+ goto fail_release;
+
+ if (mfi.size_tcpinfo_user) {
+ tcp_get_info(ssk, &tcp_info);
+ if (copy_to_user(tcpinfoptr, &tcp_info,
+ mfi.size_tcpinfo_user))
+ goto fail_release;
+ }
+
+ tcpinfoptr += mfi.size_tcpinfo_user;
+ sfinfoptr += mfi.size_sfinfo_user;
+ }
+ release_sock(sk);
+
+ mfi.num_subflows = sfcount;
+ if (mptcp_put_full_info(&mfi, optval, copylen, optlen))
+ return -EFAULT;
+
+ return 0;
+
+fail_release:
+ release_sock(sk);
+ return -EFAULT;
+}
+
+static int mptcp_put_int_option(struct mptcp_sock *msk, char __user *optval,
+ int __user *optlen, int val)
+{
+ int len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < 0)
+ return -EINVAL;
+
+ if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) {
+ unsigned char ucval = (unsigned char)val;
+
+ len = 1;
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &ucval, 1))
+ return -EFAULT;
+ } else {
+ len = min_t(unsigned int, len, sizeof(int));
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
+ char __user *optval, int __user *optlen)
+{
+ switch (optname) {
+ case TCP_ULP:
+ case TCP_CONGESTION:
+ case TCP_INFO:
+ case TCP_CC_INFO:
+ case TCP_DEFER_ACCEPT:
+ case TCP_FASTOPEN:
+ case TCP_FASTOPEN_CONNECT:
+ case TCP_FASTOPEN_KEY:
+ case TCP_FASTOPEN_NO_COOKIE:
+ return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname,
+ optval, optlen);
+ case TCP_INQ:
+ return mptcp_put_int_option(msk, optval, optlen, msk->recvmsg_inq);
+ case TCP_CORK:
+ return mptcp_put_int_option(msk, optval, optlen, msk->cork);
+ case TCP_NODELAY:
+ return mptcp_put_int_option(msk, optval, optlen, msk->nodelay);
+ }
+ return -EOPNOTSUPP;
+}
+
+static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = (void *)msk;
+
+ switch (optname) {
+ case IP_TOS:
+ return mptcp_put_int_option(msk, optval, optlen, inet_sk(sk)->tos);
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int mptcp_getsockopt_sol_mptcp(struct mptcp_sock *msk, int optname,
+ char __user *optval, int __user *optlen)
+{
+ switch (optname) {
+ case MPTCP_INFO:
+ return mptcp_getsockopt_info(msk, optval, optlen);
+ case MPTCP_FULL_INFO:
+ return mptcp_getsockopt_full_info(msk, optval, optlen);
+ case MPTCP_TCPINFO:
+ return mptcp_getsockopt_tcpinfo(msk, optval, optlen);
+ case MPTCP_SUBFLOW_ADDRS:
+ return mptcp_getsockopt_subflow_addrs(msk, optval, optlen);
+ }
+
+ return -EOPNOTSUPP;
+}
+
+int mptcp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *option)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct sock *ssk;
+
+ pr_debug("msk=%p", msk);
+
+ /* @@ the meaning of setsockopt() when the socket is connected and
+ * there are multiple subflows is not yet defined. It is up to the
+ * MPTCP-level socket to configure the subflows until the subflow
+ * is in TCP fallback, when socket options are passed through
+ * to the one remaining subflow.
+ */
+ lock_sock(sk);
+ ssk = __mptcp_tcp_fallback(msk);
+ release_sock(sk);
+ if (ssk)
+ return tcp_getsockopt(ssk, level, optname, optval, option);
+
+ if (level == SOL_IP)
+ return mptcp_getsockopt_v4(msk, optname, optval, option);
+ if (level == SOL_TCP)
+ return mptcp_getsockopt_sol_tcp(msk, optname, optval, option);
+ if (level == SOL_MPTCP)
+ return mptcp_getsockopt_sol_mptcp(msk, optname, optval, option);
+ return -EOPNOTSUPP;
+}
+
+static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
+{
+ static const unsigned int tx_rx_locks = SOCK_RCVBUF_LOCK | SOCK_SNDBUF_LOCK;
+ struct sock *sk = (struct sock *)msk;
+
+ if (ssk->sk_prot->keepalive) {
+ if (sock_flag(sk, SOCK_KEEPOPEN))
+ ssk->sk_prot->keepalive(ssk, 1);
+ else
+ ssk->sk_prot->keepalive(ssk, 0);
+ }
+
+ ssk->sk_priority = sk->sk_priority;
+ ssk->sk_bound_dev_if = sk->sk_bound_dev_if;
+ ssk->sk_incoming_cpu = sk->sk_incoming_cpu;
+ ssk->sk_ipv6only = sk->sk_ipv6only;
+ __ip_sock_set_tos(ssk, inet_sk(sk)->tos);
+
+ if (sk->sk_userlocks & tx_rx_locks) {
+ ssk->sk_userlocks |= sk->sk_userlocks & tx_rx_locks;
+ if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) {
+ WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf);
+ mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf;
+ }
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
+ WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf);
+ }
+
+ if (sock_flag(sk, SOCK_LINGER)) {
+ ssk->sk_lingertime = sk->sk_lingertime;
+ sock_set_flag(ssk, SOCK_LINGER);
+ } else {
+ sock_reset_flag(ssk, SOCK_LINGER);
+ }
+
+ if (sk->sk_mark != ssk->sk_mark) {
+ ssk->sk_mark = sk->sk_mark;
+ sk_dst_reset(ssk);
+ }
+
+ sock_valbool_flag(ssk, SOCK_DBG, sock_flag(sk, SOCK_DBG));
+
+ if (inet_csk(sk)->icsk_ca_ops != inet_csk(ssk)->icsk_ca_ops)
+ tcp_set_congestion_control(ssk, msk->ca_name, false, true);
+ __tcp_sock_set_cork(ssk, !!msk->cork);
+ __tcp_sock_set_nodelay(ssk, !!msk->nodelay);
+
+ inet_assign_bit(TRANSPARENT, ssk, inet_test_bit(TRANSPARENT, sk));
+ inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk));
+}
+
+static void __mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk)
+{
+ bool slow = lock_sock_fast(ssk);
+
+ sync_socket_options(msk, ssk);
+
+ unlock_sock_fast(ssk, slow);
+}
+
+void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+
+ msk_owned_by_me(msk);
+
+ if (READ_ONCE(subflow->setsockopt_seq) != msk->setsockopt_seq) {
+ __mptcp_sockopt_sync(msk, ssk);
+
+ subflow->setsockopt_seq = msk->setsockopt_seq;
+ }
+}
+
+void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+
+ msk_owned_by_me(msk);
+
+ if (READ_ONCE(subflow->setsockopt_seq) != msk->setsockopt_seq) {
+ sync_socket_options(msk, ssk);
+
+ subflow->setsockopt_seq = msk->setsockopt_seq;
+ }
+}
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
new file mode 100644
index 000000000..d3c5ecf8d
--- /dev/null
+++ b/net/mptcp/subflow.c
@@ -0,0 +1,2077 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2017 - 2019, Intel Corporation.
+ */
+
+#define pr_fmt(fmt) "MPTCP: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <crypto/algapi.h>
+#include <crypto/sha2.h>
+#include <net/sock.h>
+#include <net/inet_common.h>
+#include <net/inet_hashtables.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+#include <net/ip6_route.h>
+#include <net/transp_v6.h>
+#endif
+#include <net/mptcp.h>
+#include <uapi/linux/mptcp.h>
+#include "protocol.h"
+#include "mib.h"
+
+#include <trace/events/mptcp.h>
+#include <trace/events/sock.h>
+
+static void mptcp_subflow_ops_undo_override(struct sock *ssk);
+
+static void SUBFLOW_REQ_INC_STATS(struct request_sock *req,
+ enum linux_mptcp_mib_field field)
+{
+ MPTCP_INC_STATS(sock_net(req_to_sk(req)), field);
+}
+
+static void subflow_req_destructor(struct request_sock *req)
+{
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+
+ pr_debug("subflow_req=%p", subflow_req);
+
+ if (subflow_req->msk)
+ sock_put((struct sock *)subflow_req->msk);
+
+ mptcp_token_destroy_request(req);
+}
+
+static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
+ void *hmac)
+{
+ u8 msg[8];
+
+ put_unaligned_be32(nonce1, &msg[0]);
+ put_unaligned_be32(nonce2, &msg[4]);
+
+ mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
+}
+
+static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk)
+{
+ return mptcp_is_fully_established((void *)msk) &&
+ ((mptcp_pm_is_userspace(msk) &&
+ mptcp_userspace_pm_active(msk)) ||
+ READ_ONCE(msk->pm.accept_subflow));
+}
+
+/* validate received token and create truncated hmac and nonce for SYN-ACK */
+static void subflow_req_create_thmac(struct mptcp_subflow_request_sock *subflow_req)
+{
+ struct mptcp_sock *msk = subflow_req->msk;
+ u8 hmac[SHA256_DIGEST_SIZE];
+
+ get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
+
+ subflow_generate_hmac(msk->local_key, msk->remote_key,
+ subflow_req->local_nonce,
+ subflow_req->remote_nonce, hmac);
+
+ subflow_req->thmac = get_unaligned_be64(hmac);
+}
+
+static struct mptcp_sock *subflow_token_join_request(struct request_sock *req)
+{
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+ struct mptcp_sock *msk;
+ int local_id;
+
+ msk = mptcp_token_get_sock(sock_net(req_to_sk(req)), subflow_req->token);
+ if (!msk) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN);
+ return NULL;
+ }
+
+ local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req);
+ if (local_id < 0) {
+ sock_put((struct sock *)msk);
+ return NULL;
+ }
+ subflow_req->local_id = local_id;
+
+ return msk;
+}
+
+static void subflow_init_req(struct request_sock *req, const struct sock *sk_listener)
+{
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+
+ subflow_req->mp_capable = 0;
+ subflow_req->mp_join = 0;
+ subflow_req->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk_listener));
+ subflow_req->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk_listener));
+ subflow_req->msk = NULL;
+ mptcp_token_init_request(req);
+}
+
+static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct sock *sk)
+{
+ return inet_sk(sk)->inet_sport != inet_sk((struct sock *)msk)->inet_sport;
+}
+
+static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason)
+{
+ struct mptcp_ext *mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
+
+ if (mpext) {
+ memset(mpext, 0, sizeof(*mpext));
+ mpext->reset_reason = reason;
+ }
+}
+
+/* Init mptcp request socket.
+ *
+ * Returns an error code if a JOIN has failed and a TCP reset
+ * should be sent.
+ */
+static int subflow_check_req(struct request_sock *req,
+ const struct sock *sk_listener,
+ struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+ struct mptcp_options_received mp_opt;
+ bool opt_mp_capable, opt_mp_join;
+
+ pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
+
+#ifdef CONFIG_TCP_MD5SIG
+ /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
+ * TCP option space.
+ */
+ if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
+ return -EINVAL;
+#endif
+
+ mptcp_get_options(skb, &mp_opt);
+
+ opt_mp_capable = !!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYN);
+ opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_SYN);
+ if (opt_mp_capable) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
+
+ if (opt_mp_join)
+ return 0;
+ } else if (opt_mp_join) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);
+ }
+
+ if (opt_mp_capable && listener->request_mptcp) {
+ int err, retries = MPTCP_TOKEN_MAX_RETRIES;
+
+ subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
+again:
+ do {
+ get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key));
+ } while (subflow_req->local_key == 0);
+
+ if (unlikely(req->syncookie)) {
+ mptcp_crypto_key_sha(subflow_req->local_key,
+ &subflow_req->token,
+ &subflow_req->idsn);
+ if (mptcp_token_exists(subflow_req->token)) {
+ if (retries-- > 0)
+ goto again;
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT);
+ } else {
+ subflow_req->mp_capable = 1;
+ }
+ return 0;
+ }
+
+ err = mptcp_token_new_request(req);
+ if (err == 0)
+ subflow_req->mp_capable = 1;
+ else if (retries-- > 0)
+ goto again;
+ else
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT);
+
+ } else if (opt_mp_join && listener->request_mptcp) {
+ subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
+ subflow_req->mp_join = 1;
+ subflow_req->backup = mp_opt.backup;
+ subflow_req->remote_id = mp_opt.join_id;
+ subflow_req->token = mp_opt.token;
+ subflow_req->remote_nonce = mp_opt.nonce;
+ subflow_req->msk = subflow_token_join_request(req);
+
+ /* Can't fall back to TCP in this case. */
+ if (!subflow_req->msk) {
+ subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP);
+ return -EPERM;
+ }
+
+ if (subflow_use_different_sport(subflow_req->msk, sk_listener)) {
+ pr_debug("syn inet_sport=%d %d",
+ ntohs(inet_sk(sk_listener)->inet_sport),
+ ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport));
+ if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX);
+ return -EPERM;
+ }
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTSYNRX);
+ }
+
+ subflow_req_create_thmac(subflow_req);
+
+ if (unlikely(req->syncookie)) {
+ if (mptcp_can_accept_new_subflow(subflow_req->msk))
+ subflow_init_req_cookie_join_save(subflow_req, skb);
+ else
+ return -EPERM;
+ }
+
+ pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token,
+ subflow_req->remote_nonce, subflow_req->msk);
+ }
+
+ return 0;
+}
+
+int mptcp_subflow_init_cookie_req(struct request_sock *req,
+ const struct sock *sk_listener,
+ struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+ struct mptcp_options_received mp_opt;
+ bool opt_mp_capable, opt_mp_join;
+ int err;
+
+ subflow_init_req(req, sk_listener);
+ mptcp_get_options(skb, &mp_opt);
+
+ opt_mp_capable = !!(mp_opt.suboptions & OPTION_MPTCP_MPC_ACK);
+ opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK);
+ if (opt_mp_capable && opt_mp_join)
+ return -EINVAL;
+
+ if (opt_mp_capable && listener->request_mptcp) {
+ if (mp_opt.sndr_key == 0)
+ return -EINVAL;
+
+ subflow_req->local_key = mp_opt.rcvr_key;
+ err = mptcp_token_new_request(req);
+ if (err)
+ return err;
+
+ subflow_req->mp_capable = 1;
+ subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1;
+ } else if (opt_mp_join && listener->request_mptcp) {
+ if (!mptcp_token_join_cookie_init_state(subflow_req, skb))
+ return -EINVAL;
+
+ subflow_req->mp_join = 1;
+ subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req);
+
+static struct dst_entry *subflow_v4_route_req(const struct sock *sk,
+ struct sk_buff *skb,
+ struct flowi *fl,
+ struct request_sock *req)
+{
+ struct dst_entry *dst;
+ int err;
+
+ tcp_rsk(req)->is_mptcp = 1;
+ subflow_init_req(req, sk);
+
+ dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req);
+ if (!dst)
+ return NULL;
+
+ err = subflow_check_req(req, sk, skb);
+ if (err == 0)
+ return dst;
+
+ dst_release(dst);
+ if (!req->syncookie)
+ tcp_request_sock_ops.send_reset(sk, skb);
+ return NULL;
+}
+
+static void subflow_prep_synack(const struct sock *sk, struct request_sock *req,
+ struct tcp_fastopen_cookie *foc,
+ enum tcp_synack_type synack_type)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ /* clear tstamp_ok, as needed depending on cookie */
+ if (foc && foc->len > -1)
+ ireq->tstamp_ok = 0;
+
+ if (synack_type == TCP_SYNACK_FASTOPEN)
+ mptcp_fastopen_subflow_synack_set_params(subflow, req);
+}
+
+static int subflow_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
+ struct flowi *fl,
+ struct request_sock *req,
+ struct tcp_fastopen_cookie *foc,
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
+{
+ subflow_prep_synack(sk, req, foc, synack_type);
+
+ return tcp_request_sock_ipv4_ops.send_synack(sk, dst, fl, req, foc,
+ synack_type, syn_skb);
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+static int subflow_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
+ struct flowi *fl,
+ struct request_sock *req,
+ struct tcp_fastopen_cookie *foc,
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
+{
+ subflow_prep_synack(sk, req, foc, synack_type);
+
+ return tcp_request_sock_ipv6_ops.send_synack(sk, dst, fl, req, foc,
+ synack_type, syn_skb);
+}
+
+static struct dst_entry *subflow_v6_route_req(const struct sock *sk,
+ struct sk_buff *skb,
+ struct flowi *fl,
+ struct request_sock *req)
+{
+ struct dst_entry *dst;
+ int err;
+
+ tcp_rsk(req)->is_mptcp = 1;
+ subflow_init_req(req, sk);
+
+ dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req);
+ if (!dst)
+ return NULL;
+
+ err = subflow_check_req(req, sk, skb);
+ if (err == 0)
+ return dst;
+
+ dst_release(dst);
+ if (!req->syncookie)
+ tcp6_request_sock_ops.send_reset(sk, skb);
+ return NULL;
+}
+#endif
+
+/* validate received truncated hmac and create hmac for third ACK */
+static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
+{
+ u8 hmac[SHA256_DIGEST_SIZE];
+ u64 thmac;
+
+ subflow_generate_hmac(subflow->remote_key, subflow->local_key,
+ subflow->remote_nonce, subflow->local_nonce,
+ hmac);
+
+ thmac = get_unaligned_be64(hmac);
+ pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
+ subflow, subflow->token, thmac, subflow->thmac);
+
+ return thmac == subflow->thmac;
+}
+
+void mptcp_subflow_reset(struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct sock *sk = subflow->conn;
+
+ /* mptcp_mp_fail_no_response() can reach here on an already closed
+ * socket
+ */
+ if (ssk->sk_state == TCP_CLOSE)
+ return;
+
+ /* must hold: tcp_done() could drop last reference on parent */
+ sock_hold(sk);
+
+ tcp_send_active_reset(ssk, GFP_ATOMIC);
+ tcp_done(ssk);
+ if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags))
+ mptcp_schedule_work(sk);
+
+ sock_put(sk);
+}
+
+static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct sock *sk)
+{
+ return inet_sk(sk)->inet_dport != inet_sk((struct sock *)msk)->inet_dport;
+}
+
+void __mptcp_sync_state(struct sock *sk, int state)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ __mptcp_propagate_sndbuf(sk, msk->first);
+ if (sk->sk_state == TCP_SYN_SENT) {
+ inet_sk_state_store(sk, state);
+ sk->sk_state_change(sk);
+ }
+}
+
+static void mptcp_propagate_state(struct sock *sk, struct sock *ssk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ mptcp_data_lock(sk);
+ if (!sock_owned_by_user(sk)) {
+ __mptcp_sync_state(sk, ssk->sk_state);
+ } else {
+ msk->pending_state = ssk->sk_state;
+ __set_bit(MPTCP_SYNC_STATE, &msk->cb_flags);
+ }
+ mptcp_data_unlock(sk);
+}
+
+static void subflow_set_remote_key(struct mptcp_sock *msk,
+ struct mptcp_subflow_context *subflow,
+ const struct mptcp_options_received *mp_opt)
+{
+ /* active MPC subflow will reach here multiple times:
+ * at subflow_finish_connect() time and at 4th ack time
+ */
+ if (subflow->remote_key_valid)
+ return;
+
+ subflow->remote_key_valid = 1;
+ subflow->remote_key = mp_opt->sndr_key;
+ mptcp_crypto_key_sha(subflow->remote_key, NULL, &subflow->iasn);
+ subflow->iasn++;
+
+ WRITE_ONCE(msk->remote_key, subflow->remote_key);
+ WRITE_ONCE(msk->ack_seq, subflow->iasn);
+ WRITE_ONCE(msk->can_ack, true);
+ atomic64_set(&msk->rcv_wnd_sent, subflow->iasn);
+}
+
+static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_options_received mp_opt;
+ struct sock *parent = subflow->conn;
+ struct mptcp_sock *msk;
+
+ subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
+
+ /* be sure no special action on any packet other than syn-ack */
+ if (subflow->conn_finished)
+ return;
+
+ msk = mptcp_sk(parent);
+ subflow->rel_write_seq = 1;
+ subflow->conn_finished = 1;
+ subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
+ pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset);
+
+ mptcp_get_options(skb, &mp_opt);
+ if (subflow->request_mptcp) {
+ if (!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYNACK)) {
+ MPTCP_INC_STATS(sock_net(sk),
+ MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
+ mptcp_do_fallback(sk);
+ pr_fallback(msk);
+ goto fallback;
+ }
+
+ if (mp_opt.suboptions & OPTION_MPTCP_CSUMREQD)
+ WRITE_ONCE(msk->csum_enabled, true);
+ if (mp_opt.deny_join_id0)
+ WRITE_ONCE(msk->pm.remote_deny_join_id0, true);
+ subflow->mp_capable = 1;
+ subflow_set_remote_key(msk, subflow, &mp_opt);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK);
+ mptcp_finish_connect(sk);
+ mptcp_propagate_state(parent, sk);
+ } else if (subflow->request_join) {
+ u8 hmac[SHA256_DIGEST_SIZE];
+
+ if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_SYNACK)) {
+ subflow->reset_reason = MPTCP_RST_EMPTCP;
+ goto do_reset;
+ }
+
+ subflow->backup = mp_opt.backup;
+ subflow->thmac = mp_opt.thmac;
+ subflow->remote_nonce = mp_opt.nonce;
+ subflow->remote_id = mp_opt.join_id;
+ pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d",
+ subflow, subflow->thmac, subflow->remote_nonce,
+ subflow->backup);
+
+ if (!subflow_thmac_valid(subflow)) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
+ subflow->reset_reason = MPTCP_RST_EMPTCP;
+ goto do_reset;
+ }
+
+ if (!mptcp_finish_join(sk))
+ goto do_reset;
+
+ subflow_generate_hmac(subflow->local_key, subflow->remote_key,
+ subflow->local_nonce,
+ subflow->remote_nonce,
+ hmac);
+ memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN);
+
+ subflow->mp_join = 1;
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
+
+ if (subflow_use_different_dport(msk, sk)) {
+ pr_debug("synack inet_dport=%d %d",
+ ntohs(inet_sk(sk)->inet_dport),
+ ntohs(inet_sk(parent)->inet_dport));
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX);
+ }
+ } else if (mptcp_check_fallback(sk)) {
+fallback:
+ mptcp_rcv_space_init(msk, sk);
+ mptcp_propagate_state(parent, sk);
+ }
+ return;
+
+do_reset:
+ subflow->reset_transient = 0;
+ mptcp_subflow_reset(sk);
+}
+
+static void subflow_set_local_id(struct mptcp_subflow_context *subflow, int local_id)
+{
+ subflow->local_id = local_id;
+ subflow->local_id_valid = 1;
+}
+
+static int subflow_chk_local_id(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ int err;
+
+ if (likely(subflow->local_id_valid))
+ return 0;
+
+ err = mptcp_pm_get_local_id(msk, (struct sock_common *)sk);
+ if (err < 0)
+ return err;
+
+ subflow_set_local_id(subflow, err);
+ return 0;
+}
+
+static int subflow_rebuild_header(struct sock *sk)
+{
+ int err = subflow_chk_local_id(sk);
+
+ if (unlikely(err < 0))
+ return err;
+
+ return inet_sk_rebuild_header(sk);
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+static int subflow_v6_rebuild_header(struct sock *sk)
+{
+ int err = subflow_chk_local_id(sk);
+
+ if (unlikely(err < 0))
+ return err;
+
+ return inet6_sk_rebuild_header(sk);
+}
+#endif
+
+static struct request_sock_ops mptcp_subflow_v4_request_sock_ops __ro_after_init;
+static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops __ro_after_init;
+
+static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+ pr_debug("subflow=%p", subflow);
+
+ /* Never answer to SYNs sent to broadcast or multicast */
+ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+ goto drop;
+
+ return tcp_conn_request(&mptcp_subflow_v4_request_sock_ops,
+ &subflow_request_sock_ipv4_ops,
+ sk, skb);
+drop:
+ tcp_listendrop(sk);
+ return 0;
+}
+
+static void subflow_v4_req_destructor(struct request_sock *req)
+{
+ subflow_req_destructor(req);
+ tcp_request_sock_ops.destructor(req);
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+static struct request_sock_ops mptcp_subflow_v6_request_sock_ops __ro_after_init;
+static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init;
+static struct inet_connection_sock_af_ops subflow_v6_specific __ro_after_init;
+static struct inet_connection_sock_af_ops subflow_v6m_specific __ro_after_init;
+static struct proto tcpv6_prot_override __ro_after_init;
+
+static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+ pr_debug("subflow=%p", subflow);
+
+ if (skb->protocol == htons(ETH_P_IP))
+ return subflow_v4_conn_request(sk, skb);
+
+ if (!ipv6_unicast_destination(skb))
+ goto drop;
+
+ if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) {
+ __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS);
+ return 0;
+ }
+
+ return tcp_conn_request(&mptcp_subflow_v6_request_sock_ops,
+ &subflow_request_sock_ipv6_ops, sk, skb);
+
+drop:
+ tcp_listendrop(sk);
+ return 0; /* don't send reset */
+}
+
+static void subflow_v6_req_destructor(struct request_sock *req)
+{
+ subflow_req_destructor(req);
+ tcp6_request_sock_ops.destructor(req);
+}
+#endif
+
+struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops,
+ struct sock *sk_listener,
+ bool attach_listener)
+{
+ if (ops->family == AF_INET)
+ ops = &mptcp_subflow_v4_request_sock_ops;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (ops->family == AF_INET6)
+ ops = &mptcp_subflow_v6_request_sock_ops;
+#endif
+
+ return inet_reqsk_alloc(ops, sk_listener, attach_listener);
+}
+EXPORT_SYMBOL(mptcp_subflow_reqsk_alloc);
+
+/* validate hmac received in third ACK */
+static bool subflow_hmac_valid(const struct request_sock *req,
+ const struct mptcp_options_received *mp_opt)
+{
+ const struct mptcp_subflow_request_sock *subflow_req;
+ u8 hmac[SHA256_DIGEST_SIZE];
+ struct mptcp_sock *msk;
+
+ subflow_req = mptcp_subflow_rsk(req);
+ msk = subflow_req->msk;
+ if (!msk)
+ return false;
+
+ subflow_generate_hmac(msk->remote_key, msk->local_key,
+ subflow_req->remote_nonce,
+ subflow_req->local_nonce, hmac);
+
+ return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN);
+}
+
+static void subflow_ulp_fallback(struct sock *sk,
+ struct mptcp_subflow_context *old_ctx)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ mptcp_subflow_tcp_fallback(sk, old_ctx);
+ icsk->icsk_ulp_ops = NULL;
+ rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
+ tcp_sk(sk)->is_mptcp = 0;
+
+ mptcp_subflow_ops_undo_override(sk);
+}
+
+void mptcp_subflow_drop_ctx(struct sock *ssk)
+{
+ struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
+
+ if (!ctx)
+ return;
+
+ list_del(&mptcp_subflow_ctx(ssk)->node);
+ if (inet_csk(ssk)->icsk_ulp_ops) {
+ subflow_ulp_fallback(ssk, ctx);
+ if (ctx->conn)
+ sock_put(ctx->conn);
+ }
+
+ kfree_rcu(ctx, rcu);
+}
+
+void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
+ const struct mptcp_options_received *mp_opt)
+{
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+
+ subflow_set_remote_key(msk, subflow, mp_opt);
+ subflow->fully_established = 1;
+ WRITE_ONCE(msk->fully_established, true);
+
+ if (subflow->is_mptfo)
+ mptcp_fastopen_gen_msk_ackseq(msk, subflow, mp_opt);
+}
+
+static struct sock *subflow_syn_recv_sock(const struct sock *sk,
+ struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst,
+ struct request_sock *req_unhash,
+ bool *own_req)
+{
+ struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
+ struct mptcp_subflow_request_sock *subflow_req;
+ struct mptcp_options_received mp_opt;
+ bool fallback, fallback_is_fatal;
+ struct mptcp_sock *owner;
+ struct sock *child;
+
+ pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
+
+ /* After child creation we must look for MPC even when options
+ * are not parsed
+ */
+ mp_opt.suboptions = 0;
+
+ /* hopefully temporary handling for MP_JOIN+syncookie */
+ subflow_req = mptcp_subflow_rsk(req);
+ fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join;
+ fallback = !tcp_rsk(req)->is_mptcp;
+ if (fallback)
+ goto create_child;
+
+ /* if the sk is MP_CAPABLE, we try to fetch the client key */
+ if (subflow_req->mp_capable) {
+ /* we can receive and accept an in-window, out-of-order pkt,
+ * which may not carry the MP_CAPABLE opt even on mptcp enabled
+ * paths: always try to extract the peer key, and fallback
+ * for packets missing it.
+ * Even OoO DSS packets coming legitly after dropped or
+ * reordered MPC will cause fallback, but we don't have other
+ * options.
+ */
+ mptcp_get_options(skb, &mp_opt);
+ if (!(mp_opt.suboptions &
+ (OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_ACK)))
+ fallback = true;
+
+ } else if (subflow_req->mp_join) {
+ mptcp_get_options(skb, &mp_opt);
+ if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK) ||
+ !subflow_hmac_valid(req, &mp_opt) ||
+ !mptcp_can_accept_new_subflow(subflow_req->msk)) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
+ fallback = true;
+ }
+ }
+
+create_child:
+ child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
+ req_unhash, own_req);
+
+ if (child && *own_req) {
+ struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
+
+ tcp_rsk(req)->drop_req = false;
+
+ /* we need to fallback on ctx allocation failure and on pre-reqs
+ * checking above. In the latter scenario we additionally need
+ * to reset the context to non MPTCP status.
+ */
+ if (!ctx || fallback) {
+ if (fallback_is_fatal) {
+ subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP);
+ goto dispose_child;
+ }
+ goto fallback;
+ }
+
+ /* ssk inherits options of listener sk */
+ ctx->setsockopt_seq = listener->setsockopt_seq;
+
+ if (ctx->mp_capable) {
+ ctx->conn = mptcp_sk_clone_init(listener->conn, &mp_opt, child, req);
+ if (!ctx->conn)
+ goto fallback;
+
+ ctx->subflow_id = 1;
+ owner = mptcp_sk(ctx->conn);
+ mptcp_pm_new_connection(owner, child, 1);
+
+ /* with OoO packets we can reach here without ingress
+ * mpc option
+ */
+ if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK) {
+ mptcp_subflow_fully_established(ctx, &mp_opt);
+ mptcp_pm_fully_established(owner, child);
+ ctx->pm_notified = 1;
+ }
+ } else if (ctx->mp_join) {
+ owner = subflow_req->msk;
+ if (!owner) {
+ subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
+ goto dispose_child;
+ }
+
+ /* move the msk reference ownership to the subflow */
+ subflow_req->msk = NULL;
+ ctx->conn = (struct sock *)owner;
+
+ if (subflow_use_different_sport(owner, sk)) {
+ pr_debug("ack inet_sport=%d %d",
+ ntohs(inet_sk(sk)->inet_sport),
+ ntohs(inet_sk((struct sock *)owner)->inet_sport));
+ if (!mptcp_pm_sport_in_anno_list(owner, sk)) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTACKRX);
+ goto dispose_child;
+ }
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTACKRX);
+ }
+
+ if (!mptcp_finish_join(child))
+ goto dispose_child;
+
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
+ tcp_rsk(req)->drop_req = true;
+ }
+ }
+
+ /* check for expected invariant - should never trigger, just help
+ * catching eariler subtle bugs
+ */
+ WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp &&
+ (!mptcp_subflow_ctx(child) ||
+ !mptcp_subflow_ctx(child)->conn));
+ return child;
+
+dispose_child:
+ mptcp_subflow_drop_ctx(child);
+ tcp_rsk(req)->drop_req = true;
+ inet_csk_prepare_for_destroy_sock(child);
+ tcp_done(child);
+ req->rsk_ops->send_reset(sk, skb);
+
+ /* The last child reference will be released by the caller */
+ return child;
+
+fallback:
+ mptcp_subflow_drop_ctx(child);
+ return child;
+}
+
+static struct inet_connection_sock_af_ops subflow_specific __ro_after_init;
+static struct proto tcp_prot_override __ro_after_init;
+
+enum mapping_status {
+ MAPPING_OK,
+ MAPPING_INVALID,
+ MAPPING_EMPTY,
+ MAPPING_DATA_FIN,
+ MAPPING_DUMMY,
+ MAPPING_BAD_CSUM
+};
+
+static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
+{
+ pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
+ ssn, subflow->map_subflow_seq, subflow->map_data_len);
+}
+
+static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ unsigned int skb_consumed;
+
+ skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq;
+ if (WARN_ON_ONCE(skb_consumed >= skb->len))
+ return true;
+
+ return skb->len - skb_consumed <= subflow->map_data_len -
+ mptcp_subflow_get_map_offset(subflow);
+}
+
+static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
+
+ if (unlikely(before(ssn, subflow->map_subflow_seq))) {
+ /* Mapping covers data later in the subflow stream,
+ * currently unsupported.
+ */
+ dbg_bad_map(subflow, ssn);
+ return false;
+ }
+ if (unlikely(!before(ssn, subflow->map_subflow_seq +
+ subflow->map_data_len))) {
+ /* Mapping does covers past subflow data, invalid */
+ dbg_bad_map(subflow, ssn);
+ return false;
+ }
+ return true;
+}
+
+static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff *skb,
+ bool csum_reqd)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ u32 offset, seq, delta;
+ __sum16 csum;
+ int len;
+
+ if (!csum_reqd)
+ return MAPPING_OK;
+
+ /* mapping already validated on previous traversal */
+ if (subflow->map_csum_len == subflow->map_data_len)
+ return MAPPING_OK;
+
+ /* traverse the receive queue, ensuring it contains a full
+ * DSS mapping and accumulating the related csum.
+ * Preserve the accoumlate csum across multiple calls, to compute
+ * the csum only once
+ */
+ delta = subflow->map_data_len - subflow->map_csum_len;
+ for (;;) {
+ seq = tcp_sk(ssk)->copied_seq + subflow->map_csum_len;
+ offset = seq - TCP_SKB_CB(skb)->seq;
+
+ /* if the current skb has not been accounted yet, csum its contents
+ * up to the amount covered by the current DSS
+ */
+ if (offset < skb->len) {
+ __wsum csum;
+
+ len = min(skb->len - offset, delta);
+ csum = skb_checksum(skb, offset, len, 0);
+ subflow->map_data_csum = csum_block_add(subflow->map_data_csum, csum,
+ subflow->map_csum_len);
+
+ delta -= len;
+ subflow->map_csum_len += len;
+ }
+ if (delta == 0)
+ break;
+
+ if (skb_queue_is_last(&ssk->sk_receive_queue, skb)) {
+ /* if this subflow is closed, the partial mapping
+ * will be never completed; flush the pending skbs, so
+ * that subflow_sched_work_if_closed() can kick in
+ */
+ if (unlikely(ssk->sk_state == TCP_CLOSE))
+ while ((skb = skb_peek(&ssk->sk_receive_queue)))
+ sk_eat_skb(ssk, skb);
+
+ /* not enough data to validate the csum */
+ return MAPPING_EMPTY;
+ }
+
+ /* the DSS mapping for next skbs will be validated later,
+ * when a get_mapping_status call will process such skb
+ */
+ skb = skb->next;
+ }
+
+ /* note that 'map_data_len' accounts only for the carried data, does
+ * not include the eventual seq increment due to the data fin,
+ * while the pseudo header requires the original DSS data len,
+ * including that
+ */
+ csum = __mptcp_make_csum(subflow->map_seq,
+ subflow->map_subflow_seq,
+ subflow->map_data_len + subflow->map_data_fin,
+ subflow->map_data_csum);
+ if (unlikely(csum)) {
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR);
+ return MAPPING_BAD_CSUM;
+ }
+
+ subflow->valid_csum_seen = 1;
+ return MAPPING_OK;
+}
+
+static enum mapping_status get_mapping_status(struct sock *ssk,
+ struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ bool csum_reqd = READ_ONCE(msk->csum_enabled);
+ struct mptcp_ext *mpext;
+ struct sk_buff *skb;
+ u16 data_len;
+ u64 map_seq;
+
+ skb = skb_peek(&ssk->sk_receive_queue);
+ if (!skb)
+ return MAPPING_EMPTY;
+
+ if (mptcp_check_fallback(ssk))
+ return MAPPING_DUMMY;
+
+ mpext = mptcp_get_ext(skb);
+ if (!mpext || !mpext->use_map) {
+ if (!subflow->map_valid && !skb->len) {
+ /* the TCP stack deliver 0 len FIN pkt to the receive
+ * queue, that is the only 0len pkts ever expected here,
+ * and we can admit no mapping only for 0 len pkts
+ */
+ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
+ WARN_ONCE(1, "0len seq %d:%d flags %x",
+ TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq,
+ TCP_SKB_CB(skb)->tcp_flags);
+ sk_eat_skb(ssk, skb);
+ return MAPPING_EMPTY;
+ }
+
+ if (!subflow->map_valid)
+ return MAPPING_INVALID;
+
+ goto validate_seq;
+ }
+
+ trace_get_mapping_status(mpext);
+
+ data_len = mpext->data_len;
+ if (data_len == 0) {
+ pr_debug("infinite mapping received");
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
+ subflow->map_data_len = 0;
+ return MAPPING_INVALID;
+ }
+
+ if (mpext->data_fin == 1) {
+ if (data_len == 1) {
+ bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq,
+ mpext->dsn64);
+ pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq);
+ if (subflow->map_valid) {
+ /* A DATA_FIN might arrive in a DSS
+ * option before the previous mapping
+ * has been fully consumed. Continue
+ * handling the existing mapping.
+ */
+ skb_ext_del(skb, SKB_EXT_MPTCP);
+ return MAPPING_OK;
+ } else {
+ if (updated)
+ mptcp_schedule_work((struct sock *)msk);
+
+ return MAPPING_DATA_FIN;
+ }
+ } else {
+ u64 data_fin_seq = mpext->data_seq + data_len - 1;
+
+ /* If mpext->data_seq is a 32-bit value, data_fin_seq
+ * must also be limited to 32 bits.
+ */
+ if (!mpext->dsn64)
+ data_fin_seq &= GENMASK_ULL(31, 0);
+
+ mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64);
+ pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d",
+ data_fin_seq, mpext->dsn64);
+ }
+
+ /* Adjust for DATA_FIN using 1 byte of sequence space */
+ data_len--;
+ }
+
+ map_seq = mptcp_expand_seq(READ_ONCE(msk->ack_seq), mpext->data_seq, mpext->dsn64);
+ WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64);
+
+ if (subflow->map_valid) {
+ /* Allow replacing only with an identical map */
+ if (subflow->map_seq == map_seq &&
+ subflow->map_subflow_seq == mpext->subflow_seq &&
+ subflow->map_data_len == data_len &&
+ subflow->map_csum_reqd == mpext->csum_reqd) {
+ skb_ext_del(skb, SKB_EXT_MPTCP);
+ goto validate_csum;
+ }
+
+ /* If this skb data are fully covered by the current mapping,
+ * the new map would need caching, which is not supported
+ */
+ if (skb_is_fully_mapped(ssk, skb)) {
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH);
+ return MAPPING_INVALID;
+ }
+
+ /* will validate the next map after consuming the current one */
+ goto validate_csum;
+ }
+
+ subflow->map_seq = map_seq;
+ subflow->map_subflow_seq = mpext->subflow_seq;
+ subflow->map_data_len = data_len;
+ subflow->map_valid = 1;
+ subflow->map_data_fin = mpext->data_fin;
+ subflow->mpc_map = mpext->mpc_map;
+ subflow->map_csum_reqd = mpext->csum_reqd;
+ subflow->map_csum_len = 0;
+ subflow->map_data_csum = csum_unfold(mpext->csum);
+
+ /* Cfr RFC 8684 Section 3.3.0 */
+ if (unlikely(subflow->map_csum_reqd != csum_reqd))
+ return MAPPING_INVALID;
+
+ pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u",
+ subflow->map_seq, subflow->map_subflow_seq,
+ subflow->map_data_len, subflow->map_csum_reqd,
+ subflow->map_data_csum);
+
+validate_seq:
+ /* we revalidate valid mapping on new skb, because we must ensure
+ * the current skb is completely covered by the available mapping
+ */
+ if (!validate_mapping(ssk, skb)) {
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSTCPMISMATCH);
+ return MAPPING_INVALID;
+ }
+
+ skb_ext_del(skb, SKB_EXT_MPTCP);
+
+validate_csum:
+ return validate_data_csum(ssk, skb, csum_reqd);
+}
+
+static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb,
+ u64 limit)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
+ u32 incr;
+
+ incr = limit >= skb->len ? skb->len + fin : limit;
+
+ pr_debug("discarding=%d len=%d seq=%d", incr, skb->len,
+ subflow->map_subflow_seq);
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA);
+ tcp_sk(ssk)->copied_seq += incr;
+ if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq))
+ sk_eat_skb(ssk, skb);
+ if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len)
+ subflow->map_valid = 0;
+}
+
+/* sched mptcp worker to remove the subflow if no more data is pending */
+static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk)
+{
+ if (likely(ssk->sk_state != TCP_CLOSE))
+ return;
+
+ if (skb_queue_empty(&ssk->sk_receive_queue) &&
+ !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
+ mptcp_schedule_work((struct sock *)msk);
+}
+
+static bool subflow_can_fallback(struct mptcp_subflow_context *subflow)
+{
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+
+ if (subflow->mp_join)
+ return false;
+ else if (READ_ONCE(msk->csum_enabled))
+ return !subflow->valid_csum_seen;
+ else
+ return !subflow->fully_established;
+}
+
+static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ unsigned long fail_tout;
+
+ /* greceful failure can happen only on the MPC subflow */
+ if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first)))
+ return;
+
+ /* since the close timeout take precedence on the fail one,
+ * no need to start the latter when the first is already set
+ */
+ if (sock_flag((struct sock *)msk, SOCK_DEAD))
+ return;
+
+ /* we don't need extreme accuracy here, use a zero fail_tout as special
+ * value meaning no fail timeout at all;
+ */
+ fail_tout = jiffies + TCP_RTO_MAX;
+ if (!fail_tout)
+ fail_tout = 1;
+ WRITE_ONCE(subflow->fail_tout, fail_tout);
+ tcp_send_ack(ssk);
+
+ mptcp_reset_tout_timer(msk, subflow->fail_tout);
+}
+
+static bool subflow_check_data_avail(struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ enum mapping_status status;
+ struct mptcp_sock *msk;
+ struct sk_buff *skb;
+
+ if (!skb_peek(&ssk->sk_receive_queue))
+ WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
+ if (subflow->data_avail)
+ return true;
+
+ msk = mptcp_sk(subflow->conn);
+ for (;;) {
+ u64 ack_seq;
+ u64 old_ack;
+
+ status = get_mapping_status(ssk, msk);
+ trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue));
+ if (unlikely(status == MAPPING_INVALID || status == MAPPING_DUMMY ||
+ status == MAPPING_BAD_CSUM))
+ goto fallback;
+
+ if (status != MAPPING_OK)
+ goto no_data;
+
+ skb = skb_peek(&ssk->sk_receive_queue);
+ if (WARN_ON_ONCE(!skb))
+ goto no_data;
+
+ if (unlikely(!READ_ONCE(msk->can_ack)))
+ goto fallback;
+
+ old_ack = READ_ONCE(msk->ack_seq);
+ ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
+ pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
+ ack_seq);
+ if (unlikely(before64(ack_seq, old_ack))) {
+ mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq);
+ continue;
+ }
+
+ WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
+ break;
+ }
+ return true;
+
+no_data:
+ subflow_sched_work_if_closed(msk, ssk);
+ return false;
+
+fallback:
+ if (!__mptcp_check_fallback(msk)) {
+ /* RFC 8684 section 3.7. */
+ if (status == MAPPING_BAD_CSUM &&
+ (subflow->mp_join || subflow->valid_csum_seen)) {
+ subflow->send_mp_fail = 1;
+
+ if (!READ_ONCE(msk->allow_infinite_fallback)) {
+ subflow->reset_transient = 0;
+ subflow->reset_reason = MPTCP_RST_EMIDDLEBOX;
+ goto reset;
+ }
+ mptcp_subflow_fail(msk, ssk);
+ WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
+ return true;
+ }
+
+ if (!subflow_can_fallback(subflow) && subflow->map_data_len) {
+ /* fatal protocol error, close the socket.
+ * subflow_error_report() will introduce the appropriate barriers
+ */
+ subflow->reset_transient = 0;
+ subflow->reset_reason = MPTCP_RST_EMPTCP;
+
+reset:
+ WRITE_ONCE(ssk->sk_err, EBADMSG);
+ tcp_set_state(ssk, TCP_CLOSE);
+ while ((skb = skb_peek(&ssk->sk_receive_queue)))
+ sk_eat_skb(ssk, skb);
+ tcp_send_active_reset(ssk, GFP_ATOMIC);
+ WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
+ return false;
+ }
+
+ mptcp_do_fallback(ssk);
+ }
+
+ skb = skb_peek(&ssk->sk_receive_queue);
+ subflow->map_valid = 1;
+ subflow->map_seq = READ_ONCE(msk->ack_seq);
+ subflow->map_data_len = skb->len;
+ subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
+ WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
+ return true;
+}
+
+bool mptcp_subflow_data_available(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+ /* check if current mapping is still valid */
+ if (subflow->map_valid &&
+ mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
+ subflow->map_valid = 0;
+ WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
+
+ pr_debug("Done with mapping: seq=%u data_len=%u",
+ subflow->map_subflow_seq,
+ subflow->map_data_len);
+ }
+
+ return subflow_check_data_avail(sk);
+}
+
+/* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy,
+ * not the ssk one.
+ *
+ * In mptcp, rwin is about the mptcp-level connection data.
+ *
+ * Data that is still on the ssk rx queue can thus be ignored,
+ * as far as mptcp peer is concerned that data is still inflight.
+ * DSS ACK is updated when skb is moved to the mptcp rx queue.
+ */
+void mptcp_space(const struct sock *ssk, int *space, int *full_space)
+{
+ const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ const struct sock *sk = subflow->conn;
+
+ *space = __mptcp_space(sk);
+ *full_space = mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf));
+}
+
+static void subflow_error_report(struct sock *ssk)
+{
+ struct sock *sk = mptcp_subflow_ctx(ssk)->conn;
+
+ /* bail early if this is a no-op, so that we avoid introducing a
+ * problematic lockdep dependency between TCP accept queue lock
+ * and msk socket spinlock
+ */
+ if (!sk->sk_socket)
+ return;
+
+ mptcp_data_lock(sk);
+ if (!sock_owned_by_user(sk))
+ __mptcp_error_report(sk);
+ else
+ __set_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->cb_flags);
+ mptcp_data_unlock(sk);
+}
+
+static void subflow_data_ready(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ u16 state = 1 << inet_sk_state_load(sk);
+ struct sock *parent = subflow->conn;
+ struct mptcp_sock *msk;
+
+ trace_sk_data_ready(sk);
+
+ msk = mptcp_sk(parent);
+ if (state & TCPF_LISTEN) {
+ /* MPJ subflow are removed from accept queue before reaching here,
+ * avoid stray wakeups
+ */
+ if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue))
+ return;
+
+ parent->sk_data_ready(parent);
+ return;
+ }
+
+ WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable &&
+ !subflow->mp_join && !(state & TCPF_CLOSE));
+
+ if (mptcp_subflow_data_available(sk))
+ mptcp_data_ready(parent, sk);
+ else if (unlikely(sk->sk_err))
+ subflow_error_report(sk);
+}
+
+static void subflow_write_space(struct sock *ssk)
+{
+ struct sock *sk = mptcp_subflow_ctx(ssk)->conn;
+
+ mptcp_propagate_sndbuf(sk, ssk);
+ mptcp_write_space(sk);
+}
+
+static const struct inet_connection_sock_af_ops *
+subflow_default_af_ops(struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (sk->sk_family == AF_INET6)
+ return &subflow_v6_specific;
+#endif
+ return &subflow_specific;
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct inet_connection_sock_af_ops *target;
+
+ target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);
+
+ pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
+ subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);
+
+ if (likely(icsk->icsk_af_ops == target))
+ return;
+
+ subflow->icsk_af_ops = icsk->icsk_af_ops;
+ icsk->icsk_af_ops = target;
+}
+#endif
+
+void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
+ struct sockaddr_storage *addr,
+ unsigned short family)
+{
+ memset(addr, 0, sizeof(*addr));
+ addr->ss_family = family;
+ if (addr->ss_family == AF_INET) {
+ struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
+
+ if (info->family == AF_INET)
+ in_addr->sin_addr = info->addr;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (ipv6_addr_v4mapped(&info->addr6))
+ in_addr->sin_addr.s_addr = info->addr6.s6_addr32[3];
+#endif
+ in_addr->sin_port = info->port;
+ }
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (addr->ss_family == AF_INET6) {
+ struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;
+
+ if (info->family == AF_INET)
+ ipv6_addr_set_v4mapped(info->addr.s_addr,
+ &in6_addr->sin6_addr);
+ else
+ in6_addr->sin6_addr = info->addr6;
+ in6_addr->sin6_port = info->port;
+ }
+#endif
+}
+
+int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
+ const struct mptcp_addr_info *remote)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_subflow_context *subflow;
+ struct sockaddr_storage addr;
+ int remote_id = remote->id;
+ int local_id = loc->id;
+ int err = -ENOTCONN;
+ struct socket *sf;
+ struct sock *ssk;
+ u32 remote_token;
+ int addrlen;
+ int ifindex;
+ u8 flags;
+
+ if (!mptcp_is_fully_established(sk))
+ goto err_out;
+
+ err = mptcp_subflow_create_socket(sk, loc->family, &sf);
+ if (err)
+ goto err_out;
+
+ ssk = sf->sk;
+ subflow = mptcp_subflow_ctx(ssk);
+ do {
+ get_random_bytes(&subflow->local_nonce, sizeof(u32));
+ } while (!subflow->local_nonce);
+
+ if (local_id)
+ subflow_set_local_id(subflow, local_id);
+
+ mptcp_pm_get_flags_and_ifindex_by_id(msk, local_id,
+ &flags, &ifindex);
+ subflow->remote_key_valid = 1;
+ subflow->remote_key = msk->remote_key;
+ subflow->local_key = msk->local_key;
+ subflow->token = msk->token;
+ mptcp_info2sockaddr(loc, &addr, ssk->sk_family);
+
+ addrlen = sizeof(struct sockaddr_in);
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (addr.ss_family == AF_INET6)
+ addrlen = sizeof(struct sockaddr_in6);
+#endif
+ mptcp_sockopt_sync(msk, ssk);
+
+ ssk->sk_bound_dev_if = ifindex;
+ err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
+ if (err)
+ goto failed;
+
+ mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
+ pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk,
+ remote_token, local_id, remote_id);
+ subflow->remote_token = remote_token;
+ subflow->remote_id = remote_id;
+ subflow->request_join = 1;
+ subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP);
+ subflow->subflow_id = msk->subflow_id++;
+ mptcp_info2sockaddr(remote, &addr, ssk->sk_family);
+
+ sock_hold(ssk);
+ list_add_tail(&subflow->node, &msk->conn_list);
+ err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
+ if (err && err != -EINPROGRESS)
+ goto failed_unlink;
+
+ /* discard the subflow socket */
+ mptcp_sock_graft(ssk, sk->sk_socket);
+ iput(SOCK_INODE(sf));
+ WRITE_ONCE(msk->allow_infinite_fallback, false);
+ mptcp_stop_tout_timer(sk);
+ return 0;
+
+failed_unlink:
+ list_del(&subflow->node);
+ sock_put(mptcp_subflow_tcp_sock(subflow));
+
+failed:
+ subflow->disposable = 1;
+ sock_release(sf);
+
+err_out:
+ /* we account subflows before the creation, and this failures will not
+ * be caught by sk_state_change()
+ */
+ mptcp_pm_close_subflow(msk);
+ return err;
+}
+
+static void mptcp_attach_cgroup(struct sock *parent, struct sock *child)
+{
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data,
+ *child_skcd = &child->sk_cgrp_data;
+
+ /* only the additional subflows created by kworkers have to be modified */
+ if (cgroup_id(sock_cgroup_ptr(parent_skcd)) !=
+ cgroup_id(sock_cgroup_ptr(child_skcd))) {
+#ifdef CONFIG_MEMCG
+ struct mem_cgroup *memcg = parent->sk_memcg;
+
+ mem_cgroup_sk_free(child);
+ if (memcg && css_tryget(&memcg->css))
+ child->sk_memcg = memcg;
+#endif /* CONFIG_MEMCG */
+
+ cgroup_sk_free(child_skcd);
+ *child_skcd = *parent_skcd;
+ cgroup_sk_clone(child_skcd);
+ }
+#endif /* CONFIG_SOCK_CGROUP_DATA */
+}
+
+static void mptcp_subflow_ops_override(struct sock *ssk)
+{
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (ssk->sk_prot == &tcpv6_prot)
+ ssk->sk_prot = &tcpv6_prot_override;
+ else
+#endif
+ ssk->sk_prot = &tcp_prot_override;
+}
+
+static void mptcp_subflow_ops_undo_override(struct sock *ssk)
+{
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (ssk->sk_prot == &tcpv6_prot_override)
+ ssk->sk_prot = &tcpv6_prot;
+ else
+#endif
+ ssk->sk_prot = &tcp_prot;
+}
+
+int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
+ struct socket **new_sock)
+{
+ struct mptcp_subflow_context *subflow;
+ struct net *net = sock_net(sk);
+ struct socket *sf;
+ int err;
+
+ /* un-accepted server sockets can reach here - on bad configuration
+ * bail early to avoid greater trouble later
+ */
+ if (unlikely(!sk->sk_socket))
+ return -EINVAL;
+
+ err = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, &sf);
+ if (err)
+ return err;
+
+ lock_sock_nested(sf->sk, SINGLE_DEPTH_NESTING);
+
+ err = security_mptcp_add_subflow(sk, sf->sk);
+ if (err)
+ goto release_ssk;
+
+ /* the newly created socket has to be in the same cgroup as its parent */
+ mptcp_attach_cgroup(sk, sf->sk);
+
+ /* kernel sockets do not by default acquire net ref, but TCP timer
+ * needs it.
+ * Update ns_tracker to current stack trace and refcounted tracker.
+ */
+ __netns_tracker_free(net, &sf->sk->ns_tracker, false);
+ sf->sk->sk_net_refcnt = 1;
+ get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL);
+ sock_inuse_add(net, 1);
+ err = tcp_set_ulp(sf->sk, "mptcp");
+
+release_ssk:
+ release_sock(sf->sk);
+
+ if (err) {
+ sock_release(sf);
+ return err;
+ }
+
+ /* the newly created socket really belongs to the owning MPTCP master
+ * socket, even if for additional subflows the allocation is performed
+ * by a kernel workqueue. Adjust inode references, so that the
+ * procfs/diag interfaces really show this one belonging to the correct
+ * user.
+ */
+ SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino;
+ SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid;
+ SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid;
+
+ subflow = mptcp_subflow_ctx(sf->sk);
+ pr_debug("subflow=%p", subflow);
+
+ *new_sock = sf;
+ sock_hold(sk);
+ subflow->conn = sk;
+ mptcp_subflow_ops_override(sf->sk);
+
+ return 0;
+}
+
+static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
+ gfp_t priority)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct mptcp_subflow_context *ctx;
+
+ ctx = kzalloc(sizeof(*ctx), priority);
+ if (!ctx)
+ return NULL;
+
+ rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
+ INIT_LIST_HEAD(&ctx->node);
+ INIT_LIST_HEAD(&ctx->delegated_node);
+
+ pr_debug("subflow=%p", ctx);
+
+ ctx->tcp_sock = sk;
+
+ return ctx;
+}
+
+static void __subflow_state_change(struct sock *sk)
+{
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_all(&wq->wait);
+ rcu_read_unlock();
+}
+
+static bool subflow_is_done(const struct sock *sk)
+{
+ return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE;
+}
+
+static void subflow_state_change(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct sock *parent = subflow->conn;
+ struct mptcp_sock *msk;
+
+ __subflow_state_change(sk);
+
+ msk = mptcp_sk(parent);
+ if (subflow_simultaneous_connect(sk)) {
+ mptcp_do_fallback(sk);
+ mptcp_rcv_space_init(msk, sk);
+ pr_fallback(msk);
+ subflow->conn_finished = 1;
+ mptcp_propagate_state(parent, sk);
+ }
+
+ /* as recvmsg() does not acquire the subflow socket for ssk selection
+ * a fin packet carrying a DSS can be unnoticed if we don't trigger
+ * the data available machinery here.
+ */
+ if (mptcp_subflow_data_available(sk))
+ mptcp_data_ready(parent, sk);
+ else if (unlikely(sk->sk_err))
+ subflow_error_report(sk);
+
+ subflow_sched_work_if_closed(mptcp_sk(parent), sk);
+
+ /* when the fallback subflow closes the rx side, trigger a 'dummy'
+ * ingress data fin, so that the msk state will follow along
+ */
+ if (__mptcp_check_fallback(msk) && subflow_is_done(sk) && msk->first == sk &&
+ mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true))
+ mptcp_schedule_work(parent);
+}
+
+void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk)
+{
+ struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue;
+ struct request_sock *req, *head, *tail;
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk, *ssk;
+
+ /* Due to lock dependencies no relevant lock can be acquired under rskq_lock.
+ * Splice the req list, so that accept() can not reach the pending ssk after
+ * the listener socket is released below.
+ */
+ spin_lock_bh(&queue->rskq_lock);
+ head = queue->rskq_accept_head;
+ tail = queue->rskq_accept_tail;
+ queue->rskq_accept_head = NULL;
+ queue->rskq_accept_tail = NULL;
+ spin_unlock_bh(&queue->rskq_lock);
+
+ if (!head)
+ return;
+
+ /* can't acquire the msk socket lock under the subflow one,
+ * or will cause ABBA deadlock
+ */
+ release_sock(listener_ssk);
+
+ for (req = head; req; req = req->dl_next) {
+ ssk = req->sk;
+ if (!sk_is_mptcp(ssk))
+ continue;
+
+ subflow = mptcp_subflow_ctx(ssk);
+ if (!subflow || !subflow->conn)
+ continue;
+
+ sk = subflow->conn;
+ sock_hold(sk);
+
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+ __mptcp_unaccepted_force_close(sk);
+ release_sock(sk);
+
+ /* lockdep will report a false positive ABBA deadlock
+ * between cancel_work_sync and the listener socket.
+ * The involved locks belong to different sockets WRT
+ * the existing AB chain.
+ * Using a per socket key is problematic as key
+ * deregistration requires process context and must be
+ * performed at socket disposal time, in atomic
+ * context.
+ * Just tell lockdep to consider the listener socket
+ * released here.
+ */
+ mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_);
+ mptcp_cancel_work(sk);
+ mutex_acquire(&listener_sk->sk_lock.dep_map, 0, 0, _RET_IP_);
+
+ sock_put(sk);
+ }
+
+ /* we are still under the listener msk socket lock */
+ lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING);
+
+ /* restore the listener queue, to let the TCP code clean it up */
+ spin_lock_bh(&queue->rskq_lock);
+ WARN_ON_ONCE(queue->rskq_accept_head);
+ queue->rskq_accept_head = head;
+ queue->rskq_accept_tail = tail;
+ spin_unlock_bh(&queue->rskq_lock);
+}
+
+static int subflow_ulp_init(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct mptcp_subflow_context *ctx;
+ struct tcp_sock *tp = tcp_sk(sk);
+ int err = 0;
+
+ /* disallow attaching ULP to a socket unless it has been
+ * created with sock_create_kern()
+ */
+ if (!sk->sk_kern_sock) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ ctx = subflow_create_ctx(sk, GFP_KERNEL);
+ if (!ctx) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ pr_debug("subflow=%p, family=%d", ctx, sk->sk_family);
+
+ tp->is_mptcp = 1;
+ ctx->icsk_af_ops = icsk->icsk_af_ops;
+ icsk->icsk_af_ops = subflow_default_af_ops(sk);
+ ctx->tcp_state_change = sk->sk_state_change;
+ ctx->tcp_error_report = sk->sk_error_report;
+
+ WARN_ON_ONCE(sk->sk_data_ready != sock_def_readable);
+ WARN_ON_ONCE(sk->sk_write_space != sk_stream_write_space);
+
+ sk->sk_data_ready = subflow_data_ready;
+ sk->sk_write_space = subflow_write_space;
+ sk->sk_state_change = subflow_state_change;
+ sk->sk_error_report = subflow_error_report;
+out:
+ return err;
+}
+
+static void subflow_ulp_release(struct sock *ssk)
+{
+ struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
+ bool release = true;
+ struct sock *sk;
+
+ if (!ctx)
+ return;
+
+ sk = ctx->conn;
+ if (sk) {
+ /* if the msk has been orphaned, keep the ctx
+ * alive, will be freed by __mptcp_close_ssk(),
+ * when the subflow is still unaccepted
+ */
+ release = ctx->disposable || list_empty(&ctx->node);
+
+ /* inet_child_forget() does not call sk_state_change(),
+ * explicitly trigger the socket close machinery
+ */
+ if (!release && !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW,
+ &mptcp_sk(sk)->flags))
+ mptcp_schedule_work(sk);
+ sock_put(sk);
+ }
+
+ mptcp_subflow_ops_undo_override(ssk);
+ if (release)
+ kfree_rcu(ctx, rcu);
+}
+
+static void subflow_ulp_clone(const struct request_sock *req,
+ struct sock *newsk,
+ const gfp_t priority)
+{
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+ struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
+ struct mptcp_subflow_context *new_ctx;
+
+ if (!tcp_rsk(req)->is_mptcp ||
+ (!subflow_req->mp_capable && !subflow_req->mp_join)) {
+ subflow_ulp_fallback(newsk, old_ctx);
+ return;
+ }
+
+ new_ctx = subflow_create_ctx(newsk, priority);
+ if (!new_ctx) {
+ subflow_ulp_fallback(newsk, old_ctx);
+ return;
+ }
+
+ new_ctx->conn_finished = 1;
+ new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
+ new_ctx->tcp_state_change = old_ctx->tcp_state_change;
+ new_ctx->tcp_error_report = old_ctx->tcp_error_report;
+ new_ctx->rel_write_seq = 1;
+ new_ctx->tcp_sock = newsk;
+
+ if (subflow_req->mp_capable) {
+ /* see comments in subflow_syn_recv_sock(), MPTCP connection
+ * is fully established only after we receive the remote key
+ */
+ new_ctx->mp_capable = 1;
+ new_ctx->local_key = subflow_req->local_key;
+ new_ctx->token = subflow_req->token;
+ new_ctx->ssn_offset = subflow_req->ssn_offset;
+ new_ctx->idsn = subflow_req->idsn;
+
+ /* this is the first subflow, id is always 0 */
+ new_ctx->local_id_valid = 1;
+ } else if (subflow_req->mp_join) {
+ new_ctx->ssn_offset = subflow_req->ssn_offset;
+ new_ctx->mp_join = 1;
+ new_ctx->fully_established = 1;
+ new_ctx->remote_key_valid = 1;
+ new_ctx->backup = subflow_req->backup;
+ new_ctx->remote_id = subflow_req->remote_id;
+ new_ctx->token = subflow_req->token;
+ new_ctx->thmac = subflow_req->thmac;
+
+ /* the subflow req id is valid, fetched via subflow_check_req()
+ * and subflow_token_join_request()
+ */
+ subflow_set_local_id(new_ctx, subflow_req->local_id);
+ }
+}
+
+static void tcp_release_cb_override(struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ long status;
+
+ /* process and clear all the pending actions, but leave the subflow into
+ * the napi queue. To respect locking, only the same CPU that originated
+ * the action can touch the list. mptcp_napi_poll will take care of it.
+ */
+ status = set_mask_bits(&subflow->delegated_status, MPTCP_DELEGATE_ACTIONS_MASK, 0);
+ if (status)
+ mptcp_subflow_process_delegated(ssk, status);
+
+ tcp_release_cb(ssk);
+}
+
+static int tcp_abort_override(struct sock *ssk, int err)
+{
+ /* closing a listener subflow requires a great deal of care.
+ * keep it simple and just prevent such operation
+ */
+ if (inet_sk_state_load(ssk) == TCP_LISTEN)
+ return -EINVAL;
+
+ return tcp_abort(ssk, err);
+}
+
+static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
+ .name = "mptcp",
+ .owner = THIS_MODULE,
+ .init = subflow_ulp_init,
+ .release = subflow_ulp_release,
+ .clone = subflow_ulp_clone,
+};
+
+static int subflow_ops_init(struct request_sock_ops *subflow_ops)
+{
+ subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock);
+
+ subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name,
+ subflow_ops->obj_size, 0,
+ SLAB_ACCOUNT |
+ SLAB_TYPESAFE_BY_RCU,
+ NULL);
+ if (!subflow_ops->slab)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void __init mptcp_subflow_init(void)
+{
+ mptcp_subflow_v4_request_sock_ops = tcp_request_sock_ops;
+ mptcp_subflow_v4_request_sock_ops.slab_name = "request_sock_subflow_v4";
+ mptcp_subflow_v4_request_sock_ops.destructor = subflow_v4_req_destructor;
+
+ if (subflow_ops_init(&mptcp_subflow_v4_request_sock_ops) != 0)
+ panic("MPTCP: failed to init subflow v4 request sock ops\n");
+
+ subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
+ subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req;
+ subflow_request_sock_ipv4_ops.send_synack = subflow_v4_send_synack;
+
+ subflow_specific = ipv4_specific;
+ subflow_specific.conn_request = subflow_v4_conn_request;
+ subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
+ subflow_specific.sk_rx_dst_set = subflow_finish_connect;
+ subflow_specific.rebuild_header = subflow_rebuild_header;
+
+ tcp_prot_override = tcp_prot;
+ tcp_prot_override.release_cb = tcp_release_cb_override;
+ tcp_prot_override.diag_destroy = tcp_abort_override;
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ /* In struct mptcp_subflow_request_sock, we assume the TCP request sock
+ * structures for v4 and v6 have the same size. It should not changed in
+ * the future but better to make sure to be warned if it is no longer
+ * the case.
+ */
+ BUILD_BUG_ON(sizeof(struct tcp_request_sock) != sizeof(struct tcp6_request_sock));
+
+ mptcp_subflow_v6_request_sock_ops = tcp6_request_sock_ops;
+ mptcp_subflow_v6_request_sock_ops.slab_name = "request_sock_subflow_v6";
+ mptcp_subflow_v6_request_sock_ops.destructor = subflow_v6_req_destructor;
+
+ if (subflow_ops_init(&mptcp_subflow_v6_request_sock_ops) != 0)
+ panic("MPTCP: failed to init subflow v6 request sock ops\n");
+
+ subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
+ subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req;
+ subflow_request_sock_ipv6_ops.send_synack = subflow_v6_send_synack;
+
+ subflow_v6_specific = ipv6_specific;
+ subflow_v6_specific.conn_request = subflow_v6_conn_request;
+ subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
+ subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;
+ subflow_v6_specific.rebuild_header = subflow_v6_rebuild_header;
+
+ subflow_v6m_specific = subflow_v6_specific;
+ subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
+ subflow_v6m_specific.send_check = ipv4_specific.send_check;
+ subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
+ subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
+ subflow_v6m_specific.net_frag_header_len = 0;
+ subflow_v6m_specific.rebuild_header = subflow_rebuild_header;
+
+ tcpv6_prot_override = tcpv6_prot;
+ tcpv6_prot_override.release_cb = tcp_release_cb_override;
+ tcpv6_prot_override.diag_destroy = tcp_abort_override;
+#endif
+
+ mptcp_diag_subflow_init(&subflow_ulp_ops);
+
+ if (tcp_register_ulp(&subflow_ulp_ops) != 0)
+ panic("MPTCP: failed to register subflows to ULP\n");
+}
diff --git a/net/mptcp/syncookies.c b/net/mptcp/syncookies.c
new file mode 100644
index 000000000..7f2252634
--- /dev/null
+++ b/net/mptcp/syncookies.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/skbuff.h>
+
+#include "protocol.h"
+
+/* Syncookies do not work for JOIN requests.
+ *
+ * Unlike MP_CAPABLE, where the ACK cookie contains the needed MPTCP
+ * options to reconstruct the initial syn state, MP_JOIN does not contain
+ * the token to obtain the mptcp socket nor the server-generated nonce
+ * that was used in the cookie SYN/ACK response.
+ *
+ * Keep a small best effort state table to store the syn/synack data,
+ * indexed by skb hash.
+ *
+ * A MP_JOIN SYN packet handled by syn cookies is only stored if the 32bit
+ * token matches a known mptcp connection that can still accept more subflows.
+ *
+ * There is no timeout handling -- state is only re-constructed
+ * when the TCP ACK passed the cookie validation check.
+ */
+
+struct join_entry {
+ u32 token;
+ u32 remote_nonce;
+ u32 local_nonce;
+ u8 join_id;
+ u8 local_id;
+ u8 backup;
+ u8 valid;
+};
+
+#define COOKIE_JOIN_SLOTS 1024
+
+static struct join_entry join_entries[COOKIE_JOIN_SLOTS] __cacheline_aligned_in_smp;
+static spinlock_t join_entry_locks[COOKIE_JOIN_SLOTS] __cacheline_aligned_in_smp;
+
+static u32 mptcp_join_entry_hash(struct sk_buff *skb, struct net *net)
+{
+ static u32 mptcp_join_hash_secret __read_mostly;
+ struct tcphdr *th = tcp_hdr(skb);
+ u32 seq, i;
+
+ net_get_random_once(&mptcp_join_hash_secret,
+ sizeof(mptcp_join_hash_secret));
+
+ if (th->syn)
+ seq = TCP_SKB_CB(skb)->seq;
+ else
+ seq = TCP_SKB_CB(skb)->seq - 1;
+
+ i = jhash_3words(seq, net_hash_mix(net),
+ (__force __u32)th->source << 16 | (__force __u32)th->dest,
+ mptcp_join_hash_secret);
+
+ return i % ARRAY_SIZE(join_entries);
+}
+
+static void mptcp_join_store_state(struct join_entry *entry,
+ const struct mptcp_subflow_request_sock *subflow_req)
+{
+ entry->token = subflow_req->token;
+ entry->remote_nonce = subflow_req->remote_nonce;
+ entry->local_nonce = subflow_req->local_nonce;
+ entry->backup = subflow_req->backup;
+ entry->join_id = subflow_req->remote_id;
+ entry->local_id = subflow_req->local_id;
+ entry->valid = 1;
+}
+
+void subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req,
+ struct sk_buff *skb)
+{
+ struct net *net = read_pnet(&subflow_req->sk.req.ireq_net);
+ u32 i = mptcp_join_entry_hash(skb, net);
+
+ /* No use in waiting if other cpu is already using this slot --
+ * would overwrite the data that got stored.
+ */
+ spin_lock_bh(&join_entry_locks[i]);
+ mptcp_join_store_state(&join_entries[i], subflow_req);
+ spin_unlock_bh(&join_entry_locks[i]);
+}
+
+/* Called for a cookie-ack with MP_JOIN option present.
+ * Look up the saved state based on skb hash & check token matches msk
+ * in same netns.
+ *
+ * Caller will check msk can still accept another subflow. The hmac
+ * present in the cookie ACK mptcp option space will be checked later.
+ */
+bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req,
+ struct sk_buff *skb)
+{
+ struct net *net = read_pnet(&subflow_req->sk.req.ireq_net);
+ u32 i = mptcp_join_entry_hash(skb, net);
+ struct mptcp_sock *msk;
+ struct join_entry *e;
+
+ e = &join_entries[i];
+
+ spin_lock_bh(&join_entry_locks[i]);
+
+ if (e->valid == 0) {
+ spin_unlock_bh(&join_entry_locks[i]);
+ return false;
+ }
+
+ e->valid = 0;
+
+ msk = mptcp_token_get_sock(net, e->token);
+ if (!msk) {
+ spin_unlock_bh(&join_entry_locks[i]);
+ return false;
+ }
+
+ subflow_req->remote_nonce = e->remote_nonce;
+ subflow_req->local_nonce = e->local_nonce;
+ subflow_req->backup = e->backup;
+ subflow_req->remote_id = e->join_id;
+ subflow_req->token = e->token;
+ subflow_req->msk = msk;
+ spin_unlock_bh(&join_entry_locks[i]);
+ return true;
+}
+
+void __init mptcp_join_cookie_init(void)
+{
+ int i;
+
+ for (i = 0; i < COOKIE_JOIN_SLOTS; i++)
+ spin_lock_init(&join_entry_locks[i]);
+}
diff --git a/net/mptcp/token.c b/net/mptcp/token.c
new file mode 100644
index 000000000..5bb924534
--- /dev/null
+++ b/net/mptcp/token.c
@@ -0,0 +1,422 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP token management
+ * Copyright (c) 2017 - 2019, Intel Corporation.
+ *
+ * Note: This code is based on mptcp_ctrl.c from multipath-tcp.org,
+ * authored by:
+ *
+ * Sébastien Barré <sebastien.barre@uclouvain.be>
+ * Christoph Paasch <christoph.paasch@uclouvain.be>
+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
+ * Gregory Detal <gregory.detal@uclouvain.be>
+ * Fabien Duchêne <fabien.duchene@uclouvain.be>
+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
+ * Lavkesh Lahngir <lavkesh51@gmail.com>
+ * Andreas Ripke <ripke@neclab.eu>
+ * Vlad Dogaru <vlad.dogaru@intel.com>
+ * Octavian Purdila <octavian.purdila@intel.com>
+ * John Ronan <jronan@tssg.org>
+ * Catalin Nicutar <catalin.nicutar@gmail.com>
+ * Brandon Heller <brandonh@stanford.edu>
+ */
+
+#define pr_fmt(fmt) "MPTCP: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/memblock.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <net/sock.h>
+#include <net/inet_common.h>
+#include <net/protocol.h>
+#include <net/mptcp.h>
+#include "protocol.h"
+
+#define TOKEN_MAX_CHAIN_LEN 4
+
+struct token_bucket {
+ spinlock_t lock;
+ int chain_len;
+ struct hlist_nulls_head req_chain;
+ struct hlist_nulls_head msk_chain;
+};
+
+static struct token_bucket *token_hash __read_mostly;
+static unsigned int token_mask __read_mostly;
+
+static struct token_bucket *token_bucket(u32 token)
+{
+ return &token_hash[token & token_mask];
+}
+
+/* called with bucket lock held */
+static struct mptcp_subflow_request_sock *
+__token_lookup_req(struct token_bucket *t, u32 token)
+{
+ struct mptcp_subflow_request_sock *req;
+ struct hlist_nulls_node *pos;
+
+ hlist_nulls_for_each_entry_rcu(req, pos, &t->req_chain, token_node)
+ if (req->token == token)
+ return req;
+ return NULL;
+}
+
+/* called with bucket lock held */
+static struct mptcp_sock *
+__token_lookup_msk(struct token_bucket *t, u32 token)
+{
+ struct hlist_nulls_node *pos;
+ struct sock *sk;
+
+ sk_nulls_for_each_rcu(sk, pos, &t->msk_chain)
+ if (mptcp_sk(sk)->token == token)
+ return mptcp_sk(sk);
+ return NULL;
+}
+
+static bool __token_bucket_busy(struct token_bucket *t, u32 token)
+{
+ return !token || t->chain_len >= TOKEN_MAX_CHAIN_LEN ||
+ __token_lookup_req(t, token) || __token_lookup_msk(t, token);
+}
+
+static void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn)
+{
+ /* we might consider a faster version that computes the key as a
+ * hash of some information available in the MPTCP socket. Use
+ * random data at the moment, as it's probably the safest option
+ * in case multiple sockets are opened in different namespaces at
+ * the same time.
+ */
+ get_random_bytes(key, sizeof(u64));
+ mptcp_crypto_key_sha(*key, token, idsn);
+}
+
+/**
+ * mptcp_token_new_request - create new key/idsn/token for subflow_request
+ * @req: the request socket
+ *
+ * This function is called when a new mptcp connection is coming in.
+ *
+ * It creates a unique token to identify the new mptcp connection,
+ * a secret local key and the initial data sequence number (idsn).
+ *
+ * Returns 0 on success.
+ */
+int mptcp_token_new_request(struct request_sock *req)
+{
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+ struct token_bucket *bucket;
+ u32 token;
+
+ mptcp_crypto_key_sha(subflow_req->local_key,
+ &subflow_req->token,
+ &subflow_req->idsn);
+ pr_debug("req=%p local_key=%llu, token=%u, idsn=%llu\n",
+ req, subflow_req->local_key, subflow_req->token,
+ subflow_req->idsn);
+
+ token = subflow_req->token;
+ bucket = token_bucket(token);
+ spin_lock_bh(&bucket->lock);
+ if (__token_bucket_busy(bucket, token)) {
+ spin_unlock_bh(&bucket->lock);
+ return -EBUSY;
+ }
+
+ hlist_nulls_add_head_rcu(&subflow_req->token_node, &bucket->req_chain);
+ bucket->chain_len++;
+ spin_unlock_bh(&bucket->lock);
+ return 0;
+}
+
+/**
+ * mptcp_token_new_connect - create new key/idsn/token for subflow
+ * @ssk: the socket that will initiate a connection
+ *
+ * This function is called when a new outgoing mptcp connection is
+ * initiated.
+ *
+ * It creates a unique token to identify the new mptcp connection,
+ * a secret local key and the initial data sequence number (idsn).
+ *
+ * On success, the mptcp connection can be found again using
+ * the computed token at a later time, this is needed to process
+ * join requests.
+ *
+ * returns 0 on success.
+ */
+int mptcp_token_new_connect(struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ int retries = MPTCP_TOKEN_MAX_RETRIES;
+ struct sock *sk = subflow->conn;
+ struct token_bucket *bucket;
+
+again:
+ mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token,
+ &subflow->idsn);
+
+ bucket = token_bucket(subflow->token);
+ spin_lock_bh(&bucket->lock);
+ if (__token_bucket_busy(bucket, subflow->token)) {
+ spin_unlock_bh(&bucket->lock);
+ if (!--retries)
+ return -EBUSY;
+ goto again;
+ }
+
+ pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n",
+ ssk, subflow->local_key, subflow->token, subflow->idsn);
+
+ WRITE_ONCE(msk->token, subflow->token);
+ __sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain);
+ bucket->chain_len++;
+ spin_unlock_bh(&bucket->lock);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ return 0;
+}
+
+/**
+ * mptcp_token_accept - replace a req sk with full sock in token hash
+ * @req: the request socket to be removed
+ * @msk: the just cloned socket linked to the new connection
+ *
+ * Called when a SYN packet creates a new logical connection, i.e.
+ * is not a join request.
+ */
+void mptcp_token_accept(struct mptcp_subflow_request_sock *req,
+ struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_request_sock *pos;
+ struct sock *sk = (struct sock *)msk;
+ struct token_bucket *bucket;
+
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ bucket = token_bucket(req->token);
+ spin_lock_bh(&bucket->lock);
+
+ /* pedantic lookup check for the moved token */
+ pos = __token_lookup_req(bucket, req->token);
+ if (!WARN_ON_ONCE(pos != req))
+ hlist_nulls_del_init_rcu(&req->token_node);
+ __sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain);
+ spin_unlock_bh(&bucket->lock);
+}
+
+bool mptcp_token_exists(u32 token)
+{
+ struct hlist_nulls_node *pos;
+ struct token_bucket *bucket;
+ struct mptcp_sock *msk;
+ struct sock *sk;
+
+ rcu_read_lock();
+ bucket = token_bucket(token);
+
+again:
+ sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) {
+ msk = mptcp_sk(sk);
+ if (READ_ONCE(msk->token) == token)
+ goto found;
+ }
+ if (get_nulls_value(pos) != (token & token_mask))
+ goto again;
+
+ rcu_read_unlock();
+ return false;
+found:
+ rcu_read_unlock();
+ return true;
+}
+
+/**
+ * mptcp_token_get_sock - retrieve mptcp connection sock using its token
+ * @net: restrict to this namespace
+ * @token: token of the mptcp connection to retrieve
+ *
+ * This function returns the mptcp connection structure with the given token.
+ * A reference count on the mptcp socket returned is taken.
+ *
+ * returns NULL if no connection with the given token value exists.
+ */
+struct mptcp_sock *mptcp_token_get_sock(struct net *net, u32 token)
+{
+ struct hlist_nulls_node *pos;
+ struct token_bucket *bucket;
+ struct mptcp_sock *msk;
+ struct sock *sk;
+
+ rcu_read_lock();
+ bucket = token_bucket(token);
+
+again:
+ sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) {
+ msk = mptcp_sk(sk);
+ if (READ_ONCE(msk->token) != token ||
+ !net_eq(sock_net(sk), net))
+ continue;
+
+ if (!refcount_inc_not_zero(&sk->sk_refcnt))
+ goto not_found;
+
+ if (READ_ONCE(msk->token) != token ||
+ !net_eq(sock_net(sk), net)) {
+ sock_put(sk);
+ goto again;
+ }
+ goto found;
+ }
+ if (get_nulls_value(pos) != (token & token_mask))
+ goto again;
+
+not_found:
+ msk = NULL;
+
+found:
+ rcu_read_unlock();
+ return msk;
+}
+EXPORT_SYMBOL_GPL(mptcp_token_get_sock);
+
+/**
+ * mptcp_token_iter_next - iterate over the token container from given pos
+ * @net: namespace to be iterated
+ * @s_slot: start slot number
+ * @s_num: start number inside the given lock
+ *
+ * This function returns the first mptcp connection structure found inside the
+ * token container starting from the specified position, or NULL.
+ *
+ * On successful iteration, the iterator is moved to the next position and
+ * a reference to the returned socket is acquired.
+ */
+struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot,
+ long *s_num)
+{
+ struct mptcp_sock *ret = NULL;
+ struct hlist_nulls_node *pos;
+ int slot, num = 0;
+
+ for (slot = *s_slot; slot <= token_mask; *s_num = 0, slot++) {
+ struct token_bucket *bucket = &token_hash[slot];
+ struct sock *sk;
+
+ num = 0;
+
+ if (hlist_nulls_empty(&bucket->msk_chain))
+ continue;
+
+ rcu_read_lock();
+ sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) {
+ ++num;
+ if (!net_eq(sock_net(sk), net))
+ continue;
+
+ if (num <= *s_num)
+ continue;
+
+ if (!refcount_inc_not_zero(&sk->sk_refcnt))
+ continue;
+
+ if (!net_eq(sock_net(sk), net)) {
+ sock_put(sk);
+ continue;
+ }
+
+ ret = mptcp_sk(sk);
+ rcu_read_unlock();
+ goto out;
+ }
+ rcu_read_unlock();
+ }
+
+out:
+ *s_slot = slot;
+ *s_num = num;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mptcp_token_iter_next);
+
+/**
+ * mptcp_token_destroy_request - remove mptcp connection/token
+ * @req: mptcp request socket dropping the token
+ *
+ * Remove the token associated to @req.
+ */
+void mptcp_token_destroy_request(struct request_sock *req)
+{
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+ struct mptcp_subflow_request_sock *pos;
+ struct token_bucket *bucket;
+
+ if (hlist_nulls_unhashed(&subflow_req->token_node))
+ return;
+
+ bucket = token_bucket(subflow_req->token);
+ spin_lock_bh(&bucket->lock);
+ pos = __token_lookup_req(bucket, subflow_req->token);
+ if (!WARN_ON_ONCE(pos != subflow_req)) {
+ hlist_nulls_del_init_rcu(&pos->token_node);
+ bucket->chain_len--;
+ }
+ spin_unlock_bh(&bucket->lock);
+}
+
+/**
+ * mptcp_token_destroy - remove mptcp connection/token
+ * @msk: mptcp connection dropping the token
+ *
+ * Remove the token associated to @msk
+ */
+void mptcp_token_destroy(struct mptcp_sock *msk)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct token_bucket *bucket;
+ struct mptcp_sock *pos;
+
+ if (sk_unhashed((struct sock *)msk))
+ return;
+
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ bucket = token_bucket(msk->token);
+ spin_lock_bh(&bucket->lock);
+ pos = __token_lookup_msk(bucket, msk->token);
+ if (!WARN_ON_ONCE(pos != msk)) {
+ __sk_nulls_del_node_init_rcu((struct sock *)pos);
+ bucket->chain_len--;
+ }
+ spin_unlock_bh(&bucket->lock);
+ WRITE_ONCE(msk->token, 0);
+}
+
+void __init mptcp_token_init(void)
+{
+ int i;
+
+ token_hash = alloc_large_system_hash("MPTCP token",
+ sizeof(struct token_bucket),
+ 0,
+ 20,/* one slot per 1MB of memory */
+ HASH_ZERO,
+ NULL,
+ &token_mask,
+ 0,
+ 64 * 1024);
+ for (i = 0; i < token_mask + 1; ++i) {
+ INIT_HLIST_NULLS_HEAD(&token_hash[i].req_chain, i);
+ INIT_HLIST_NULLS_HEAD(&token_hash[i].msk_chain, i);
+ spin_lock_init(&token_hash[i].lock);
+ }
+}
+
+#if IS_MODULE(CONFIG_MPTCP_KUNIT_TEST)
+EXPORT_SYMBOL_GPL(mptcp_token_new_request);
+EXPORT_SYMBOL_GPL(mptcp_token_new_connect);
+EXPORT_SYMBOL_GPL(mptcp_token_accept);
+EXPORT_SYMBOL_GPL(mptcp_token_destroy_request);
+EXPORT_SYMBOL_GPL(mptcp_token_destroy);
+#endif
diff --git a/net/mptcp/token_test.c b/net/mptcp/token_test.c
new file mode 100644
index 000000000..0758865ab
--- /dev/null
+++ b/net/mptcp/token_test.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <kunit/test.h>
+
+#include "protocol.h"
+
+static struct mptcp_subflow_request_sock *build_req_sock(struct kunit *test)
+{
+ struct mptcp_subflow_request_sock *req;
+
+ req = kunit_kzalloc(test, sizeof(struct mptcp_subflow_request_sock),
+ GFP_USER);
+ KUNIT_EXPECT_NOT_ERR_OR_NULL(test, req);
+ mptcp_token_init_request((struct request_sock *)req);
+ sock_net_set((struct sock *)req, &init_net);
+ return req;
+}
+
+static void mptcp_token_test_req_basic(struct kunit *test)
+{
+ struct mptcp_subflow_request_sock *req = build_req_sock(test);
+ struct mptcp_sock *null_msk = NULL;
+
+ KUNIT_ASSERT_EQ(test, 0,
+ mptcp_token_new_request((struct request_sock *)req));
+ KUNIT_EXPECT_NE(test, 0, (int)req->token);
+ KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(&init_net, req->token));
+
+ /* cleanup */
+ mptcp_token_destroy_request((struct request_sock *)req);
+}
+
+static struct inet_connection_sock *build_icsk(struct kunit *test)
+{
+ struct inet_connection_sock *icsk;
+
+ icsk = kunit_kzalloc(test, sizeof(struct inet_connection_sock),
+ GFP_USER);
+ KUNIT_EXPECT_NOT_ERR_OR_NULL(test, icsk);
+ return icsk;
+}
+
+static struct mptcp_subflow_context *build_ctx(struct kunit *test)
+{
+ struct mptcp_subflow_context *ctx;
+
+ ctx = kunit_kzalloc(test, sizeof(struct mptcp_subflow_context),
+ GFP_USER);
+ KUNIT_EXPECT_NOT_ERR_OR_NULL(test, ctx);
+ return ctx;
+}
+
+static struct mptcp_sock *build_msk(struct kunit *test)
+{
+ struct mptcp_sock *msk;
+
+ msk = kunit_kzalloc(test, sizeof(struct mptcp_sock), GFP_USER);
+ KUNIT_EXPECT_NOT_ERR_OR_NULL(test, msk);
+ refcount_set(&((struct sock *)msk)->sk_refcnt, 1);
+ sock_net_set((struct sock *)msk, &init_net);
+
+ /* be sure the token helpers can dereference sk->sk_prot */
+ ((struct sock *)msk)->sk_prot = &tcp_prot;
+ return msk;
+}
+
+static void mptcp_token_test_msk_basic(struct kunit *test)
+{
+ struct inet_connection_sock *icsk = build_icsk(test);
+ struct mptcp_subflow_context *ctx = build_ctx(test);
+ struct mptcp_sock *msk = build_msk(test);
+ struct mptcp_sock *null_msk = NULL;
+ struct sock *sk;
+
+ rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
+ ctx->conn = (struct sock *)msk;
+ sk = (struct sock *)msk;
+
+ KUNIT_ASSERT_EQ(test, 0,
+ mptcp_token_new_connect((struct sock *)icsk));
+ KUNIT_EXPECT_NE(test, 0, (int)ctx->token);
+ KUNIT_EXPECT_EQ(test, ctx->token, msk->token);
+ KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(&init_net, ctx->token));
+ KUNIT_EXPECT_EQ(test, 2, (int)refcount_read(&sk->sk_refcnt));
+
+ mptcp_token_destroy(msk);
+ KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(&init_net, ctx->token));
+}
+
+static void mptcp_token_test_accept(struct kunit *test)
+{
+ struct mptcp_subflow_request_sock *req = build_req_sock(test);
+ struct mptcp_sock *msk = build_msk(test);
+
+ KUNIT_ASSERT_EQ(test, 0,
+ mptcp_token_new_request((struct request_sock *)req));
+ msk->token = req->token;
+ mptcp_token_accept(req, msk);
+ KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(&init_net, msk->token));
+
+ /* this is now a no-op */
+ mptcp_token_destroy_request((struct request_sock *)req);
+ KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(&init_net, msk->token));
+
+ /* cleanup */
+ mptcp_token_destroy(msk);
+}
+
+static void mptcp_token_test_destroyed(struct kunit *test)
+{
+ struct mptcp_subflow_request_sock *req = build_req_sock(test);
+ struct mptcp_sock *msk = build_msk(test);
+ struct mptcp_sock *null_msk = NULL;
+ struct sock *sk;
+
+ sk = (struct sock *)msk;
+
+ KUNIT_ASSERT_EQ(test, 0,
+ mptcp_token_new_request((struct request_sock *)req));
+ msk->token = req->token;
+ mptcp_token_accept(req, msk);
+
+ /* simulate race on removal */
+ refcount_set(&sk->sk_refcnt, 0);
+ KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(&init_net, msk->token));
+
+ /* cleanup */
+ mptcp_token_destroy(msk);
+}
+
+static struct kunit_case mptcp_token_test_cases[] = {
+ KUNIT_CASE(mptcp_token_test_req_basic),
+ KUNIT_CASE(mptcp_token_test_msk_basic),
+ KUNIT_CASE(mptcp_token_test_accept),
+ KUNIT_CASE(mptcp_token_test_destroyed),
+ {}
+};
+
+static struct kunit_suite mptcp_token_suite = {
+ .name = "mptcp-token",
+ .test_cases = mptcp_token_test_cases,
+};
+
+kunit_test_suite(mptcp_token_suite);
+
+MODULE_LICENSE("GPL");