From 2c3c1048746a4622d8c89a29670120dc8fab93c4 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 20:49:45 +0200 Subject: Adding upstream version 6.1.76. Signed-off-by: Daniel Baumann --- net/netfilter/ipvs/Kconfig | 352 +++ net/netfilter/ipvs/Makefile | 45 + net/netfilter/ipvs/ip_vs_app.c | 617 +++++ net/netfilter/ipvs/ip_vs_conn.c | 1538 +++++++++++ net/netfilter/ipvs/ip_vs_core.c | 2456 ++++++++++++++++++ net/netfilter/ipvs/ip_vs_ctl.c | 4288 +++++++++++++++++++++++++++++++ net/netfilter/ipvs/ip_vs_dh.c | 272 ++ net/netfilter/ipvs/ip_vs_est.c | 204 ++ net/netfilter/ipvs/ip_vs_fo.c | 74 + net/netfilter/ipvs/ip_vs_ftp.c | 637 +++++ net/netfilter/ipvs/ip_vs_lblc.c | 630 +++++ net/netfilter/ipvs/ip_vs_lblcr.c | 815 ++++++ net/netfilter/ipvs/ip_vs_lc.c | 88 + net/netfilter/ipvs/ip_vs_mh.c | 539 ++++ net/netfilter/ipvs/ip_vs_nfct.c | 280 ++ net/netfilter/ipvs/ip_vs_nq.c | 138 + net/netfilter/ipvs/ip_vs_ovf.c | 81 + net/netfilter/ipvs/ip_vs_pe.c | 112 + net/netfilter/ipvs/ip_vs_pe_sip.c | 187 ++ net/netfilter/ipvs/ip_vs_proto.c | 384 +++ net/netfilter/ipvs/ip_vs_proto_ah_esp.c | 157 ++ net/netfilter/ipvs/ip_vs_proto_sctp.c | 595 +++++ net/netfilter/ipvs/ip_vs_proto_tcp.c | 746 ++++++ net/netfilter/ipvs/ip_vs_proto_udp.c | 503 ++++ net/netfilter/ipvs/ip_vs_rr.c | 125 + net/netfilter/ipvs/ip_vs_sched.c | 250 ++ net/netfilter/ipvs/ip_vs_sed.c | 139 + net/netfilter/ipvs/ip_vs_sh.c | 378 +++ net/netfilter/ipvs/ip_vs_sync.c | 2051 +++++++++++++++ net/netfilter/ipvs/ip_vs_twos.c | 139 + net/netfilter/ipvs/ip_vs_wlc.c | 111 + net/netfilter/ipvs/ip_vs_wrr.c | 265 ++ net/netfilter/ipvs/ip_vs_xmit.c | 1686 ++++++++++++ 33 files changed, 20882 insertions(+) create mode 100644 net/netfilter/ipvs/Kconfig create mode 100644 net/netfilter/ipvs/Makefile create mode 100644 net/netfilter/ipvs/ip_vs_app.c create mode 100644 net/netfilter/ipvs/ip_vs_conn.c create mode 100644 net/netfilter/ipvs/ip_vs_core.c create mode 100644 net/netfilter/ipvs/ip_vs_ctl.c create mode 100644 net/netfilter/ipvs/ip_vs_dh.c create mode 100644 net/netfilter/ipvs/ip_vs_est.c create mode 100644 net/netfilter/ipvs/ip_vs_fo.c create mode 100644 net/netfilter/ipvs/ip_vs_ftp.c create mode 100644 net/netfilter/ipvs/ip_vs_lblc.c create mode 100644 net/netfilter/ipvs/ip_vs_lblcr.c create mode 100644 net/netfilter/ipvs/ip_vs_lc.c create mode 100644 net/netfilter/ipvs/ip_vs_mh.c create mode 100644 net/netfilter/ipvs/ip_vs_nfct.c create mode 100644 net/netfilter/ipvs/ip_vs_nq.c create mode 100644 net/netfilter/ipvs/ip_vs_ovf.c create mode 100644 net/netfilter/ipvs/ip_vs_pe.c create mode 100644 net/netfilter/ipvs/ip_vs_pe_sip.c create mode 100644 net/netfilter/ipvs/ip_vs_proto.c create mode 100644 net/netfilter/ipvs/ip_vs_proto_ah_esp.c create mode 100644 net/netfilter/ipvs/ip_vs_proto_sctp.c create mode 100644 net/netfilter/ipvs/ip_vs_proto_tcp.c create mode 100644 net/netfilter/ipvs/ip_vs_proto_udp.c create mode 100644 net/netfilter/ipvs/ip_vs_rr.c create mode 100644 net/netfilter/ipvs/ip_vs_sched.c create mode 100644 net/netfilter/ipvs/ip_vs_sed.c create mode 100644 net/netfilter/ipvs/ip_vs_sh.c create mode 100644 net/netfilter/ipvs/ip_vs_sync.c create mode 100644 net/netfilter/ipvs/ip_vs_twos.c create mode 100644 net/netfilter/ipvs/ip_vs_wlc.c create mode 100644 net/netfilter/ipvs/ip_vs_wrr.c create mode 100644 net/netfilter/ipvs/ip_vs_xmit.c (limited to 'net/netfilter/ipvs') diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig new file mode 100644 index 000000000..2a3017b9c --- /dev/null +++ b/net/netfilter/ipvs/Kconfig @@ -0,0 +1,352 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# IP Virtual Server configuration +# +menuconfig IP_VS + tristate "IP virtual server support" + depends on INET && NETFILTER + depends on (NF_CONNTRACK || NF_CONNTRACK=n) + help + IP Virtual Server support will let you build a high-performance + virtual server based on cluster of two or more real servers. This + option must be enabled for at least one of the clustered computers + that will take care of intercepting incoming connections to a + single IP address and scheduling them to real servers. + + Three request dispatching techniques are implemented, they are + virtual server via NAT, virtual server via tunneling and virtual + server via direct routing. The several scheduling algorithms can + be used to choose which server the connection is directed to, + thus load balancing can be achieved among the servers. For more + information and its administration program, please visit the + following URL: . + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +if IP_VS + +config IP_VS_IPV6 + bool "IPv6 support for IPVS" + depends on IPV6 = y || IP_VS = IPV6 + select NF_DEFRAG_IPV6 + help + Add IPv6 support to IPVS. + + Say Y if unsure. + +config IP_VS_DEBUG + bool "IP virtual server debugging" + help + Say Y here if you want to get additional messages useful in + debugging the IP virtual server code. You can change the debug + level in /proc/sys/net/ipv4/vs/debug_level + +config IP_VS_TAB_BITS + int "IPVS connection table size (the Nth power of 2)" + range 8 20 if !64BIT + range 8 27 if 64BIT + default 12 + help + The IPVS connection hash table uses the chaining scheme to handle + hash collisions. Using a big IPVS connection hash table will greatly + reduce conflicts when there are hundreds of thousands of connections + in the hash table. + + Note the table size must be power of 2. The table size will be the + value of 2 to the your input number power. The number to choose is + from 8 to 27 for 64BIT(20 otherwise), the default number is 12, + which means the table size is 4096. Don't input the number too + small, otherwise you will lose performance on it. You can adapt the + table size yourself, according to your virtual server application. + It is good to set the table size not far less than the number of + connections per second multiplying average lasting time of + connection in the table. For example, your virtual server gets 200 + connections per second, the connection lasts for 200 seconds in + average in the connection table, the table size should be not far + less than 200x200, it is good to set the table size 32768 (2**15). + + Another note that each connection occupies 128 bytes effectively and + each hash entry uses 8 bytes, so you can estimate how much memory is + needed for your box. + + You can overwrite this number setting conn_tab_bits module parameter + or by appending ip_vs.conn_tab_bits=? to the kernel command line if + IP VS was compiled built-in. + +comment "IPVS transport protocol load balancing support" + +config IP_VS_PROTO_TCP + bool "TCP load balancing support" + help + This option enables support for load balancing TCP transport + protocol. Say Y if unsure. + +config IP_VS_PROTO_UDP + bool "UDP load balancing support" + help + This option enables support for load balancing UDP transport + protocol. Say Y if unsure. + +config IP_VS_PROTO_AH_ESP + def_bool IP_VS_PROTO_ESP || IP_VS_PROTO_AH + +config IP_VS_PROTO_ESP + bool "ESP load balancing support" + help + This option enables support for load balancing ESP (Encapsulation + Security Payload) transport protocol. Say Y if unsure. + +config IP_VS_PROTO_AH + bool "AH load balancing support" + help + This option enables support for load balancing AH (Authentication + Header) transport protocol. Say Y if unsure. + +config IP_VS_PROTO_SCTP + bool "SCTP load balancing support" + select LIBCRC32C + help + This option enables support for load balancing SCTP transport + protocol. Say Y if unsure. + +comment "IPVS scheduler" + +config IP_VS_RR + tristate "round-robin scheduling" + help + The robin-robin scheduling algorithm simply directs network + connections to different real servers in a round-robin manner. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_WRR + tristate "weighted round-robin scheduling" + help + The weighted robin-robin scheduling algorithm directs network + connections to different real servers based on server weights + in a round-robin manner. Servers with higher weights receive + new connections first than those with less weights, and servers + with higher weights get more connections than those with less + weights and servers with equal weights get equal connections. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LC + tristate "least-connection scheduling" + help + The least-connection scheduling algorithm directs network + connections to the server with the least number of active + connections. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_WLC + tristate "weighted least-connection scheduling" + help + The weighted least-connection scheduling algorithm directs network + connections to the server with the least active connections + normalized by the server weight. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_FO + tristate "weighted failover scheduling" + help + The weighted failover scheduling algorithm directs network + connections to the server with the highest weight that is + currently available. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_OVF + tristate "weighted overflow scheduling" + help + The weighted overflow scheduling algorithm directs network + connections to the server with the highest weight that is + currently available and overflows to the next when active + connections exceed the node's weight. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LBLC + tristate "locality-based least-connection scheduling" + help + The locality-based least-connection scheduling algorithm is for + destination IP load balancing. It is usually used in cache cluster. + This algorithm usually directs packet destined for an IP address to + its server if the server is alive and under load. If the server is + overloaded (its active connection numbers is larger than its weight) + and there is a server in its half load, then allocate the weighted + least-connection server to this IP address. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LBLCR + tristate "locality-based least-connection with replication scheduling" + help + The locality-based least-connection with replication scheduling + algorithm is also for destination IP load balancing. It is + usually used in cache cluster. It differs from the LBLC scheduling + as follows: the load balancer maintains mappings from a target + to a set of server nodes that can serve the target. Requests for + a target are assigned to the least-connection node in the target's + server set. If all the node in the server set are over loaded, + it picks up a least-connection node in the cluster and adds it + in the sever set for the target. If the server set has not been + modified for the specified time, the most loaded node is removed + from the server set, in order to avoid high degree of replication. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_DH + tristate "destination hashing scheduling" + help + The destination hashing scheduling algorithm assigns network + connections to the servers through looking up a statically assigned + hash table by their destination IP addresses. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_SH + tristate "source hashing scheduling" + help + The source hashing scheduling algorithm assigns network + connections to the servers through looking up a statically assigned + hash table by their source IP addresses. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_MH + tristate "maglev hashing scheduling" + help + The maglev consistent hashing scheduling algorithm provides the + Google's Maglev hashing algorithm as a IPVS scheduler. It assigns + network connections to the servers through looking up a statically + assigned special hash table called the lookup table. Maglev hashing + is to assign a preference list of all the lookup table positions + to each destination. + + Through this operation, The maglev hashing gives an almost equal + share of the lookup table to each of the destinations and provides + minimal disruption by using the lookup table. When the set of + destinations changes, a connection will likely be sent to the same + destination as it was before. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_SED + tristate "shortest expected delay scheduling" + help + The shortest expected delay scheduling algorithm assigns network + connections to the server with the shortest expected delay. The + expected delay that the job will experience is (Ci + 1) / Ui if + sent to the ith server, in which Ci is the number of connections + on the ith server and Ui is the fixed service rate (weight) + of the ith server. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_NQ + tristate "never queue scheduling" + help + The never queue scheduling algorithm adopts a two-speed model. + When there is an idle server available, the job will be sent to + the idle server, instead of waiting for a fast one. When there + is no idle server available, the job will be sent to the server + that minimize its expected delay (The Shortest Expected Delay + scheduling algorithm). + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_TWOS + tristate "weighted random twos choice least-connection scheduling" + help + The weighted random twos choice least-connection scheduling + algorithm picks two random real servers and directs network + connections to the server with the least active connections + normalized by the server weight. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +comment 'IPVS SH scheduler' + +config IP_VS_SH_TAB_BITS + int "IPVS source hashing table size (the Nth power of 2)" + range 4 20 + default 8 + help + The source hashing scheduler maps source IPs to destinations + stored in a hash table. This table is tiled by each destination + until all slots in the table are filled. When using weights to + allow destinations to receive more connections, the table is + tiled an amount proportional to the weights specified. The table + needs to be large enough to effectively fit all the destinations + multiplied by their respective weights. + +comment 'IPVS MH scheduler' + +config IP_VS_MH_TAB_INDEX + int "IPVS maglev hashing table index of size (the prime numbers)" + range 8 17 + default 12 + help + The maglev hashing scheduler maps source IPs to destinations + stored in a hash table. This table is assigned by a preference + list of the positions to each destination until all slots in + the table are filled. The index determines the prime for size of + the table as 251, 509, 1021, 2039, 4093, 8191, 16381, 32749, + 65521 or 131071. When using weights to allow destinations to + receive more connections, the table is assigned an amount + proportional to the weights specified. The table needs to be large + enough to effectively fit all the destinations multiplied by their + respective weights. + +comment 'IPVS application helper' + +config IP_VS_FTP + tristate "FTP protocol helper" + depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT && \ + NF_CONNTRACK_FTP + select IP_VS_NFCT + help + FTP is a protocol that transfers IP address and/or port number in + the payload. In the virtual server via Network Address Translation, + the IP address and port number of real servers cannot be sent to + clients in ftp connections directly, so FTP protocol helper is + required for tracking the connection and mangling it back to that of + virtual service. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_NFCT + bool "Netfilter connection tracking" + depends on NF_CONNTRACK + help + The Netfilter connection tracking support allows the IPVS + connection state to be exported to the Netfilter framework + for filtering purposes. + +config IP_VS_PE_SIP + tristate "SIP persistence engine" + depends on IP_VS_PROTO_UDP + depends on NF_CONNTRACK_SIP + help + Allow persistence based on the SIP Call-ID + +endif # IP_VS diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile new file mode 100644 index 000000000..bb5d8125c --- /dev/null +++ b/net/netfilter/ipvs/Makefile @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the IPVS modules on top of IPv4. +# + +# IPVS transport protocol load balancing support +ip_vs_proto-objs-y := +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_SCTP) += ip_vs_proto_sctp.o + +ip_vs-extra_objs-y := +ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o + +ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ + ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ + ip_vs_est.o ip_vs_proto.o ip_vs_pe.o \ + $(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y) + + +# IPVS core +obj-$(CONFIG_IP_VS) += ip_vs.o + +# IPVS schedulers +obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o +obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o +obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o +obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o +obj-$(CONFIG_IP_VS_FO) += ip_vs_fo.o +obj-$(CONFIG_IP_VS_OVF) += ip_vs_ovf.o +obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o +obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o +obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o +obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o +obj-$(CONFIG_IP_VS_MH) += ip_vs_mh.o +obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o +obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o +obj-$(CONFIG_IP_VS_TWOS) += ip_vs_twos.o + +# IPVS application helpers +obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o + +# IPVS connection template retrievers +obj-$(CONFIG_IP_VS_PE_SIP) += ip_vs_pe_sip.o diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c new file mode 100644 index 000000000..fdacbc3c1 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -0,0 +1,617 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * ip_vs_app.c: Application module support for IPVS + * + * Authors: Wensong Zhang + * + * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference + * is that ip_vs_app module handles the reverse direction (incoming requests + * and outgoing responses). + * + * IP_MASQ_APP application masquerading module + * + * Author: Juan Jose Ciarlante, + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +EXPORT_SYMBOL(register_ip_vs_app); +EXPORT_SYMBOL(unregister_ip_vs_app); +EXPORT_SYMBOL(register_ip_vs_app_inc); + +static DEFINE_MUTEX(__ip_vs_app_mutex); + +/* + * Get an ip_vs_app object + */ +static inline int ip_vs_app_get(struct ip_vs_app *app) +{ + return try_module_get(app->module); +} + + +static inline void ip_vs_app_put(struct ip_vs_app *app) +{ + module_put(app->module); +} + +static void ip_vs_app_inc_destroy(struct ip_vs_app *inc) +{ + kfree(inc->timeout_table); + kfree(inc); +} + +static void ip_vs_app_inc_rcu_free(struct rcu_head *head) +{ + struct ip_vs_app *inc = container_of(head, struct ip_vs_app, rcu_head); + + ip_vs_app_inc_destroy(inc); +} + +/* + * Allocate/initialize app incarnation and register it in proto apps. + */ +static int +ip_vs_app_inc_new(struct netns_ipvs *ipvs, struct ip_vs_app *app, __u16 proto, + __u16 port) +{ + struct ip_vs_protocol *pp; + struct ip_vs_app *inc; + int ret; + + if (!(pp = ip_vs_proto_get(proto))) + return -EPROTONOSUPPORT; + + if (!pp->unregister_app) + return -EOPNOTSUPP; + + inc = kmemdup(app, sizeof(*inc), GFP_KERNEL); + if (!inc) + return -ENOMEM; + INIT_LIST_HEAD(&inc->p_list); + INIT_LIST_HEAD(&inc->incs_list); + inc->app = app; + inc->port = htons(port); + atomic_set(&inc->usecnt, 0); + + if (app->timeouts) { + inc->timeout_table = + ip_vs_create_timeout_table(app->timeouts, + app->timeouts_size); + if (!inc->timeout_table) { + ret = -ENOMEM; + goto out; + } + } + + ret = pp->register_app(ipvs, inc); + if (ret) + goto out; + + list_add(&inc->a_list, &app->incs_list); + IP_VS_DBG(9, "%s App %s:%u registered\n", + pp->name, inc->name, ntohs(inc->port)); + + return 0; + + out: + ip_vs_app_inc_destroy(inc); + return ret; +} + + +/* + * Release app incarnation + */ +static void +ip_vs_app_inc_release(struct netns_ipvs *ipvs, struct ip_vs_app *inc) +{ + struct ip_vs_protocol *pp; + + if (!(pp = ip_vs_proto_get(inc->protocol))) + return; + + if (pp->unregister_app) + pp->unregister_app(ipvs, inc); + + IP_VS_DBG(9, "%s App %s:%u unregistered\n", + pp->name, inc->name, ntohs(inc->port)); + + list_del(&inc->a_list); + + call_rcu(&inc->rcu_head, ip_vs_app_inc_rcu_free); +} + + +/* + * Get reference to app inc (only called from softirq) + * + */ +int ip_vs_app_inc_get(struct ip_vs_app *inc) +{ + int result; + + result = ip_vs_app_get(inc->app); + if (result) + atomic_inc(&inc->usecnt); + return result; +} + + +/* + * Put the app inc (only called from timer or net softirq) + */ +void ip_vs_app_inc_put(struct ip_vs_app *inc) +{ + atomic_dec(&inc->usecnt); + ip_vs_app_put(inc->app); +} + + +/* + * Register an application incarnation in protocol applications + */ +int +register_ip_vs_app_inc(struct netns_ipvs *ipvs, struct ip_vs_app *app, __u16 proto, + __u16 port) +{ + int result; + + mutex_lock(&__ip_vs_app_mutex); + + result = ip_vs_app_inc_new(ipvs, app, proto, port); + + mutex_unlock(&__ip_vs_app_mutex); + + return result; +} + + +/* Register application for netns */ +struct ip_vs_app *register_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app *app) +{ + struct ip_vs_app *a; + int err = 0; + + mutex_lock(&__ip_vs_app_mutex); + + /* increase the module use count */ + if (!ip_vs_use_count_inc()) { + err = -ENOENT; + goto out_unlock; + } + + list_for_each_entry(a, &ipvs->app_list, a_list) { + if (!strcmp(app->name, a->name)) { + err = -EEXIST; + /* decrease the module use count */ + ip_vs_use_count_dec(); + goto out_unlock; + } + } + a = kmemdup(app, sizeof(*app), GFP_KERNEL); + if (!a) { + err = -ENOMEM; + /* decrease the module use count */ + ip_vs_use_count_dec(); + goto out_unlock; + } + INIT_LIST_HEAD(&a->incs_list); + list_add(&a->a_list, &ipvs->app_list); + +out_unlock: + mutex_unlock(&__ip_vs_app_mutex); + + return err ? ERR_PTR(err) : a; +} + + +/* + * ip_vs_app unregistration routine + * We are sure there are no app incarnations attached to services + * Caller should use synchronize_rcu() or rcu_barrier() + */ +void unregister_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app *app) +{ + struct ip_vs_app *a, *anxt, *inc, *nxt; + + mutex_lock(&__ip_vs_app_mutex); + + list_for_each_entry_safe(a, anxt, &ipvs->app_list, a_list) { + if (app && strcmp(app->name, a->name)) + continue; + list_for_each_entry_safe(inc, nxt, &a->incs_list, a_list) { + ip_vs_app_inc_release(ipvs, inc); + } + + list_del(&a->a_list); + kfree(a); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + } + + mutex_unlock(&__ip_vs_app_mutex); +} + + +/* + * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) + */ +int ip_vs_bind_app(struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + return pp->app_conn_bind(cp); +} + + +/* + * Unbind cp from application incarnation (called by cp destructor) + */ +void ip_vs_unbind_app(struct ip_vs_conn *cp) +{ + struct ip_vs_app *inc = cp->app; + + if (!inc) + return; + + if (inc->unbind_conn) + inc->unbind_conn(inc, cp); + if (inc->done_conn) + inc->done_conn(inc, cp); + ip_vs_app_inc_put(inc); + cp->app = NULL; +} + + +/* + * Fixes th->seq based on ip_vs_seq info. + */ +static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) +{ + __u32 seq = ntohl(th->seq); + + /* + * Adjust seq with delta-offset for all packets after + * the most recent resized pkt seq and with previous_delta offset + * for all packets before most recent resized pkt seq. + */ + if (vseq->delta || vseq->previous_delta) { + if(after(seq, vseq->init_seq)) { + th->seq = htonl(seq + vseq->delta); + IP_VS_DBG(9, "%s(): added delta (%d) to seq\n", + __func__, vseq->delta); + } else { + th->seq = htonl(seq + vseq->previous_delta); + IP_VS_DBG(9, "%s(): added previous_delta (%d) to seq\n", + __func__, vseq->previous_delta); + } + } +} + + +/* + * Fixes th->ack_seq based on ip_vs_seq info. + */ +static inline void +vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) +{ + __u32 ack_seq = ntohl(th->ack_seq); + + /* + * Adjust ack_seq with delta-offset for + * the packets AFTER most recent resized pkt has caused a shift + * for packets before most recent resized pkt, use previous_delta + */ + if (vseq->delta || vseq->previous_delta) { + /* since ack_seq is the number of octet that is expected + to receive next, so compare it with init_seq+delta */ + if(after(ack_seq, vseq->init_seq+vseq->delta)) { + th->ack_seq = htonl(ack_seq - vseq->delta); + IP_VS_DBG(9, "%s(): subtracted delta " + "(%d) from ack_seq\n", __func__, vseq->delta); + + } else { + th->ack_seq = htonl(ack_seq - vseq->previous_delta); + IP_VS_DBG(9, "%s(): subtracted " + "previous_delta (%d) from ack_seq\n", + __func__, vseq->previous_delta); + } + } +} + + +/* + * Updates ip_vs_seq if pkt has been resized + * Assumes already checked proto==IPPROTO_TCP and diff!=0. + */ +static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, + unsigned int flag, __u32 seq, int diff) +{ + /* spinlock is to keep updating cp->flags atomic */ + spin_lock_bh(&cp->lock); + if (!(cp->flags & flag) || after(seq, vseq->init_seq)) { + vseq->previous_delta = vseq->delta; + vseq->delta += diff; + vseq->init_seq = seq; + cp->flags |= flag; + } + spin_unlock_bh(&cp->lock); +} + +static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, + struct ip_vs_app *app, + struct ip_vs_iphdr *ipvsh) +{ + int diff; + const unsigned int tcp_offset = ip_hdrlen(skb); + struct tcphdr *th; + __u32 seq; + + if (skb_ensure_writable(skb, tcp_offset + sizeof(*th))) + return 0; + + th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); + + /* + * Remember seq number in case this pkt gets resized + */ + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + if (cp->flags & IP_VS_CONN_F_OUT_SEQ) + vs_fix_seq(&cp->out_seq, th); + if (cp->flags & IP_VS_CONN_F_IN_SEQ) + vs_fix_ack_seq(&cp->in_seq, th); + + /* + * Call private output hook function + */ + if (app->pkt_out == NULL) + return 1; + + if (!app->pkt_out(app, cp, skb, &diff, ipvsh)) + return 0; + + /* + * Update ip_vs seq stuff if len has changed. + */ + if (diff != 0) + vs_seq_update(cp, &cp->out_seq, + IP_VS_CONN_F_OUT_SEQ, seq, diff); + + return 1; +} + +/* + * Output pkt hook. Will call bound ip_vs_app specific function + * called by ipvs packet handler, assumes previously checked cp!=NULL + * returns false if it can't handle packet (oom) + */ +int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, + struct ip_vs_iphdr *ipvsh) +{ + struct ip_vs_app *app; + + /* + * check if application module is bound to + * this ip_vs_conn. + */ + if ((app = cp->app) == NULL) + return 1; + + /* TCP is complicated */ + if (cp->protocol == IPPROTO_TCP) + return app_tcp_pkt_out(cp, skb, app, ipvsh); + + /* + * Call private output hook function + */ + if (app->pkt_out == NULL) + return 1; + + return app->pkt_out(app, cp, skb, NULL, ipvsh); +} + + +static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, + struct ip_vs_app *app, + struct ip_vs_iphdr *ipvsh) +{ + int diff; + const unsigned int tcp_offset = ip_hdrlen(skb); + struct tcphdr *th; + __u32 seq; + + if (skb_ensure_writable(skb, tcp_offset + sizeof(*th))) + return 0; + + th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); + + /* + * Remember seq number in case this pkt gets resized + */ + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + if (cp->flags & IP_VS_CONN_F_IN_SEQ) + vs_fix_seq(&cp->in_seq, th); + if (cp->flags & IP_VS_CONN_F_OUT_SEQ) + vs_fix_ack_seq(&cp->out_seq, th); + + /* + * Call private input hook function + */ + if (app->pkt_in == NULL) + return 1; + + if (!app->pkt_in(app, cp, skb, &diff, ipvsh)) + return 0; + + /* + * Update ip_vs seq stuff if len has changed. + */ + if (diff != 0) + vs_seq_update(cp, &cp->in_seq, + IP_VS_CONN_F_IN_SEQ, seq, diff); + + return 1; +} + +/* + * Input pkt hook. Will call bound ip_vs_app specific function + * called by ipvs packet handler, assumes previously checked cp!=NULL. + * returns false if can't handle packet (oom). + */ +int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, + struct ip_vs_iphdr *ipvsh) +{ + struct ip_vs_app *app; + + /* + * check if application module is bound to + * this ip_vs_conn. + */ + if ((app = cp->app) == NULL) + return 1; + + /* TCP is complicated */ + if (cp->protocol == IPPROTO_TCP) + return app_tcp_pkt_in(cp, skb, app, ipvsh); + + /* + * Call private input hook function + */ + if (app->pkt_in == NULL) + return 1; + + return app->pkt_in(app, cp, skb, NULL, ipvsh); +} + + +#ifdef CONFIG_PROC_FS +/* + * /proc/net/ip_vs_app entry function + */ + +static struct ip_vs_app *ip_vs_app_idx(struct netns_ipvs *ipvs, loff_t pos) +{ + struct ip_vs_app *app, *inc; + + list_for_each_entry(app, &ipvs->app_list, a_list) { + list_for_each_entry(inc, &app->incs_list, a_list) { + if (pos-- == 0) + return inc; + } + } + return NULL; + +} + +static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); + + mutex_lock(&__ip_vs_app_mutex); + + return *pos ? ip_vs_app_idx(ipvs, *pos - 1) : SEQ_START_TOKEN; +} + +static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip_vs_app *inc, *app; + struct list_head *e; + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_app_idx(ipvs, 0); + + inc = v; + app = inc->app; + + if ((e = inc->a_list.next) != &app->incs_list) + return list_entry(e, struct ip_vs_app, a_list); + + /* go on to next application */ + for (e = app->a_list.next; e != &ipvs->app_list; e = e->next) { + app = list_entry(e, struct ip_vs_app, a_list); + list_for_each_entry(inc, &app->incs_list, a_list) { + return inc; + } + } + return NULL; +} + +static void ip_vs_app_seq_stop(struct seq_file *seq, void *v) +{ + mutex_unlock(&__ip_vs_app_mutex); +} + +static int ip_vs_app_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "prot port usecnt name\n"); + else { + const struct ip_vs_app *inc = v; + + seq_printf(seq, "%-3s %-7u %-6d %-17s\n", + ip_vs_proto_name(inc->protocol), + ntohs(inc->port), + atomic_read(&inc->usecnt), + inc->name); + } + return 0; +} + +static const struct seq_operations ip_vs_app_seq_ops = { + .start = ip_vs_app_seq_start, + .next = ip_vs_app_seq_next, + .stop = ip_vs_app_seq_stop, + .show = ip_vs_app_seq_show, +}; +#endif + +int __net_init ip_vs_app_net_init(struct netns_ipvs *ipvs) +{ + INIT_LIST_HEAD(&ipvs->app_list); +#ifdef CONFIG_PROC_FS + if (!proc_create_net("ip_vs_app", 0, ipvs->net->proc_net, + &ip_vs_app_seq_ops, + sizeof(struct seq_net_private))) + return -ENOMEM; +#endif + return 0; +} + +void __net_exit ip_vs_app_net_cleanup(struct netns_ipvs *ipvs) +{ + unregister_ip_vs_app(ipvs, NULL /* all */); +#ifdef CONFIG_PROC_FS + remove_proc_entry("ip_vs_app", ipvs->net->proc_net); +#endif +} diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c new file mode 100644 index 000000000..e1b9b5290 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -0,0 +1,1538 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang + * Peter Kese + * Julian Anastasov + * + * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, + * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms + * and others. Many code here is taken from IP MASQ code of kernel 2.2. + * + * Changes: + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include /* for proc_net_* */ +#include +#include +#include +#include + +#include +#include + + +#ifndef CONFIG_IP_VS_TAB_BITS +#define CONFIG_IP_VS_TAB_BITS 12 +#endif + +/* + * Connection hash size. Default is what was selected at compile time. +*/ +static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; +module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444); +MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size"); + +/* size and mask values */ +int ip_vs_conn_tab_size __read_mostly; +static int ip_vs_conn_tab_mask __read_mostly; + +/* + * Connection hash table: for input and output packets lookups of IPVS + */ +static struct hlist_head *ip_vs_conn_tab __read_mostly; + +/* SLAB cache for IPVS connections */ +static struct kmem_cache *ip_vs_conn_cachep __read_mostly; + +/* counter for no client port connections */ +static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); + +/* random value for IPVS connection hash */ +static unsigned int ip_vs_conn_rnd __read_mostly; + +/* + * Fine locking granularity for big connection hash table + */ +#define CT_LOCKARRAY_BITS 5 +#define CT_LOCKARRAY_SIZE (1<>8)) & ip_vs_conn_tab_mask; +#endif + return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto, + ip_vs_conn_rnd) ^ + ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask; +} + +static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, + bool inverse) +{ + const union nf_inet_addr *addr; + __be16 port; + + if (p->pe_data && p->pe->hashkey_raw) + return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) & + ip_vs_conn_tab_mask; + + if (likely(!inverse)) { + addr = p->caddr; + port = p->cport; + } else { + addr = p->vaddr; + port = p->vport; + } + + return ip_vs_conn_hashkey(p->ipvs, p->af, p->protocol, addr, port); +} + +static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) +{ + struct ip_vs_conn_param p; + + ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol, + &cp->caddr, cp->cport, NULL, 0, &p); + + if (cp->pe) { + p.pe = cp->pe; + p.pe_data = cp->pe_data; + p.pe_data_len = cp->pe_data_len; + } + + return ip_vs_conn_hashkey_param(&p, false); +} + +/* + * Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port. + * returns bool success. + */ +static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) +{ + unsigned int hash; + int ret; + + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + return 0; + + /* Hash by protocol, client address and port */ + hash = ip_vs_conn_hashkey_conn(cp); + + ct_write_lock_bh(hash); + spin_lock(&cp->lock); + + if (!(cp->flags & IP_VS_CONN_F_HASHED)) { + cp->flags |= IP_VS_CONN_F_HASHED; + refcount_inc(&cp->refcnt); + hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]); + ret = 1; + } else { + pr_err("%s(): request for already hashed, called from %pS\n", + __func__, __builtin_return_address(0)); + ret = 0; + } + + spin_unlock(&cp->lock); + ct_write_unlock_bh(hash); + + return ret; +} + + +/* + * UNhashes ip_vs_conn from ip_vs_conn_tab. + * returns bool success. Caller should hold conn reference. + */ +static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) +{ + unsigned int hash; + int ret; + + /* unhash it and decrease its reference counter */ + hash = ip_vs_conn_hashkey_conn(cp); + + ct_write_lock_bh(hash); + spin_lock(&cp->lock); + + if (cp->flags & IP_VS_CONN_F_HASHED) { + hlist_del_rcu(&cp->c_list); + cp->flags &= ~IP_VS_CONN_F_HASHED; + refcount_dec(&cp->refcnt); + ret = 1; + } else + ret = 0; + + spin_unlock(&cp->lock); + ct_write_unlock_bh(hash); + + return ret; +} + +/* Try to unlink ip_vs_conn from ip_vs_conn_tab. + * returns bool success. + */ +static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) +{ + unsigned int hash; + bool ret = false; + + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + return refcount_dec_if_one(&cp->refcnt); + + hash = ip_vs_conn_hashkey_conn(cp); + + ct_write_lock_bh(hash); + spin_lock(&cp->lock); + + if (cp->flags & IP_VS_CONN_F_HASHED) { + /* Decrease refcnt and unlink conn only if we are last user */ + if (refcount_dec_if_one(&cp->refcnt)) { + hlist_del_rcu(&cp->c_list); + cp->flags &= ~IP_VS_CONN_F_HASHED; + ret = true; + } + } + + spin_unlock(&cp->lock); + ct_write_unlock_bh(hash); + + return ret; +} + + +/* + * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Called for pkts coming from OUTside-to-INside. + * p->caddr, p->cport: pkt source address (foreign host) + * p->vaddr, p->vport: pkt dest address (load balancer) + */ +static inline struct ip_vs_conn * +__ip_vs_conn_in_get(const struct ip_vs_conn_param *p) +{ + unsigned int hash; + struct ip_vs_conn *cp; + + hash = ip_vs_conn_hashkey_param(p, false); + + rcu_read_lock(); + + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { + if (p->cport == cp->cport && p->vport == cp->vport && + cp->af == p->af && + ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && + ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) && + ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && + p->protocol == cp->protocol && + cp->ipvs == p->ipvs) { + if (!__ip_vs_conn_get(cp)) + continue; + /* HIT */ + rcu_read_unlock(); + return cp; + } + } + + rcu_read_unlock(); + + return NULL; +} + +struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p) +{ + struct ip_vs_conn *cp; + + cp = __ip_vs_conn_in_get(p); + if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) { + struct ip_vs_conn_param cport_zero_p = *p; + cport_zero_p.cport = 0; + cp = __ip_vs_conn_in_get(&cport_zero_p); + } + + IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(p->protocol), + IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), + cp ? "hit" : "not hit"); + + return cp; +} + +static int +ip_vs_conn_fill_param_proto(struct netns_ipvs *ipvs, + int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + struct ip_vs_conn_param *p) +{ + __be16 _ports[2], *pptr; + + pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports); + if (pptr == NULL) + return 1; + + if (likely(!ip_vs_iph_inverse(iph))) + ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->saddr, + pptr[0], &iph->daddr, pptr[1], p); + else + ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->daddr, + pptr[1], &iph->saddr, pptr[0], p); + return 0; +} + +struct ip_vs_conn * +ip_vs_conn_in_get_proto(struct netns_ipvs *ipvs, int af, + const struct sk_buff *skb, + const struct ip_vs_iphdr *iph) +{ + struct ip_vs_conn_param p; + + if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p)) + return NULL; + + return ip_vs_conn_in_get(&p); +} +EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto); + +/* Get reference to connection template */ +struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) +{ + unsigned int hash; + struct ip_vs_conn *cp; + + hash = ip_vs_conn_hashkey_param(p, false); + + rcu_read_lock(); + + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { + if (unlikely(p->pe_data && p->pe->ct_match)) { + if (cp->ipvs != p->ipvs) + continue; + if (p->pe == cp->pe && p->pe->ct_match(p, cp)) { + if (__ip_vs_conn_get(cp)) + goto out; + } + continue; + } + + if (cp->af == p->af && + ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && + /* protocol should only be IPPROTO_IP if + * p->vaddr is a fwmark */ + ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : + p->af, p->vaddr, &cp->vaddr) && + p->vport == cp->vport && p->cport == cp->cport && + cp->flags & IP_VS_CONN_F_TEMPLATE && + p->protocol == cp->protocol && + cp->ipvs == p->ipvs) { + if (__ip_vs_conn_get(cp)) + goto out; + } + } + cp = NULL; + + out: + rcu_read_unlock(); + + IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(p->protocol), + IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), + cp ? "hit" : "not hit"); + + return cp; +} + +/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Called for pkts coming from inside-to-OUTside. + * p->caddr, p->cport: pkt source address (inside host) + * p->vaddr, p->vport: pkt dest address (foreign host) */ +struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) +{ + unsigned int hash; + struct ip_vs_conn *cp, *ret=NULL; + const union nf_inet_addr *saddr; + __be16 sport; + + /* + * Check for "full" addressed entries + */ + hash = ip_vs_conn_hashkey_param(p, true); + + rcu_read_lock(); + + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { + if (p->vport != cp->cport) + continue; + + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { + sport = cp->vport; + saddr = &cp->vaddr; + } else { + sport = cp->dport; + saddr = &cp->daddr; + } + + if (p->cport == sport && cp->af == p->af && + ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && + ip_vs_addr_equal(p->af, p->caddr, saddr) && + p->protocol == cp->protocol && + cp->ipvs == p->ipvs) { + if (!__ip_vs_conn_get(cp)) + continue; + /* HIT */ + ret = cp; + break; + } + } + + rcu_read_unlock(); + + IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(p->protocol), + IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), + ret ? "hit" : "not hit"); + + return ret; +} + +struct ip_vs_conn * +ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af, + const struct sk_buff *skb, + const struct ip_vs_iphdr *iph) +{ + struct ip_vs_conn_param p; + + if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p)) + return NULL; + + return ip_vs_conn_out_get(&p); +} +EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto); + +/* + * Put back the conn and restart its timer with its timeout + */ +static void __ip_vs_conn_put_timer(struct ip_vs_conn *cp) +{ + unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ? + 0 : cp->timeout; + mod_timer(&cp->timer, jiffies+t); + + __ip_vs_conn_put(cp); +} + +void ip_vs_conn_put(struct ip_vs_conn *cp) +{ + if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && + (refcount_read(&cp->refcnt) == 1) && + !timer_pending(&cp->timer)) + /* expire connection immediately */ + ip_vs_conn_expire(&cp->timer); + else + __ip_vs_conn_put_timer(cp); +} + +/* + * Fill a no_client_port connection with a client port number + */ +void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) +{ + if (ip_vs_conn_unhash(cp)) { + spin_lock_bh(&cp->lock); + if (cp->flags & IP_VS_CONN_F_NO_CPORT) { + atomic_dec(&ip_vs_conn_no_cport_cnt); + cp->flags &= ~IP_VS_CONN_F_NO_CPORT; + cp->cport = cport; + } + spin_unlock_bh(&cp->lock); + + /* hash on new dport */ + ip_vs_conn_hash(cp); + } +} + + +/* + * Bind a connection entry with the corresponding packet_xmit. + * Called by ip_vs_conn_new. + */ +static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) +{ + switch (IP_VS_FWD_METHOD(cp)) { + case IP_VS_CONN_F_MASQ: + cp->packet_xmit = ip_vs_nat_xmit; + break; + + case IP_VS_CONN_F_TUNNEL: +#ifdef CONFIG_IP_VS_IPV6 + if (cp->daf == AF_INET6) + cp->packet_xmit = ip_vs_tunnel_xmit_v6; + else +#endif + cp->packet_xmit = ip_vs_tunnel_xmit; + break; + + case IP_VS_CONN_F_DROUTE: + cp->packet_xmit = ip_vs_dr_xmit; + break; + + case IP_VS_CONN_F_LOCALNODE: + cp->packet_xmit = ip_vs_null_xmit; + break; + + case IP_VS_CONN_F_BYPASS: + cp->packet_xmit = ip_vs_bypass_xmit; + break; + } +} + +#ifdef CONFIG_IP_VS_IPV6 +static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp) +{ + switch (IP_VS_FWD_METHOD(cp)) { + case IP_VS_CONN_F_MASQ: + cp->packet_xmit = ip_vs_nat_xmit_v6; + break; + + case IP_VS_CONN_F_TUNNEL: + if (cp->daf == AF_INET6) + cp->packet_xmit = ip_vs_tunnel_xmit_v6; + else + cp->packet_xmit = ip_vs_tunnel_xmit; + break; + + case IP_VS_CONN_F_DROUTE: + cp->packet_xmit = ip_vs_dr_xmit_v6; + break; + + case IP_VS_CONN_F_LOCALNODE: + cp->packet_xmit = ip_vs_null_xmit; + break; + + case IP_VS_CONN_F_BYPASS: + cp->packet_xmit = ip_vs_bypass_xmit_v6; + break; + } +} +#endif + + +static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) +{ + return atomic_read(&dest->activeconns) + + atomic_read(&dest->inactconns); +} + +/* + * Bind a connection entry with a virtual service destination + * Called just after a new connection entry is created. + */ +static inline void +ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) +{ + unsigned int conn_flags; + __u32 flags; + + /* if dest is NULL, then return directly */ + if (!dest) + return; + + /* Increase the refcnt counter of the dest */ + ip_vs_dest_hold(dest); + + conn_flags = atomic_read(&dest->conn_flags); + if (cp->protocol != IPPROTO_UDP) + conn_flags &= ~IP_VS_CONN_F_ONE_PACKET; + flags = cp->flags; + /* Bind with the destination and its corresponding transmitter */ + if (flags & IP_VS_CONN_F_SYNC) { + /* if the connection is not template and is created + * by sync, preserve the activity flag. + */ + if (!(flags & IP_VS_CONN_F_TEMPLATE)) + conn_flags &= ~IP_VS_CONN_F_INACTIVE; + /* connections inherit forwarding method from dest */ + flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT); + } + flags |= conn_flags; + cp->flags = flags; + cp->dest = dest; + + IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " + "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", + ip_vs_proto_name(cp->protocol), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), cp->state, + cp->flags, refcount_read(&cp->refcnt), + refcount_read(&dest->refcnt)); + + /* Update the connection counters */ + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { + /* It is a normal connection, so modify the counters + * according to the flags, later the protocol can + * update them on state change + */ + if (!(flags & IP_VS_CONN_F_INACTIVE)) + atomic_inc(&dest->activeconns); + else + atomic_inc(&dest->inactconns); + } else { + /* It is a persistent connection/template, so increase + the persistent connection counter */ + atomic_inc(&dest->persistconns); + } + + if (dest->u_threshold != 0 && + ip_vs_dest_totalconns(dest) >= dest->u_threshold) + dest->flags |= IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Check if there is a destination for the connection, if so + * bind the connection to the destination. + */ +void ip_vs_try_bind_dest(struct ip_vs_conn *cp) +{ + struct ip_vs_dest *dest; + + rcu_read_lock(); + + /* This function is only invoked by the synchronization code. We do + * not currently support heterogeneous pools with synchronization, + * so we can make the assumption that the svc_af is the same as the + * dest_af + */ + dest = ip_vs_find_dest(cp->ipvs, cp->af, cp->af, &cp->daddr, + cp->dport, &cp->vaddr, cp->vport, + cp->protocol, cp->fwmark, cp->flags); + if (dest) { + struct ip_vs_proto_data *pd; + + spin_lock_bh(&cp->lock); + if (cp->dest) { + spin_unlock_bh(&cp->lock); + rcu_read_unlock(); + return; + } + + /* Applications work depending on the forwarding method + * but better to reassign them always when binding dest */ + if (cp->app) + ip_vs_unbind_app(cp); + + ip_vs_bind_dest(cp, dest); + spin_unlock_bh(&cp->lock); + + /* Update its packet transmitter */ + cp->packet_xmit = NULL; +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + ip_vs_bind_xmit_v6(cp); + else +#endif + ip_vs_bind_xmit(cp); + + pd = ip_vs_proto_data_get(cp->ipvs, cp->protocol); + if (pd && atomic_read(&pd->appcnt)) + ip_vs_bind_app(cp, pd->pp); + } + rcu_read_unlock(); +} + + +/* + * Unbind a connection entry with its VS destination + * Called by the ip_vs_conn_expire function. + */ +static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) +{ + struct ip_vs_dest *dest = cp->dest; + + if (!dest) + return; + + IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d " + "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", + ip_vs_proto_name(cp->protocol), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), cp->state, + cp->flags, refcount_read(&cp->refcnt), + refcount_read(&dest->refcnt)); + + /* Update the connection counters */ + if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { + /* It is a normal connection, so decrease the inactconns + or activeconns counter */ + if (cp->flags & IP_VS_CONN_F_INACTIVE) { + atomic_dec(&dest->inactconns); + } else { + atomic_dec(&dest->activeconns); + } + } else { + /* It is a persistent connection/template, so decrease + the persistent connection counter */ + atomic_dec(&dest->persistconns); + } + + if (dest->l_threshold != 0) { + if (ip_vs_dest_totalconns(dest) < dest->l_threshold) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } else if (dest->u_threshold != 0) { + if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } else { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } + + ip_vs_dest_put(dest); +} + +static int expire_quiescent_template(struct netns_ipvs *ipvs, + struct ip_vs_dest *dest) +{ +#ifdef CONFIG_SYSCTL + return ipvs->sysctl_expire_quiescent_template && + (atomic_read(&dest->weight) == 0); +#else + return 0; +#endif +} + +/* + * Checking if the destination of a connection template is available. + * If available, return 1, otherwise invalidate this connection + * template and return 0. + */ +int ip_vs_check_template(struct ip_vs_conn *ct, struct ip_vs_dest *cdest) +{ + struct ip_vs_dest *dest = ct->dest; + struct netns_ipvs *ipvs = ct->ipvs; + + /* + * Checking the dest server status. + */ + if ((dest == NULL) || + !(dest->flags & IP_VS_DEST_F_AVAILABLE) || + expire_quiescent_template(ipvs, dest) || + (cdest && (dest != cdest))) { + IP_VS_DBG_BUF(9, "check_template: dest not available for " + "protocol %s s:%s:%d v:%s:%d " + "-> d:%s:%d\n", + ip_vs_proto_name(ct->protocol), + IP_VS_DBG_ADDR(ct->af, &ct->caddr), + ntohs(ct->cport), + IP_VS_DBG_ADDR(ct->af, &ct->vaddr), + ntohs(ct->vport), + IP_VS_DBG_ADDR(ct->daf, &ct->daddr), + ntohs(ct->dport)); + + /* + * Invalidate the connection template + */ + if (ct->vport != htons(0xffff)) { + if (ip_vs_conn_unhash(ct)) { + ct->dport = htons(0xffff); + ct->vport = htons(0xffff); + ct->cport = 0; + ip_vs_conn_hash(ct); + } + } + + /* + * Simply decrease the refcnt of the template, + * don't restart its timer. + */ + __ip_vs_conn_put(ct); + return 0; + } + return 1; +} + +static void ip_vs_conn_rcu_free(struct rcu_head *head) +{ + struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn, + rcu_head); + + ip_vs_pe_put(cp->pe); + kfree(cp->pe_data); + kmem_cache_free(ip_vs_conn_cachep, cp); +} + +/* Try to delete connection while not holding reference */ +static void ip_vs_conn_del(struct ip_vs_conn *cp) +{ + if (del_timer(&cp->timer)) { + /* Drop cp->control chain too */ + if (cp->control) + cp->timeout = 0; + ip_vs_conn_expire(&cp->timer); + } +} + +/* Try to delete connection while holding reference */ +static void ip_vs_conn_del_put(struct ip_vs_conn *cp) +{ + if (del_timer(&cp->timer)) { + /* Drop cp->control chain too */ + if (cp->control) + cp->timeout = 0; + __ip_vs_conn_put(cp); + ip_vs_conn_expire(&cp->timer); + } else { + __ip_vs_conn_put(cp); + } +} + +static void ip_vs_conn_expire(struct timer_list *t) +{ + struct ip_vs_conn *cp = from_timer(cp, t, timer); + struct netns_ipvs *ipvs = cp->ipvs; + + /* + * do I control anybody? + */ + if (atomic_read(&cp->n_control)) + goto expire_later; + + /* Unlink conn if not referenced anymore */ + if (likely(ip_vs_conn_unlink(cp))) { + struct ip_vs_conn *ct = cp->control; + + /* delete the timer if it is activated by other users */ + del_timer(&cp->timer); + + /* does anybody control me? */ + if (ct) { + bool has_ref = !cp->timeout && __ip_vs_conn_get(ct); + + ip_vs_control_del(cp); + /* Drop CTL or non-assured TPL if not used anymore */ + if (has_ref && !atomic_read(&ct->n_control) && + (!(ct->flags & IP_VS_CONN_F_TEMPLATE) || + !(ct->state & IP_VS_CTPL_S_ASSURED))) { + IP_VS_DBG(4, "drop controlling connection\n"); + ip_vs_conn_del_put(ct); + } else if (has_ref) { + __ip_vs_conn_put(ct); + } + } + + if ((cp->flags & IP_VS_CONN_F_NFCT) && + !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) { + /* Do not access conntracks during subsys cleanup + * because nf_conntrack_find_get can not be used after + * conntrack cleanup for the net. + */ + smp_rmb(); + if (ipvs->enable) + ip_vs_conn_drop_conntrack(cp); + } + + if (unlikely(cp->app != NULL)) + ip_vs_unbind_app(cp); + ip_vs_unbind_dest(cp); + if (cp->flags & IP_VS_CONN_F_NO_CPORT) + atomic_dec(&ip_vs_conn_no_cport_cnt); + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + ip_vs_conn_rcu_free(&cp->rcu_head); + else + call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free); + atomic_dec(&ipvs->conn_count); + return; + } + + expire_later: + IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n", + refcount_read(&cp->refcnt), + atomic_read(&cp->n_control)); + + refcount_inc(&cp->refcnt); + cp->timeout = 60*HZ; + + if (ipvs->sync_state & IP_VS_STATE_MASTER) + ip_vs_sync_conn(ipvs, cp, sysctl_sync_threshold(ipvs)); + + __ip_vs_conn_put_timer(cp); +} + +/* Modify timer, so that it expires as soon as possible. + * Can be called without reference only if under RCU lock. + * We can have such chain of conns linked with ->control: DATA->CTL->TPL + * - DATA (eg. FTP) and TPL (persistence) can be present depending on setup + * - cp->timeout=0 indicates all conns from chain should be dropped but + * TPL is not dropped if in assured state + */ +void ip_vs_conn_expire_now(struct ip_vs_conn *cp) +{ + /* Using mod_timer_pending will ensure the timer is not + * modified after the final del_timer in ip_vs_conn_expire. + */ + if (timer_pending(&cp->timer) && + time_after(cp->timer.expires, jiffies)) + mod_timer_pending(&cp->timer, jiffies); +} + + +/* + * Create a new connection entry and hash it into the ip_vs_conn_tab + */ +struct ip_vs_conn * +ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, + const union nf_inet_addr *daddr, __be16 dport, unsigned int flags, + struct ip_vs_dest *dest, __u32 fwmark) +{ + struct ip_vs_conn *cp; + struct netns_ipvs *ipvs = p->ipvs; + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->ipvs, + p->protocol); + + cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC); + if (cp == NULL) { + IP_VS_ERR_RL("%s(): no memory\n", __func__); + return NULL; + } + + INIT_HLIST_NODE(&cp->c_list); + timer_setup(&cp->timer, ip_vs_conn_expire, 0); + cp->ipvs = ipvs; + cp->af = p->af; + cp->daf = dest_af; + cp->protocol = p->protocol; + ip_vs_addr_set(p->af, &cp->caddr, p->caddr); + cp->cport = p->cport; + /* proto should only be IPPROTO_IP if p->vaddr is a fwmark */ + ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, + &cp->vaddr, p->vaddr); + cp->vport = p->vport; + ip_vs_addr_set(cp->daf, &cp->daddr, daddr); + cp->dport = dport; + cp->flags = flags; + cp->fwmark = fwmark; + if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) { + ip_vs_pe_get(p->pe); + cp->pe = p->pe; + cp->pe_data = p->pe_data; + cp->pe_data_len = p->pe_data_len; + } else { + cp->pe = NULL; + cp->pe_data = NULL; + cp->pe_data_len = 0; + } + spin_lock_init(&cp->lock); + + /* + * Set the entry is referenced by the current thread before hashing + * it in the table, so that other thread run ip_vs_random_dropentry + * but cannot drop this entry. + */ + refcount_set(&cp->refcnt, 1); + + cp->control = NULL; + atomic_set(&cp->n_control, 0); + atomic_set(&cp->in_pkts, 0); + + cp->packet_xmit = NULL; + cp->app = NULL; + cp->app_data = NULL; + /* reset struct ip_vs_seq */ + cp->in_seq.delta = 0; + cp->out_seq.delta = 0; + + atomic_inc(&ipvs->conn_count); + if (flags & IP_VS_CONN_F_NO_CPORT) + atomic_inc(&ip_vs_conn_no_cport_cnt); + + /* Bind the connection with a destination server */ + cp->dest = NULL; + ip_vs_bind_dest(cp, dest); + + /* Set its state and timeout */ + cp->state = 0; + cp->old_state = 0; + cp->timeout = 3*HZ; + cp->sync_endtime = jiffies & ~3UL; + + /* Bind its packet transmitter */ +#ifdef CONFIG_IP_VS_IPV6 + if (p->af == AF_INET6) + ip_vs_bind_xmit_v6(cp); + else +#endif + ip_vs_bind_xmit(cp); + + if (unlikely(pd && atomic_read(&pd->appcnt))) + ip_vs_bind_app(cp, pd->pp); + + /* + * Allow conntrack to be preserved. By default, conntrack + * is created and destroyed for every packet. + * Sometimes keeping conntrack can be useful for + * IP_VS_CONN_F_ONE_PACKET too. + */ + + if (ip_vs_conntrack_enabled(ipvs)) + cp->flags |= IP_VS_CONN_F_NFCT; + + /* Hash it in the ip_vs_conn_tab finally */ + ip_vs_conn_hash(cp); + + return cp; +} + +/* + * /proc/net/ip_vs_conn entries + */ +#ifdef CONFIG_PROC_FS +struct ip_vs_iter_state { + struct seq_net_private p; + struct hlist_head *l; +}; + +static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) +{ + int idx; + struct ip_vs_conn *cp; + struct ip_vs_iter_state *iter = seq->private; + + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { + /* __ip_vs_conn_get() is not needed by + * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show + */ + if (pos-- == 0) { + iter->l = &ip_vs_conn_tab[idx]; + return cp; + } + } + cond_resched_rcu(); + } + + return NULL; +} + +static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + struct ip_vs_iter_state *iter = seq->private; + + iter->l = NULL; + rcu_read_lock(); + return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; +} + +static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip_vs_conn *cp = v; + struct ip_vs_iter_state *iter = seq->private; + struct hlist_node *e; + struct hlist_head *l = iter->l; + int idx; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_conn_array(seq, 0); + + /* more on same hash chain? */ + e = rcu_dereference(hlist_next_rcu(&cp->c_list)); + if (e) + return hlist_entry(e, struct ip_vs_conn, c_list); + + idx = l - ip_vs_conn_tab; + while (++idx < ip_vs_conn_tab_size) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { + iter->l = &ip_vs_conn_tab[idx]; + return cp; + } + cond_resched_rcu(); + } + iter->l = NULL; + return NULL; +} + +static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + +static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) +{ + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n"); + else { + const struct ip_vs_conn *cp = v; + struct net *net = seq_file_net(seq); + char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3]; + size_t len = 0; + char dbuf[IP_VS_ADDRSTRLEN]; + + if (!net_eq(cp->ipvs->net, net)) + return 0; + if (cp->pe_data) { + pe_data[0] = ' '; + len = strlen(cp->pe->name); + memcpy(pe_data + 1, cp->pe->name, len); + pe_data[len + 1] = ' '; + len += 2; + len += cp->pe->show_pe_data(cp, pe_data + len); + } + pe_data[len] = '\0'; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->daf == AF_INET6) + snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6); + else +#endif + snprintf(dbuf, sizeof(dbuf), "%08X", + ntohl(cp->daddr.ip)); + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X " + "%s %04X %-11s %7u%s\n", + ip_vs_proto_name(cp->protocol), + &cp->caddr.in6, ntohs(cp->cport), + &cp->vaddr.in6, ntohs(cp->vport), + dbuf, ntohs(cp->dport), + ip_vs_state_name(cp), + jiffies_delta_to_msecs(cp->timer.expires - + jiffies) / 1000, + pe_data); + else +#endif + seq_printf(seq, + "%-3s %08X %04X %08X %04X" + " %s %04X %-11s %7u%s\n", + ip_vs_proto_name(cp->protocol), + ntohl(cp->caddr.ip), ntohs(cp->cport), + ntohl(cp->vaddr.ip), ntohs(cp->vport), + dbuf, ntohs(cp->dport), + ip_vs_state_name(cp), + jiffies_delta_to_msecs(cp->timer.expires - + jiffies) / 1000, + pe_data); + } + return 0; +} + +static const struct seq_operations ip_vs_conn_seq_ops = { + .start = ip_vs_conn_seq_start, + .next = ip_vs_conn_seq_next, + .stop = ip_vs_conn_seq_stop, + .show = ip_vs_conn_seq_show, +}; + +static const char *ip_vs_origin_name(unsigned int flags) +{ + if (flags & IP_VS_CONN_F_SYNC) + return "SYNC"; + else + return "LOCAL"; +} + +static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) +{ + char dbuf[IP_VS_ADDRSTRLEN]; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); + else { + const struct ip_vs_conn *cp = v; + struct net *net = seq_file_net(seq); + + if (!net_eq(cp->ipvs->net, net)) + return 0; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->daf == AF_INET6) + snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6); + else +#endif + snprintf(dbuf, sizeof(dbuf), "%08X", + ntohl(cp->daddr.ip)); + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X " + "%s %04X %-11s %-6s %7u\n", + ip_vs_proto_name(cp->protocol), + &cp->caddr.in6, ntohs(cp->cport), + &cp->vaddr.in6, ntohs(cp->vport), + dbuf, ntohs(cp->dport), + ip_vs_state_name(cp), + ip_vs_origin_name(cp->flags), + jiffies_delta_to_msecs(cp->timer.expires - + jiffies) / 1000); + else +#endif + seq_printf(seq, + "%-3s %08X %04X %08X %04X " + "%s %04X %-11s %-6s %7u\n", + ip_vs_proto_name(cp->protocol), + ntohl(cp->caddr.ip), ntohs(cp->cport), + ntohl(cp->vaddr.ip), ntohs(cp->vport), + dbuf, ntohs(cp->dport), + ip_vs_state_name(cp), + ip_vs_origin_name(cp->flags), + jiffies_delta_to_msecs(cp->timer.expires - + jiffies) / 1000); + } + return 0; +} + +static const struct seq_operations ip_vs_conn_sync_seq_ops = { + .start = ip_vs_conn_seq_start, + .next = ip_vs_conn_seq_next, + .stop = ip_vs_conn_seq_stop, + .show = ip_vs_conn_sync_seq_show, +}; +#endif + + +/* Randomly drop connection entries before running out of memory + * Can be used for DATA and CTL conns. For TPL conns there are exceptions: + * - traffic for services in OPS mode increases ct->in_pkts, so it is supported + * - traffic for services not in OPS mode does not increase ct->in_pkts in + * all cases, so it is not supported + */ +static inline int todrop_entry(struct ip_vs_conn *cp) +{ + /* + * The drop rate array needs tuning for real environments. + * Called from timer bh only => no locking + */ + static const signed char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + static signed char todrop_counter[9] = {0}; + int i; + + /* if the conn entry hasn't lasted for 60 seconds, don't drop it. + This will leave enough time for normal connection to get + through. */ + if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) + return 0; + + /* Don't drop the entry if its number of incoming packets is not + located in [0, 8] */ + i = atomic_read(&cp->in_pkts); + if (i > 8 || i < 0) return 0; + + if (!todrop_rate[i]) return 0; + if (--todrop_counter[i] > 0) return 0; + + todrop_counter[i] = todrop_rate[i]; + return 1; +} + +static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp) +{ + struct ip_vs_service *svc; + + if (!cp->dest) + return false; + svc = rcu_dereference(cp->dest->svc); + return svc && (svc->flags & IP_VS_SVC_F_ONEPACKET); +} + +/* Called from keventd and must protect itself from softirqs */ +void ip_vs_random_dropentry(struct netns_ipvs *ipvs) +{ + int idx; + struct ip_vs_conn *cp; + + rcu_read_lock(); + /* + * Randomly scan 1/32 of the whole table every second + */ + for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { + unsigned int hash = get_random_u32() & ip_vs_conn_tab_mask; + + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { + if (cp->ipvs != ipvs) + continue; + if (atomic_read(&cp->n_control)) + continue; + if (cp->flags & IP_VS_CONN_F_TEMPLATE) { + /* connection template of OPS */ + if (ip_vs_conn_ops_mode(cp)) + goto try_drop; + if (!(cp->state & IP_VS_CTPL_S_ASSURED)) + goto drop; + continue; + } + if (cp->protocol == IPPROTO_TCP) { + switch(cp->state) { + case IP_VS_TCP_S_SYN_RECV: + case IP_VS_TCP_S_SYNACK: + break; + + case IP_VS_TCP_S_ESTABLISHED: + if (todrop_entry(cp)) + break; + continue; + + default: + continue; + } + } else if (cp->protocol == IPPROTO_SCTP) { + switch (cp->state) { + case IP_VS_SCTP_S_INIT1: + case IP_VS_SCTP_S_INIT: + break; + case IP_VS_SCTP_S_ESTABLISHED: + if (todrop_entry(cp)) + break; + continue; + default: + continue; + } + } else { +try_drop: + if (!todrop_entry(cp)) + continue; + } + +drop: + IP_VS_DBG(4, "drop connection\n"); + ip_vs_conn_del(cp); + } + cond_resched_rcu(); + } + rcu_read_unlock(); +} + + +/* + * Flush all the connection entries in the ip_vs_conn_tab + */ +static void ip_vs_conn_flush(struct netns_ipvs *ipvs) +{ + int idx; + struct ip_vs_conn *cp, *cp_c; + +flush_again: + rcu_read_lock(); + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { + + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { + if (cp->ipvs != ipvs) + continue; + if (atomic_read(&cp->n_control)) + continue; + cp_c = cp->control; + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_del(cp); + if (cp_c && !atomic_read(&cp_c->n_control)) { + IP_VS_DBG(4, "del controlling connection\n"); + ip_vs_conn_del(cp_c); + } + } + cond_resched_rcu(); + } + rcu_read_unlock(); + + /* the counter may be not NULL, because maybe some conn entries + are run by slow timer handler or unhashed but still referred */ + if (atomic_read(&ipvs->conn_count) != 0) { + schedule(); + goto flush_again; + } +} + +#ifdef CONFIG_SYSCTL +void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs) +{ + int idx; + struct ip_vs_conn *cp, *cp_c; + struct ip_vs_dest *dest; + + rcu_read_lock(); + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { + if (cp->ipvs != ipvs) + continue; + + dest = cp->dest; + if (!dest || (dest->flags & IP_VS_DEST_F_AVAILABLE)) + continue; + + if (atomic_read(&cp->n_control)) + continue; + + cp_c = cp->control; + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_del(cp); + if (cp_c && !atomic_read(&cp_c->n_control)) { + IP_VS_DBG(4, "del controlling connection\n"); + ip_vs_conn_del(cp_c); + } + } + cond_resched_rcu(); + + /* netns clean up started, abort delayed work */ + if (!ipvs->enable) + break; + } + rcu_read_unlock(); +} +#endif + +/* + * per netns init and exit + */ +int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs) +{ + atomic_set(&ipvs->conn_count, 0); + +#ifdef CONFIG_PROC_FS + if (!proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net, + &ip_vs_conn_seq_ops, + sizeof(struct ip_vs_iter_state))) + goto err_conn; + + if (!proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net, + &ip_vs_conn_sync_seq_ops, + sizeof(struct ip_vs_iter_state))) + goto err_conn_sync; +#endif + + return 0; + +#ifdef CONFIG_PROC_FS +err_conn_sync: + remove_proc_entry("ip_vs_conn", ipvs->net->proc_net); +err_conn: + return -ENOMEM; +#endif +} + +void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs) +{ + /* flush all the connection entries first */ + ip_vs_conn_flush(ipvs); +#ifdef CONFIG_PROC_FS + remove_proc_entry("ip_vs_conn", ipvs->net->proc_net); + remove_proc_entry("ip_vs_conn_sync", ipvs->net->proc_net); +#endif +} + +int __init ip_vs_conn_init(void) +{ + int idx; + + /* Compute size and mask */ + if (ip_vs_conn_tab_bits < 8 || ip_vs_conn_tab_bits > 27) { + pr_info("conn_tab_bits not in [8, 27]. Using default value\n"); + ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; + } + ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; + ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; + + /* + * Allocate the connection hash table and initialize its list heads + */ + ip_vs_conn_tab = vmalloc(array_size(ip_vs_conn_tab_size, + sizeof(*ip_vs_conn_tab))); + if (!ip_vs_conn_tab) + return -ENOMEM; + + /* Allocate ip_vs_conn slab cache */ + ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", + sizeof(struct ip_vs_conn), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!ip_vs_conn_cachep) { + vfree(ip_vs_conn_tab); + return -ENOMEM; + } + + pr_info("Connection hash table configured " + "(size=%d, memory=%ldKbytes)\n", + ip_vs_conn_tab_size, + (long)(ip_vs_conn_tab_size*sizeof(*ip_vs_conn_tab))/1024); + IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n", + sizeof(struct ip_vs_conn)); + + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) + INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]); + + for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { + spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l); + } + + /* calculate the random value for connection hash */ + get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); + + return 0; +} + +void ip_vs_conn_cleanup(void) +{ + /* Wait all ip_vs_conn_rcu_free() callbacks to complete */ + rcu_barrier(); + /* Release the empty cache */ + kmem_cache_destroy(ip_vs_conn_cachep); + vfree(ip_vs_conn_tab); +} diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c new file mode 100644 index 000000000..b5ae41966 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -0,0 +1,2456 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang + * Peter Kese + * Julian Anastasov + * + * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, + * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms + * and others. + * + * Changes: + * Paul `Rusty' Russell properly handle non-linear skbs + * Harald Welte don't use nfcache + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include /* for icmp_send */ +#include +#include +#include +#include +#include /* net_generic() */ + +#include +#include + +#ifdef CONFIG_IP_VS_IPV6 +#include +#include +#include +#endif + +#include +#include + + +EXPORT_SYMBOL(register_ip_vs_scheduler); +EXPORT_SYMBOL(unregister_ip_vs_scheduler); +EXPORT_SYMBOL(ip_vs_proto_name); +EXPORT_SYMBOL(ip_vs_conn_new); +EXPORT_SYMBOL(ip_vs_conn_in_get); +EXPORT_SYMBOL(ip_vs_conn_out_get); +#ifdef CONFIG_IP_VS_PROTO_TCP +EXPORT_SYMBOL(ip_vs_tcp_conn_listen); +#endif +EXPORT_SYMBOL(ip_vs_conn_put); +#ifdef CONFIG_IP_VS_DEBUG +EXPORT_SYMBOL(ip_vs_get_debug_level); +#endif +EXPORT_SYMBOL(ip_vs_new_conn_out); + +#if defined(CONFIG_IP_VS_PROTO_TCP) && defined(CONFIG_IP_VS_PROTO_UDP) +#define SNAT_CALL(f, ...) \ + INDIRECT_CALL_2(f, tcp_snat_handler, udp_snat_handler, __VA_ARGS__) +#elif defined(CONFIG_IP_VS_PROTO_TCP) +#define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, tcp_snat_handler, __VA_ARGS__) +#elif defined(CONFIG_IP_VS_PROTO_UDP) +#define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, udp_snat_handler, __VA_ARGS__) +#else +#define SNAT_CALL(f, ...) f(__VA_ARGS__) +#endif + +static unsigned int ip_vs_net_id __read_mostly; +/* netns cnt used for uniqueness */ +static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); + +/* ID used in ICMP lookups */ +#define icmp_id(icmph) (((icmph)->un).echo.id) +#define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) + +const char *ip_vs_proto_name(unsigned int proto) +{ + static char buf[20]; + + switch (proto) { + case IPPROTO_IP: + return "IP"; + case IPPROTO_UDP: + return "UDP"; + case IPPROTO_TCP: + return "TCP"; + case IPPROTO_SCTP: + return "SCTP"; + case IPPROTO_ICMP: + return "ICMP"; +#ifdef CONFIG_IP_VS_IPV6 + case IPPROTO_ICMPV6: + return "ICMPv6"; +#endif + default: + sprintf(buf, "IP_%u", proto); + return buf; + } +} + +void ip_vs_init_hash_table(struct list_head *table, int rows) +{ + while (--rows >= 0) + INIT_LIST_HEAD(&table[rows]); +} + +static inline void +ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = cp->dest; + struct netns_ipvs *ipvs = cp->ipvs; + + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + struct ip_vs_cpu_stats *s; + struct ip_vs_service *svc; + + local_bh_disable(); + + s = this_cpu_ptr(dest->stats.cpustats); + u64_stats_update_begin(&s->syncp); + u64_stats_inc(&s->cnt.inpkts); + u64_stats_add(&s->cnt.inbytes, skb->len); + u64_stats_update_end(&s->syncp); + + svc = rcu_dereference(dest->svc); + s = this_cpu_ptr(svc->stats.cpustats); + u64_stats_update_begin(&s->syncp); + u64_stats_inc(&s->cnt.inpkts); + u64_stats_add(&s->cnt.inbytes, skb->len); + u64_stats_update_end(&s->syncp); + + s = this_cpu_ptr(ipvs->tot_stats.cpustats); + u64_stats_update_begin(&s->syncp); + u64_stats_inc(&s->cnt.inpkts); + u64_stats_add(&s->cnt.inbytes, skb->len); + u64_stats_update_end(&s->syncp); + + local_bh_enable(); + } +} + + +static inline void +ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = cp->dest; + struct netns_ipvs *ipvs = cp->ipvs; + + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + struct ip_vs_cpu_stats *s; + struct ip_vs_service *svc; + + local_bh_disable(); + + s = this_cpu_ptr(dest->stats.cpustats); + u64_stats_update_begin(&s->syncp); + u64_stats_inc(&s->cnt.outpkts); + u64_stats_add(&s->cnt.outbytes, skb->len); + u64_stats_update_end(&s->syncp); + + svc = rcu_dereference(dest->svc); + s = this_cpu_ptr(svc->stats.cpustats); + u64_stats_update_begin(&s->syncp); + u64_stats_inc(&s->cnt.outpkts); + u64_stats_add(&s->cnt.outbytes, skb->len); + u64_stats_update_end(&s->syncp); + + s = this_cpu_ptr(ipvs->tot_stats.cpustats); + u64_stats_update_begin(&s->syncp); + u64_stats_inc(&s->cnt.outpkts); + u64_stats_add(&s->cnt.outbytes, skb->len); + u64_stats_update_end(&s->syncp); + + local_bh_enable(); + } +} + + +static inline void +ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) +{ + struct netns_ipvs *ipvs = svc->ipvs; + struct ip_vs_cpu_stats *s; + + local_bh_disable(); + + s = this_cpu_ptr(cp->dest->stats.cpustats); + u64_stats_update_begin(&s->syncp); + u64_stats_inc(&s->cnt.conns); + u64_stats_update_end(&s->syncp); + + s = this_cpu_ptr(svc->stats.cpustats); + u64_stats_update_begin(&s->syncp); + u64_stats_inc(&s->cnt.conns); + u64_stats_update_end(&s->syncp); + + s = this_cpu_ptr(ipvs->tot_stats.cpustats); + u64_stats_update_begin(&s->syncp); + u64_stats_inc(&s->cnt.conns); + u64_stats_update_end(&s->syncp); + + local_bh_enable(); +} + + +static inline void +ip_vs_set_state(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_proto_data *pd) +{ + if (likely(pd->pp->state_transition)) + pd->pp->state_transition(cp, direction, skb, pd); +} + +static inline int +ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, + struct sk_buff *skb, int protocol, + const union nf_inet_addr *caddr, __be16 cport, + const union nf_inet_addr *vaddr, __be16 vport, + struct ip_vs_conn_param *p) +{ + ip_vs_conn_fill_param(svc->ipvs, svc->af, protocol, caddr, cport, vaddr, + vport, p); + p->pe = rcu_dereference(svc->pe); + if (p->pe && p->pe->fill_param) + return p->pe->fill_param(p, skb); + + return 0; +} + +/* + * IPVS persistent scheduling function + * It creates a connection entry according to its template if exists, + * or selects a server and creates a connection entry plus a template. + * Locking: we are svc user (svc->refcnt), so we hold all dests too + * Protocols supported: TCP, UDP + */ +static struct ip_vs_conn * +ip_vs_sched_persist(struct ip_vs_service *svc, + struct sk_buff *skb, __be16 src_port, __be16 dst_port, + int *ignored, struct ip_vs_iphdr *iph) +{ + struct ip_vs_conn *cp = NULL; + struct ip_vs_dest *dest; + struct ip_vs_conn *ct; + __be16 dport = 0; /* destination port to forward */ + unsigned int flags; + struct ip_vs_conn_param param; + const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; + union nf_inet_addr snet; /* source network of the client, + after masking */ + const union nf_inet_addr *src_addr, *dst_addr; + + if (likely(!ip_vs_iph_inverse(iph))) { + src_addr = &iph->saddr; + dst_addr = &iph->daddr; + } else { + src_addr = &iph->daddr; + dst_addr = &iph->saddr; + } + + + /* Mask saddr with the netmask to adjust template granularity */ +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + ipv6_addr_prefix(&snet.in6, &src_addr->in6, + (__force __u32) svc->netmask); + else +#endif + snet.ip = src_addr->ip & svc->netmask; + + IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " + "mnet %s\n", + IP_VS_DBG_ADDR(svc->af, src_addr), ntohs(src_port), + IP_VS_DBG_ADDR(svc->af, dst_addr), ntohs(dst_port), + IP_VS_DBG_ADDR(svc->af, &snet)); + + /* + * As far as we know, FTP is a very complicated network protocol, and + * it uses control connection and data connections. For active FTP, + * FTP server initialize data connection to the client, its source port + * is often 20. For passive FTP, FTP server tells the clients the port + * that it passively listens to, and the client issues the data + * connection. In the tunneling or direct routing mode, the load + * balancer is on the client-to-server half of connection, the port + * number is unknown to the load balancer. So, a conn template like + * is created for persistent FTP + * service, and a template like + * is created for other persistent services. + */ + { + int protocol = iph->protocol; + const union nf_inet_addr *vaddr = dst_addr; + __be16 vport = 0; + + if (dst_port == svc->port) { + /* non-FTP template: + * + * FTP template: + * + */ + if (svc->port != FTPPORT) + vport = dst_port; + } else { + /* Note: persistent fwmark-based services and + * persistent port zero service are handled here. + * fwmark template: + * + * port zero template: + * + */ + if (svc->fwmark) { + protocol = IPPROTO_IP; + vaddr = &fwmark; + } + } + /* return *ignored = -1 so NF_DROP can be used */ + if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, + vaddr, vport, ¶m) < 0) { + *ignored = -1; + return NULL; + } + } + + /* Check if a template already exists */ + ct = ip_vs_ct_in_get(¶m); + if (!ct || !ip_vs_check_template(ct, NULL)) { + struct ip_vs_scheduler *sched; + + /* + * No template found or the dest of the connection + * template is not available. + * return *ignored=0 i.e. ICMP and NF_DROP + */ + sched = rcu_dereference(svc->scheduler); + if (sched) { + /* read svc->sched_data after svc->scheduler */ + smp_rmb(); + dest = sched->schedule(svc, skb, iph); + } else { + dest = NULL; + } + if (!dest) { + IP_VS_DBG(1, "p-schedule: no dest found.\n"); + kfree(param.pe_data); + *ignored = 0; + return NULL; + } + + if (dst_port == svc->port && svc->port != FTPPORT) + dport = dest->port; + + /* Create a template + * This adds param.pe_data to the template, + * and thus param.pe_data will be destroyed + * when the template expires */ + ct = ip_vs_conn_new(¶m, dest->af, &dest->addr, dport, + IP_VS_CONN_F_TEMPLATE, dest, skb->mark); + if (ct == NULL) { + kfree(param.pe_data); + *ignored = -1; + return NULL; + } + + ct->timeout = svc->timeout; + } else { + /* set destination with the found template */ + dest = ct->dest; + kfree(param.pe_data); + } + + dport = dst_port; + if (dport == svc->port && dest->port) + dport = dest->port; + + flags = (svc->flags & IP_VS_SVC_F_ONEPACKET + && iph->protocol == IPPROTO_UDP) ? + IP_VS_CONN_F_ONE_PACKET : 0; + + /* + * Create a new connection according to the template + */ + ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, src_addr, + src_port, dst_addr, dst_port, ¶m); + + cp = ip_vs_conn_new(¶m, dest->af, &dest->addr, dport, flags, dest, + skb->mark); + if (cp == NULL) { + ip_vs_conn_put(ct); + *ignored = -1; + return NULL; + } + + /* + * Add its control + */ + ip_vs_control_add(cp, ct); + ip_vs_conn_put(ct); + + ip_vs_conn_stats(cp, svc); + return cp; +} + + +/* + * IPVS main scheduling function + * It selects a server according to the virtual service, and + * creates a connection entry. + * Protocols supported: TCP, UDP + * + * Usage of *ignored + * + * 1 : protocol tried to schedule (eg. on SYN), found svc but the + * svc/scheduler decides that this packet should be accepted with + * NF_ACCEPT because it must not be scheduled. + * + * 0 : scheduler can not find destination, so try bypass or + * return ICMP and then NF_DROP (ip_vs_leave). + * + * -1 : scheduler tried to schedule but fatal error occurred, eg. + * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param + * failure such as missing Call-ID, ENOMEM on skb_linearize + * or pe_data. In this case we should return NF_DROP without + * any attempts to send ICMP with ip_vs_leave. + */ +struct ip_vs_conn * +ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, + struct ip_vs_proto_data *pd, int *ignored, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_protocol *pp = pd->pp; + struct ip_vs_conn *cp = NULL; + struct ip_vs_scheduler *sched; + struct ip_vs_dest *dest; + __be16 _ports[2], *pptr, cport, vport; + const void *caddr, *vaddr; + unsigned int flags; + + *ignored = 1; + /* + * IPv6 frags, only the first hit here. + */ + pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + if (likely(!ip_vs_iph_inverse(iph))) { + cport = pptr[0]; + caddr = &iph->saddr; + vport = pptr[1]; + vaddr = &iph->daddr; + } else { + cport = pptr[1]; + caddr = &iph->daddr; + vport = pptr[0]; + vaddr = &iph->saddr; + } + + /* + * FTPDATA needs this check when using local real server. + * Never schedule Active FTPDATA connections from real server. + * For LVS-NAT they must be already created. For other methods + * with persistence the connection is created on SYN+ACK. + */ + if (cport == FTPDATA) { + IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off, + "Not scheduling FTPDATA"); + return NULL; + } + + /* + * Do not schedule replies from local real server. + */ + if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK)) { + iph->hdr_flags ^= IP_VS_HDR_INVERSE; + cp = INDIRECT_CALL_1(pp->conn_in_get, + ip_vs_conn_in_get_proto, svc->ipvs, + svc->af, skb, iph); + iph->hdr_flags ^= IP_VS_HDR_INVERSE; + + if (cp) { + IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off, + "Not scheduling reply for existing" + " connection"); + __ip_vs_conn_put(cp); + return NULL; + } + } + + /* + * Persistent service + */ + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + return ip_vs_sched_persist(svc, skb, cport, vport, ignored, + iph); + + *ignored = 0; + + /* + * Non-persistent service + */ + if (!svc->fwmark && vport != svc->port) { + if (!svc->port) + pr_err("Schedule: port zero only supported " + "in persistent services, " + "check your ipvs configuration\n"); + return NULL; + } + + sched = rcu_dereference(svc->scheduler); + if (sched) { + /* read svc->sched_data after svc->scheduler */ + smp_rmb(); + dest = sched->schedule(svc, skb, iph); + } else { + dest = NULL; + } + if (dest == NULL) { + IP_VS_DBG(1, "Schedule: no dest found.\n"); + return NULL; + } + + flags = (svc->flags & IP_VS_SVC_F_ONEPACKET + && iph->protocol == IPPROTO_UDP) ? + IP_VS_CONN_F_ONE_PACKET : 0; + + /* + * Create a connection entry. + */ + { + struct ip_vs_conn_param p; + + ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, + caddr, cport, vaddr, vport, &p); + cp = ip_vs_conn_new(&p, dest->af, &dest->addr, + dest->port ? dest->port : vport, + flags, dest, skb->mark); + if (!cp) { + *ignored = -1; + return NULL; + } + } + + IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " + "d:%s:%u conn->flags:%X conn->refcnt:%d\n", + ip_vs_fwd_tag(cp), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), + cp->flags, refcount_read(&cp->refcnt)); + + ip_vs_conn_stats(cp, svc); + return cp; +} + +static inline int ip_vs_addr_is_unicast(struct net *net, int af, + union nf_inet_addr *addr) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + return ipv6_addr_type(&addr->in6) & IPV6_ADDR_UNICAST; +#endif + return (inet_addr_type(net, addr->ip) == RTN_UNICAST); +} + +/* + * Pass or drop the packet. + * Called by ip_vs_in, when the virtual service is available but + * no destination is available for a new connection. + */ +int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, + struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph) +{ + __be16 _ports[2], *pptr, dport; + struct netns_ipvs *ipvs = svc->ipvs; + struct net *net = ipvs->net; + + pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports); + if (!pptr) + return NF_DROP; + dport = likely(!ip_vs_iph_inverse(iph)) ? pptr[1] : pptr[0]; + + /* if it is fwmark-based service, the cache_bypass sysctl is up + and the destination is a non-local unicast, then create + a cache_bypass connection entry */ + if (sysctl_cache_bypass(ipvs) && svc->fwmark && + !(iph->hdr_flags & (IP_VS_HDR_INVERSE | IP_VS_HDR_ICMP)) && + ip_vs_addr_is_unicast(net, svc->af, &iph->daddr)) { + int ret; + struct ip_vs_conn *cp; + unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && + iph->protocol == IPPROTO_UDP) ? + IP_VS_CONN_F_ONE_PACKET : 0; + union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; + + /* create a new connection entry */ + IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); + { + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1], &p); + cp = ip_vs_conn_new(&p, svc->af, &daddr, 0, + IP_VS_CONN_F_BYPASS | flags, + NULL, skb->mark); + if (!cp) + return NF_DROP; + } + + /* statistics */ + ip_vs_in_stats(cp, skb); + + /* set state */ + ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); + + /* transmit the first SYN packet */ + ret = cp->packet_xmit(skb, cp, pd->pp, iph); + /* do not touch skb anymore */ + + if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) + atomic_inc(&cp->control->in_pkts); + else + atomic_inc(&cp->in_pkts); + ip_vs_conn_put(cp); + return ret; + } + + /* + * When the virtual ftp service is presented, packets destined + * for other services on the VIP may get here (except services + * listed in the ipvs table), pass the packets, because it is + * not ipvs job to decide to drop the packets. + */ + if (svc->port == FTPPORT && dport != FTPPORT) + return NF_ACCEPT; + + if (unlikely(ip_vs_iph_icmp(iph))) + return NF_DROP; + + /* + * Notify the client that the destination is unreachable, and + * release the socket buffer. + * Since it is in IP layer, the TCP socket is not actually + * created, the TCP RST packet cannot be sent, instead that + * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ + */ +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) { + if (!skb->dev) + skb->dev = net->loopback_dev; + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + } else +#endif + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + return NF_DROP; +} + +#ifdef CONFIG_SYSCTL + +static int sysctl_snat_reroute(struct netns_ipvs *ipvs) +{ + return ipvs->sysctl_snat_reroute; +} + +static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) +{ + return ipvs->sysctl_nat_icmp_send; +} + +#else + +static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return 0; } +static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return 0; } + +#endif + +__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) +{ + return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); +} + +static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum) +{ + if (NF_INET_LOCAL_IN == hooknum) + return IP_DEFRAG_VS_IN; + if (NF_INET_FORWARD == hooknum) + return IP_DEFRAG_VS_FWD; + return IP_DEFRAG_VS_OUT; +} + +static inline int ip_vs_gather_frags(struct netns_ipvs *ipvs, + struct sk_buff *skb, u_int32_t user) +{ + int err; + + local_bh_disable(); + err = ip_defrag(ipvs->net, skb, user); + local_bh_enable(); + if (!err) + ip_send_check(ip_hdr(skb)); + + return err; +} + +static int ip_vs_route_me_harder(struct netns_ipvs *ipvs, int af, + struct sk_buff *skb, unsigned int hooknum) +{ + if (!sysctl_snat_reroute(ipvs)) + return 0; + /* Reroute replies only to remote clients (FORWARD and LOCAL_OUT) */ + if (NF_INET_LOCAL_IN == hooknum) + return 0; +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + struct dst_entry *dst = skb_dst(skb); + + if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) && + ip6_route_me_harder(ipvs->net, skb->sk, skb) != 0) + return 1; + } else +#endif + if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) && + ip_route_me_harder(ipvs->net, skb->sk, skb, RTN_LOCAL) != 0) + return 1; + + return 0; +} + +/* + * Packet has been made sufficiently writable in caller + * - inout: 1=in->out, 0=out->in + */ +void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, int inout) +{ + struct iphdr *iph = ip_hdr(skb); + unsigned int icmp_offset = iph->ihl*4; + struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + + icmp_offset); + struct iphdr *ciph = (struct iphdr *)(icmph + 1); + + if (inout) { + iph->saddr = cp->vaddr.ip; + ip_send_check(iph); + ciph->daddr = cp->vaddr.ip; + ip_send_check(ciph); + } else { + iph->daddr = cp->daddr.ip; + ip_send_check(iph); + ciph->saddr = cp->daddr.ip; + ip_send_check(ciph); + } + + /* the TCP/UDP/SCTP port */ + if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol || + IPPROTO_SCTP == ciph->protocol) { + __be16 *ports = (void *)ciph + ciph->ihl*4; + + if (inout) + ports[1] = cp->vport; + else + ports[0] = cp->dport; + } + + /* And finally the ICMP checksum */ + icmph->checksum = 0; + icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (inout) + IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered outgoing ICMP"); + else + IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered incoming ICMP"); +} + +#ifdef CONFIG_IP_VS_IPV6 +void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, int inout) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + unsigned int icmp_offset = 0; + unsigned int offs = 0; /* header offset*/ + int protocol; + struct icmp6hdr *icmph; + struct ipv6hdr *ciph; + unsigned short fragoffs; + + ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, &fragoffs, NULL); + icmph = (struct icmp6hdr *)(skb_network_header(skb) + icmp_offset); + offs = icmp_offset + sizeof(struct icmp6hdr); + ciph = (struct ipv6hdr *)(skb_network_header(skb) + offs); + + protocol = ipv6_find_hdr(skb, &offs, -1, &fragoffs, NULL); + + if (inout) { + iph->saddr = cp->vaddr.in6; + ciph->daddr = cp->vaddr.in6; + } else { + iph->daddr = cp->daddr.in6; + ciph->saddr = cp->daddr.in6; + } + + /* the TCP/UDP/SCTP port */ + if (!fragoffs && (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || + IPPROTO_SCTP == protocol)) { + __be16 *ports = (void *)(skb_network_header(skb) + offs); + + IP_VS_DBG(11, "%s() changed port %d to %d\n", __func__, + ntohs(inout ? ports[1] : ports[0]), + ntohs(inout ? cp->vport : cp->dport)); + if (inout) + ports[1] = cp->vport; + else + ports[0] = cp->dport; + } + + /* And finally the ICMP checksum */ + icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, + skb->len - icmp_offset, + IPPROTO_ICMPV6, 0); + skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset; + skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum); + skb->ip_summed = CHECKSUM_PARTIAL; + + if (inout) + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, + (void *)ciph - (void *)iph, + "Forwarding altered outgoing ICMPv6"); + else + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, + (void *)ciph - (void *)iph, + "Forwarding altered incoming ICMPv6"); +} +#endif + +/* Handle relevant response ICMP messages - forward to the right + * destination host. + */ +static int handle_response_icmp(int af, struct sk_buff *skb, + union nf_inet_addr *snet, + __u8 protocol, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, + unsigned int offset, unsigned int ihl, + unsigned int hooknum) +{ + unsigned int verdict = NF_DROP; + + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + goto after_nat; + + /* Ensure the checksum is correct */ + if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { + /* Failed checksum! */ + IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n", + IP_VS_DBG_ADDR(af, snet)); + goto out; + } + + if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || + IPPROTO_SCTP == protocol) + offset += 2 * sizeof(__u16); + if (skb_ensure_writable(skb, offset)) + goto out; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ip_vs_nat_icmp_v6(skb, pp, cp, 1); + else +#endif + ip_vs_nat_icmp(skb, pp, cp, 1); + + if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum)) + goto out; + +after_nat: + /* do the statistics and put it back */ + ip_vs_out_stats(cp, skb); + + skb->ipvs_property = 1; + if (!(cp->flags & IP_VS_CONN_F_NFCT)) + ip_vs_notrack(skb); + else + ip_vs_update_conntrack(skb, cp, 0); + verdict = NF_ACCEPT; + +out: + __ip_vs_conn_put(cp); + + return verdict; +} + +/* + * Handle ICMP messages in the inside-to-outside direction (outgoing). + * Find any that might be relevant, check against existing connections. + * Currently handles error types - unreachable, quench, ttl exceeded. + */ +static int ip_vs_out_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, + int *related, unsigned int hooknum) +{ + struct iphdr *iph; + struct icmphdr _icmph, *ic; + struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset, ihl; + union nf_inet_addr snet; + + *related = 1; + + /* reassemble IP fragments */ + if (ip_is_fragment(ip_hdr(skb))) { + if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) + return NF_STOLEN; + } + + iph = ip_hdr(skb); + offset = ihl = iph->ihl * 4; + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n", + ic->type, ntohs(icmp_id(ic)), + &iph->saddr, &iph->daddr); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->type != ICMP_DEST_UNREACH) && + (ic->type != ICMP_SOURCE_QUENCH) && + (ic->type != ICMP_TIME_EXCEEDED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->protocol); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + if (unlikely(cih->frag_off & htons(IP_OFFSET) && + pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, + "Checking outgoing ICMP for"); + + ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, true, &ciph); + + /* The embedded headers contain source and dest in reverse order */ + cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, + ipvs, AF_INET, skb, &ciph); + if (!cp) + return NF_ACCEPT; + + snet.ip = iph->saddr; + return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, + pp, ciph.len, ihl, hooknum); +} + +#ifdef CONFIG_IP_VS_IPV6 +static int ip_vs_out_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, + int *related, unsigned int hooknum, + struct ip_vs_iphdr *ipvsh) +{ + struct icmp6hdr _icmph, *ic; + struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + union nf_inet_addr snet; + unsigned int offset; + + *related = 1; + ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { + *related = 0; + return NF_ACCEPT; + } + /* Fragment header that is before ICMP header tells us that: + * it's not an error message since they can't be fragmented. + */ + if (ipvsh->flags & IP6_FH_F_FRAG) + return NF_DROP; + + IP_VS_DBG(8, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n", + ic->icmp6_type, ntohs(icmpv6_id(ic)), + &ipvsh->saddr, &ipvsh->daddr); + + if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, ipvsh->len + sizeof(_icmph), + true, &ciph)) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(ciph.protocol); + if (!pp) + return NF_ACCEPT; + + /* The embedded headers contain source and dest in reverse order */ + cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, + ipvs, AF_INET6, skb, &ciph); + if (!cp) + return NF_ACCEPT; + + snet.in6 = ciph.saddr.in6; + offset = ciph.len; + return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp, + pp, offset, sizeof(struct ipv6hdr), + hooknum); +} +#endif + +/* + * Check if sctp chunc is ABORT chunk + */ +static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len) +{ + struct sctp_chunkhdr *sch, schunk; + sch = skb_header_pointer(skb, nh_len + sizeof(struct sctphdr), + sizeof(schunk), &schunk); + if (sch == NULL) + return 0; + if (sch->type == SCTP_CID_ABORT) + return 1; + return 0; +} + +static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) +{ + struct tcphdr _tcph, *th; + + th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph); + if (th == NULL) + return 0; + return th->rst; +} + +static inline bool is_new_conn(const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + switch (iph->protocol) { + case IPPROTO_TCP: { + struct tcphdr _tcph, *th; + + th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); + if (th == NULL) + return false; + return th->syn; + } + case IPPROTO_SCTP: { + struct sctp_chunkhdr *sch, schunk; + + sch = skb_header_pointer(skb, iph->len + sizeof(struct sctphdr), + sizeof(schunk), &schunk); + if (sch == NULL) + return false; + return sch->type == SCTP_CID_INIT; + } + default: + return false; + } +} + +static inline bool is_new_conn_expected(const struct ip_vs_conn *cp, + int conn_reuse_mode) +{ + /* Controlled (FTP DATA or persistence)? */ + if (cp->control) + return false; + + switch (cp->protocol) { + case IPPROTO_TCP: + return (cp->state == IP_VS_TCP_S_TIME_WAIT) || + (cp->state == IP_VS_TCP_S_CLOSE) || + ((conn_reuse_mode & 2) && + (cp->state == IP_VS_TCP_S_FIN_WAIT) && + (cp->flags & IP_VS_CONN_F_NOOUTPUT)); + case IPPROTO_SCTP: + return cp->state == IP_VS_SCTP_S_CLOSED; + default: + return false; + } +} + +/* Generic function to create new connections for outgoing RS packets + * + * Pre-requisites for successful connection creation: + * 1) Virtual Service is NOT fwmark based: + * In fwmark-VS actual vaddr and vport are unknown to IPVS + * 2) Real Server and Virtual Service were NOT configured without port: + * This is to allow match of different VS to the same RS ip-addr + */ +struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + __be16 dport, + __be16 cport) +{ + struct ip_vs_conn_param param; + struct ip_vs_conn *ct = NULL, *cp = NULL; + const union nf_inet_addr *vaddr, *daddr, *caddr; + union nf_inet_addr snet; + __be16 vport; + unsigned int flags; + + EnterFunction(12); + vaddr = &svc->addr; + vport = svc->port; + daddr = &iph->saddr; + caddr = &iph->daddr; + + /* check pre-requisites are satisfied */ + if (svc->fwmark) + return NULL; + if (!vport || !dport) + return NULL; + + /* for persistent service first create connection template */ + if (svc->flags & IP_VS_SVC_F_PERSISTENT) { + /* apply netmask the same way ingress-side does */ +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + ipv6_addr_prefix(&snet.in6, &caddr->in6, + (__force __u32)svc->netmask); + else +#endif + snet.ip = caddr->ip & svc->netmask; + /* fill params and create template if not existent */ + if (ip_vs_conn_fill_param_persist(svc, skb, iph->protocol, + &snet, 0, vaddr, + vport, ¶m) < 0) + return NULL; + ct = ip_vs_ct_in_get(¶m); + /* check if template exists and points to the same dest */ + if (!ct || !ip_vs_check_template(ct, dest)) { + ct = ip_vs_conn_new(¶m, dest->af, daddr, dport, + IP_VS_CONN_F_TEMPLATE, dest, 0); + if (!ct) { + kfree(param.pe_data); + return NULL; + } + ct->timeout = svc->timeout; + } else { + kfree(param.pe_data); + } + } + + /* connection flags */ + flags = ((svc->flags & IP_VS_SVC_F_ONEPACKET) && + iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; + /* create connection */ + ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, + caddr, cport, vaddr, vport, ¶m); + cp = ip_vs_conn_new(¶m, dest->af, daddr, dport, flags, dest, 0); + if (!cp) { + if (ct) + ip_vs_conn_put(ct); + return NULL; + } + if (ct) { + ip_vs_control_add(cp, ct); + ip_vs_conn_put(ct); + } + ip_vs_conn_stats(cp, svc); + + /* return connection (will be used to handle outgoing packet) */ + IP_VS_DBG_BUF(6, "New connection RS-initiated:%c c:%s:%u v:%s:%u " + "d:%s:%u conn->flags:%X conn->refcnt:%d\n", + ip_vs_fwd_tag(cp), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + cp->flags, refcount_read(&cp->refcnt)); + LeaveFunction(12); + return cp; +} + +/* Handle outgoing packets which are considered requests initiated by + * real servers, so that subsequent responses from external client can be + * routed to the right real server. + * Used also for outgoing responses in OPS mode. + * + * Connection management is handled by persistent-engine specific callback. + */ +static struct ip_vs_conn *__ip_vs_rs_conn_out(unsigned int hooknum, + struct netns_ipvs *ipvs, + int af, struct sk_buff *skb, + const struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_conn *cp = NULL; + __be16 _ports[2], *pptr; + + if (hooknum == NF_INET_LOCAL_IN) + return NULL; + + pptr = frag_safe_skb_hp(skb, iph->len, + sizeof(_ports), _ports); + if (!pptr) + return NULL; + + dest = ip_vs_find_real_service(ipvs, af, iph->protocol, + &iph->saddr, pptr[0]); + if (dest) { + struct ip_vs_service *svc; + struct ip_vs_pe *pe; + + svc = rcu_dereference(dest->svc); + if (svc) { + pe = rcu_dereference(svc->pe); + if (pe && pe->conn_out) + cp = pe->conn_out(svc, dest, skb, iph, + pptr[0], pptr[1]); + } + } + + return cp; +} + +/* Handle response packets: rewrite addresses and send away... + */ +static unsigned int +handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph, + unsigned int hooknum) +{ + struct ip_vs_protocol *pp = pd->pp; + + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + goto after_nat; + + IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet"); + + if (skb_ensure_writable(skb, iph->len)) + goto drop; + + /* mangle the packet */ + if (pp->snat_handler && + !SNAT_CALL(pp->snat_handler, skb, pp, cp, iph)) + goto drop; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ipv6_hdr(skb)->saddr = cp->vaddr.in6; + else +#endif + { + ip_hdr(skb)->saddr = cp->vaddr.ip; + ip_send_check(ip_hdr(skb)); + } + + /* + * nf_iterate does not expect change in the skb->dst->dev. + * It looks like it is not fatal to enable this code for hooks + * where our handlers are at the end of the chain list and + * when all next handlers use skb->dst->dev and not outdev. + * It will definitely route properly the inout NAT traffic + * when multiple paths are used. + */ + + /* For policy routing, packets originating from this + * machine itself may be routed differently to packets + * passing through. We want this packet to be routed as + * if it came from this machine itself. So re-compute + * the routing information. + */ + if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum)) + goto drop; + + IP_VS_DBG_PKT(10, af, pp, skb, iph->off, "After SNAT"); + +after_nat: + ip_vs_out_stats(cp, skb); + ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd); + skb->ipvs_property = 1; + if (!(cp->flags & IP_VS_CONN_F_NFCT)) + ip_vs_notrack(skb); + else + ip_vs_update_conntrack(skb, cp, 0); + ip_vs_conn_put(cp); + + LeaveFunction(11); + return NF_ACCEPT; + +drop: + ip_vs_conn_put(cp); + kfree_skb(skb); + LeaveFunction(11); + return NF_STOLEN; +} + +/* + * Check if outgoing packet belongs to the established ip_vs_conn. + */ +static unsigned int +ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) +{ + struct netns_ipvs *ipvs = net_ipvs(state->net); + unsigned int hooknum = state->hook; + struct ip_vs_iphdr iph; + struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; + struct ip_vs_conn *cp; + int af = state->pf; + struct sock *sk; + + EnterFunction(11); + + /* Already marked as IPVS request or reply? */ + if (skb->ipvs_property) + return NF_ACCEPT; + + sk = skb_to_full_sk(skb); + /* Bad... Do not break raw sockets */ + if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && + af == AF_INET)) { + + if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag) + return NF_ACCEPT; + } + + if (unlikely(!skb_dst(skb))) + return NF_ACCEPT; + + if (!ipvs->enable) + return NF_ACCEPT; + + ip_vs_fill_iph_skb(af, skb, false, &iph); +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { + int related; + int verdict = ip_vs_out_icmp_v6(ipvs, skb, &related, + hooknum, &iph); + + if (related) + return verdict; + } + } else +#endif + if (unlikely(iph.protocol == IPPROTO_ICMP)) { + int related; + int verdict = ip_vs_out_icmp(ipvs, skb, &related, hooknum); + + if (related) + return verdict; + } + + pd = ip_vs_proto_data_get(ipvs, iph.protocol); + if (unlikely(!pd)) + return NF_ACCEPT; + pp = pd->pp; + + /* reassemble IP fragments */ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET) +#endif + if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) { + if (ip_vs_gather_frags(ipvs, skb, + ip_vs_defrag_user(hooknum))) + return NF_STOLEN; + + ip_vs_fill_iph_skb(AF_INET, skb, false, &iph); + } + + /* + * Check if the packet belongs to an existing entry + */ + cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, + ipvs, af, skb, &iph); + + if (likely(cp)) + return handle_response(af, skb, pd, cp, &iph, hooknum); + + /* Check for real-server-started requests */ + if (atomic_read(&ipvs->conn_out_counter)) { + /* Currently only for UDP: + * connection oriented protocols typically use + * ephemeral ports for outgoing connections, so + * related incoming responses would not match any VS + */ + if (pp->protocol == IPPROTO_UDP) { + cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph); + if (likely(cp)) + return handle_response(af, skb, pd, cp, &iph, + hooknum); + } + } + + if (sysctl_nat_icmp_send(ipvs) && + (pp->protocol == IPPROTO_TCP || + pp->protocol == IPPROTO_UDP || + pp->protocol == IPPROTO_SCTP)) { + __be16 _ports[2], *pptr; + + pptr = frag_safe_skb_hp(skb, iph.len, + sizeof(_ports), _ports); + if (pptr == NULL) + return NF_ACCEPT; /* Not for me */ + if (ip_vs_has_real_service(ipvs, af, iph.protocol, &iph.saddr, + pptr[0])) { + /* + * Notify the real server: there is no + * existing entry if it is not RST + * packet or not TCP packet. + */ + if ((iph.protocol != IPPROTO_TCP && + iph.protocol != IPPROTO_SCTP) + || ((iph.protocol == IPPROTO_TCP + && !is_tcp_reset(skb, iph.len)) + || (iph.protocol == IPPROTO_SCTP + && !is_sctp_abort(skb, + iph.len)))) { +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (!skb->dev) + skb->dev = ipvs->net->loopback_dev; + icmpv6_send(skb, + ICMPV6_DEST_UNREACH, + ICMPV6_PORT_UNREACH, + 0); + } else +#endif + icmp_send(skb, + ICMP_DEST_UNREACH, + ICMP_PORT_UNREACH, 0); + return NF_DROP; + } + } + } + + IP_VS_DBG_PKT(12, af, pp, skb, iph.off, + "ip_vs_out: packet continues traversal as normal"); + return NF_ACCEPT; +} + +static unsigned int +ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, + struct ip_vs_proto_data *pd, + int *verdict, struct ip_vs_conn **cpp, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_protocol *pp = pd->pp; + + if (!iph->fragoffs) { + /* No (second) fragments need to enter here, as nf_defrag_ipv6 + * replayed fragment zero will already have created the cp + */ + + /* Schedule and create new connection entry into cpp */ + if (!pp->conn_schedule(ipvs, af, skb, pd, verdict, cpp, iph)) + return 0; + } + + if (unlikely(!*cpp)) { + /* sorry, all this trouble for a no-hit :) */ + IP_VS_DBG_PKT(12, af, pp, skb, iph->off, + "ip_vs_in: packet continues traversal as normal"); + + /* Fragment couldn't be mapped to a conn entry */ + if (iph->fragoffs) + IP_VS_DBG_PKT(7, af, pp, skb, iph->off, + "unhandled fragment"); + + *verdict = NF_ACCEPT; + return 0; + } + + return 1; +} + +/* Check the UDP tunnel and return its header length */ +static int ipvs_udp_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, + unsigned int offset, __u16 af, + const union nf_inet_addr *daddr, __u8 *proto) +{ + struct udphdr _udph, *udph; + struct ip_vs_dest *dest; + + udph = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); + if (!udph) + goto unk; + offset += sizeof(struct udphdr); + dest = ip_vs_find_tunnel(ipvs, af, daddr, udph->dest); + if (!dest) + goto unk; + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + struct guehdr _gueh, *gueh; + + gueh = skb_header_pointer(skb, offset, sizeof(_gueh), &_gueh); + if (!gueh) + goto unk; + if (gueh->control != 0 || gueh->version != 0) + goto unk; + /* Later we can support also IPPROTO_IPV6 */ + if (gueh->proto_ctype != IPPROTO_IPIP) + goto unk; + *proto = gueh->proto_ctype; + return sizeof(struct udphdr) + sizeof(struct guehdr) + + (gueh->hlen << 2); + } + +unk: + return 0; +} + +/* Check the GRE tunnel and return its header length */ +static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, + unsigned int offset, __u16 af, + const union nf_inet_addr *daddr, __u8 *proto) +{ + struct gre_base_hdr _greh, *greh; + struct ip_vs_dest *dest; + + greh = skb_header_pointer(skb, offset, sizeof(_greh), &_greh); + if (!greh) + goto unk; + dest = ip_vs_find_tunnel(ipvs, af, daddr, 0); + if (!dest) + goto unk; + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + __be16 type; + + /* Only support version 0 and C (csum) */ + if ((greh->flags & ~GRE_CSUM) != 0) + goto unk; + type = greh->protocol; + /* Later we can support also IPPROTO_IPV6 */ + if (type != htons(ETH_P_IP)) + goto unk; + *proto = IPPROTO_IPIP; + return gre_calc_hlen(gre_flags_to_tnl_flags(greh->flags)); + } + +unk: + return 0; +} + +/* + * Handle ICMP messages in the outside-to-inside direction (incoming). + * Find any that might be relevant, check against existing connections, + * forward to the right destination host if relevant. + * Currently handles error types - unreachable, quench, ttl exceeded. + */ +static int +ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, + unsigned int hooknum) +{ + struct iphdr *iph; + struct icmphdr _icmph, *ic; + struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; + unsigned int offset, offset2, ihl, verdict; + bool tunnel, new_cp = false; + union nf_inet_addr *raddr; + char *outer_proto = "IPIP"; + + *related = 1; + + /* reassemble IP fragments */ + if (ip_is_fragment(ip_hdr(skb))) { + if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) + return NF_STOLEN; + } + + iph = ip_hdr(skb); + offset = ihl = iph->ihl * 4; + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n", + ic->type, ntohs(icmp_id(ic)), + &iph->saddr, &iph->daddr); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->type != ICMP_DEST_UNREACH) && + (ic->type != ICMP_SOURCE_QUENCH) && + (ic->type != ICMP_TIME_EXCEEDED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + raddr = (union nf_inet_addr *)&cih->daddr; + + /* Special case for errors for IPIP/UDP/GRE tunnel packets */ + tunnel = false; + if (cih->protocol == IPPROTO_IPIP) { + struct ip_vs_dest *dest; + + if (unlikely(cih->frag_off & htons(IP_OFFSET))) + return NF_ACCEPT; + /* Error for our IPIP must arrive at LOCAL_IN */ + if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL)) + return NF_ACCEPT; + dest = ip_vs_find_tunnel(ipvs, AF_INET, raddr, 0); + /* Only for known tunnel */ + if (!dest || dest->tun_type != IP_VS_CONN_F_TUNNEL_TYPE_IPIP) + return NF_ACCEPT; + offset += cih->ihl * 4; + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + tunnel = true; + } else if ((cih->protocol == IPPROTO_UDP || /* Can be UDP encap */ + cih->protocol == IPPROTO_GRE) && /* Can be GRE encap */ + /* Error for our tunnel must arrive at LOCAL_IN */ + (skb_rtable(skb)->rt_flags & RTCF_LOCAL)) { + __u8 iproto; + int ulen; + + /* Non-first fragment has no UDP/GRE header */ + if (unlikely(cih->frag_off & htons(IP_OFFSET))) + return NF_ACCEPT; + offset2 = offset + cih->ihl * 4; + if (cih->protocol == IPPROTO_UDP) { + ulen = ipvs_udp_decap(ipvs, skb, offset2, AF_INET, + raddr, &iproto); + outer_proto = "UDP"; + } else { + ulen = ipvs_gre_decap(ipvs, skb, offset2, AF_INET, + raddr, &iproto); + outer_proto = "GRE"; + } + if (ulen > 0) { + /* Skip IP and UDP/GRE tunnel headers */ + offset = offset2 + ulen; + /* Now we should be at the original IP header */ + cih = skb_header_pointer(skb, offset, sizeof(_ciph), + &_ciph); + if (cih && cih->version == 4 && cih->ihl >= 5 && + iproto == IPPROTO_IPIP) + tunnel = true; + else + return NF_ACCEPT; + } + } + + pd = ip_vs_proto_data_get(ipvs, cih->protocol); + if (!pd) + return NF_ACCEPT; + pp = pd->pp; + + /* Is the embedded protocol header present? */ + if (unlikely(cih->frag_off & htons(IP_OFFSET) && + pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, + "Checking incoming ICMP for"); + + offset2 = offset; + ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !tunnel, &ciph); + offset = ciph.len; + + /* The embedded headers contain source and dest in reverse order. + * For IPIP/UDP/GRE tunnel this is error for request, not for reply. + */ + cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, + ipvs, AF_INET, skb, &ciph); + + if (!cp) { + int v; + + if (tunnel || !sysctl_schedule_icmp(ipvs)) + return NF_ACCEPT; + + if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph)) + return v; + new_cp = true; + } + + verdict = NF_DROP; + + /* Ensure the checksum is correct */ + if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { + /* Failed checksum! */ + IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n", + &iph->saddr); + goto out; + } + + if (tunnel) { + __be32 info = ic->un.gateway; + __u8 type = ic->type; + __u8 code = ic->code; + + /* Update the MTU */ + if (ic->type == ICMP_DEST_UNREACH && + ic->code == ICMP_FRAG_NEEDED) { + struct ip_vs_dest *dest = cp->dest; + u32 mtu = ntohs(ic->un.frag.mtu); + __be16 frag_off = cih->frag_off; + + /* Strip outer IP and ICMP, go to IPIP/UDP/GRE header */ + if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL) + goto ignore_tunnel; + offset2 -= ihl + sizeof(_icmph); + skb_reset_network_header(skb); + IP_VS_DBG(12, "ICMP for %s %pI4->%pI4: mtu=%u\n", + outer_proto, &ip_hdr(skb)->saddr, + &ip_hdr(skb)->daddr, mtu); + ipv4_update_pmtu(skb, ipvs->net, mtu, 0, 0); + /* Client uses PMTUD? */ + if (!(frag_off & htons(IP_DF))) + goto ignore_tunnel; + /* Prefer the resulting PMTU */ + if (dest) { + struct ip_vs_dest_dst *dest_dst; + + dest_dst = rcu_dereference(dest->dest_dst); + if (dest_dst) + mtu = dst_mtu(dest_dst->dst_cache); + } + if (mtu > 68 + sizeof(struct iphdr)) + mtu -= sizeof(struct iphdr); + info = htonl(mtu); + } + /* Strip outer IP, ICMP and IPIP/UDP/GRE, go to IP header of + * original request. + */ + if (pskb_pull(skb, offset2) == NULL) + goto ignore_tunnel; + skb_reset_network_header(skb); + IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n", + &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, + type, code, ntohl(info)); + icmp_send(skb, type, code, info); + /* ICMP can be shorter but anyways, account it */ + ip_vs_out_stats(cp, skb); + +ignore_tunnel: + consume_skb(skb); + verdict = NF_STOLEN; + goto out; + } + + /* do the statistics and put it back */ + ip_vs_in_stats(cp, skb); + if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol || + IPPROTO_SCTP == cih->protocol) + offset += 2 * sizeof(__u16); + verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph); + +out: + if (likely(!new_cp)) + __ip_vs_conn_put(cp); + else + ip_vs_conn_put(cp); + + return verdict; +} + +#ifdef CONFIG_IP_VS_IPV6 +static int ip_vs_in_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, + int *related, unsigned int hooknum, + struct ip_vs_iphdr *iph) +{ + struct icmp6hdr _icmph, *ic; + struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; + unsigned int offset, verdict; + bool new_cp = false; + + *related = 1; + + ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { + *related = 0; + return NF_ACCEPT; + } + /* Fragment header that is before ICMP header tells us that: + * it's not an error message since they can't be fragmented. + */ + if (iph->flags & IP6_FH_F_FRAG) + return NF_DROP; + + IP_VS_DBG(8, "Incoming ICMPv6 (%d,%d) %pI6c->%pI6c\n", + ic->icmp6_type, ntohs(icmpv6_id(ic)), + &iph->saddr, &iph->daddr); + + offset = iph->len + sizeof(_icmph); + if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, offset, true, &ciph)) + return NF_ACCEPT; + + pd = ip_vs_proto_data_get(ipvs, ciph.protocol); + if (!pd) + return NF_ACCEPT; + pp = pd->pp; + + /* Cannot handle fragmented embedded protocol */ + if (ciph.fragoffs) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, + "Checking incoming ICMPv6 for"); + + /* The embedded headers contain source and dest in reverse order + * if not from localhost + */ + cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, + ipvs, AF_INET6, skb, &ciph); + + if (!cp) { + int v; + + if (!sysctl_schedule_icmp(ipvs)) + return NF_ACCEPT; + + if (!ip_vs_try_to_schedule(ipvs, AF_INET6, skb, pd, &v, &cp, &ciph)) + return v; + + new_cp = true; + } + + /* VS/TUN, VS/DR and LOCALNODE just let it go */ + if ((hooknum == NF_INET_LOCAL_OUT) && + (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) { + verdict = NF_ACCEPT; + goto out; + } + + /* do the statistics and put it back */ + ip_vs_in_stats(cp, skb); + + /* Need to mangle contained IPv6 header in ICMPv6 packet */ + offset = ciph.len; + if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol || + IPPROTO_SCTP == ciph.protocol) + offset += 2 * sizeof(__u16); /* Also mangle ports */ + + verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset, hooknum, &ciph); + +out: + if (likely(!new_cp)) + __ip_vs_conn_put(cp); + else + ip_vs_conn_put(cp); + + return verdict; +} +#endif + + +/* + * Check if it's for virtual services, look it up, + * and send it on its way... + */ +static unsigned int +ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) +{ + struct netns_ipvs *ipvs = net_ipvs(state->net); + unsigned int hooknum = state->hook; + struct ip_vs_iphdr iph; + struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; + struct ip_vs_conn *cp; + int ret, pkts; + struct sock *sk; + int af = state->pf; + + /* Already marked as IPVS request or reply? */ + if (skb->ipvs_property) + return NF_ACCEPT; + + /* + * Big tappo: + * - remote client: only PACKET_HOST + * - route: used for struct net when skb->dev is unset + */ + if (unlikely((skb->pkt_type != PACKET_HOST && + hooknum != NF_INET_LOCAL_OUT) || + !skb_dst(skb))) { + ip_vs_fill_iph_skb(af, skb, false, &iph); + IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s" + " ignored in hook %u\n", + skb->pkt_type, iph.protocol, + IP_VS_DBG_ADDR(af, &iph.daddr), hooknum); + return NF_ACCEPT; + } + /* ipvs enabled in this netns ? */ + if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) + return NF_ACCEPT; + + ip_vs_fill_iph_skb(af, skb, false, &iph); + + /* Bad... Do not break raw sockets */ + sk = skb_to_full_sk(skb); + if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && + af == AF_INET)) { + + if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag) + return NF_ACCEPT; + } + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { + int related; + int verdict = ip_vs_in_icmp_v6(ipvs, skb, &related, + hooknum, &iph); + + if (related) + return verdict; + } + } else +#endif + if (unlikely(iph.protocol == IPPROTO_ICMP)) { + int related; + int verdict = ip_vs_in_icmp(ipvs, skb, &related, + hooknum); + + if (related) + return verdict; + } + + /* Protocol supported? */ + pd = ip_vs_proto_data_get(ipvs, iph.protocol); + if (unlikely(!pd)) { + /* The only way we'll see this packet again is if it's + * encapsulated, so mark it with ipvs_property=1 so we + * skip it if we're ignoring tunneled packets + */ + if (sysctl_ignore_tunneled(ipvs)) + skb->ipvs_property = 1; + + return NF_ACCEPT; + } + pp = pd->pp; + /* + * Check if the packet belongs to an existing connection entry + */ + cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, + ipvs, af, skb, &iph); + + if (!iph.fragoffs && is_new_conn(skb, &iph) && cp) { + int conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); + bool old_ct = false, resched = false; + + if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest && + unlikely(!atomic_read(&cp->dest->weight))) { + resched = true; + old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); + } else if (conn_reuse_mode && + is_new_conn_expected(cp, conn_reuse_mode)) { + old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); + if (!atomic_read(&cp->n_control)) { + resched = true; + } else { + /* Do not reschedule controlling connection + * that uses conntrack while it is still + * referenced by controlled connection(s). + */ + resched = !old_ct; + } + } + + if (resched) { + if (!old_ct) + cp->flags &= ~IP_VS_CONN_F_NFCT; + if (!atomic_read(&cp->n_control)) + ip_vs_conn_expire_now(cp); + __ip_vs_conn_put(cp); + if (old_ct) + return NF_DROP; + cp = NULL; + } + } + + /* Check the server status */ + if (cp && cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { + /* the destination server is not available */ + if (sysctl_expire_nodest_conn(ipvs)) { + bool old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); + + if (!old_ct) + cp->flags &= ~IP_VS_CONN_F_NFCT; + + ip_vs_conn_expire_now(cp); + __ip_vs_conn_put(cp); + if (old_ct) + return NF_DROP; + cp = NULL; + } else { + __ip_vs_conn_put(cp); + return NF_DROP; + } + } + + if (unlikely(!cp)) { + int v; + + if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph)) + return v; + } + + IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet"); + + ip_vs_in_stats(cp, skb); + ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); + if (cp->packet_xmit) + ret = cp->packet_xmit(skb, cp, pp, &iph); + /* do not touch skb anymore */ + else { + IP_VS_DBG_RL("warning: packet_xmit is null"); + ret = NF_ACCEPT; + } + + /* Increase its packet counter and check if it is needed + * to be synchronized + * + * Sync connection if it is about to close to + * encorage the standby servers to update the connections timeout + * + * For ONE_PKT let ip_vs_sync_conn() do the filter work. + */ + + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + pkts = sysctl_sync_threshold(ipvs); + else + pkts = atomic_inc_return(&cp->in_pkts); + + if (ipvs->sync_state & IP_VS_STATE_MASTER) + ip_vs_sync_conn(ipvs, cp, pkts); + else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) + /* increment is done inside ip_vs_sync_conn too */ + atomic_inc(&cp->control->in_pkts); + + ip_vs_conn_put(cp); + return ret; +} + +/* + * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP + * related packets destined for 0.0.0.0/0. + * When fwmark-based virtual service is used, such as transparent + * cache cluster, TCP packets can be marked and routed to ip_vs_in, + * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and + * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain + * and send them to ip_vs_in_icmp. + */ +static unsigned int +ip_vs_forward_icmp(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct netns_ipvs *ipvs = net_ipvs(state->net); + int r; + + /* ipvs enabled in this netns ? */ + if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) + return NF_ACCEPT; + + if (state->pf == NFPROTO_IPV4) { + if (ip_hdr(skb)->protocol != IPPROTO_ICMP) + return NF_ACCEPT; +#ifdef CONFIG_IP_VS_IPV6 + } else { + struct ip_vs_iphdr iphdr; + + ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr); + + if (iphdr.protocol != IPPROTO_ICMPV6) + return NF_ACCEPT; + + return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr); +#endif + } + + return ip_vs_in_icmp(ipvs, skb, &r, state->hook); +} + +static const struct nf_hook_ops ip_vs_ops4[] = { + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_out_hook, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_NAT_SRC - 2, + }, + /* After packet filtering, forward packet through VS/DR, VS/TUN, + * or VS/NAT(change destination), so that filtering rules can be + * applied to IPVS. */ + { + .hook = ip_vs_in_hook, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_NAT_SRC - 1, + }, + /* Before ip_vs_in, change source only for VS/NAT */ + { + .hook = ip_vs_out_hook, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP_PRI_NAT_DST + 1, + }, + /* After mangle, schedule and forward local requests */ + { + .hook = ip_vs_in_hook, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP_PRI_NAT_DST + 2, + }, + /* After packet filtering (but before ip_vs_out_icmp), catch icmp + * destined for 0.0.0.0/0, which is for incoming IPVS connections */ + { + .hook = ip_vs_forward_icmp, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_FORWARD, + .priority = 99, + }, + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_out_hook, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_FORWARD, + .priority = 100, + }, +}; + +#ifdef CONFIG_IP_VS_IPV6 +static const struct nf_hook_ops ip_vs_ops6[] = { + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_out_hook, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP6_PRI_NAT_SRC - 2, + }, + /* After packet filtering, forward packet through VS/DR, VS/TUN, + * or VS/NAT(change destination), so that filtering rules can be + * applied to IPVS. */ + { + .hook = ip_vs_in_hook, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP6_PRI_NAT_SRC - 1, + }, + /* Before ip_vs_in, change source only for VS/NAT */ + { + .hook = ip_vs_out_hook, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_NAT_DST + 1, + }, + /* After mangle, schedule and forward local requests */ + { + .hook = ip_vs_in_hook, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_NAT_DST + 2, + }, + /* After packet filtering (but before ip_vs_out_icmp), catch icmp + * destined for 0.0.0.0/0, which is for incoming IPVS connections */ + { + .hook = ip_vs_forward_icmp, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_FORWARD, + .priority = 99, + }, + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_out_hook, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_FORWARD, + .priority = 100, + }, +}; +#endif + +int ip_vs_register_hooks(struct netns_ipvs *ipvs, unsigned int af) +{ + const struct nf_hook_ops *ops; + unsigned int count; + unsigned int afmask; + int ret = 0; + + if (af == AF_INET6) { +#ifdef CONFIG_IP_VS_IPV6 + ops = ip_vs_ops6; + count = ARRAY_SIZE(ip_vs_ops6); + afmask = 2; +#else + return -EINVAL; +#endif + } else { + ops = ip_vs_ops4; + count = ARRAY_SIZE(ip_vs_ops4); + afmask = 1; + } + + if (!(ipvs->hooks_afmask & afmask)) { + ret = nf_register_net_hooks(ipvs->net, ops, count); + if (ret >= 0) + ipvs->hooks_afmask |= afmask; + } + return ret; +} + +void ip_vs_unregister_hooks(struct netns_ipvs *ipvs, unsigned int af) +{ + const struct nf_hook_ops *ops; + unsigned int count; + unsigned int afmask; + + if (af == AF_INET6) { +#ifdef CONFIG_IP_VS_IPV6 + ops = ip_vs_ops6; + count = ARRAY_SIZE(ip_vs_ops6); + afmask = 2; +#else + return; +#endif + } else { + ops = ip_vs_ops4; + count = ARRAY_SIZE(ip_vs_ops4); + afmask = 1; + } + + if (ipvs->hooks_afmask & afmask) { + nf_unregister_net_hooks(ipvs->net, ops, count); + ipvs->hooks_afmask &= ~afmask; + } +} + +/* + * Initialize IP Virtual Server netns mem. + */ +static int __net_init __ip_vs_init(struct net *net) +{ + struct netns_ipvs *ipvs; + + ipvs = net_generic(net, ip_vs_net_id); + if (ipvs == NULL) + return -ENOMEM; + + /* Hold the beast until a service is registered */ + ipvs->enable = 0; + ipvs->net = net; + /* Counters used for creating unique names */ + ipvs->gen = atomic_read(&ipvs_netns_cnt); + atomic_inc(&ipvs_netns_cnt); + net->ipvs = ipvs; + + if (ip_vs_estimator_net_init(ipvs) < 0) + goto estimator_fail; + + if (ip_vs_control_net_init(ipvs) < 0) + goto control_fail; + + if (ip_vs_protocol_net_init(ipvs) < 0) + goto protocol_fail; + + if (ip_vs_app_net_init(ipvs) < 0) + goto app_fail; + + if (ip_vs_conn_net_init(ipvs) < 0) + goto conn_fail; + + if (ip_vs_sync_net_init(ipvs) < 0) + goto sync_fail; + + return 0; +/* + * Error handling + */ + +sync_fail: + ip_vs_conn_net_cleanup(ipvs); +conn_fail: + ip_vs_app_net_cleanup(ipvs); +app_fail: + ip_vs_protocol_net_cleanup(ipvs); +protocol_fail: + ip_vs_control_net_cleanup(ipvs); +control_fail: + ip_vs_estimator_net_cleanup(ipvs); +estimator_fail: + net->ipvs = NULL; + return -ENOMEM; +} + +static void __net_exit __ip_vs_cleanup_batch(struct list_head *net_list) +{ + struct netns_ipvs *ipvs; + struct net *net; + + ip_vs_service_nets_cleanup(net_list); /* ip_vs_flush() with locks */ + list_for_each_entry(net, net_list, exit_list) { + ipvs = net_ipvs(net); + ip_vs_conn_net_cleanup(ipvs); + ip_vs_app_net_cleanup(ipvs); + ip_vs_protocol_net_cleanup(ipvs); + ip_vs_control_net_cleanup(ipvs); + ip_vs_estimator_net_cleanup(ipvs); + IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen); + net->ipvs = NULL; + } +} + +static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list) +{ + struct netns_ipvs *ipvs; + struct net *net; + + EnterFunction(2); + list_for_each_entry(net, net_list, exit_list) { + ipvs = net_ipvs(net); + ip_vs_unregister_hooks(ipvs, AF_INET); + ip_vs_unregister_hooks(ipvs, AF_INET6); + ipvs->enable = 0; /* Disable packet reception */ + smp_wmb(); + ip_vs_sync_net_cleanup(ipvs); + } + LeaveFunction(2); +} + +static struct pernet_operations ipvs_core_ops = { + .init = __ip_vs_init, + .exit_batch = __ip_vs_cleanup_batch, + .id = &ip_vs_net_id, + .size = sizeof(struct netns_ipvs), +}; + +static struct pernet_operations ipvs_core_dev_ops = { + .exit_batch = __ip_vs_dev_cleanup_batch, +}; + +/* + * Initialize IP Virtual Server + */ +static int __init ip_vs_init(void) +{ + int ret; + + ret = ip_vs_control_init(); + if (ret < 0) { + pr_err("can't setup control.\n"); + goto exit; + } + + ip_vs_protocol_init(); + + ret = ip_vs_conn_init(); + if (ret < 0) { + pr_err("can't setup connection table.\n"); + goto cleanup_protocol; + } + + ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ + if (ret < 0) + goto cleanup_conn; + + ret = register_pernet_device(&ipvs_core_dev_ops); + if (ret < 0) + goto cleanup_sub; + + ret = ip_vs_register_nl_ioctl(); + if (ret < 0) { + pr_err("can't register netlink/ioctl.\n"); + goto cleanup_dev; + } + + pr_info("ipvs loaded.\n"); + + return ret; + +cleanup_dev: + unregister_pernet_device(&ipvs_core_dev_ops); +cleanup_sub: + unregister_pernet_subsys(&ipvs_core_ops); +cleanup_conn: + ip_vs_conn_cleanup(); +cleanup_protocol: + ip_vs_protocol_cleanup(); + ip_vs_control_cleanup(); +exit: + return ret; +} + +static void __exit ip_vs_cleanup(void) +{ + ip_vs_unregister_nl_ioctl(); + unregister_pernet_device(&ipvs_core_dev_ops); + unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ + ip_vs_conn_cleanup(); + ip_vs_protocol_cleanup(); + ip_vs_control_cleanup(); + pr_info("ipvs unloaded.\n"); +} + +module_init(ip_vs_init); +module_exit(ip_vs_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c new file mode 100644 index 000000000..17a1b731a --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -0,0 +1,4288 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the NetFilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang + * Peter Kese + * Julian Anastasov + * + * Changes: + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#ifdef CONFIG_IP_VS_IPV6 +#include +#include +#include +#endif +#include +#include +#include + +#include + +#include + +MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME); + +/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ +static DEFINE_MUTEX(__ip_vs_mutex); + +/* sysctl variables */ + +#ifdef CONFIG_IP_VS_DEBUG +static int sysctl_ip_vs_debug_level = 0; + +int ip_vs_get_debug_level(void) +{ + return sysctl_ip_vs_debug_level; +} +#endif + + +/* Protos */ +static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup); + + +#ifdef CONFIG_IP_VS_IPV6 +/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ +static bool __ip_vs_addr_is_local_v6(struct net *net, + const struct in6_addr *addr) +{ + struct flowi6 fl6 = { + .daddr = *addr, + }; + struct dst_entry *dst = ip6_route_output(net, NULL, &fl6); + bool is_local; + + is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK); + + dst_release(dst); + return is_local; +} +#endif + +#ifdef CONFIG_SYSCTL +/* + * update_defense_level is called from keventd and from sysctl, + * so it needs to protect itself from softirqs + */ +static void update_defense_level(struct netns_ipvs *ipvs) +{ + struct sysinfo i; + int availmem; + int nomem; + int to_change = -1; + + /* we only count free and buffered memory (in pages) */ + si_meminfo(&i); + availmem = i.freeram + i.bufferram; + /* however in linux 2.5 the i.bufferram is total page cache size, + we need adjust it */ + /* si_swapinfo(&i); */ + /* availmem = availmem - (i.totalswap - i.freeswap); */ + + nomem = (availmem < ipvs->sysctl_amemthresh); + + local_bh_disable(); + + /* drop_entry */ + spin_lock(&ipvs->dropentry_lock); + switch (ipvs->sysctl_drop_entry) { + case 0: + atomic_set(&ipvs->dropentry, 0); + break; + case 1: + if (nomem) { + atomic_set(&ipvs->dropentry, 1); + ipvs->sysctl_drop_entry = 2; + } else { + atomic_set(&ipvs->dropentry, 0); + } + break; + case 2: + if (nomem) { + atomic_set(&ipvs->dropentry, 1); + } else { + atomic_set(&ipvs->dropentry, 0); + ipvs->sysctl_drop_entry = 1; + } + break; + case 3: + atomic_set(&ipvs->dropentry, 1); + break; + } + spin_unlock(&ipvs->dropentry_lock); + + /* drop_packet */ + spin_lock(&ipvs->droppacket_lock); + switch (ipvs->sysctl_drop_packet) { + case 0: + ipvs->drop_rate = 0; + break; + case 1: + if (nomem) { + ipvs->drop_rate = ipvs->drop_counter + = ipvs->sysctl_amemthresh / + (ipvs->sysctl_amemthresh-availmem); + ipvs->sysctl_drop_packet = 2; + } else { + ipvs->drop_rate = 0; + } + break; + case 2: + if (nomem) { + ipvs->drop_rate = ipvs->drop_counter + = ipvs->sysctl_amemthresh / + (ipvs->sysctl_amemthresh-availmem); + } else { + ipvs->drop_rate = 0; + ipvs->sysctl_drop_packet = 1; + } + break; + case 3: + ipvs->drop_rate = ipvs->sysctl_am_droprate; + break; + } + spin_unlock(&ipvs->droppacket_lock); + + /* secure_tcp */ + spin_lock(&ipvs->securetcp_lock); + switch (ipvs->sysctl_secure_tcp) { + case 0: + if (ipvs->old_secure_tcp >= 2) + to_change = 0; + break; + case 1: + if (nomem) { + if (ipvs->old_secure_tcp < 2) + to_change = 1; + ipvs->sysctl_secure_tcp = 2; + } else { + if (ipvs->old_secure_tcp >= 2) + to_change = 0; + } + break; + case 2: + if (nomem) { + if (ipvs->old_secure_tcp < 2) + to_change = 1; + } else { + if (ipvs->old_secure_tcp >= 2) + to_change = 0; + ipvs->sysctl_secure_tcp = 1; + } + break; + case 3: + if (ipvs->old_secure_tcp < 2) + to_change = 1; + break; + } + ipvs->old_secure_tcp = ipvs->sysctl_secure_tcp; + if (to_change >= 0) + ip_vs_protocol_timeout_change(ipvs, + ipvs->sysctl_secure_tcp > 1); + spin_unlock(&ipvs->securetcp_lock); + + local_bh_enable(); +} + +/* Handler for delayed work for expiring no + * destination connections + */ +static void expire_nodest_conn_handler(struct work_struct *work) +{ + struct netns_ipvs *ipvs; + + ipvs = container_of(work, struct netns_ipvs, + expire_nodest_conn_work.work); + ip_vs_expire_nodest_conn_flush(ipvs); +} + +/* + * Timer for checking the defense + */ +#define DEFENSE_TIMER_PERIOD 1*HZ + +static void defense_work_handler(struct work_struct *work) +{ + struct netns_ipvs *ipvs = + container_of(work, struct netns_ipvs, defense_work.work); + + update_defense_level(ipvs); + if (atomic_read(&ipvs->dropentry)) + ip_vs_random_dropentry(ipvs); + queue_delayed_work(system_long_wq, &ipvs->defense_work, + DEFENSE_TIMER_PERIOD); +} +#endif + +int +ip_vs_use_count_inc(void) +{ + return try_module_get(THIS_MODULE); +} + +void +ip_vs_use_count_dec(void) +{ + module_put(THIS_MODULE); +} + + +/* + * Hash table: for virtual service lookups + */ +#define IP_VS_SVC_TAB_BITS 8 +#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) +#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) + +/* the service table hashed by */ +static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; +/* the service table hashed by fwmark */ +static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; + + +/* + * Returns hash value for virtual service + */ +static inline unsigned int +ip_vs_svc_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto, + const union nf_inet_addr *addr, __be16 port) +{ + unsigned int porth = ntohs(port); + __be32 addr_fold = addr->ip; + __u32 ahash; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + ahash = ntohl(addr_fold); + ahash ^= ((size_t) ipvs >> 8); + + return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) & + IP_VS_SVC_TAB_MASK; +} + +/* + * Returns hash value of fwmark for virtual service lookup + */ +static inline unsigned int ip_vs_svc_fwm_hashkey(struct netns_ipvs *ipvs, __u32 fwmark) +{ + return (((size_t)ipvs>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK; +} + +/* + * Hashes a service in the ip_vs_svc_table by + * or in the ip_vs_svc_fwm_table by fwmark. + * Should be called with locked tables. + */ +static int ip_vs_svc_hash(struct ip_vs_service *svc) +{ + unsigned int hash; + + if (svc->flags & IP_VS_SVC_F_HASHED) { + pr_err("%s(): request for already hashed, called from %pS\n", + __func__, __builtin_return_address(0)); + return 0; + } + + if (svc->fwmark == 0) { + /* + * Hash it by in ip_vs_svc_table + */ + hash = ip_vs_svc_hashkey(svc->ipvs, svc->af, svc->protocol, + &svc->addr, svc->port); + hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]); + } else { + /* + * Hash it by fwmark in svc_fwm_table + */ + hash = ip_vs_svc_fwm_hashkey(svc->ipvs, svc->fwmark); + hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]); + } + + svc->flags |= IP_VS_SVC_F_HASHED; + /* increase its refcnt because it is referenced by the svc table */ + atomic_inc(&svc->refcnt); + return 1; +} + + +/* + * Unhashes a service from svc_table / svc_fwm_table. + * Should be called with locked tables. + */ +static int ip_vs_svc_unhash(struct ip_vs_service *svc) +{ + if (!(svc->flags & IP_VS_SVC_F_HASHED)) { + pr_err("%s(): request for unhash flagged, called from %pS\n", + __func__, __builtin_return_address(0)); + return 0; + } + + if (svc->fwmark == 0) { + /* Remove it from the svc_table table */ + hlist_del_rcu(&svc->s_list); + } else { + /* Remove it from the svc_fwm_table table */ + hlist_del_rcu(&svc->f_list); + } + + svc->flags &= ~IP_VS_SVC_F_HASHED; + atomic_dec(&svc->refcnt); + return 1; +} + + +/* + * Get service by {netns, proto,addr,port} in the service table. + */ +static inline struct ip_vs_service * +__ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol, + const union nf_inet_addr *vaddr, __be16 vport) +{ + unsigned int hash; + struct ip_vs_service *svc; + + /* Check for "full" addressed entries */ + hash = ip_vs_svc_hashkey(ipvs, af, protocol, vaddr, vport); + + hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) { + if ((svc->af == af) + && ip_vs_addr_equal(af, &svc->addr, vaddr) + && (svc->port == vport) + && (svc->protocol == protocol) + && (svc->ipvs == ipvs)) { + /* HIT */ + return svc; + } + } + + return NULL; +} + + +/* + * Get service by {fwmark} in the service table. + */ +static inline struct ip_vs_service * +__ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark) +{ + unsigned int hash; + struct ip_vs_service *svc; + + /* Check for fwmark addressed entries */ + hash = ip_vs_svc_fwm_hashkey(ipvs, fwmark); + + hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) { + if (svc->fwmark == fwmark && svc->af == af + && (svc->ipvs == ipvs)) { + /* HIT */ + return svc; + } + } + + return NULL; +} + +/* Find service, called under RCU lock */ +struct ip_vs_service * +ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol, + const union nf_inet_addr *vaddr, __be16 vport) +{ + struct ip_vs_service *svc; + + /* + * Check the table hashed by fwmark first + */ + if (fwmark) { + svc = __ip_vs_svc_fwm_find(ipvs, af, fwmark); + if (svc) + goto out; + } + + /* + * Check the table hashed by + * for "full" addressed entries + */ + svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport); + + if (!svc && protocol == IPPROTO_TCP && + atomic_read(&ipvs->ftpsvc_counter) && + (vport == FTPDATA || !inet_port_requires_bind_service(ipvs->net, ntohs(vport)))) { + /* + * Check if ftp service entry exists, the packet + * might belong to FTP data connections. + */ + svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, FTPPORT); + } + + if (svc == NULL + && atomic_read(&ipvs->nullsvc_counter)) { + /* + * Check if the catch-all port (port zero) exists + */ + svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, 0); + } + + out: + IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", + fwmark, ip_vs_proto_name(protocol), + IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), + svc ? "hit" : "not hit"); + + return svc; +} + + +static inline void +__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + atomic_inc(&svc->refcnt); + rcu_assign_pointer(dest->svc, svc); +} + +static void ip_vs_service_free(struct ip_vs_service *svc) +{ + free_percpu(svc->stats.cpustats); + kfree(svc); +} + +static void ip_vs_service_rcu_free(struct rcu_head *head) +{ + struct ip_vs_service *svc; + + svc = container_of(head, struct ip_vs_service, rcu_head); + ip_vs_service_free(svc); +} + +static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay) +{ + if (atomic_dec_and_test(&svc->refcnt)) { + IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", + svc->fwmark, + IP_VS_DBG_ADDR(svc->af, &svc->addr), + ntohs(svc->port)); + if (do_delay) + call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); + else + ip_vs_service_free(svc); + } +} + + +/* + * Returns hash value for real service + */ +static inline unsigned int ip_vs_rs_hashkey(int af, + const union nf_inet_addr *addr, + __be16 port) +{ + unsigned int porth = ntohs(port); + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + + return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth) + & IP_VS_RTAB_MASK; +} + +/* Hash ip_vs_dest in rs_table by . */ +static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) +{ + unsigned int hash; + __be16 port; + + if (dest->in_rs_table) + return; + + switch (IP_VS_DFWD_METHOD(dest)) { + case IP_VS_CONN_F_MASQ: + port = dest->port; + break; + case IP_VS_CONN_F_TUNNEL: + switch (dest->tun_type) { + case IP_VS_CONN_F_TUNNEL_TYPE_GUE: + port = dest->tun_port; + break; + case IP_VS_CONN_F_TUNNEL_TYPE_IPIP: + case IP_VS_CONN_F_TUNNEL_TYPE_GRE: + port = 0; + break; + default: + return; + } + break; + default: + return; + } + + /* + * Hash by proto,addr,port, + * which are the parameters of the real service. + */ + hash = ip_vs_rs_hashkey(dest->af, &dest->addr, port); + + hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]); + dest->in_rs_table = 1; +} + +/* Unhash ip_vs_dest from rs_table. */ +static void ip_vs_rs_unhash(struct ip_vs_dest *dest) +{ + /* + * Remove it from the rs_table table. + */ + if (dest->in_rs_table) { + hlist_del_rcu(&dest->d_list); + dest->in_rs_table = 0; + } +} + +/* Check if real service by is present */ +bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, + const union nf_inet_addr *daddr, __be16 dport) +{ + unsigned int hash; + struct ip_vs_dest *dest; + + /* Check for "full" addressed entries */ + hash = ip_vs_rs_hashkey(af, daddr, dport); + + hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { + if (dest->port == dport && + dest->af == af && + ip_vs_addr_equal(af, &dest->addr, daddr) && + (dest->protocol == protocol || dest->vfwmark) && + IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { + /* HIT */ + return true; + } + } + + return false; +} + +/* Find real service record by . + * In case of multiple records with the same , only + * the first found record is returned. + * + * To be called under RCU lock. + */ +struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, + __u16 protocol, + const union nf_inet_addr *daddr, + __be16 dport) +{ + unsigned int hash; + struct ip_vs_dest *dest; + + /* Check for "full" addressed entries */ + hash = ip_vs_rs_hashkey(af, daddr, dport); + + hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { + if (dest->port == dport && + dest->af == af && + ip_vs_addr_equal(af, &dest->addr, daddr) && + (dest->protocol == protocol || dest->vfwmark) && + IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { + /* HIT */ + return dest; + } + } + + return NULL; +} + +/* Find real service record by . + * In case of multiple records with the same , only + * the first found record is returned. + * + * To be called under RCU lock. + */ +struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af, + const union nf_inet_addr *daddr, + __be16 tun_port) +{ + struct ip_vs_dest *dest; + unsigned int hash; + + /* Check for "full" addressed entries */ + hash = ip_vs_rs_hashkey(af, daddr, tun_port); + + hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { + if (dest->tun_port == tun_port && + dest->af == af && + ip_vs_addr_equal(af, &dest->addr, daddr) && + IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_TUNNEL) { + /* HIT */ + return dest; + } + } + + return NULL; +} + +/* Lookup destination by {addr,port} in the given service + * Called under RCU lock. + */ +static struct ip_vs_dest * +ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af, + const union nf_inet_addr *daddr, __be16 dport) +{ + struct ip_vs_dest *dest; + + /* + * Find the destination for the given service + */ + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + if ((dest->af == dest_af) && + ip_vs_addr_equal(dest_af, &dest->addr, daddr) && + (dest->port == dport)) { + /* HIT */ + return dest; + } + } + + return NULL; +} + +/* + * Find destination by {daddr,dport,vaddr,protocol} + * Created to be used in ip_vs_process_message() in + * the backup synchronization daemon. It finds the + * destination to be bound to the received connection + * on the backup. + * Called under RCU lock, no refcnt is returned. + */ +struct ip_vs_dest *ip_vs_find_dest(struct netns_ipvs *ipvs, int svc_af, int dest_af, + const union nf_inet_addr *daddr, + __be16 dport, + const union nf_inet_addr *vaddr, + __be16 vport, __u16 protocol, __u32 fwmark, + __u32 flags) +{ + struct ip_vs_dest *dest; + struct ip_vs_service *svc; + __be16 port = dport; + + svc = ip_vs_service_find(ipvs, svc_af, fwmark, protocol, vaddr, vport); + if (!svc) + return NULL; + if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) + port = 0; + dest = ip_vs_lookup_dest(svc, dest_af, daddr, port); + if (!dest) + dest = ip_vs_lookup_dest(svc, dest_af, daddr, port ^ dport); + return dest; +} + +void ip_vs_dest_dst_rcu_free(struct rcu_head *head) +{ + struct ip_vs_dest_dst *dest_dst = container_of(head, + struct ip_vs_dest_dst, + rcu_head); + + dst_release(dest_dst->dst_cache); + kfree(dest_dst); +} + +/* Release dest_dst and dst_cache for dest in user context */ +static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest) +{ + struct ip_vs_dest_dst *old; + + old = rcu_dereference_protected(dest->dest_dst, 1); + if (old) { + RCU_INIT_POINTER(dest->dest_dst, NULL); + call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); + } +} + +/* + * Lookup dest by {svc,addr,port} in the destination trash. + * The destination trash is used to hold the destinations that are removed + * from the service table but are still referenced by some conn entries. + * The reason to add the destination trash is when the dest is temporary + * down (either by administrator or by monitor program), the dest can be + * picked back from the trash, the remaining connections to the dest can + * continue, and the counting information of the dest is also useful for + * scheduling. + */ +static struct ip_vs_dest * +ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af, + const union nf_inet_addr *daddr, __be16 dport) +{ + struct ip_vs_dest *dest; + struct netns_ipvs *ipvs = svc->ipvs; + + /* + * Find the destination in trash + */ + spin_lock_bh(&ipvs->dest_trash_lock); + list_for_each_entry(dest, &ipvs->dest_trash, t_list) { + IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " + "dest->refcnt=%d\n", + dest->vfwmark, + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port), + refcount_read(&dest->refcnt)); + if (dest->af == dest_af && + ip_vs_addr_equal(dest_af, &dest->addr, daddr) && + dest->port == dport && + dest->vfwmark == svc->fwmark && + dest->protocol == svc->protocol && + (svc->fwmark || + (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && + dest->vport == svc->port))) { + /* HIT */ + list_del(&dest->t_list); + goto out; + } + } + + dest = NULL; + +out: + spin_unlock_bh(&ipvs->dest_trash_lock); + + return dest; +} + +static void ip_vs_dest_free(struct ip_vs_dest *dest) +{ + struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1); + + __ip_vs_dst_cache_reset(dest); + __ip_vs_svc_put(svc, false); + free_percpu(dest->stats.cpustats); + ip_vs_dest_put_and_free(dest); +} + +/* + * Clean up all the destinations in the trash + * Called by the ip_vs_control_cleanup() + * + * When the ip_vs_control_clearup is activated by ipvs module exit, + * the service tables must have been flushed and all the connections + * are expired, and the refcnt of each destination in the trash must + * be 1, so we simply release them here. + */ +static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs) +{ + struct ip_vs_dest *dest, *nxt; + + del_timer_sync(&ipvs->dest_trash_timer); + /* No need to use dest_trash_lock */ + list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) { + list_del(&dest->t_list); + ip_vs_dest_free(dest); + } +} + +static void +ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src) +{ +#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c + + spin_lock_bh(&src->lock); + + IP_VS_SHOW_STATS_COUNTER(conns); + IP_VS_SHOW_STATS_COUNTER(inpkts); + IP_VS_SHOW_STATS_COUNTER(outpkts); + IP_VS_SHOW_STATS_COUNTER(inbytes); + IP_VS_SHOW_STATS_COUNTER(outbytes); + + ip_vs_read_estimator(dst, src); + + spin_unlock_bh(&src->lock); +} + +static void +ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src) +{ + dst->conns = (u32)src->conns; + dst->inpkts = (u32)src->inpkts; + dst->outpkts = (u32)src->outpkts; + dst->inbytes = src->inbytes; + dst->outbytes = src->outbytes; + dst->cps = (u32)src->cps; + dst->inpps = (u32)src->inpps; + dst->outpps = (u32)src->outpps; + dst->inbps = (u32)src->inbps; + dst->outbps = (u32)src->outbps; +} + +static void +ip_vs_zero_stats(struct ip_vs_stats *stats) +{ + spin_lock_bh(&stats->lock); + + /* get current counters as zero point, rates are zeroed */ + +#define IP_VS_ZERO_STATS_COUNTER(c) stats->kstats0.c = stats->kstats.c + + IP_VS_ZERO_STATS_COUNTER(conns); + IP_VS_ZERO_STATS_COUNTER(inpkts); + IP_VS_ZERO_STATS_COUNTER(outpkts); + IP_VS_ZERO_STATS_COUNTER(inbytes); + IP_VS_ZERO_STATS_COUNTER(outbytes); + + ip_vs_zero_estimator(stats); + + spin_unlock_bh(&stats->lock); +} + +/* + * Update a destination in the given service + */ +static void +__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, + struct ip_vs_dest_user_kern *udest, int add) +{ + struct netns_ipvs *ipvs = svc->ipvs; + struct ip_vs_service *old_svc; + struct ip_vs_scheduler *sched; + int conn_flags; + + /* We cannot modify an address and change the address family */ + BUG_ON(!add && udest->af != dest->af); + + if (add && udest->af != svc->af) + ipvs->mixed_address_family_dests++; + + /* keep the last_weight with latest non-0 weight */ + if (add || udest->weight != 0) + atomic_set(&dest->last_weight, udest->weight); + + /* set the weight and the flags */ + atomic_set(&dest->weight, udest->weight); + conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; + conn_flags |= IP_VS_CONN_F_INACTIVE; + + /* Need to rehash? */ + if ((udest->conn_flags & IP_VS_CONN_F_FWD_MASK) != + IP_VS_DFWD_METHOD(dest) || + udest->tun_type != dest->tun_type || + udest->tun_port != dest->tun_port) + ip_vs_rs_unhash(dest); + + /* set the tunnel info */ + dest->tun_type = udest->tun_type; + dest->tun_port = udest->tun_port; + dest->tun_flags = udest->tun_flags; + + /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ + if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { + conn_flags |= IP_VS_CONN_F_NOOUTPUT; + } else { + /* FTP-NAT requires conntrack for mangling */ + if (svc->port == FTPPORT) + ip_vs_register_conntrack(svc); + } + atomic_set(&dest->conn_flags, conn_flags); + /* Put the real service in rs_table if not present. */ + ip_vs_rs_hash(ipvs, dest); + + /* bind the service */ + old_svc = rcu_dereference_protected(dest->svc, 1); + if (!old_svc) { + __ip_vs_bind_svc(dest, svc); + } else { + if (old_svc != svc) { + ip_vs_zero_stats(&dest->stats); + __ip_vs_bind_svc(dest, svc); + __ip_vs_svc_put(old_svc, true); + } + } + + /* set the dest status flags */ + dest->flags |= IP_VS_DEST_F_AVAILABLE; + + if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + dest->u_threshold = udest->u_threshold; + dest->l_threshold = udest->l_threshold; + + dest->af = udest->af; + + spin_lock_bh(&dest->dst_lock); + __ip_vs_dst_cache_reset(dest); + spin_unlock_bh(&dest->dst_lock); + + if (add) { + ip_vs_start_estimator(svc->ipvs, &dest->stats); + list_add_rcu(&dest->n_list, &svc->destinations); + svc->num_dests++; + sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched && sched->add_dest) + sched->add_dest(svc, dest); + } else { + sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched && sched->upd_dest) + sched->upd_dest(svc, dest); + } +} + + +/* + * Create a destination for the given service + */ +static int +ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) +{ + struct ip_vs_dest *dest; + unsigned int atype, i; + + EnterFunction(2); + +#ifdef CONFIG_IP_VS_IPV6 + if (udest->af == AF_INET6) { + int ret; + + atype = ipv6_addr_type(&udest->addr.in6); + if ((!(atype & IPV6_ADDR_UNICAST) || + atype & IPV6_ADDR_LINKLOCAL) && + !__ip_vs_addr_is_local_v6(svc->ipvs->net, &udest->addr.in6)) + return -EINVAL; + + ret = nf_defrag_ipv6_enable(svc->ipvs->net); + if (ret) + return ret; + } else +#endif + { + atype = inet_addr_type(svc->ipvs->net, udest->addr.ip); + if (atype != RTN_LOCAL && atype != RTN_UNICAST) + return -EINVAL; + } + + dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL); + if (dest == NULL) + return -ENOMEM; + + dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!dest->stats.cpustats) + goto err_alloc; + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *ip_vs_dest_stats; + ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i); + u64_stats_init(&ip_vs_dest_stats->syncp); + } + + dest->af = udest->af; + dest->protocol = svc->protocol; + dest->vaddr = svc->addr; + dest->vport = svc->port; + dest->vfwmark = svc->fwmark; + ip_vs_addr_copy(udest->af, &dest->addr, &udest->addr); + dest->port = udest->port; + + atomic_set(&dest->activeconns, 0); + atomic_set(&dest->inactconns, 0); + atomic_set(&dest->persistconns, 0); + refcount_set(&dest->refcnt, 1); + + INIT_HLIST_NODE(&dest->d_list); + spin_lock_init(&dest->dst_lock); + spin_lock_init(&dest->stats.lock); + __ip_vs_update_dest(svc, dest, udest, 1); + + LeaveFunction(2); + return 0; + +err_alloc: + kfree(dest); + return -ENOMEM; +} + + +/* + * Add a destination into an existing service + */ +static int +ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) +{ + struct ip_vs_dest *dest; + union nf_inet_addr daddr; + __be16 dport = udest->port; + int ret; + + EnterFunction(2); + + if (udest->weight < 0) { + pr_err("%s(): server weight less than zero\n", __func__); + return -ERANGE; + } + + if (udest->l_threshold > udest->u_threshold) { + pr_err("%s(): lower threshold is higher than upper threshold\n", + __func__); + return -ERANGE; + } + + if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + if (udest->tun_port == 0) { + pr_err("%s(): tunnel port is zero\n", __func__); + return -EINVAL; + } + } + + ip_vs_addr_copy(udest->af, &daddr, &udest->addr); + + /* We use function that requires RCU lock */ + rcu_read_lock(); + dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); + rcu_read_unlock(); + + if (dest != NULL) { + IP_VS_DBG(1, "%s(): dest already exists\n", __func__); + return -EEXIST; + } + + /* + * Check if the dest already exists in the trash and + * is from the same service + */ + dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport); + + if (dest != NULL) { + IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " + "dest->refcnt=%d, service %u/%s:%u\n", + IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport), + refcount_read(&dest->refcnt), + dest->vfwmark, + IP_VS_DBG_ADDR(svc->af, &dest->vaddr), + ntohs(dest->vport)); + + __ip_vs_update_dest(svc, dest, udest, 1); + ret = 0; + } else { + /* + * Allocate and initialize the dest structure + */ + ret = ip_vs_new_dest(svc, udest); + } + LeaveFunction(2); + + return ret; +} + + +/* + * Edit a destination in the given service + */ +static int +ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) +{ + struct ip_vs_dest *dest; + union nf_inet_addr daddr; + __be16 dport = udest->port; + + EnterFunction(2); + + if (udest->weight < 0) { + pr_err("%s(): server weight less than zero\n", __func__); + return -ERANGE; + } + + if (udest->l_threshold > udest->u_threshold) { + pr_err("%s(): lower threshold is higher than upper threshold\n", + __func__); + return -ERANGE; + } + + if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + if (udest->tun_port == 0) { + pr_err("%s(): tunnel port is zero\n", __func__); + return -EINVAL; + } + } + + ip_vs_addr_copy(udest->af, &daddr, &udest->addr); + + /* We use function that requires RCU lock */ + rcu_read_lock(); + dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); + rcu_read_unlock(); + + if (dest == NULL) { + IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__); + return -ENOENT; + } + + __ip_vs_update_dest(svc, dest, udest, 0); + LeaveFunction(2); + + return 0; +} + +/* + * Delete a destination (must be already unlinked from the service) + */ +static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest, + bool cleanup) +{ + ip_vs_stop_estimator(ipvs, &dest->stats); + + /* + * Remove it from the d-linked list with the real services. + */ + ip_vs_rs_unhash(dest); + + spin_lock_bh(&ipvs->dest_trash_lock); + IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), + refcount_read(&dest->refcnt)); + if (list_empty(&ipvs->dest_trash) && !cleanup) + mod_timer(&ipvs->dest_trash_timer, + jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); + /* dest lives in trash with reference */ + list_add(&dest->t_list, &ipvs->dest_trash); + dest->idle_start = 0; + spin_unlock_bh(&ipvs->dest_trash_lock); + + /* Queue up delayed work to expire all no destination connections. + * No-op when CONFIG_SYSCTL is disabled. + */ + if (!cleanup) + ip_vs_enqueue_expire_nodest_conns(ipvs); +} + + +/* + * Unlink a destination from the given service + */ +static void __ip_vs_unlink_dest(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + int svcupd) +{ + dest->flags &= ~IP_VS_DEST_F_AVAILABLE; + + /* + * Remove it from the d-linked destination list. + */ + list_del_rcu(&dest->n_list); + svc->num_dests--; + + if (dest->af != svc->af) + svc->ipvs->mixed_address_family_dests--; + + if (svcupd) { + struct ip_vs_scheduler *sched; + + sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched && sched->del_dest) + sched->del_dest(svc, dest); + } +} + + +/* + * Delete a destination server in the given service + */ +static int +ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) +{ + struct ip_vs_dest *dest; + __be16 dport = udest->port; + + EnterFunction(2); + + /* We use function that requires RCU lock */ + rcu_read_lock(); + dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport); + rcu_read_unlock(); + + if (dest == NULL) { + IP_VS_DBG(1, "%s(): destination not found!\n", __func__); + return -ENOENT; + } + + /* + * Unlink dest from the service + */ + __ip_vs_unlink_dest(svc, dest, 1); + + /* + * Delete the destination + */ + __ip_vs_del_dest(svc->ipvs, dest, false); + + LeaveFunction(2); + + return 0; +} + +static void ip_vs_dest_trash_expire(struct timer_list *t) +{ + struct netns_ipvs *ipvs = from_timer(ipvs, t, dest_trash_timer); + struct ip_vs_dest *dest, *next; + unsigned long now = jiffies; + + spin_lock(&ipvs->dest_trash_lock); + list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { + if (refcount_read(&dest->refcnt) > 1) + continue; + if (dest->idle_start) { + if (time_before(now, dest->idle_start + + IP_VS_DEST_TRASH_PERIOD)) + continue; + } else { + dest->idle_start = max(1UL, now); + continue; + } + IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n", + dest->vfwmark, + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port)); + list_del(&dest->t_list); + ip_vs_dest_free(dest); + } + if (!list_empty(&ipvs->dest_trash)) + mod_timer(&ipvs->dest_trash_timer, + jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); + spin_unlock(&ipvs->dest_trash_lock); +} + +/* + * Add a service into the service hash table + */ +static int +ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, + struct ip_vs_service **svc_p) +{ + int ret = 0, i; + struct ip_vs_scheduler *sched = NULL; + struct ip_vs_pe *pe = NULL; + struct ip_vs_service *svc = NULL; + int ret_hooks = -1; + + /* increase the module use count */ + if (!ip_vs_use_count_inc()) + return -ENOPROTOOPT; + + /* Lookup the scheduler by 'u->sched_name' */ + if (strcmp(u->sched_name, "none")) { + sched = ip_vs_scheduler_get(u->sched_name); + if (!sched) { + pr_info("Scheduler module ip_vs_%s not found\n", + u->sched_name); + ret = -ENOENT; + goto out_err; + } + } + + if (u->pe_name && *u->pe_name) { + pe = ip_vs_pe_getbyname(u->pe_name); + if (pe == NULL) { + pr_info("persistence engine module ip_vs_pe_%s " + "not found\n", u->pe_name); + ret = -ENOENT; + goto out_err; + } + } + +#ifdef CONFIG_IP_VS_IPV6 + if (u->af == AF_INET6) { + __u32 plen = (__force __u32) u->netmask; + + if (plen < 1 || plen > 128) { + ret = -EINVAL; + goto out_err; + } + + ret = nf_defrag_ipv6_enable(ipvs->net); + if (ret) + goto out_err; + } +#endif + + if ((u->af == AF_INET && !ipvs->num_services) || + (u->af == AF_INET6 && !ipvs->num_services6)) { + ret = ip_vs_register_hooks(ipvs, u->af); + if (ret < 0) + goto out_err; + ret_hooks = ret; + } + + svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL); + if (svc == NULL) { + IP_VS_DBG(1, "%s(): no memory\n", __func__); + ret = -ENOMEM; + goto out_err; + } + svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!svc->stats.cpustats) { + ret = -ENOMEM; + goto out_err; + } + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *ip_vs_stats; + ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i); + u64_stats_init(&ip_vs_stats->syncp); + } + + + /* I'm the first user of the service */ + atomic_set(&svc->refcnt, 0); + + svc->af = u->af; + svc->protocol = u->protocol; + ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); + svc->port = u->port; + svc->fwmark = u->fwmark; + svc->flags = u->flags & ~IP_VS_SVC_F_HASHED; + svc->timeout = u->timeout * HZ; + svc->netmask = u->netmask; + svc->ipvs = ipvs; + + INIT_LIST_HEAD(&svc->destinations); + spin_lock_init(&svc->sched_lock); + spin_lock_init(&svc->stats.lock); + + /* Bind the scheduler */ + if (sched) { + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) + goto out_err; + sched = NULL; + } + + /* Bind the ct retriever */ + RCU_INIT_POINTER(svc->pe, pe); + pe = NULL; + + /* Update the virtual service counters */ + if (svc->port == FTPPORT) + atomic_inc(&ipvs->ftpsvc_counter); + else if (svc->port == 0) + atomic_inc(&ipvs->nullsvc_counter); + if (svc->pe && svc->pe->conn_out) + atomic_inc(&ipvs->conn_out_counter); + + ip_vs_start_estimator(ipvs, &svc->stats); + + /* Count only IPv4 services for old get/setsockopt interface */ + if (svc->af == AF_INET) + ipvs->num_services++; + else if (svc->af == AF_INET6) + ipvs->num_services6++; + + /* Hash the service into the service table */ + ip_vs_svc_hash(svc); + + *svc_p = svc; + /* Now there is a service - full throttle */ + ipvs->enable = 1; + return 0; + + + out_err: + if (ret_hooks >= 0) + ip_vs_unregister_hooks(ipvs, u->af); + if (svc != NULL) { + ip_vs_unbind_scheduler(svc, sched); + ip_vs_service_free(svc); + } + ip_vs_scheduler_put(sched); + ip_vs_pe_put(pe); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + return ret; +} + + +/* + * Edit a service and bind it with a new scheduler + */ +static int +ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) +{ + struct ip_vs_scheduler *sched = NULL, *old_sched; + struct ip_vs_pe *pe = NULL, *old_pe = NULL; + int ret = 0; + bool new_pe_conn_out, old_pe_conn_out; + + /* + * Lookup the scheduler, by 'u->sched_name' + */ + if (strcmp(u->sched_name, "none")) { + sched = ip_vs_scheduler_get(u->sched_name); + if (!sched) { + pr_info("Scheduler module ip_vs_%s not found\n", + u->sched_name); + return -ENOENT; + } + } + old_sched = sched; + + if (u->pe_name && *u->pe_name) { + pe = ip_vs_pe_getbyname(u->pe_name); + if (pe == NULL) { + pr_info("persistence engine module ip_vs_pe_%s " + "not found\n", u->pe_name); + ret = -ENOENT; + goto out; + } + old_pe = pe; + } + +#ifdef CONFIG_IP_VS_IPV6 + if (u->af == AF_INET6) { + __u32 plen = (__force __u32) u->netmask; + + if (plen < 1 || plen > 128) { + ret = -EINVAL; + goto out; + } + } +#endif + + old_sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched != old_sched) { + if (old_sched) { + ip_vs_unbind_scheduler(svc, old_sched); + RCU_INIT_POINTER(svc->scheduler, NULL); + /* Wait all svc->sched_data users */ + synchronize_rcu(); + } + /* Bind the new scheduler */ + if (sched) { + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) { + ip_vs_scheduler_put(sched); + goto out; + } + } + } + + /* + * Set the flags and timeout value + */ + svc->flags = u->flags | IP_VS_SVC_F_HASHED; + svc->timeout = u->timeout * HZ; + svc->netmask = u->netmask; + + old_pe = rcu_dereference_protected(svc->pe, 1); + if (pe != old_pe) { + rcu_assign_pointer(svc->pe, pe); + /* check for optional methods in new pe */ + new_pe_conn_out = (pe && pe->conn_out) ? true : false; + old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false; + if (new_pe_conn_out && !old_pe_conn_out) + atomic_inc(&svc->ipvs->conn_out_counter); + if (old_pe_conn_out && !new_pe_conn_out) + atomic_dec(&svc->ipvs->conn_out_counter); + } + +out: + ip_vs_scheduler_put(old_sched); + ip_vs_pe_put(old_pe); + return ret; +} + +/* + * Delete a service from the service list + * - The service must be unlinked, unlocked and not referenced! + * - We are called under _bh lock + */ +static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) +{ + struct ip_vs_dest *dest, *nxt; + struct ip_vs_scheduler *old_sched; + struct ip_vs_pe *old_pe; + struct netns_ipvs *ipvs = svc->ipvs; + + if (svc->af == AF_INET) { + ipvs->num_services--; + if (!ipvs->num_services) + ip_vs_unregister_hooks(ipvs, svc->af); + } else if (svc->af == AF_INET6) { + ipvs->num_services6--; + if (!ipvs->num_services6) + ip_vs_unregister_hooks(ipvs, svc->af); + } + + ip_vs_stop_estimator(svc->ipvs, &svc->stats); + + /* Unbind scheduler */ + old_sched = rcu_dereference_protected(svc->scheduler, 1); + ip_vs_unbind_scheduler(svc, old_sched); + ip_vs_scheduler_put(old_sched); + + /* Unbind persistence engine, keep svc->pe */ + old_pe = rcu_dereference_protected(svc->pe, 1); + if (old_pe && old_pe->conn_out) + atomic_dec(&ipvs->conn_out_counter); + ip_vs_pe_put(old_pe); + + /* + * Unlink the whole destination list + */ + list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { + __ip_vs_unlink_dest(svc, dest, 0); + __ip_vs_del_dest(svc->ipvs, dest, cleanup); + } + + /* + * Update the virtual service counters + */ + if (svc->port == FTPPORT) + atomic_dec(&ipvs->ftpsvc_counter); + else if (svc->port == 0) + atomic_dec(&ipvs->nullsvc_counter); + + /* + * Free the service if nobody refers to it + */ + __ip_vs_svc_put(svc, true); + + /* decrease the module use count */ + ip_vs_use_count_dec(); +} + +/* + * Unlink a service from list and try to delete it if its refcnt reached 0 + */ +static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup) +{ + ip_vs_unregister_conntrack(svc); + /* Hold svc to avoid double release from dest_trash */ + atomic_inc(&svc->refcnt); + /* + * Unhash it from the service table + */ + ip_vs_svc_unhash(svc); + + __ip_vs_del_service(svc, cleanup); +} + +/* + * Delete a service from the service list + */ +static int ip_vs_del_service(struct ip_vs_service *svc) +{ + if (svc == NULL) + return -EEXIST; + ip_vs_unlink_service(svc, false); + + return 0; +} + + +/* + * Flush all the virtual services + */ +static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) +{ + int idx; + struct ip_vs_service *svc; + struct hlist_node *n; + + /* + * Flush the service table hashed by + */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx], + s_list) { + if (svc->ipvs == ipvs) + ip_vs_unlink_service(svc, cleanup); + } + } + + /* + * Flush the service table hashed by fwmark + */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx], + f_list) { + if (svc->ipvs == ipvs) + ip_vs_unlink_service(svc, cleanup); + } + } + + return 0; +} + +/* + * Delete service by {netns} in the service table. + * Called by __ip_vs_batch_cleanup() + */ +void ip_vs_service_nets_cleanup(struct list_head *net_list) +{ + struct netns_ipvs *ipvs; + struct net *net; + + EnterFunction(2); + /* Check for "full" addressed entries */ + mutex_lock(&__ip_vs_mutex); + list_for_each_entry(net, net_list, exit_list) { + ipvs = net_ipvs(net); + ip_vs_flush(ipvs, true); + } + mutex_unlock(&__ip_vs_mutex); + LeaveFunction(2); +} + +/* Put all references for device (dst_cache) */ +static inline void +ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev) +{ + struct ip_vs_dest_dst *dest_dst; + + spin_lock_bh(&dest->dst_lock); + dest_dst = rcu_dereference_protected(dest->dest_dst, 1); + if (dest_dst && dest_dst->dst_cache->dev == dev) { + IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n", + dev->name, + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port), + refcount_read(&dest->refcnt)); + __ip_vs_dst_cache_reset(dest); + } + spin_unlock_bh(&dest->dst_lock); + +} +/* Netdev event receiver + * Currently only NETDEV_DOWN is handled to release refs to cached dsts + */ +static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_service *svc; + struct ip_vs_dest *dest; + unsigned int idx; + + if (event != NETDEV_DOWN || !ipvs) + return NOTIFY_DONE; + IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); + EnterFunction(2); + mutex_lock(&__ip_vs_mutex); + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + if (svc->ipvs == ipvs) { + list_for_each_entry(dest, &svc->destinations, + n_list) { + ip_vs_forget_dev(dest, dev); + } + } + } + + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + if (svc->ipvs == ipvs) { + list_for_each_entry(dest, &svc->destinations, + n_list) { + ip_vs_forget_dev(dest, dev); + } + } + + } + } + + spin_lock_bh(&ipvs->dest_trash_lock); + list_for_each_entry(dest, &ipvs->dest_trash, t_list) { + ip_vs_forget_dev(dest, dev); + } + spin_unlock_bh(&ipvs->dest_trash_lock); + mutex_unlock(&__ip_vs_mutex); + LeaveFunction(2); + return NOTIFY_DONE; +} + +/* + * Zero counters in a service or all services + */ +static int ip_vs_zero_service(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + + list_for_each_entry(dest, &svc->destinations, n_list) { + ip_vs_zero_stats(&dest->stats); + } + ip_vs_zero_stats(&svc->stats); + return 0; +} + +static int ip_vs_zero_all(struct netns_ipvs *ipvs) +{ + int idx; + struct ip_vs_service *svc; + + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + if (svc->ipvs == ipvs) + ip_vs_zero_service(svc); + } + } + + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + if (svc->ipvs == ipvs) + ip_vs_zero_service(svc); + } + } + + ip_vs_zero_stats(&ipvs->tot_stats); + return 0; +} + +#ifdef CONFIG_SYSCTL + +static int +proc_do_defense_mode(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct netns_ipvs *ipvs = table->extra2; + int *valp = table->data; + int val = *valp; + int rc; + + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(int), + .mode = table->mode, + }; + + rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); + if (write && (*valp != val)) { + if (val < 0 || val > 3) { + rc = -EINVAL; + } else { + *valp = val; + update_defense_level(ipvs); + } + } + return rc; +} + +static int +proc_do_sync_threshold(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct netns_ipvs *ipvs = table->extra2; + int *valp = table->data; + int val[2]; + int rc; + struct ctl_table tmp = { + .data = &val, + .maxlen = table->maxlen, + .mode = table->mode, + }; + + mutex_lock(&ipvs->sync_mutex); + memcpy(val, valp, sizeof(val)); + rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); + if (write) { + if (val[0] < 0 || val[1] < 0 || + (val[0] >= val[1] && val[1])) + rc = -EINVAL; + else + memcpy(valp, val, sizeof(val)); + } + mutex_unlock(&ipvs->sync_mutex); + return rc; +} + +static int +proc_do_sync_ports(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val = *valp; + int rc; + + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(int), + .mode = table->mode, + }; + + rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); + if (write && (*valp != val)) { + if (val < 1 || !is_power_of_2(val)) + rc = -EINVAL; + else + *valp = val; + } + return rc; +} + +/* + * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) + * Do not change order or insert new entries without + * align with netns init in ip_vs_control_net_init() + */ + +static struct ctl_table vs_vars[] = { + { + .procname = "amemthresh", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "am_droprate", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "drop_entry", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_defense_mode, + }, + { + .procname = "drop_packet", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_defense_mode, + }, +#ifdef CONFIG_IP_VS_NFCT + { + .procname = "conntrack", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .procname = "secure_tcp", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_defense_mode, + }, + { + .procname = "snat_reroute", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "sync_version", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sync_ports", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_sync_ports, + }, + { + .procname = "sync_persist_mode", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sync_qlen_max", + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "sync_sock_size", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "cache_bypass", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "expire_nodest_conn", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sloppy_tcp", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sloppy_sctp", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "expire_quiescent_template", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sync_threshold", + .maxlen = + sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold), + .mode = 0644, + .proc_handler = proc_do_sync_threshold, + }, + { + .procname = "sync_refresh_period", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "sync_retries", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_THREE, + }, + { + .procname = "nat_icmp_send", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "pmtu_disc", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "backup_only", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "conn_reuse_mode", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "schedule_icmp", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "ignore_tunneled", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "run_estimation", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_IP_VS_DEBUG + { + .procname = "debug_level", + .data = &sysctl_ip_vs_debug_level, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { } +}; + +#endif + +#ifdef CONFIG_PROC_FS + +struct ip_vs_iter { + struct seq_net_private p; /* Do not move this, netns depends upon it*/ + struct hlist_head *table; + int bucket; +}; + +/* + * Write the contents of the VS rule table to a PROCfs file. + * (It is kept just for backward compatibility) + */ +static inline const char *ip_vs_fwd_name(unsigned int flags) +{ + switch (flags & IP_VS_CONN_F_FWD_MASK) { + case IP_VS_CONN_F_LOCALNODE: + return "Local"; + case IP_VS_CONN_F_TUNNEL: + return "Tunnel"; + case IP_VS_CONN_F_DROUTE: + return "Route"; + default: + return "Masq"; + } +} + + +/* Get the Nth entry in the two lists */ +static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) +{ + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_iter *iter = seq->private; + int idx; + struct ip_vs_service *svc; + + /* look in hash by protocol */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) { + if ((svc->ipvs == ipvs) && pos-- == 0) { + iter->table = ip_vs_svc_table; + iter->bucket = idx; + return svc; + } + } + } + + /* keep looking in fwmark */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx], + f_list) { + if ((svc->ipvs == ipvs) && pos-- == 0) { + iter->table = ip_vs_svc_fwm_table; + iter->bucket = idx; + return svc; + } + } + } + + return NULL; +} + +static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + rcu_read_lock(); + return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; +} + + +static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct hlist_node *e; + struct ip_vs_iter *iter; + struct ip_vs_service *svc; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_info_array(seq,0); + + svc = v; + iter = seq->private; + + if (iter->table == ip_vs_svc_table) { + /* next service in table hashed by protocol */ + e = rcu_dereference(hlist_next_rcu(&svc->s_list)); + if (e) + return hlist_entry(e, struct ip_vs_service, s_list); + + while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { + hlist_for_each_entry_rcu(svc, + &ip_vs_svc_table[iter->bucket], + s_list) { + return svc; + } + } + + iter->table = ip_vs_svc_fwm_table; + iter->bucket = -1; + goto scan_fwmark; + } + + /* next service in hashed by fwmark */ + e = rcu_dereference(hlist_next_rcu(&svc->f_list)); + if (e) + return hlist_entry(e, struct ip_vs_service, f_list); + + scan_fwmark: + while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { + hlist_for_each_entry_rcu(svc, + &ip_vs_svc_fwm_table[iter->bucket], + f_list) + return svc; + } + + return NULL; +} + +static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + + +static int ip_vs_info_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) { + seq_printf(seq, + "IP Virtual Server version %d.%d.%d (size=%d)\n", + NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); + seq_puts(seq, + "Prot LocalAddress:Port Scheduler Flags\n"); + seq_puts(seq, + " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); + } else { + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); + const struct ip_vs_service *svc = v; + const struct ip_vs_iter *iter = seq->private; + const struct ip_vs_dest *dest; + struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); + char *sched_name = sched ? sched->name : "none"; + + if (svc->ipvs != ipvs) + return 0; + if (iter->table == ip_vs_svc_table) { +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + seq_printf(seq, "%s [%pI6]:%04X %s ", + ip_vs_proto_name(svc->protocol), + &svc->addr.in6, + ntohs(svc->port), + sched_name); + else +#endif + seq_printf(seq, "%s %08X:%04X %s %s ", + ip_vs_proto_name(svc->protocol), + ntohl(svc->addr.ip), + ntohs(svc->port), + sched_name, + (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); + } else { + seq_printf(seq, "FWM %08X %s %s", + svc->fwmark, sched_name, + (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); + } + + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + seq_printf(seq, "persistent %d %08X\n", + svc->timeout, + ntohl(svc->netmask)); + else + seq_putc(seq, '\n'); + + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { +#ifdef CONFIG_IP_VS_IPV6 + if (dest->af == AF_INET6) + seq_printf(seq, + " -> [%pI6]:%04X" + " %-7s %-6d %-10d %-10d\n", + &dest->addr.in6, + ntohs(dest->port), + ip_vs_fwd_name(atomic_read(&dest->conn_flags)), + atomic_read(&dest->weight), + atomic_read(&dest->activeconns), + atomic_read(&dest->inactconns)); + else +#endif + seq_printf(seq, + " -> %08X:%04X " + "%-7s %-6d %-10d %-10d\n", + ntohl(dest->addr.ip), + ntohs(dest->port), + ip_vs_fwd_name(atomic_read(&dest->conn_flags)), + atomic_read(&dest->weight), + atomic_read(&dest->activeconns), + atomic_read(&dest->inactconns)); + + } + } + return 0; +} + +static const struct seq_operations ip_vs_info_seq_ops = { + .start = ip_vs_info_seq_start, + .next = ip_vs_info_seq_next, + .stop = ip_vs_info_seq_stop, + .show = ip_vs_info_seq_show, +}; + +static int ip_vs_stats_show(struct seq_file *seq, void *v) +{ + struct net *net = seq_file_single_net(seq); + struct ip_vs_kstats show; + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Total Incoming Outgoing Incoming Outgoing\n"); + seq_puts(seq, + " Conns Packets Packets Bytes Bytes\n"); + + ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats); + seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n", + (unsigned long long)show.conns, + (unsigned long long)show.inpkts, + (unsigned long long)show.outpkts, + (unsigned long long)show.inbytes, + (unsigned long long)show.outbytes); + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567*/ + seq_puts(seq, + " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); + seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n", + (unsigned long long)show.cps, + (unsigned long long)show.inpps, + (unsigned long long)show.outpps, + (unsigned long long)show.inbps, + (unsigned long long)show.outbps); + + return 0; +} + +static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) +{ + struct net *net = seq_file_single_net(seq); + struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats; + struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats; + struct ip_vs_kstats kstats; + int i; + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Total Incoming Outgoing Incoming Outgoing\n"); + seq_puts(seq, + "CPU Conns Packets Packets Bytes Bytes\n"); + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i); + unsigned int start; + u64 conns, inpkts, outpkts, inbytes, outbytes; + + do { + start = u64_stats_fetch_begin_irq(&u->syncp); + conns = u64_stats_read(&u->cnt.conns); + inpkts = u64_stats_read(&u->cnt.inpkts); + outpkts = u64_stats_read(&u->cnt.outpkts); + inbytes = u64_stats_read(&u->cnt.inbytes); + outbytes = u64_stats_read(&u->cnt.outbytes); + } while (u64_stats_fetch_retry_irq(&u->syncp, start)); + + seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n", + i, (u64)conns, (u64)inpkts, + (u64)outpkts, (u64)inbytes, + (u64)outbytes); + } + + ip_vs_copy_stats(&kstats, tot_stats); + + seq_printf(seq, " ~ %8LX %8LX %8LX %16LX %16LX\n\n", + (unsigned long long)kstats.conns, + (unsigned long long)kstats.inpkts, + (unsigned long long)kstats.outpkts, + (unsigned long long)kstats.inbytes, + (unsigned long long)kstats.outbytes); + +/* ... 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); + seq_printf(seq, " %8LX %8LX %8LX %16LX %16LX\n", + kstats.cps, + kstats.inpps, + kstats.outpps, + kstats.inbps, + kstats.outbps); + + return 0; +} +#endif + +/* + * Set timeout values for tcp tcpfin udp in the timeout_table. + */ +static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) +{ +#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) + struct ip_vs_proto_data *pd; +#endif + + IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", + u->tcp_timeout, + u->tcp_fin_timeout, + u->udp_timeout); + +#ifdef CONFIG_IP_VS_PROTO_TCP + if (u->tcp_timeout < 0 || u->tcp_timeout > (INT_MAX / HZ) || + u->tcp_fin_timeout < 0 || u->tcp_fin_timeout > (INT_MAX / HZ)) { + return -EINVAL; + } +#endif + +#ifdef CONFIG_IP_VS_PROTO_UDP + if (u->udp_timeout < 0 || u->udp_timeout > (INT_MAX / HZ)) + return -EINVAL; +#endif + +#ifdef CONFIG_IP_VS_PROTO_TCP + if (u->tcp_timeout) { + pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); + pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] + = u->tcp_timeout * HZ; + } + + if (u->tcp_fin_timeout) { + pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); + pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] + = u->tcp_fin_timeout * HZ; + } +#endif + +#ifdef CONFIG_IP_VS_PROTO_UDP + if (u->udp_timeout) { + pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); + pd->timeout_table[IP_VS_UDP_S_NORMAL] + = u->udp_timeout * HZ; + } +#endif + return 0; +} + +#define CMDID(cmd) (cmd - IP_VS_BASE_CTL) + +struct ip_vs_svcdest_user { + struct ip_vs_service_user s; + struct ip_vs_dest_user d; +}; + +static const unsigned char set_arglen[CMDID(IP_VS_SO_SET_MAX) + 1] = { + [CMDID(IP_VS_SO_SET_ADD)] = sizeof(struct ip_vs_service_user), + [CMDID(IP_VS_SO_SET_EDIT)] = sizeof(struct ip_vs_service_user), + [CMDID(IP_VS_SO_SET_DEL)] = sizeof(struct ip_vs_service_user), + [CMDID(IP_VS_SO_SET_ADDDEST)] = sizeof(struct ip_vs_svcdest_user), + [CMDID(IP_VS_SO_SET_DELDEST)] = sizeof(struct ip_vs_svcdest_user), + [CMDID(IP_VS_SO_SET_EDITDEST)] = sizeof(struct ip_vs_svcdest_user), + [CMDID(IP_VS_SO_SET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user), + [CMDID(IP_VS_SO_SET_STARTDAEMON)] = sizeof(struct ip_vs_daemon_user), + [CMDID(IP_VS_SO_SET_STOPDAEMON)] = sizeof(struct ip_vs_daemon_user), + [CMDID(IP_VS_SO_SET_ZERO)] = sizeof(struct ip_vs_service_user), +}; + +union ip_vs_set_arglen { + struct ip_vs_service_user field_IP_VS_SO_SET_ADD; + struct ip_vs_service_user field_IP_VS_SO_SET_EDIT; + struct ip_vs_service_user field_IP_VS_SO_SET_DEL; + struct ip_vs_svcdest_user field_IP_VS_SO_SET_ADDDEST; + struct ip_vs_svcdest_user field_IP_VS_SO_SET_DELDEST; + struct ip_vs_svcdest_user field_IP_VS_SO_SET_EDITDEST; + struct ip_vs_timeout_user field_IP_VS_SO_SET_TIMEOUT; + struct ip_vs_daemon_user field_IP_VS_SO_SET_STARTDAEMON; + struct ip_vs_daemon_user field_IP_VS_SO_SET_STOPDAEMON; + struct ip_vs_service_user field_IP_VS_SO_SET_ZERO; +}; + +#define MAX_SET_ARGLEN sizeof(union ip_vs_set_arglen) + +static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, + struct ip_vs_service_user *usvc_compat) +{ + memset(usvc, 0, sizeof(*usvc)); + + usvc->af = AF_INET; + usvc->protocol = usvc_compat->protocol; + usvc->addr.ip = usvc_compat->addr; + usvc->port = usvc_compat->port; + usvc->fwmark = usvc_compat->fwmark; + + /* Deep copy of sched_name is not needed here */ + usvc->sched_name = usvc_compat->sched_name; + + usvc->flags = usvc_compat->flags; + usvc->timeout = usvc_compat->timeout; + usvc->netmask = usvc_compat->netmask; +} + +static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, + struct ip_vs_dest_user *udest_compat) +{ + memset(udest, 0, sizeof(*udest)); + + udest->addr.ip = udest_compat->addr; + udest->port = udest_compat->port; + udest->conn_flags = udest_compat->conn_flags; + udest->weight = udest_compat->weight; + udest->u_threshold = udest_compat->u_threshold; + udest->l_threshold = udest_compat->l_threshold; + udest->af = AF_INET; + udest->tun_type = IP_VS_CONN_F_TUNNEL_TYPE_IPIP; +} + +static int +do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len) +{ + struct net *net = sock_net(sk); + int ret; + unsigned char arg[MAX_SET_ARGLEN]; + struct ip_vs_service_user *usvc_compat; + struct ip_vs_service_user_kern usvc; + struct ip_vs_service *svc; + struct ip_vs_dest_user *udest_compat; + struct ip_vs_dest_user_kern udest; + struct netns_ipvs *ipvs = net_ipvs(net); + + BUILD_BUG_ON(sizeof(arg) > 255); + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX) + return -EINVAL; + if (len != set_arglen[CMDID(cmd)]) { + IP_VS_DBG(1, "set_ctl: len %u != %u\n", + len, set_arglen[CMDID(cmd)]); + return -EINVAL; + } + + if (copy_from_sockptr(arg, ptr, len) != 0) + return -EFAULT; + + /* Handle daemons since they have another lock */ + if (cmd == IP_VS_SO_SET_STARTDAEMON || + cmd == IP_VS_SO_SET_STOPDAEMON) { + struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; + + if (cmd == IP_VS_SO_SET_STARTDAEMON) { + struct ipvs_sync_daemon_cfg cfg; + + memset(&cfg, 0, sizeof(cfg)); + ret = -EINVAL; + if (strscpy(cfg.mcast_ifn, dm->mcast_ifn, + sizeof(cfg.mcast_ifn)) <= 0) + return ret; + cfg.syncid = dm->syncid; + ret = start_sync_thread(ipvs, &cfg, dm->state); + } else { + ret = stop_sync_thread(ipvs, dm->state); + } + return ret; + } + + mutex_lock(&__ip_vs_mutex); + if (cmd == IP_VS_SO_SET_FLUSH) { + /* Flush the virtual service */ + ret = ip_vs_flush(ipvs, false); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_TIMEOUT) { + /* Set timeout values for (tcp tcpfin udp) */ + ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg); + goto out_unlock; + } else if (!len) { + /* No more commands with len == 0 below */ + ret = -EINVAL; + goto out_unlock; + } + + usvc_compat = (struct ip_vs_service_user *)arg; + udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1); + + /* We only use the new structs internally, so copy userspace compat + * structs to extended internal versions */ + ip_vs_copy_usvc_compat(&usvc, usvc_compat); + ip_vs_copy_udest_compat(&udest, udest_compat); + + if (cmd == IP_VS_SO_SET_ZERO) { + /* if no service address is set, zero counters in all */ + if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { + ret = ip_vs_zero_all(ipvs); + goto out_unlock; + } + } + + if ((cmd == IP_VS_SO_SET_ADD || cmd == IP_VS_SO_SET_EDIT) && + strnlen(usvc.sched_name, IP_VS_SCHEDNAME_MAXLEN) == + IP_VS_SCHEDNAME_MAXLEN) { + ret = -EINVAL; + goto out_unlock; + } + + /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */ + if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP && + usvc.protocol != IPPROTO_SCTP) { + pr_err("set_ctl: invalid protocol: %d %pI4:%d\n", + usvc.protocol, &usvc.addr.ip, + ntohs(usvc.port)); + ret = -EFAULT; + goto out_unlock; + } + + /* Lookup the exact service by or fwmark */ + rcu_read_lock(); + if (usvc.fwmark == 0) + svc = __ip_vs_service_find(ipvs, usvc.af, usvc.protocol, + &usvc.addr, usvc.port); + else + svc = __ip_vs_svc_fwm_find(ipvs, usvc.af, usvc.fwmark); + rcu_read_unlock(); + + if (cmd != IP_VS_SO_SET_ADD + && (svc == NULL || svc->protocol != usvc.protocol)) { + ret = -ESRCH; + goto out_unlock; + } + + switch (cmd) { + case IP_VS_SO_SET_ADD: + if (svc != NULL) + ret = -EEXIST; + else + ret = ip_vs_add_service(ipvs, &usvc, &svc); + break; + case IP_VS_SO_SET_EDIT: + ret = ip_vs_edit_service(svc, &usvc); + break; + case IP_VS_SO_SET_DEL: + ret = ip_vs_del_service(svc); + if (!ret) + goto out_unlock; + break; + case IP_VS_SO_SET_ZERO: + ret = ip_vs_zero_service(svc); + break; + case IP_VS_SO_SET_ADDDEST: + ret = ip_vs_add_dest(svc, &udest); + break; + case IP_VS_SO_SET_EDITDEST: + ret = ip_vs_edit_dest(svc, &udest); + break; + case IP_VS_SO_SET_DELDEST: + ret = ip_vs_del_dest(svc, &udest); + } + + out_unlock: + mutex_unlock(&__ip_vs_mutex); + return ret; +} + + +static void +ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) +{ + struct ip_vs_scheduler *sched; + struct ip_vs_kstats kstats; + char *sched_name; + + sched = rcu_dereference_protected(src->scheduler, 1); + sched_name = sched ? sched->name : "none"; + dst->protocol = src->protocol; + dst->addr = src->addr.ip; + dst->port = src->port; + dst->fwmark = src->fwmark; + strscpy(dst->sched_name, sched_name, sizeof(dst->sched_name)); + dst->flags = src->flags; + dst->timeout = src->timeout / HZ; + dst->netmask = src->netmask; + dst->num_dests = src->num_dests; + ip_vs_copy_stats(&kstats, &src->stats); + ip_vs_export_stats_user(&dst->stats, &kstats); +} + +static inline int +__ip_vs_get_service_entries(struct netns_ipvs *ipvs, + const struct ip_vs_get_services *get, + struct ip_vs_get_services __user *uptr) +{ + int idx, count=0; + struct ip_vs_service *svc; + struct ip_vs_service_entry entry; + int ret = 0; + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + /* Only expose IPv4 entries to old interface */ + if (svc->af != AF_INET || (svc->ipvs != ipvs)) + continue; + + if (count >= get->num_services) + goto out; + memset(&entry, 0, sizeof(entry)); + ip_vs_copy_service(&entry, svc); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + goto out; + } + count++; + } + } + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + /* Only expose IPv4 entries to old interface */ + if (svc->af != AF_INET || (svc->ipvs != ipvs)) + continue; + + if (count >= get->num_services) + goto out; + memset(&entry, 0, sizeof(entry)); + ip_vs_copy_service(&entry, svc); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + goto out; + } + count++; + } + } +out: + return ret; +} + +static inline int +__ip_vs_get_dest_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_dests *get, + struct ip_vs_get_dests __user *uptr) +{ + struct ip_vs_service *svc; + union nf_inet_addr addr = { .ip = get->addr }; + int ret = 0; + + rcu_read_lock(); + if (get->fwmark) + svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, get->fwmark); + else + svc = __ip_vs_service_find(ipvs, AF_INET, get->protocol, &addr, + get->port); + rcu_read_unlock(); + + if (svc) { + int count = 0; + struct ip_vs_dest *dest; + struct ip_vs_dest_entry entry; + struct ip_vs_kstats kstats; + + memset(&entry, 0, sizeof(entry)); + list_for_each_entry(dest, &svc->destinations, n_list) { + if (count >= get->num_dests) + break; + + /* Cannot expose heterogeneous members via sockopt + * interface + */ + if (dest->af != svc->af) + continue; + + entry.addr = dest->addr.ip; + entry.port = dest->port; + entry.conn_flags = atomic_read(&dest->conn_flags); + entry.weight = atomic_read(&dest->weight); + entry.u_threshold = dest->u_threshold; + entry.l_threshold = dest->l_threshold; + entry.activeconns = atomic_read(&dest->activeconns); + entry.inactconns = atomic_read(&dest->inactconns); + entry.persistconns = atomic_read(&dest->persistconns); + ip_vs_copy_stats(&kstats, &dest->stats); + ip_vs_export_stats_user(&entry.stats, &kstats); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + break; + } + count++; + } + } else + ret = -ESRCH; + return ret; +} + +static inline void +__ip_vs_get_timeouts(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) +{ +#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) + struct ip_vs_proto_data *pd; +#endif + + memset(u, 0, sizeof (*u)); + +#ifdef CONFIG_IP_VS_PROTO_TCP + pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); + u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; + u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); + u->udp_timeout = + pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ; +#endif +} + +static const unsigned char get_arglen[CMDID(IP_VS_SO_GET_MAX) + 1] = { + [CMDID(IP_VS_SO_GET_VERSION)] = 64, + [CMDID(IP_VS_SO_GET_INFO)] = sizeof(struct ip_vs_getinfo), + [CMDID(IP_VS_SO_GET_SERVICES)] = sizeof(struct ip_vs_get_services), + [CMDID(IP_VS_SO_GET_SERVICE)] = sizeof(struct ip_vs_service_entry), + [CMDID(IP_VS_SO_GET_DESTS)] = sizeof(struct ip_vs_get_dests), + [CMDID(IP_VS_SO_GET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user), + [CMDID(IP_VS_SO_GET_DAEMON)] = 2 * sizeof(struct ip_vs_daemon_user), +}; + +union ip_vs_get_arglen { + char field_IP_VS_SO_GET_VERSION[64]; + struct ip_vs_getinfo field_IP_VS_SO_GET_INFO; + struct ip_vs_get_services field_IP_VS_SO_GET_SERVICES; + struct ip_vs_service_entry field_IP_VS_SO_GET_SERVICE; + struct ip_vs_get_dests field_IP_VS_SO_GET_DESTS; + struct ip_vs_timeout_user field_IP_VS_SO_GET_TIMEOUT; + struct ip_vs_daemon_user field_IP_VS_SO_GET_DAEMON[2]; +}; + +#define MAX_GET_ARGLEN sizeof(union ip_vs_get_arglen) + +static int +do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + unsigned char arg[MAX_GET_ARGLEN]; + int ret = 0; + unsigned int copylen; + struct net *net = sock_net(sk); + struct netns_ipvs *ipvs = net_ipvs(net); + + BUG_ON(!net); + BUILD_BUG_ON(sizeof(arg) > 255); + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX) + return -EINVAL; + + copylen = get_arglen[CMDID(cmd)]; + if (*len < (int) copylen) { + IP_VS_DBG(1, "get_ctl: len %d < %u\n", *len, copylen); + return -EINVAL; + } + + if (copy_from_user(arg, user, copylen) != 0) + return -EFAULT; + /* + * Handle daemons first since it has its own locking + */ + if (cmd == IP_VS_SO_GET_DAEMON) { + struct ip_vs_daemon_user d[2]; + + memset(&d, 0, sizeof(d)); + mutex_lock(&ipvs->sync_mutex); + if (ipvs->sync_state & IP_VS_STATE_MASTER) { + d[0].state = IP_VS_STATE_MASTER; + strscpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn, + sizeof(d[0].mcast_ifn)); + d[0].syncid = ipvs->mcfg.syncid; + } + if (ipvs->sync_state & IP_VS_STATE_BACKUP) { + d[1].state = IP_VS_STATE_BACKUP; + strscpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn, + sizeof(d[1].mcast_ifn)); + d[1].syncid = ipvs->bcfg.syncid; + } + if (copy_to_user(user, &d, sizeof(d)) != 0) + ret = -EFAULT; + mutex_unlock(&ipvs->sync_mutex); + return ret; + } + + mutex_lock(&__ip_vs_mutex); + switch (cmd) { + case IP_VS_SO_GET_VERSION: + { + char buf[64]; + + sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", + NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); + if (copy_to_user(user, buf, strlen(buf)+1) != 0) { + ret = -EFAULT; + goto out; + } + *len = strlen(buf)+1; + } + break; + + case IP_VS_SO_GET_INFO: + { + struct ip_vs_getinfo info; + info.version = IP_VS_VERSION_CODE; + info.size = ip_vs_conn_tab_size; + info.num_services = ipvs->num_services; + if (copy_to_user(user, &info, sizeof(info)) != 0) + ret = -EFAULT; + } + break; + + case IP_VS_SO_GET_SERVICES: + { + struct ip_vs_get_services *get; + int size; + + get = (struct ip_vs_get_services *)arg; + size = struct_size(get, entrytable, get->num_services); + if (*len != size) { + pr_err("length: %u != %u\n", *len, size); + ret = -EINVAL; + goto out; + } + ret = __ip_vs_get_service_entries(ipvs, get, user); + } + break; + + case IP_VS_SO_GET_SERVICE: + { + struct ip_vs_service_entry *entry; + struct ip_vs_service *svc; + union nf_inet_addr addr; + + entry = (struct ip_vs_service_entry *)arg; + addr.ip = entry->addr; + rcu_read_lock(); + if (entry->fwmark) + svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, entry->fwmark); + else + svc = __ip_vs_service_find(ipvs, AF_INET, + entry->protocol, &addr, + entry->port); + rcu_read_unlock(); + if (svc) { + ip_vs_copy_service(entry, svc); + if (copy_to_user(user, entry, sizeof(*entry)) != 0) + ret = -EFAULT; + } else + ret = -ESRCH; + } + break; + + case IP_VS_SO_GET_DESTS: + { + struct ip_vs_get_dests *get; + int size; + + get = (struct ip_vs_get_dests *)arg; + size = struct_size(get, entrytable, get->num_dests); + if (*len != size) { + pr_err("length: %u != %u\n", *len, size); + ret = -EINVAL; + goto out; + } + ret = __ip_vs_get_dest_entries(ipvs, get, user); + } + break; + + case IP_VS_SO_GET_TIMEOUT: + { + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(ipvs, &t); + if (copy_to_user(user, &t, sizeof(t)) != 0) + ret = -EFAULT; + } + break; + + default: + ret = -EINVAL; + } + +out: + mutex_unlock(&__ip_vs_mutex); + return ret; +} + + +static struct nf_sockopt_ops ip_vs_sockopts = { + .pf = PF_INET, + .set_optmin = IP_VS_BASE_CTL, + .set_optmax = IP_VS_SO_SET_MAX+1, + .set = do_ip_vs_set_ctl, + .get_optmin = IP_VS_BASE_CTL, + .get_optmax = IP_VS_SO_GET_MAX+1, + .get = do_ip_vs_get_ctl, + .owner = THIS_MODULE, +}; + +/* + * Generic Netlink interface + */ + +/* IPVS genetlink family */ +static struct genl_family ip_vs_genl_family; + +/* Policy used for first-level command attributes */ +static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = { + [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 }, + [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 }, + [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */ +static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { + [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 }, + [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, + .len = IP_VS_IFNAME_MAXLEN - 1 }, + [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, + [IPVS_DAEMON_ATTR_SYNC_MAXLEN] = { .type = NLA_U16 }, + [IPVS_DAEMON_ATTR_MCAST_GROUP] = { .type = NLA_U32 }, + [IPVS_DAEMON_ATTR_MCAST_GROUP6] = { .len = sizeof(struct in6_addr) }, + [IPVS_DAEMON_ATTR_MCAST_PORT] = { .type = NLA_U16 }, + [IPVS_DAEMON_ATTR_MCAST_TTL] = { .type = NLA_U8 }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ +static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { + [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY, + .len = sizeof(union nf_inet_addr) }, + [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, + .len = IP_VS_SCHEDNAME_MAXLEN - 1 }, + [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING, + .len = IP_VS_PENAME_MAXLEN }, + [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, + .len = sizeof(struct ip_vs_flags) }, + [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */ +static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { + [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY, + .len = sizeof(union nf_inet_addr) }, + [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 }, + [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, + [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 }, + [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 }, + [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 }, + [IPVS_DEST_ATTR_TUN_FLAGS] = { .type = NLA_U16 }, +}; + +static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, + struct ip_vs_kstats *kstats) +{ + struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type); + + if (!nl_stats) + return -EMSGSIZE; + + if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) || + nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes, + IPVS_STATS_ATTR_PAD) || + nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) || + nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) || + nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, (u32)kstats->inbps) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, (u32)kstats->outbps)) + goto nla_put_failure; + nla_nest_end(skb, nl_stats); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_stats); + return -EMSGSIZE; +} + +static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type, + struct ip_vs_kstats *kstats) +{ + struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type); + + if (!nl_stats) + return -EMSGSIZE; + + if (nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CONNS, kstats->conns, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CPS, kstats->cps, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps, + IPVS_STATS_ATTR_PAD)) + goto nla_put_failure; + nla_nest_end(skb, nl_stats); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_stats); + return -EMSGSIZE; +} + +static int ip_vs_genl_fill_service(struct sk_buff *skb, + struct ip_vs_service *svc) +{ + struct ip_vs_scheduler *sched; + struct ip_vs_pe *pe; + struct nlattr *nl_service; + struct ip_vs_flags flags = { .flags = svc->flags, + .mask = ~0 }; + struct ip_vs_kstats kstats; + char *sched_name; + + nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE); + if (!nl_service) + return -EMSGSIZE; + + if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af)) + goto nla_put_failure; + if (svc->fwmark) { + if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark)) + goto nla_put_failure; + } else { + if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) || + nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) || + nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port)) + goto nla_put_failure; + } + + sched = rcu_dereference_protected(svc->scheduler, 1); + sched_name = sched ? sched->name : "none"; + pe = rcu_dereference_protected(svc->pe, 1); + if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) || + (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) || + nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) || + nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) || + nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask)) + goto nla_put_failure; + ip_vs_copy_stats(&kstats, &svc->stats); + if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats)) + goto nla_put_failure; + if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats)) + goto nla_put_failure; + + nla_nest_end(skb, nl_service); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_service); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_service(struct sk_buff *skb, + struct ip_vs_service *svc, + struct netlink_callback *cb) +{ + void *hdr; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_SERVICE); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_service(skb, svc) < 0) + goto nla_put_failure; + + genlmsg_end(skb, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_services(struct sk_buff *skb, + struct netlink_callback *cb) +{ + int idx = 0, i; + int start = cb->args[0]; + struct ip_vs_service *svc; + struct net *net = sock_net(skb->sk); + struct netns_ipvs *ipvs = net_ipvs(net); + + mutex_lock(&__ip_vs_mutex); + for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { + hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { + if (++idx <= start || (svc->ipvs != ipvs)) + continue; + if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + } + + for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { + hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { + if (++idx <= start || (svc->ipvs != ipvs)) + continue; + if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + } + +nla_put_failure: + mutex_unlock(&__ip_vs_mutex); + cb->args[0] = idx; + + return skb->len; +} + +static bool ip_vs_is_af_valid(int af) +{ + if (af == AF_INET) + return true; +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6 && ipv6_mod_enabled()) + return true; +#endif + return false; +} + +static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs, + struct ip_vs_service_user_kern *usvc, + struct nlattr *nla, bool full_entry, + struct ip_vs_service **ret_svc) +{ + struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; + struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr; + struct ip_vs_service *svc; + + /* Parse mandatory identifying service fields first */ + if (nla == NULL || + nla_parse_nested_deprecated(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy, NULL)) + return -EINVAL; + + nla_af = attrs[IPVS_SVC_ATTR_AF]; + nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL]; + nla_addr = attrs[IPVS_SVC_ATTR_ADDR]; + nla_port = attrs[IPVS_SVC_ATTR_PORT]; + nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK]; + + if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr)))) + return -EINVAL; + + memset(usvc, 0, sizeof(*usvc)); + + usvc->af = nla_get_u16(nla_af); + if (!ip_vs_is_af_valid(usvc->af)) + return -EAFNOSUPPORT; + + if (nla_fwmark) { + usvc->protocol = IPPROTO_TCP; + usvc->fwmark = nla_get_u32(nla_fwmark); + } else { + usvc->protocol = nla_get_u16(nla_protocol); + nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); + usvc->port = nla_get_be16(nla_port); + usvc->fwmark = 0; + } + + rcu_read_lock(); + if (usvc->fwmark) + svc = __ip_vs_svc_fwm_find(ipvs, usvc->af, usvc->fwmark); + else + svc = __ip_vs_service_find(ipvs, usvc->af, usvc->protocol, + &usvc->addr, usvc->port); + rcu_read_unlock(); + *ret_svc = svc; + + /* If a full entry was requested, check for the additional fields */ + if (full_entry) { + struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout, + *nla_netmask; + struct ip_vs_flags flags; + + nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; + nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME]; + nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; + nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; + nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; + + if (!(nla_sched && nla_flags && nla_timeout && nla_netmask)) + return -EINVAL; + + nla_memcpy(&flags, nla_flags, sizeof(flags)); + + /* prefill flags from service if it already exists */ + if (svc) + usvc->flags = svc->flags; + + /* set new flags from userland */ + usvc->flags = (usvc->flags & ~flags.mask) | + (flags.flags & flags.mask); + usvc->sched_name = nla_data(nla_sched); + usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL; + usvc->timeout = nla_get_u32(nla_timeout); + usvc->netmask = nla_get_be32(nla_netmask); + } + + return 0; +} + +static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs, + struct nlattr *nla) +{ + struct ip_vs_service_user_kern usvc; + struct ip_vs_service *svc; + int ret; + + ret = ip_vs_genl_parse_service(ipvs, &usvc, nla, false, &svc); + return ret ? ERR_PTR(ret) : svc; +} + +static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) +{ + struct nlattr *nl_dest; + struct ip_vs_kstats kstats; + + nl_dest = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DEST); + if (!nl_dest) + return -EMSGSIZE; + + if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) || + nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) || + nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD, + (atomic_read(&dest->conn_flags) & + IP_VS_CONN_F_FWD_MASK)) || + nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT, + atomic_read(&dest->weight)) || + nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE, + dest->tun_type) || + nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT, + dest->tun_port) || + nla_put_u16(skb, IPVS_DEST_ATTR_TUN_FLAGS, + dest->tun_flags) || + nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) || + nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) || + nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, + atomic_read(&dest->activeconns)) || + nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS, + atomic_read(&dest->inactconns)) || + nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, + atomic_read(&dest->persistconns)) || + nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af)) + goto nla_put_failure; + ip_vs_copy_stats(&kstats, &dest->stats); + if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &kstats)) + goto nla_put_failure; + if (ip_vs_genl_fill_stats64(skb, IPVS_DEST_ATTR_STATS64, &kstats)) + goto nla_put_failure; + + nla_nest_end(skb, nl_dest); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_dest); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, + struct netlink_callback *cb) +{ + void *hdr; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_DEST); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_dest(skb, dest) < 0) + goto nla_put_failure; + + genlmsg_end(skb, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_dests(struct sk_buff *skb, + struct netlink_callback *cb) +{ + int idx = 0; + int start = cb->args[0]; + struct ip_vs_service *svc; + struct ip_vs_dest *dest; + struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; + struct net *net = sock_net(skb->sk); + struct netns_ipvs *ipvs = net_ipvs(net); + + mutex_lock(&__ip_vs_mutex); + + /* Try to find the service for which to dump destinations */ + if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy, cb->extack)) + goto out_err; + + + svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]); + if (IS_ERR_OR_NULL(svc)) + goto out_err; + + /* Dump the destinations */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if (++idx <= start) + continue; + if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + +nla_put_failure: + cb->args[0] = idx; + +out_err: + mutex_unlock(&__ip_vs_mutex); + + return skb->len; +} + +static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, + struct nlattr *nla, bool full_entry) +{ + struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; + struct nlattr *nla_addr, *nla_port; + struct nlattr *nla_addr_family; + + /* Parse mandatory identifying destination fields first */ + if (nla == NULL || + nla_parse_nested_deprecated(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy, NULL)) + return -EINVAL; + + nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; + nla_port = attrs[IPVS_DEST_ATTR_PORT]; + nla_addr_family = attrs[IPVS_DEST_ATTR_ADDR_FAMILY]; + + if (!(nla_addr && nla_port)) + return -EINVAL; + + memset(udest, 0, sizeof(*udest)); + + nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); + udest->port = nla_get_be16(nla_port); + + if (nla_addr_family) + udest->af = nla_get_u16(nla_addr_family); + else + udest->af = 0; + + /* If a full entry was requested, check for the additional fields */ + if (full_entry) { + struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, + *nla_l_thresh, *nla_tun_type, *nla_tun_port, + *nla_tun_flags; + + nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; + nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; + nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH]; + nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; + nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE]; + nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT]; + nla_tun_flags = attrs[IPVS_DEST_ATTR_TUN_FLAGS]; + + if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) + return -EINVAL; + + udest->conn_flags = nla_get_u32(nla_fwd) + & IP_VS_CONN_F_FWD_MASK; + udest->weight = nla_get_u32(nla_weight); + udest->u_threshold = nla_get_u32(nla_u_thresh); + udest->l_threshold = nla_get_u32(nla_l_thresh); + + if (nla_tun_type) + udest->tun_type = nla_get_u8(nla_tun_type); + + if (nla_tun_port) + udest->tun_port = nla_get_be16(nla_tun_port); + + if (nla_tun_flags) + udest->tun_flags = nla_get_u16(nla_tun_flags); + } + + return 0; +} + +static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, + struct ipvs_sync_daemon_cfg *c) +{ + struct nlattr *nl_daemon; + + nl_daemon = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DAEMON); + if (!nl_daemon) + return -EMSGSIZE; + + if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) || + nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) || + nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) || + nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) || + nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) || + nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl)) + goto nla_put_failure; +#ifdef CONFIG_IP_VS_IPV6 + if (c->mcast_af == AF_INET6) { + if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6, + &c->mcast_group.in6)) + goto nla_put_failure; + } else +#endif + if (c->mcast_af == AF_INET && + nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP, + c->mcast_group.ip)) + goto nla_put_failure; + nla_nest_end(skb, nl_daemon); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_daemon); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, + struct ipvs_sync_daemon_cfg *c, + struct netlink_callback *cb) +{ + void *hdr; + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_DAEMON); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_daemon(skb, state, c)) + goto nla_put_failure; + + genlmsg_end(skb, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_daemons(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct netns_ipvs *ipvs = net_ipvs(net); + + mutex_lock(&ipvs->sync_mutex); + if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { + if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, + &ipvs->mcfg, cb) < 0) + goto nla_put_failure; + + cb->args[0] = 1; + } + + if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { + if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, + &ipvs->bcfg, cb) < 0) + goto nla_put_failure; + + cb->args[1] = 1; + } + +nla_put_failure: + mutex_unlock(&ipvs->sync_mutex); + + return skb->len; +} + +static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) +{ + struct ipvs_sync_daemon_cfg c; + struct nlattr *a; + int ret; + + memset(&c, 0, sizeof(c)); + if (!(attrs[IPVS_DAEMON_ATTR_STATE] && + attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && + attrs[IPVS_DAEMON_ATTR_SYNC_ID])) + return -EINVAL; + strscpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), + sizeof(c.mcast_ifn)); + c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]); + + a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN]; + if (a) + c.sync_maxlen = nla_get_u16(a); + + a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP]; + if (a) { + c.mcast_af = AF_INET; + c.mcast_group.ip = nla_get_in_addr(a); + if (!ipv4_is_multicast(c.mcast_group.ip)) + return -EINVAL; + } else { + a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6]; + if (a) { +#ifdef CONFIG_IP_VS_IPV6 + int addr_type; + + c.mcast_af = AF_INET6; + c.mcast_group.in6 = nla_get_in6_addr(a); + addr_type = ipv6_addr_type(&c.mcast_group.in6); + if (!(addr_type & IPV6_ADDR_MULTICAST)) + return -EINVAL; +#else + return -EAFNOSUPPORT; +#endif + } + } + + a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT]; + if (a) + c.mcast_port = nla_get_u16(a); + + a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL]; + if (a) + c.mcast_ttl = nla_get_u8(a); + + /* The synchronization protocol is incompatible with mixed family + * services + */ + if (ipvs->mixed_address_family_dests > 0) + return -EINVAL; + + ret = start_sync_thread(ipvs, &c, + nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); + return ret; +} + +static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) +{ + int ret; + + if (!attrs[IPVS_DAEMON_ATTR_STATE]) + return -EINVAL; + + ret = stop_sync_thread(ipvs, + nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); + return ret; +} + +static int ip_vs_genl_set_config(struct netns_ipvs *ipvs, struct nlattr **attrs) +{ + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(ipvs, &t); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) + t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]) + t.tcp_fin_timeout = + nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) + t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); + + return ip_vs_set_timeout(ipvs, &t); +} + +static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info) +{ + int ret = -EINVAL, cmd; + struct net *net = sock_net(skb->sk); + struct netns_ipvs *ipvs = net_ipvs(net); + + cmd = info->genlhdr->cmd; + + if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) { + struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; + + if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || + nla_parse_nested_deprecated(daemon_attrs, IPVS_DAEMON_ATTR_MAX, info->attrs[IPVS_CMD_ATTR_DAEMON], ip_vs_daemon_policy, info->extack)) + goto out; + + if (cmd == IPVS_CMD_NEW_DAEMON) + ret = ip_vs_genl_new_daemon(ipvs, daemon_attrs); + else + ret = ip_vs_genl_del_daemon(ipvs, daemon_attrs); + } + +out: + return ret; +} + +static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) +{ + bool need_full_svc = false, need_full_dest = false; + struct ip_vs_service *svc = NULL; + struct ip_vs_service_user_kern usvc; + struct ip_vs_dest_user_kern udest; + int ret = 0, cmd; + struct net *net = sock_net(skb->sk); + struct netns_ipvs *ipvs = net_ipvs(net); + + cmd = info->genlhdr->cmd; + + mutex_lock(&__ip_vs_mutex); + + if (cmd == IPVS_CMD_FLUSH) { + ret = ip_vs_flush(ipvs, false); + goto out; + } else if (cmd == IPVS_CMD_SET_CONFIG) { + ret = ip_vs_genl_set_config(ipvs, info->attrs); + goto out; + } else if (cmd == IPVS_CMD_ZERO && + !info->attrs[IPVS_CMD_ATTR_SERVICE]) { + ret = ip_vs_zero_all(ipvs); + goto out; + } + + /* All following commands require a service argument, so check if we + * received a valid one. We need a full service specification when + * adding / editing a service. Only identifying members otherwise. */ + if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) + need_full_svc = true; + + ret = ip_vs_genl_parse_service(ipvs, &usvc, + info->attrs[IPVS_CMD_ATTR_SERVICE], + need_full_svc, &svc); + if (ret) + goto out; + + /* Unless we're adding a new service, the service must already exist */ + if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { + ret = -ESRCH; + goto out; + } + + /* Destination commands require a valid destination argument. For + * adding / editing a destination, we need a full destination + * specification. */ + if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST || + cmd == IPVS_CMD_DEL_DEST) { + if (cmd != IPVS_CMD_DEL_DEST) + need_full_dest = true; + + ret = ip_vs_genl_parse_dest(&udest, + info->attrs[IPVS_CMD_ATTR_DEST], + need_full_dest); + if (ret) + goto out; + + /* Old protocols did not allow the user to specify address + * family, so we set it to zero instead. We also didn't + * allow heterogeneous pools in the old code, so it's safe + * to assume that this will have the same address family as + * the service. + */ + if (udest.af == 0) + udest.af = svc->af; + + if (!ip_vs_is_af_valid(udest.af)) { + ret = -EAFNOSUPPORT; + goto out; + } + + if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) { + /* The synchronization protocol is incompatible + * with mixed family services + */ + if (ipvs->sync_state) { + ret = -EINVAL; + goto out; + } + + /* Which connection types do we support? */ + switch (udest.conn_flags) { + case IP_VS_CONN_F_TUNNEL: + /* We are able to forward this */ + break; + default: + ret = -EINVAL; + goto out; + } + } + } + + switch (cmd) { + case IPVS_CMD_NEW_SERVICE: + if (svc == NULL) + ret = ip_vs_add_service(ipvs, &usvc, &svc); + else + ret = -EEXIST; + break; + case IPVS_CMD_SET_SERVICE: + ret = ip_vs_edit_service(svc, &usvc); + break; + case IPVS_CMD_DEL_SERVICE: + ret = ip_vs_del_service(svc); + /* do not use svc, it can be freed */ + break; + case IPVS_CMD_NEW_DEST: + ret = ip_vs_add_dest(svc, &udest); + break; + case IPVS_CMD_SET_DEST: + ret = ip_vs_edit_dest(svc, &udest); + break; + case IPVS_CMD_DEL_DEST: + ret = ip_vs_del_dest(svc, &udest); + break; + case IPVS_CMD_ZERO: + ret = ip_vs_zero_service(svc); + break; + default: + ret = -EINVAL; + } + +out: + mutex_unlock(&__ip_vs_mutex); + + return ret; +} + +static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + void *reply; + int ret, cmd, reply_cmd; + struct net *net = sock_net(skb->sk); + struct netns_ipvs *ipvs = net_ipvs(net); + + cmd = info->genlhdr->cmd; + + if (cmd == IPVS_CMD_GET_SERVICE) + reply_cmd = IPVS_CMD_NEW_SERVICE; + else if (cmd == IPVS_CMD_GET_INFO) + reply_cmd = IPVS_CMD_SET_INFO; + else if (cmd == IPVS_CMD_GET_CONFIG) + reply_cmd = IPVS_CMD_SET_CONFIG; + else { + pr_err("unknown Generic Netlink command\n"); + return -EINVAL; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + mutex_lock(&__ip_vs_mutex); + + reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd); + if (reply == NULL) + goto nla_put_failure; + + switch (cmd) { + case IPVS_CMD_GET_SERVICE: + { + struct ip_vs_service *svc; + + svc = ip_vs_genl_find_service(ipvs, + info->attrs[IPVS_CMD_ATTR_SERVICE]); + if (IS_ERR(svc)) { + ret = PTR_ERR(svc); + goto out_err; + } else if (svc) { + ret = ip_vs_genl_fill_service(msg, svc); + if (ret) + goto nla_put_failure; + } else { + ret = -ESRCH; + goto out_err; + } + + break; + } + + case IPVS_CMD_GET_CONFIG: + { + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(ipvs, &t); +#ifdef CONFIG_IP_VS_PROTO_TCP + if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, + t.tcp_timeout) || + nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, + t.tcp_fin_timeout)) + goto nla_put_failure; +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout)) + goto nla_put_failure; +#endif + + break; + } + + case IPVS_CMD_GET_INFO: + if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION, + IP_VS_VERSION_CODE) || + nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, + ip_vs_conn_tab_size)) + goto nla_put_failure; + break; + } + + genlmsg_end(msg, reply); + ret = genlmsg_reply(msg, info); + goto out; + +nla_put_failure: + pr_err("not enough space in Netlink message\n"); + ret = -EMSGSIZE; + +out_err: + nlmsg_free(msg); +out: + mutex_unlock(&__ip_vs_mutex); + + return ret; +} + + +static const struct genl_small_ops ip_vs_genl_ops[] = { + { + .cmd = IPVS_CMD_NEW_SERVICE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_SET_SERVICE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_DEL_SERVICE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_SERVICE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + .dumpit = ip_vs_genl_dump_services, + }, + { + .cmd = IPVS_CMD_NEW_DEST, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_SET_DEST, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_DEL_DEST, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_DEST, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_ADMIN_PERM, + .dumpit = ip_vs_genl_dump_dests, + }, + { + .cmd = IPVS_CMD_NEW_DAEMON, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_set_daemon, + }, + { + .cmd = IPVS_CMD_DEL_DAEMON, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_set_daemon, + }, + { + .cmd = IPVS_CMD_GET_DAEMON, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_ADMIN_PERM, + .dumpit = ip_vs_genl_dump_daemons, + }, + { + .cmd = IPVS_CMD_SET_CONFIG, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_CONFIG, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + }, + { + .cmd = IPVS_CMD_GET_INFO, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + }, + { + .cmd = IPVS_CMD_ZERO, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_FLUSH, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_set_cmd, + }, +}; + +static struct genl_family ip_vs_genl_family __ro_after_init = { + .hdrsize = 0, + .name = IPVS_GENL_NAME, + .version = IPVS_GENL_VERSION, + .maxattr = IPVS_CMD_ATTR_MAX, + .policy = ip_vs_cmd_policy, + .netnsok = true, /* Make ipvsadm to work on netns */ + .module = THIS_MODULE, + .small_ops = ip_vs_genl_ops, + .n_small_ops = ARRAY_SIZE(ip_vs_genl_ops), + .resv_start_op = IPVS_CMD_FLUSH + 1, +}; + +static int __init ip_vs_genl_register(void) +{ + return genl_register_family(&ip_vs_genl_family); +} + +static void ip_vs_genl_unregister(void) +{ + genl_unregister_family(&ip_vs_genl_family); +} + +/* End of Generic Netlink interface definitions */ + +/* + * per netns intit/exit func. + */ +#ifdef CONFIG_SYSCTL +static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) +{ + struct net *net = ipvs->net; + int idx; + struct ctl_table *tbl; + + atomic_set(&ipvs->dropentry, 0); + spin_lock_init(&ipvs->dropentry_lock); + spin_lock_init(&ipvs->droppacket_lock); + spin_lock_init(&ipvs->securetcp_lock); + + if (!net_eq(net, &init_net)) { + tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); + if (tbl == NULL) + return -ENOMEM; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + tbl[0].procname = NULL; + } else + tbl = vs_vars; + /* Initialize sysctl defaults */ + for (idx = 0; idx < ARRAY_SIZE(vs_vars); idx++) { + if (tbl[idx].proc_handler == proc_do_defense_mode) + tbl[idx].extra2 = ipvs; + } + idx = 0; + ipvs->sysctl_amemthresh = 1024; + tbl[idx++].data = &ipvs->sysctl_amemthresh; + ipvs->sysctl_am_droprate = 10; + tbl[idx++].data = &ipvs->sysctl_am_droprate; + tbl[idx++].data = &ipvs->sysctl_drop_entry; + tbl[idx++].data = &ipvs->sysctl_drop_packet; +#ifdef CONFIG_IP_VS_NFCT + tbl[idx++].data = &ipvs->sysctl_conntrack; +#endif + tbl[idx++].data = &ipvs->sysctl_secure_tcp; + ipvs->sysctl_snat_reroute = 1; + tbl[idx++].data = &ipvs->sysctl_snat_reroute; + ipvs->sysctl_sync_ver = 1; + tbl[idx++].data = &ipvs->sysctl_sync_ver; + ipvs->sysctl_sync_ports = 1; + tbl[idx++].data = &ipvs->sysctl_sync_ports; + tbl[idx++].data = &ipvs->sysctl_sync_persist_mode; + ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32; + tbl[idx++].data = &ipvs->sysctl_sync_qlen_max; + ipvs->sysctl_sync_sock_size = 0; + tbl[idx++].data = &ipvs->sysctl_sync_sock_size; + tbl[idx++].data = &ipvs->sysctl_cache_bypass; + tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; + tbl[idx++].data = &ipvs->sysctl_sloppy_tcp; + tbl[idx++].data = &ipvs->sysctl_sloppy_sctp; + tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template; + ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD; + ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; + tbl[idx].data = &ipvs->sysctl_sync_threshold; + tbl[idx].extra2 = ipvs; + tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); + ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD; + tbl[idx++].data = &ipvs->sysctl_sync_refresh_period; + ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3); + tbl[idx++].data = &ipvs->sysctl_sync_retries; + tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; + ipvs->sysctl_pmtu_disc = 1; + tbl[idx++].data = &ipvs->sysctl_pmtu_disc; + tbl[idx++].data = &ipvs->sysctl_backup_only; + ipvs->sysctl_conn_reuse_mode = 1; + tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; + tbl[idx++].data = &ipvs->sysctl_schedule_icmp; + tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; + ipvs->sysctl_run_estimation = 1; + tbl[idx++].data = &ipvs->sysctl_run_estimation; +#ifdef CONFIG_IP_VS_DEBUG + /* Global sysctls must be ro in non-init netns */ + if (!net_eq(net, &init_net)) + tbl[idx++].mode = 0444; +#endif + + ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); + if (ipvs->sysctl_hdr == NULL) { + if (!net_eq(net, &init_net)) + kfree(tbl); + return -ENOMEM; + } + ip_vs_start_estimator(ipvs, &ipvs->tot_stats); + ipvs->sysctl_tbl = tbl; + /* Schedule defense work */ + INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); + queue_delayed_work(system_long_wq, &ipvs->defense_work, + DEFENSE_TIMER_PERIOD); + + /* Init delayed work for expiring no dest conn */ + INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work, + expire_nodest_conn_handler); + + return 0; +} + +static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) +{ + struct net *net = ipvs->net; + + cancel_delayed_work_sync(&ipvs->expire_nodest_conn_work); + cancel_delayed_work_sync(&ipvs->defense_work); + cancel_work_sync(&ipvs->defense_work.work); + unregister_net_sysctl_table(ipvs->sysctl_hdr); + ip_vs_stop_estimator(ipvs, &ipvs->tot_stats); + + if (!net_eq(net, &init_net)) + kfree(ipvs->sysctl_tbl); +} + +#else + +static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { return 0; } +static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { } + +#endif + +static struct notifier_block ip_vs_dst_notifier = { + .notifier_call = ip_vs_dst_event, +#ifdef CONFIG_IP_VS_IPV6 + .priority = ADDRCONF_NOTIFY_PRIORITY + 5, +#endif +}; + +int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) +{ + int i, idx; + + /* Initialize rs_table */ + for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) + INIT_HLIST_HEAD(&ipvs->rs_table[idx]); + + INIT_LIST_HEAD(&ipvs->dest_trash); + spin_lock_init(&ipvs->dest_trash_lock); + timer_setup(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, 0); + atomic_set(&ipvs->ftpsvc_counter, 0); + atomic_set(&ipvs->nullsvc_counter, 0); + atomic_set(&ipvs->conn_out_counter, 0); + + /* procfs stats */ + ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!ipvs->tot_stats.cpustats) + return -ENOMEM; + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *ipvs_tot_stats; + ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i); + u64_stats_init(&ipvs_tot_stats->syncp); + } + + spin_lock_init(&ipvs->tot_stats.lock); + +#ifdef CONFIG_PROC_FS + if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net, + &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter))) + goto err_vs; + if (!proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net, + ip_vs_stats_show, NULL)) + goto err_stats; + if (!proc_create_net_single("ip_vs_stats_percpu", 0, + ipvs->net->proc_net, + ip_vs_stats_percpu_show, NULL)) + goto err_percpu; +#endif + + if (ip_vs_control_net_init_sysctl(ipvs)) + goto err; + + return 0; + +err: +#ifdef CONFIG_PROC_FS + remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); + +err_percpu: + remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); + +err_stats: + remove_proc_entry("ip_vs", ipvs->net->proc_net); + +err_vs: +#endif + free_percpu(ipvs->tot_stats.cpustats); + return -ENOMEM; +} + +void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) +{ + ip_vs_trash_cleanup(ipvs); + ip_vs_control_net_cleanup_sysctl(ipvs); +#ifdef CONFIG_PROC_FS + remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); + remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); + remove_proc_entry("ip_vs", ipvs->net->proc_net); +#endif + free_percpu(ipvs->tot_stats.cpustats); +} + +int __init ip_vs_register_nl_ioctl(void) +{ + int ret; + + ret = nf_register_sockopt(&ip_vs_sockopts); + if (ret) { + pr_err("cannot register sockopt.\n"); + goto err_sock; + } + + ret = ip_vs_genl_register(); + if (ret) { + pr_err("cannot register Generic Netlink interface.\n"); + goto err_genl; + } + return 0; + +err_genl: + nf_unregister_sockopt(&ip_vs_sockopts); +err_sock: + return ret; +} + +void ip_vs_unregister_nl_ioctl(void) +{ + ip_vs_genl_unregister(); + nf_unregister_sockopt(&ip_vs_sockopts); +} + +int __init ip_vs_control_init(void) +{ + int idx; + int ret; + + EnterFunction(2); + + /* Initialize svc_table, ip_vs_svc_fwm_table */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + INIT_HLIST_HEAD(&ip_vs_svc_table[idx]); + INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]); + } + + smp_wmb(); /* Do we really need it now ? */ + + ret = register_netdevice_notifier(&ip_vs_dst_notifier); + if (ret < 0) + return ret; + + LeaveFunction(2); + return 0; +} + + +void ip_vs_control_cleanup(void) +{ + EnterFunction(2); + unregister_netdevice_notifier(&ip_vs_dst_notifier); + LeaveFunction(2); +} diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c new file mode 100644 index 000000000..5e6ec32af --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -0,0 +1,272 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * IPVS: Destination Hashing scheduling module + * + * Authors: Wensong Zhang + * + * Inspired by the consistent hashing scheduler patch from + * Thomas Proell + * + * Changes: + */ + +/* + * The dh algorithm is to select server by the hash key of destination IP + * address. The pseudo code is as follows: + * + * n <- servernode[dest_ip]; + * if (n is dead) OR + * (n is overloaded) OR (n.weight <= 0) then + * return NULL; + * + * return n; + * + * Notes that servernode is a 256-bucket hash table that maps the hash + * index derived from packet destination IP address to the current server + * array. If the dh scheduler is used in cache cluster, it is good to + * combine it with cache_bypass feature. When the statically assigned + * server is dead or overloaded, the load balancer can bypass the cache + * server and send requests to the original server directly. + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include + +#include + + +/* + * IPVS DH bucket + */ +struct ip_vs_dh_bucket { + struct ip_vs_dest __rcu *dest; /* real server (cache) */ +}; + +/* + * for IPVS DH entry hash table + */ +#ifndef CONFIG_IP_VS_DH_TAB_BITS +#define CONFIG_IP_VS_DH_TAB_BITS 8 +#endif +#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS +#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS) +#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1) + +struct ip_vs_dh_state { + struct ip_vs_dh_bucket buckets[IP_VS_DH_TAB_SIZE]; + struct rcu_head rcu_head; +}; + +/* + * Returns hash value for IPVS DH entry + */ +static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr) +{ + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + return hash_32(ntohl(addr_fold), IP_VS_DH_TAB_BITS); +} + + +/* + * Get ip_vs_dest associated with supplied parameters. + */ +static inline struct ip_vs_dest * +ip_vs_dh_get(int af, struct ip_vs_dh_state *s, const union nf_inet_addr *addr) +{ + return rcu_dereference(s->buckets[ip_vs_dh_hashkey(af, addr)].dest); +} + + +/* + * Assign all the hash buckets of the specified table with the service. + */ +static int +ip_vs_dh_reassign(struct ip_vs_dh_state *s, struct ip_vs_service *svc) +{ + int i; + struct ip_vs_dh_bucket *b; + struct list_head *p; + struct ip_vs_dest *dest; + bool empty; + + b = &s->buckets[0]; + p = &svc->destinations; + empty = list_empty(p); + for (i=0; idest, 1); + if (dest) + ip_vs_dest_put(dest); + if (empty) + RCU_INIT_POINTER(b->dest, NULL); + else { + if (p == &svc->destinations) + p = p->next; + + dest = list_entry(p, struct ip_vs_dest, n_list); + ip_vs_dest_hold(dest); + RCU_INIT_POINTER(b->dest, dest); + + p = p->next; + } + b++; + } + return 0; +} + + +/* + * Flush all the hash buckets of the specified table. + */ +static void ip_vs_dh_flush(struct ip_vs_dh_state *s) +{ + int i; + struct ip_vs_dh_bucket *b; + struct ip_vs_dest *dest; + + b = &s->buckets[0]; + for (i=0; idest, 1); + if (dest) { + ip_vs_dest_put(dest); + RCU_INIT_POINTER(b->dest, NULL); + } + b++; + } +} + + +static int ip_vs_dh_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_state *s; + + /* allocate the DH table for this service */ + s = kzalloc(sizeof(struct ip_vs_dh_state), GFP_KERNEL); + if (s == NULL) + return -ENOMEM; + + svc->sched_data = s; + IP_VS_DBG(6, "DH hash table (memory=%zdbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); + + /* assign the hash buckets with current dests */ + ip_vs_dh_reassign(s, svc); + + return 0; +} + + +static void ip_vs_dh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_state *s = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_dh_flush(s); + + /* release the table itself */ + kfree_rcu(s, rcu_head); + IP_VS_DBG(6, "DH hash table (memory=%zdbytes) released\n", + sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); +} + + +static int ip_vs_dh_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) +{ + struct ip_vs_dh_state *s = svc->sched_data; + + /* assign the hash buckets with the updated service */ + ip_vs_dh_reassign(s, svc); + + return 0; +} + + +/* + * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, + * consider that the server is overloaded here. + */ +static inline int is_overloaded(struct ip_vs_dest *dest) +{ + return dest->flags & IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Destination hashing scheduling + */ +static struct ip_vs_dest * +ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_dh_state *s; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + s = (struct ip_vs_dh_state *) svc->sched_data; + dest = ip_vs_dh_get(svc->af, s, &iph->daddr); + if (!dest + || !(dest->flags & IP_VS_DEST_F_AVAILABLE) + || atomic_read(&dest->weight) <= 0 + || is_overloaded(dest)) { + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + } + + IP_VS_DBG_BUF(6, "DH: destination IP address %s --> server %s:%d\n", + IP_VS_DBG_ADDR(svc->af, &iph->daddr), + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS DH Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_dh_scheduler = +{ + .name = "dh", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list), + .init_service = ip_vs_dh_init_svc, + .done_service = ip_vs_dh_done_svc, + .add_dest = ip_vs_dh_dest_changed, + .del_dest = ip_vs_dh_dest_changed, + .schedule = ip_vs_dh_schedule, +}; + + +static int __init ip_vs_dh_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_dh_scheduler); +} + + +static void __exit ip_vs_dh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); + synchronize_rcu(); +} + + +module_init(ip_vs_dh_init); +module_exit(ip_vs_dh_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c new file mode 100644 index 000000000..f53150d82 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * ip_vs_est.c: simple rate estimator for IPVS + * + * Authors: Wensong Zhang + * + * Changes: Hans Schillstrom + * Network name space (netns) aware. + * Global data moved to netns i.e struct netns_ipvs + * Affected data: est_list and est_lock. + * estimation_timer() runs with timer per netns. + * get_stats()) do the per cpu summing. + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include + +#include + +/* + This code is to estimate rate in a shorter interval (such as 8 + seconds) for virtual services and real servers. For measure rate in a + long interval, it is easy to implement a user level daemon which + periodically reads those statistical counters and measure rate. + + Currently, the measurement is activated by slow timer handler. Hope + this measurement will not introduce too much load. + + We measure rate during the last 8 seconds every 2 seconds: + + avgrate = avgrate*(1-W) + rate*W + + where W = 2^(-2) + + NOTES. + + * Average bps is scaled by 2^5, while average pps and cps are scaled by 2^10. + + * Netlink users can see 64-bit values but sockopt users are restricted + to 32-bit values for conns, packets, bps, cps and pps. + + * A lot of code is taken from net/core/gen_estimator.c + */ + + +/* + * Make a summary from each cpu + */ +static void ip_vs_read_cpu_stats(struct ip_vs_kstats *sum, + struct ip_vs_cpu_stats __percpu *stats) +{ + int i; + bool add = false; + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i); + unsigned int start; + u64 conns, inpkts, outpkts, inbytes, outbytes; + + if (add) { + do { + start = u64_stats_fetch_begin(&s->syncp); + conns = u64_stats_read(&s->cnt.conns); + inpkts = u64_stats_read(&s->cnt.inpkts); + outpkts = u64_stats_read(&s->cnt.outpkts); + inbytes = u64_stats_read(&s->cnt.inbytes); + outbytes = u64_stats_read(&s->cnt.outbytes); + } while (u64_stats_fetch_retry(&s->syncp, start)); + sum->conns += conns; + sum->inpkts += inpkts; + sum->outpkts += outpkts; + sum->inbytes += inbytes; + sum->outbytes += outbytes; + } else { + add = true; + do { + start = u64_stats_fetch_begin(&s->syncp); + sum->conns = u64_stats_read(&s->cnt.conns); + sum->inpkts = u64_stats_read(&s->cnt.inpkts); + sum->outpkts = u64_stats_read(&s->cnt.outpkts); + sum->inbytes = u64_stats_read(&s->cnt.inbytes); + sum->outbytes = u64_stats_read(&s->cnt.outbytes); + } while (u64_stats_fetch_retry(&s->syncp, start)); + } + } +} + + +static void estimation_timer(struct timer_list *t) +{ + struct ip_vs_estimator *e; + struct ip_vs_stats *s; + u64 rate; + struct netns_ipvs *ipvs = from_timer(ipvs, t, est_timer); + + if (!sysctl_run_estimation(ipvs)) + goto skip; + + spin_lock(&ipvs->est_lock); + list_for_each_entry(e, &ipvs->est_list, list) { + s = container_of(e, struct ip_vs_stats, est); + + spin_lock(&s->lock); + ip_vs_read_cpu_stats(&s->kstats, s->cpustats); + + /* scaled by 2^10, but divided 2 seconds */ + rate = (s->kstats.conns - e->last_conns) << 9; + e->last_conns = s->kstats.conns; + e->cps += ((s64)rate - (s64)e->cps) >> 2; + + rate = (s->kstats.inpkts - e->last_inpkts) << 9; + e->last_inpkts = s->kstats.inpkts; + e->inpps += ((s64)rate - (s64)e->inpps) >> 2; + + rate = (s->kstats.outpkts - e->last_outpkts) << 9; + e->last_outpkts = s->kstats.outpkts; + e->outpps += ((s64)rate - (s64)e->outpps) >> 2; + + /* scaled by 2^5, but divided 2 seconds */ + rate = (s->kstats.inbytes - e->last_inbytes) << 4; + e->last_inbytes = s->kstats.inbytes; + e->inbps += ((s64)rate - (s64)e->inbps) >> 2; + + rate = (s->kstats.outbytes - e->last_outbytes) << 4; + e->last_outbytes = s->kstats.outbytes; + e->outbps += ((s64)rate - (s64)e->outbps) >> 2; + spin_unlock(&s->lock); + } + spin_unlock(&ipvs->est_lock); + +skip: + mod_timer(&ipvs->est_timer, jiffies + 2*HZ); +} + +void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est = &stats->est; + + INIT_LIST_HEAD(&est->list); + + spin_lock_bh(&ipvs->est_lock); + list_add(&est->list, &ipvs->est_list); + spin_unlock_bh(&ipvs->est_lock); +} + +void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est = &stats->est; + + spin_lock_bh(&ipvs->est_lock); + list_del(&est->list); + spin_unlock_bh(&ipvs->est_lock); +} + +void ip_vs_zero_estimator(struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est = &stats->est; + struct ip_vs_kstats *k = &stats->kstats; + + /* reset counters, caller must hold the stats->lock lock */ + est->last_inbytes = k->inbytes; + est->last_outbytes = k->outbytes; + est->last_conns = k->conns; + est->last_inpkts = k->inpkts; + est->last_outpkts = k->outpkts; + est->cps = 0; + est->inpps = 0; + est->outpps = 0; + est->inbps = 0; + est->outbps = 0; +} + +/* Get decoded rates */ +void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *e = &stats->est; + + dst->cps = (e->cps + 0x1FF) >> 10; + dst->inpps = (e->inpps + 0x1FF) >> 10; + dst->outpps = (e->outpps + 0x1FF) >> 10; + dst->inbps = (e->inbps + 0xF) >> 5; + dst->outbps = (e->outbps + 0xF) >> 5; +} + +int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs) +{ + INIT_LIST_HEAD(&ipvs->est_list); + spin_lock_init(&ipvs->est_lock); + timer_setup(&ipvs->est_timer, estimation_timer, 0); + mod_timer(&ipvs->est_timer, jiffies + 2 * HZ); + return 0; +} + +void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs) +{ + del_timer_sync(&ipvs->est_timer); +} diff --git a/net/netfilter/ipvs/ip_vs_fo.c b/net/netfilter/ipvs/ip_vs_fo.c new file mode 100644 index 000000000..b846cc385 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_fo.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * IPVS: Weighted Fail Over module + * + * Authors: Kenny Mathis + * + * Changes: + * Kenny Mathis : added initial functionality based on weight + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include + +#include + +/* Weighted Fail Over Module */ +static struct ip_vs_dest * +ip_vs_fo_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest, *hweight = NULL; + int hw = 0; /* Track highest weight */ + + IP_VS_DBG(6, "ip_vs_fo_schedule(): Scheduling...\n"); + + /* Basic failover functionality + * Find virtual server with highest weight and send it traffic + */ + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > hw) { + hweight = dest; + hw = atomic_read(&dest->weight); + } + } + + if (hweight) { + IP_VS_DBG_BUF(6, "FO: server %s:%u activeconns %d weight %d\n", + IP_VS_DBG_ADDR(hweight->af, &hweight->addr), + ntohs(hweight->port), + atomic_read(&hweight->activeconns), + atomic_read(&hweight->weight)); + return hweight; + } + + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; +} + +static struct ip_vs_scheduler ip_vs_fo_scheduler = { + .name = "fo", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_fo_scheduler.n_list), + .schedule = ip_vs_fo_schedule, +}; + +static int __init ip_vs_fo_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_fo_scheduler); +} + +static void __exit ip_vs_fo_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_fo_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_fo_init); +module_exit(ip_vs_fo_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c new file mode 100644 index 000000000..ef1f45e43 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -0,0 +1,637 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * ip_vs_ftp.c: IPVS ftp application module + * + * Authors: Wensong Zhang + * + * Changes: + * + * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference + * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp. + * + * IP_MASQ_FTP ftp masquerading module + * + * Version: @(#)ip_masq_ftp.c 0.04 02/05/96 + * + * Author: Wouter Gadeyne + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +#define SERVER_STRING_PASV "227 " +#define CLIENT_STRING_PORT "PORT" +#define SERVER_STRING_EPSV "229 " +#define CLIENT_STRING_EPRT "EPRT" + +enum { + IP_VS_FTP_ACTIVE = 0, + IP_VS_FTP_PORT = 0, + IP_VS_FTP_PASV, + IP_VS_FTP_EPRT, + IP_VS_FTP_EPSV, +}; + +/* + * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper + * First port is set to the default port. + */ +static unsigned int ports_count = 1; +static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0}; +module_param_array(ports, ushort, &ports_count, 0444); +MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands"); + + +static char *ip_vs_ftp_data_ptr(struct sk_buff *skb, struct ip_vs_iphdr *ipvsh) +{ + struct tcphdr *th = (struct tcphdr *)((char *)skb->data + ipvsh->len); + + if ((th->doff << 2) < sizeof(struct tcphdr)) + return NULL; + + return (char *)th + (th->doff << 2); +} + +static int +ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) +{ + /* We use connection tracking for the command connection */ + cp->flags |= IP_VS_CONN_F_NFCT; + return 0; +} + + +static int +ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) +{ + return 0; +} + + +/* Get from the string "xxx.xxx.xxx.xxx,ppp,ppp", started + * with the "pattern". is in network order. + * Parse extended format depending on ext. In this case addr can be pre-set. + */ +static int ip_vs_ftp_get_addrport(char *data, char *data_limit, + const char *pattern, size_t plen, + char skip, bool ext, int mode, + union nf_inet_addr *addr, __be16 *port, + __u16 af, char **start, char **end) +{ + char *s, c; + unsigned char p[6]; + char edelim; + __u16 hport; + int i = 0; + + if (data_limit - data < plen) { + /* check if there is partial match */ + if (strncasecmp(data, pattern, data_limit - data) == 0) + return -1; + else + return 0; + } + + if (strncasecmp(data, pattern, plen) != 0) { + return 0; + } + s = data + plen; + if (skip) { + bool found = false; + + for (;; s++) { + if (s == data_limit) + return -1; + if (!found) { + /* "(" is optional for non-extended format, + * so catch the start of IPv4 address + */ + if (!ext && isdigit(*s)) + break; + if (*s == skip) + found = true; + } else if (*s != skip) { + break; + } + } + } + /* Old IPv4-only format? */ + if (!ext) { + p[0] = 0; + for (data = s; ; data++) { + if (data == data_limit) + return -1; + c = *data; + if (isdigit(c)) { + p[i] = p[i]*10 + c - '0'; + } else if (c == ',' && i < 5) { + i++; + p[i] = 0; + } else { + /* unexpected character or terminator */ + break; + } + } + + if (i != 5) + return -1; + + *start = s; + *end = data; + addr->ip = get_unaligned((__be32 *) p); + *port = get_unaligned((__be16 *) (p + 4)); + return 1; + } + if (s == data_limit) + return -1; + *start = s; + edelim = *s++; + if (edelim < 33 || edelim > 126) + return -1; + if (s == data_limit) + return -1; + if (*s == edelim) { + /* Address family is usually missing for EPSV response */ + if (mode != IP_VS_FTP_EPSV) + return -1; + s++; + if (s == data_limit) + return -1; + /* Then address should be missing too */ + if (*s != edelim) + return -1; + /* Caller can pre-set addr, if needed */ + s++; + } else { + const char *ep; + + /* We allow address only from same family */ + if (af == AF_INET6 && *s != '2') + return -1; + if (af == AF_INET && *s != '1') + return -1; + s++; + if (s == data_limit) + return -1; + if (*s != edelim) + return -1; + s++; + if (s == data_limit) + return -1; + if (af == AF_INET6) { + if (in6_pton(s, data_limit - s, (u8 *)addr, edelim, + &ep) <= 0) + return -1; + } else { + if (in4_pton(s, data_limit - s, (u8 *)addr, edelim, + &ep) <= 0) + return -1; + } + s = (char *) ep; + if (s == data_limit) + return -1; + if (*s != edelim) + return -1; + s++; + } + for (hport = 0; ; s++) + { + if (s == data_limit) + return -1; + if (!isdigit(*s)) + break; + hport = hport * 10 + *s - '0'; + } + if (s == data_limit || !hport || *s != edelim) + return -1; + s++; + *end = s; + *port = htons(hport); + return 1; +} + +/* Look at outgoing ftp packets to catch the response to a PASV/EPSV command + * from the server (inside-to-outside). + * When we see one, we build a connection entry with the client address, + * client port 0 (unknown at the moment), the server address and the + * server port. Mark the current connection entry as a control channel + * of the new entry. All this work is just to make the data connection + * can be scheduled to the right server later. + * + * The outgoing packet should be something like + * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". + * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. + * The extended format for EPSV response provides usually only port: + * "229 Entering Extended Passive Mode (|||ppp|)" + */ +static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, + struct sk_buff *skb, int *diff, + struct ip_vs_iphdr *ipvsh) +{ + char *data, *data_limit; + char *start, *end; + union nf_inet_addr from; + __be16 port; + struct ip_vs_conn *n_cp; + char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ + unsigned int buf_len; + int ret = 0; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + *diff = 0; + + /* Only useful for established sessions */ + if (cp->state != IP_VS_TCP_S_ESTABLISHED) + return 1; + + /* Linear packets are much easier to deal with. */ + if (skb_ensure_writable(skb, skb->len)) + return 0; + + if (cp->app_data == (void *) IP_VS_FTP_PASV) { + data = ip_vs_ftp_data_ptr(skb, ipvsh); + data_limit = skb_tail_pointer(skb); + + if (!data || data >= data_limit) + return 1; + + if (ip_vs_ftp_get_addrport(data, data_limit, + SERVER_STRING_PASV, + sizeof(SERVER_STRING_PASV)-1, + '(', false, IP_VS_FTP_PASV, + &from, &port, cp->af, + &start, &end) != 1) + return 1; + + IP_VS_DBG(7, "PASV response (%pI4:%u) -> %pI4:%u detected\n", + &from.ip, ntohs(port), &cp->caddr.ip, 0); + } else if (cp->app_data == (void *) IP_VS_FTP_EPSV) { + data = ip_vs_ftp_data_ptr(skb, ipvsh); + data_limit = skb_tail_pointer(skb); + + if (!data || data >= data_limit) + return 1; + + /* Usually, data address is not specified but + * we support different address, so pre-set it. + */ + from = cp->daddr; + if (ip_vs_ftp_get_addrport(data, data_limit, + SERVER_STRING_EPSV, + sizeof(SERVER_STRING_EPSV)-1, + '(', true, IP_VS_FTP_EPSV, + &from, &port, cp->af, + &start, &end) != 1) + return 1; + + IP_VS_DBG_BUF(7, "EPSV response (%s:%u) -> %s:%u detected\n", + IP_VS_DBG_ADDR(cp->af, &from), ntohs(port), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), 0); + } else { + return 1; + } + + /* Now update or create a connection entry for it */ + { + struct ip_vs_conn_param p; + + ip_vs_conn_fill_param(cp->ipvs, cp->af, + ipvsh->protocol, &from, port, + &cp->caddr, 0, &p); + n_cp = ip_vs_conn_out_get(&p); + } + if (!n_cp) { + struct ip_vs_conn_param p; + + ip_vs_conn_fill_param(cp->ipvs, + cp->af, ipvsh->protocol, &cp->caddr, + 0, &cp->vaddr, port, &p); + n_cp = ip_vs_conn_new(&p, cp->af, &from, port, + IP_VS_CONN_F_NO_CPORT | + IP_VS_CONN_F_NFCT, + cp->dest, skb->mark); + if (!n_cp) + return 0; + + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } + + /* Replace the old passive address with the new one */ + if (cp->app_data == (void *) IP_VS_FTP_PASV) { + from.ip = n_cp->vaddr.ip; + port = n_cp->vport; + snprintf(buf, sizeof(buf), "%u,%u,%u,%u,%u,%u", + ((unsigned char *)&from.ip)[0], + ((unsigned char *)&from.ip)[1], + ((unsigned char *)&from.ip)[2], + ((unsigned char *)&from.ip)[3], + ntohs(port) >> 8, + ntohs(port) & 0xFF); + } else if (cp->app_data == (void *) IP_VS_FTP_EPSV) { + from = n_cp->vaddr; + port = n_cp->vport; + /* Only port, client will use VIP for the data connection */ + snprintf(buf, sizeof(buf), "|||%u|", + ntohs(port)); + } else { + *buf = 0; + } + buf_len = strlen(buf); + + ct = nf_ct_get(skb, &ctinfo); + if (ct) { + bool mangled; + + /* If mangling fails this function will return 0 + * which will cause the packet to be dropped. + * Mangling can only fail under memory pressure, + * hopefully it will succeed on the retransmitted + * packet. + */ + mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, + ipvsh->len, + start - data, + end - start, + buf, buf_len); + if (mangled) { + ip_vs_nfct_expect_related(skb, ct, n_cp, + ipvsh->protocol, 0, 0); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_UNNECESSARY; + /* csum is updated */ + ret = 1; + } + } + + /* Not setting 'diff' is intentional, otherwise the sequence + * would be adjusted twice. + */ + + cp->app_data = (void *) IP_VS_FTP_ACTIVE; + ip_vs_tcp_conn_listen(n_cp); + ip_vs_conn_put(n_cp); + return ret; +} + + +/* Look at incoming ftp packets to catch the PASV/PORT/EPRT/EPSV command + * (outside-to-inside). + * + * The incoming packet having the PORT command should be something like + * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n". + * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number. + * In this case, we create a connection entry using the client address and + * port, so that the active ftp data connection from the server can reach + * the client. + * Extended format: + * "EPSV\r\n" when client requests server address from same family + * "EPSV 1\r\n" when client requests IPv4 server address + * "EPSV 2\r\n" when client requests IPv6 server address + * "EPSV ALL\r\n" - not supported + * EPRT with specified delimiter (ASCII 33..126), "|" by default: + * "EPRT |1|IPv4ADDR|PORT|\r\n" when client provides IPv4 addrport + * "EPRT |2|IPv6ADDR|PORT|\r\n" when client provides IPv6 addrport + */ +static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, + struct sk_buff *skb, int *diff, + struct ip_vs_iphdr *ipvsh) +{ + char *data, *data_start, *data_limit; + char *start, *end; + union nf_inet_addr to; + __be16 port; + struct ip_vs_conn *n_cp; + + /* no diff required for incoming packets */ + *diff = 0; + + /* Only useful for established sessions */ + if (cp->state != IP_VS_TCP_S_ESTABLISHED) + return 1; + + /* Linear packets are much easier to deal with. */ + if (skb_ensure_writable(skb, skb->len)) + return 0; + + data = data_start = ip_vs_ftp_data_ptr(skb, ipvsh); + data_limit = skb_tail_pointer(skb); + if (!data || data >= data_limit) + return 1; + + while (data <= data_limit - 6) { + if (cp->af == AF_INET && + strncasecmp(data, "PASV\r\n", 6) == 0) { + /* Passive mode on */ + IP_VS_DBG(7, "got PASV at %td of %td\n", + data - data_start, + data_limit - data_start); + cp->app_data = (void *) IP_VS_FTP_PASV; + return 1; + } + + /* EPSV or EPSV */ + if (strncasecmp(data, "EPSV", 4) == 0 && + (data[4] == ' ' || data[4] == '\r')) { + if (data[4] == ' ') { + char proto = data[5]; + + if (data > data_limit - 7 || data[6] != '\r') + return 1; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6 && proto == '2') { + } else +#endif + if (cp->af == AF_INET && proto == '1') { + } else { + return 1; + } + } + /* Extended Passive mode on */ + IP_VS_DBG(7, "got EPSV at %td of %td\n", + data - data_start, + data_limit - data_start); + cp->app_data = (void *) IP_VS_FTP_EPSV; + return 1; + } + + data++; + } + + /* + * To support virtual FTP server, the scenerio is as follows: + * FTP client ----> Load Balancer ----> FTP server + * First detect the port number in the application data, + * then create a new connection entry for the coming data + * connection. + */ + if (cp->af == AF_INET && + ip_vs_ftp_get_addrport(data_start, data_limit, + CLIENT_STRING_PORT, + sizeof(CLIENT_STRING_PORT)-1, + ' ', false, IP_VS_FTP_PORT, + &to, &port, cp->af, + &start, &end) == 1) { + + IP_VS_DBG(7, "PORT %pI4:%u detected\n", &to.ip, ntohs(port)); + + /* Now update or create a connection entry for it */ + IP_VS_DBG(7, "protocol %s %pI4:%u %pI4:%u\n", + ip_vs_proto_name(ipvsh->protocol), + &to.ip, ntohs(port), &cp->vaddr.ip, + ntohs(cp->vport)-1); + } else if (ip_vs_ftp_get_addrport(data_start, data_limit, + CLIENT_STRING_EPRT, + sizeof(CLIENT_STRING_EPRT)-1, + ' ', true, IP_VS_FTP_EPRT, + &to, &port, cp->af, + &start, &end) == 1) { + + IP_VS_DBG_BUF(7, "EPRT %s:%u detected\n", + IP_VS_DBG_ADDR(cp->af, &to), ntohs(port)); + + /* Now update or create a connection entry for it */ + IP_VS_DBG_BUF(7, "protocol %s %s:%u %s:%u\n", + ip_vs_proto_name(ipvsh->protocol), + IP_VS_DBG_ADDR(cp->af, &to), ntohs(port), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport)-1); + } else { + return 1; + } + + /* Passive mode off */ + cp->app_data = (void *) IP_VS_FTP_ACTIVE; + + { + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(cp->ipvs, cp->af, + ipvsh->protocol, &to, port, &cp->vaddr, + htons(ntohs(cp->vport)-1), &p); + n_cp = ip_vs_conn_in_get(&p); + if (!n_cp) { + n_cp = ip_vs_conn_new(&p, cp->af, &cp->daddr, + htons(ntohs(cp->dport)-1), + IP_VS_CONN_F_NFCT, cp->dest, + skb->mark); + if (!n_cp) + return 0; + + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } + } + + /* + * Move tunnel to listen state + */ + ip_vs_tcp_conn_listen(n_cp); + ip_vs_conn_put(n_cp); + + return 1; +} + + +static struct ip_vs_app ip_vs_ftp = { + .name = "ftp", + .type = IP_VS_APP_TYPE_FTP, + .protocol = IPPROTO_TCP, + .module = THIS_MODULE, + .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list), + .init_conn = ip_vs_ftp_init_conn, + .done_conn = ip_vs_ftp_done_conn, + .bind_conn = NULL, + .unbind_conn = NULL, + .pkt_out = ip_vs_ftp_out, + .pkt_in = ip_vs_ftp_in, +}; + +/* + * per netns ip_vs_ftp initialization + */ +static int __net_init __ip_vs_ftp_init(struct net *net) +{ + int i, ret; + struct ip_vs_app *app; + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!ipvs) + return -ENOENT; + + app = register_ip_vs_app(ipvs, &ip_vs_ftp); + if (IS_ERR(app)) + return PTR_ERR(app); + + for (i = 0; i < ports_count; i++) { + if (!ports[i]) + continue; + ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]); + if (ret) + goto err_unreg; + } + return 0; + +err_unreg: + unregister_ip_vs_app(ipvs, &ip_vs_ftp); + return ret; +} +/* + * netns exit + */ +static void __ip_vs_ftp_exit(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!ipvs) + return; + + unregister_ip_vs_app(ipvs, &ip_vs_ftp); +} + +static struct pernet_operations ip_vs_ftp_ops = { + .init = __ip_vs_ftp_init, + .exit = __ip_vs_ftp_exit, +}; + +static int __init ip_vs_ftp_init(void) +{ + /* rcu_barrier() is called by netns on error */ + return register_pernet_subsys(&ip_vs_ftp_ops); +} + +/* + * ip_vs_ftp finish. + */ +static void __exit ip_vs_ftp_exit(void) +{ + unregister_pernet_subsys(&ip_vs_ftp_ops); + /* rcu_barrier() is called by netns */ +} + + +module_init(ip_vs_ftp_init); +module_exit(ip_vs_ftp_exit); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c new file mode 100644 index 000000000..7ac7473e3 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -0,0 +1,630 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * IPVS: Locality-Based Least-Connection scheduling module + * + * Authors: Wensong Zhang + * + * Changes: + * Martin Hamilton : fixed the terrible locking bugs + * *lock(tbl->lock) ==> *lock(&tbl->lock) + * Wensong Zhang : fixed the uninitialized tbl->lock bug + * Wensong Zhang : added doing full expiration check to + * collect stale entries of 24+ hours when + * no partial expire check in a half hour + * Julian Anastasov : replaced del_timer call with del_timer_sync + * to avoid the possible race between timer + * handler and del_timer thread in SMP + */ + +/* + * The lblc algorithm is as follows (pseudo code): + * + * if cachenode[dest_ip] is null then + * n, cachenode[dest_ip] <- {weighted least-conn node}; + * else + * n <- cachenode[dest_ip]; + * if (n is dead) OR + * (n.conns>n.weight AND + * there is a node m with m.conns +#include +#include +#include +#include +#include +#include + +/* for sysctl */ +#include +#include + +#include + + +/* + * It is for garbage collection of stale IPVS lblc entries, + * when the table is full. + */ +#define CHECK_EXPIRE_INTERVAL (60*HZ) +#define ENTRY_TIMEOUT (6*60*HZ) + +#define DEFAULT_EXPIRATION (24*60*60*HZ) + +/* + * It is for full expiration check. + * When there is no partial expiration check (garbage collection) + * in a half hour, do a full expiration check to collect stale + * entries that haven't been touched for a day. + */ +#define COUNT_FOR_FULL_EXPIRATION 30 + + +/* + * for IPVS lblc entry hash table + */ +#ifndef CONFIG_IP_VS_LBLC_TAB_BITS +#define CONFIG_IP_VS_LBLC_TAB_BITS 10 +#endif +#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS +#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) +#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) + + +/* + * IPVS lblc entry represents an association between destination + * IP address and its destination server + */ +struct ip_vs_lblc_entry { + struct hlist_node list; + int af; /* address family */ + union nf_inet_addr addr; /* destination IP address */ + struct ip_vs_dest *dest; /* real server (cache) */ + unsigned long lastuse; /* last used time */ + struct rcu_head rcu_head; +}; + + +/* + * IPVS lblc hash table + */ +struct ip_vs_lblc_table { + struct rcu_head rcu_head; + struct hlist_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ + struct timer_list periodic_timer; /* collect stale entries */ + struct ip_vs_service *svc; /* pointer back to service */ + atomic_t entries; /* number of entries */ + int max_size; /* maximum size of entries */ + int rover; /* rover for expire check */ + int counter; /* counter for no expire */ + bool dead; +}; + + +/* + * IPVS LBLC sysctl table + */ +#ifdef CONFIG_SYSCTL +static struct ctl_table vs_vars_table[] = { + { + .procname = "lblc_expiration", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#endif + +static void ip_vs_lblc_rcu_free(struct rcu_head *head) +{ + struct ip_vs_lblc_entry *en = container_of(head, + struct ip_vs_lblc_entry, + rcu_head); + + ip_vs_dest_put_and_free(en->dest); + kfree(en); +} + +static inline void ip_vs_lblc_del(struct ip_vs_lblc_entry *en) +{ + hlist_del_rcu(&en->list); + call_rcu(&en->rcu_head, ip_vs_lblc_rcu_free); +} + +/* + * Returns hash value for IPVS LBLC entry + */ +static inline unsigned int +ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr) +{ + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + return hash_32(ntohl(addr_fold), IP_VS_LBLC_TAB_BITS); +} + + +/* + * Hash an entry in the ip_vs_lblc_table. + * returns bool success. + */ +static void +ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) +{ + unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr); + + hlist_add_head_rcu(&en->list, &tbl->bucket[hash]); + atomic_inc(&tbl->entries); +} + + +/* Get ip_vs_lblc_entry associated with supplied parameters. */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, + const union nf_inet_addr *addr) +{ + unsigned int hash = ip_vs_lblc_hashkey(af, addr); + struct ip_vs_lblc_entry *en; + + hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list) + if (ip_vs_addr_equal(af, &en->addr, addr)) + return en; + + return NULL; +} + + +/* + * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP + * address to a server. Called under spin lock. + */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, + u16 af, struct ip_vs_dest *dest) +{ + struct ip_vs_lblc_entry *en; + + en = ip_vs_lblc_get(af, tbl, daddr); + if (en) { + if (en->dest == dest) + return en; + ip_vs_lblc_del(en); + } + en = kmalloc(sizeof(*en), GFP_ATOMIC); + if (!en) + return NULL; + + en->af = af; + ip_vs_addr_copy(af, &en->addr, daddr); + en->lastuse = jiffies; + + ip_vs_dest_hold(dest); + en->dest = dest; + + ip_vs_lblc_hash(tbl, en); + + return en; +} + + +/* + * Flush all the entries of the specified table. + */ +static void ip_vs_lblc_flush(struct ip_vs_service *svc) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + struct ip_vs_lblc_entry *en; + struct hlist_node *next; + int i; + + spin_lock_bh(&svc->sched_lock); + tbl->dead = true; + for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) { + hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { + ip_vs_lblc_del(en); + atomic_dec(&tbl->entries); + } + } + spin_unlock_bh(&svc->sched_lock); +} + +static int sysctl_lblc_expiration(struct ip_vs_service *svc) +{ +#ifdef CONFIG_SYSCTL + return svc->ipvs->sysctl_lblc_expiration; +#else + return DEFAULT_EXPIRATION; +#endif +} + +static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + struct ip_vs_lblc_entry *en; + struct hlist_node *next; + unsigned long now = jiffies; + int i, j; + + for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) { + j = (j + 1) & IP_VS_LBLC_TAB_MASK; + + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { + if (time_before(now, + en->lastuse + + sysctl_lblc_expiration(svc))) + continue; + + ip_vs_lblc_del(en); + atomic_dec(&tbl->entries); + } + spin_unlock(&svc->sched_lock); + } + tbl->rover = j; +} + + +/* + * Periodical timer handler for IPVS lblc table + * It is used to collect stale entries when the number of entries + * exceeds the maximum size of the table. + * + * Fixme: we probably need more complicated algorithm to collect + * entries that have not been used for a long time even + * if the number of entries doesn't exceed the maximum size + * of the table. + * The full expiration check is for this purpose now. + */ +static void ip_vs_lblc_check_expire(struct timer_list *t) +{ + struct ip_vs_lblc_table *tbl = from_timer(tbl, t, periodic_timer); + struct ip_vs_service *svc = tbl->svc; + unsigned long now = jiffies; + int goal; + int i, j; + struct ip_vs_lblc_entry *en; + struct hlist_node *next; + + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { + /* do full expiration check */ + ip_vs_lblc_full_check(svc); + tbl->counter = 1; + goto out; + } + + if (atomic_read(&tbl->entries) <= tbl->max_size) { + tbl->counter++; + goto out; + } + + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; + if (goal > tbl->max_size/2) + goal = tbl->max_size/2; + + for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) { + j = (j + 1) & IP_VS_LBLC_TAB_MASK; + + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { + if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) + continue; + + ip_vs_lblc_del(en); + atomic_dec(&tbl->entries); + goal--; + } + spin_unlock(&svc->sched_lock); + if (goal <= 0) + break; + } + tbl->rover = j; + + out: + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); +} + + +static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) +{ + int i; + struct ip_vs_lblc_table *tbl; + + /* + * Allocate the ip_vs_lblc_table for this service + */ + tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); + if (tbl == NULL) + return -ENOMEM; + + svc->sched_data = tbl; + IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) allocated for " + "current service\n", sizeof(*tbl)); + + /* + * Initialize the hash buckets + */ + for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) { + INIT_HLIST_HEAD(&tbl->bucket[i]); + } + tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; + tbl->rover = 0; + tbl->counter = 1; + tbl->dead = false; + tbl->svc = svc; + atomic_set(&tbl->entries, 0); + + /* + * Hook periodic timer for garbage collection + */ + timer_setup(&tbl->periodic_timer, ip_vs_lblc_check_expire, 0); + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); + + return 0; +} + + +static void ip_vs_lblc_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + + /* remove periodic timer */ + del_timer_sync(&tbl->periodic_timer); + + /* got to clean up table entries here */ + ip_vs_lblc_flush(svc); + + /* release the table itself */ + kfree_rcu(tbl, rcu_head); + IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) released\n", + sizeof(*tbl)); +} + + +static inline struct ip_vs_dest * +__ip_vs_lblc_schedule(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest, *least; + int loh, doh; + + /* + * We use the following formula to estimate the load: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connection. + */ + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + if (atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_dest_conn_overhead(least); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = ip_vs_dest_conn_overhead(dest); + if ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "LBLC: server %s:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), + atomic_read(&least->activeconns), + refcount_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +/* + * If this destination server is overloaded and there is a less loaded + * server, then return true. + */ +static inline int +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { + struct ip_vs_dest *d; + + list_for_each_entry_rcu(d, &svc->destinations, n_list) { + if (atomic_read(&d->activeconns)*2 + < atomic_read(&d->weight)) { + return 1; + } + } + } + return 0; +} + + +/* + * Locality-Based (weighted) Least-Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + struct ip_vs_dest *dest = NULL; + struct ip_vs_lblc_entry *en; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* First look in our cache */ + en = ip_vs_lblc_get(svc->af, tbl, &iph->daddr); + if (en) { + /* We only hold a read lock, but this is atomic */ + en->lastuse = jiffies; + + /* + * If the destination is not available, i.e. it's in the trash, + * we must ignore it, as it may be removed from under our feet, + * if someone drops our reference count. Our caller only makes + * sure that destinations, that are not in the trash, are not + * moved to the trash, while we are scheduling. But anyone can + * free up entries from the trash at any time. + */ + + dest = en->dest; + if ((dest->flags & IP_VS_DEST_F_AVAILABLE) && + atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) + goto out; + } + + /* No cache entry or it is invalid, time to schedule */ + dest = __ip_vs_lblc_schedule(svc); + if (!dest) { + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + } + + /* If we fail to create a cache entry, we'll just use the valid dest */ + spin_lock_bh(&svc->sched_lock); + if (!tbl->dead) + ip_vs_lblc_new(tbl, &iph->daddr, svc->af, dest); + spin_unlock_bh(&svc->sched_lock); + +out: + IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n", + IP_VS_DBG_ADDR(svc->af, &iph->daddr), + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS LBLC Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_lblc_scheduler = { + .name = "lblc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list), + .init_service = ip_vs_lblc_init_svc, + .done_service = ip_vs_lblc_done_svc, + .schedule = ip_vs_lblc_schedule, +}; + +/* + * per netns init. + */ +#ifdef CONFIG_SYSCTL +static int __net_init __ip_vs_lblc_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!ipvs) + return -ENOENT; + + if (!net_eq(net, &init_net)) { + ipvs->lblc_ctl_table = kmemdup(vs_vars_table, + sizeof(vs_vars_table), + GFP_KERNEL); + if (ipvs->lblc_ctl_table == NULL) + return -ENOMEM; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + ipvs->lblc_ctl_table[0].procname = NULL; + + } else + ipvs->lblc_ctl_table = vs_vars_table; + ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION; + ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration; + + ipvs->lblc_ctl_header = + register_net_sysctl(net, "net/ipv4/vs", ipvs->lblc_ctl_table); + if (!ipvs->lblc_ctl_header) { + if (!net_eq(net, &init_net)) + kfree(ipvs->lblc_ctl_table); + return -ENOMEM; + } + + return 0; +} + +static void __net_exit __ip_vs_lblc_exit(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + unregister_net_sysctl_table(ipvs->lblc_ctl_header); + + if (!net_eq(net, &init_net)) + kfree(ipvs->lblc_ctl_table); +} + +#else + +static int __net_init __ip_vs_lblc_init(struct net *net) { return 0; } +static void __net_exit __ip_vs_lblc_exit(struct net *net) { } + +#endif + +static struct pernet_operations ip_vs_lblc_ops = { + .init = __ip_vs_lblc_init, + .exit = __ip_vs_lblc_exit, +}; + +static int __init ip_vs_lblc_init(void) +{ + int ret; + + ret = register_pernet_subsys(&ip_vs_lblc_ops); + if (ret) + return ret; + + ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); + if (ret) + unregister_pernet_subsys(&ip_vs_lblc_ops); + return ret; +} + +static void __exit ip_vs_lblc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); + unregister_pernet_subsys(&ip_vs_lblc_ops); + rcu_barrier(); +} + + +module_init(ip_vs_lblc_init); +module_exit(ip_vs_lblc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c new file mode 100644 index 000000000..77c323c36 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -0,0 +1,815 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * IPVS: Locality-Based Least-Connection with Replication scheduler + * + * Authors: Wensong Zhang + * + * Changes: + * Julian Anastasov : Added the missing (dest->weight>0) + * condition in the ip_vs_dest_set_max. + */ + +/* + * The lblc/r algorithm is as follows (pseudo code): + * + * if serverSet[dest_ip] is null then + * n, serverSet[dest_ip] <- {weighted least-conn node}; + * else + * n <- {least-conn (alive) node in serverSet[dest_ip]}; + * if (n is null) OR + * (n.conns>n.weight AND + * there is a node m with m.conns 1 AND + * now - serverSet[dest_ip].lastMod > T then + * m <- {most conn node in serverSet[dest_ip]}; + * remove m from serverSet[dest_ip]; + * if serverSet[dest_ip] changed then + * serverSet[dest_ip].lastMod <- now; + * + * return n; + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +/* for sysctl */ +#include +#include +#include + +#include + + +/* + * It is for garbage collection of stale IPVS lblcr entries, + * when the table is full. + */ +#define CHECK_EXPIRE_INTERVAL (60*HZ) +#define ENTRY_TIMEOUT (6*60*HZ) + +#define DEFAULT_EXPIRATION (24*60*60*HZ) + +/* + * It is for full expiration check. + * When there is no partial expiration check (garbage collection) + * in a half hour, do a full expiration check to collect stale + * entries that haven't been touched for a day. + */ +#define COUNT_FOR_FULL_EXPIRATION 30 + +/* + * for IPVS lblcr entry hash table + */ +#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS +#define CONFIG_IP_VS_LBLCR_TAB_BITS 10 +#endif +#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS +#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS) +#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1) + + +/* + * IPVS destination set structure and operations + */ +struct ip_vs_dest_set_elem { + struct list_head list; /* list link */ + struct ip_vs_dest *dest; /* destination server */ + struct rcu_head rcu_head; +}; + +struct ip_vs_dest_set { + atomic_t size; /* set size */ + unsigned long lastmod; /* last modified time */ + struct list_head list; /* destination list */ +}; + + +static void ip_vs_dest_set_insert(struct ip_vs_dest_set *set, + struct ip_vs_dest *dest, bool check) +{ + struct ip_vs_dest_set_elem *e; + + if (check) { + list_for_each_entry(e, &set->list, list) { + if (e->dest == dest) + return; + } + } + + e = kmalloc(sizeof(*e), GFP_ATOMIC); + if (e == NULL) + return; + + ip_vs_dest_hold(dest); + e->dest = dest; + + list_add_rcu(&e->list, &set->list); + atomic_inc(&set->size); + + set->lastmod = jiffies; +} + +static void ip_vs_lblcr_elem_rcu_free(struct rcu_head *head) +{ + struct ip_vs_dest_set_elem *e; + + e = container_of(head, struct ip_vs_dest_set_elem, rcu_head); + ip_vs_dest_put_and_free(e->dest); + kfree(e); +} + +static void +ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +{ + struct ip_vs_dest_set_elem *e; + + list_for_each_entry(e, &set->list, list) { + if (e->dest == dest) { + /* HIT */ + atomic_dec(&set->size); + set->lastmod = jiffies; + list_del_rcu(&e->list); + call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free); + break; + } + } +} + +static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) +{ + struct ip_vs_dest_set_elem *e, *ep; + + list_for_each_entry_safe(e, ep, &set->list, list) { + list_del_rcu(&e->list); + call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free); + } +} + +/* get weighted least-connection node in the destination set */ +static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) +{ + struct ip_vs_dest_set_elem *e; + struct ip_vs_dest *dest, *least; + int loh, doh; + + /* select the first destination server, whose weight > 0 */ + list_for_each_entry_rcu(e, &set->list, list) { + least = e->dest; + if (least->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + if ((atomic_read(&least->weight) > 0) + && (least->flags & IP_VS_DEST_F_AVAILABLE)) { + loh = ip_vs_dest_conn_overhead(least); + goto nextstage; + } + } + return NULL; + + /* find the destination with the weighted least load */ + nextstage: + list_for_each_entry_continue_rcu(e, &set->list, list) { + dest = e->dest; + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = ip_vs_dest_conn_overhead(dest); + if (((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) + && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "%s(): server %s:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + __func__, + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), + atomic_read(&least->activeconns), + refcount_read(&least->refcnt), + atomic_read(&least->weight), loh); + return least; +} + + +/* get weighted most-connection node in the destination set */ +static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) +{ + struct ip_vs_dest_set_elem *e; + struct ip_vs_dest *dest, *most; + int moh, doh; + + if (set == NULL) + return NULL; + + /* select the first destination server, whose weight > 0 */ + list_for_each_entry(e, &set->list, list) { + most = e->dest; + if (atomic_read(&most->weight) > 0) { + moh = ip_vs_dest_conn_overhead(most); + goto nextstage; + } + } + return NULL; + + /* find the destination with the weighted most load */ + nextstage: + list_for_each_entry_continue(e, &set->list, list) { + dest = e->dest; + doh = ip_vs_dest_conn_overhead(dest); + /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ + if (((__s64)moh * atomic_read(&dest->weight) < + (__s64)doh * atomic_read(&most->weight)) + && (atomic_read(&dest->weight) > 0)) { + most = dest; + moh = doh; + } + } + + IP_VS_DBG_BUF(6, "%s(): server %s:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + __func__, + IP_VS_DBG_ADDR(most->af, &most->addr), ntohs(most->port), + atomic_read(&most->activeconns), + refcount_read(&most->refcnt), + atomic_read(&most->weight), moh); + return most; +} + + +/* + * IPVS lblcr entry represents an association between destination + * IP address and its destination server set + */ +struct ip_vs_lblcr_entry { + struct hlist_node list; + int af; /* address family */ + union nf_inet_addr addr; /* destination IP address */ + struct ip_vs_dest_set set; /* destination server set */ + unsigned long lastuse; /* last used time */ + struct rcu_head rcu_head; +}; + + +/* + * IPVS lblcr hash table + */ +struct ip_vs_lblcr_table { + struct rcu_head rcu_head; + struct hlist_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ + atomic_t entries; /* number of entries */ + int max_size; /* maximum size of entries */ + struct timer_list periodic_timer; /* collect stale entries */ + struct ip_vs_service *svc; /* pointer back to service */ + int rover; /* rover for expire check */ + int counter; /* counter for no expire */ + bool dead; +}; + + +#ifdef CONFIG_SYSCTL +/* + * IPVS LBLCR sysctl table + */ + +static struct ctl_table vs_vars_table[] = { + { + .procname = "lblcr_expiration", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#endif + +static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) +{ + hlist_del_rcu(&en->list); + ip_vs_dest_set_eraseall(&en->set); + kfree_rcu(en, rcu_head); +} + + +/* + * Returns hash value for IPVS LBLCR entry + */ +static inline unsigned int +ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr) +{ + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + return hash_32(ntohl(addr_fold), IP_VS_LBLCR_TAB_BITS); +} + + +/* + * Hash an entry in the ip_vs_lblcr_table. + * returns bool success. + */ +static void +ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) +{ + unsigned int hash = ip_vs_lblcr_hashkey(en->af, &en->addr); + + hlist_add_head_rcu(&en->list, &tbl->bucket[hash]); + atomic_inc(&tbl->entries); +} + + +/* Get ip_vs_lblcr_entry associated with supplied parameters. */ +static inline struct ip_vs_lblcr_entry * +ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl, + const union nf_inet_addr *addr) +{ + unsigned int hash = ip_vs_lblcr_hashkey(af, addr); + struct ip_vs_lblcr_entry *en; + + hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list) + if (ip_vs_addr_equal(af, &en->addr, addr)) + return en; + + return NULL; +} + + +/* + * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination + * IP address to a server. Called under spin lock. + */ +static inline struct ip_vs_lblcr_entry * +ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, + u16 af, struct ip_vs_dest *dest) +{ + struct ip_vs_lblcr_entry *en; + + en = ip_vs_lblcr_get(af, tbl, daddr); + if (!en) { + en = kmalloc(sizeof(*en), GFP_ATOMIC); + if (!en) + return NULL; + + en->af = af; + ip_vs_addr_copy(af, &en->addr, daddr); + en->lastuse = jiffies; + + /* initialize its dest set */ + atomic_set(&(en->set.size), 0); + INIT_LIST_HEAD(&en->set.list); + + ip_vs_dest_set_insert(&en->set, dest, false); + + ip_vs_lblcr_hash(tbl, en); + return en; + } + + ip_vs_dest_set_insert(&en->set, dest, true); + + return en; +} + + +/* + * Flush all the entries of the specified table. + */ +static void ip_vs_lblcr_flush(struct ip_vs_service *svc) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + int i; + struct ip_vs_lblcr_entry *en; + struct hlist_node *next; + + spin_lock_bh(&svc->sched_lock); + tbl->dead = true; + for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) { + hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { + ip_vs_lblcr_free(en); + } + } + spin_unlock_bh(&svc->sched_lock); +} + +static int sysctl_lblcr_expiration(struct ip_vs_service *svc) +{ +#ifdef CONFIG_SYSCTL + return svc->ipvs->sysctl_lblcr_expiration; +#else + return DEFAULT_EXPIRATION; +#endif +} + +static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + unsigned long now = jiffies; + int i, j; + struct ip_vs_lblcr_entry *en; + struct hlist_node *next; + + for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) { + j = (j + 1) & IP_VS_LBLCR_TAB_MASK; + + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { + if (time_after(en->lastuse + + sysctl_lblcr_expiration(svc), now)) + continue; + + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + } + spin_unlock(&svc->sched_lock); + } + tbl->rover = j; +} + + +/* + * Periodical timer handler for IPVS lblcr table + * It is used to collect stale entries when the number of entries + * exceeds the maximum size of the table. + * + * Fixme: we probably need more complicated algorithm to collect + * entries that have not been used for a long time even + * if the number of entries doesn't exceed the maximum size + * of the table. + * The full expiration check is for this purpose now. + */ +static void ip_vs_lblcr_check_expire(struct timer_list *t) +{ + struct ip_vs_lblcr_table *tbl = from_timer(tbl, t, periodic_timer); + struct ip_vs_service *svc = tbl->svc; + unsigned long now = jiffies; + int goal; + int i, j; + struct ip_vs_lblcr_entry *en; + struct hlist_node *next; + + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { + /* do full expiration check */ + ip_vs_lblcr_full_check(svc); + tbl->counter = 1; + goto out; + } + + if (atomic_read(&tbl->entries) <= tbl->max_size) { + tbl->counter++; + goto out; + } + + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; + if (goal > tbl->max_size/2) + goal = tbl->max_size/2; + + for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) { + j = (j + 1) & IP_VS_LBLCR_TAB_MASK; + + spin_lock(&svc->sched_lock); + hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { + if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) + continue; + + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + goal--; + } + spin_unlock(&svc->sched_lock); + if (goal <= 0) + break; + } + tbl->rover = j; + + out: + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); +} + +static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) +{ + int i; + struct ip_vs_lblcr_table *tbl; + + /* + * Allocate the ip_vs_lblcr_table for this service + */ + tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); + if (tbl == NULL) + return -ENOMEM; + + svc->sched_data = tbl; + IP_VS_DBG(6, "LBLCR hash table (memory=%zdbytes) allocated for " + "current service\n", sizeof(*tbl)); + + /* + * Initialize the hash buckets + */ + for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) { + INIT_HLIST_HEAD(&tbl->bucket[i]); + } + tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; + tbl->rover = 0; + tbl->counter = 1; + tbl->dead = false; + tbl->svc = svc; + atomic_set(&tbl->entries, 0); + + /* + * Hook periodic timer for garbage collection + */ + timer_setup(&tbl->periodic_timer, ip_vs_lblcr_check_expire, 0); + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); + + return 0; +} + + +static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + + /* remove periodic timer */ + del_timer_sync(&tbl->periodic_timer); + + /* got to clean up table entries here */ + ip_vs_lblcr_flush(svc); + + /* release the table itself */ + kfree_rcu(tbl, rcu_head); + IP_VS_DBG(6, "LBLCR hash table (memory=%zdbytes) released\n", + sizeof(*tbl)); +} + + +static inline struct ip_vs_dest * +__ip_vs_lblcr_schedule(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest, *least; + int loh, doh; + + /* + * We use the following formula to estimate the load: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connection. + */ + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + if (atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_dest_conn_overhead(least); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = ip_vs_dest_conn_overhead(dest); + if ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "LBLCR: server %s:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), + atomic_read(&least->activeconns), + refcount_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +/* + * If this destination server is overloaded and there is a less loaded + * server, then return true. + */ +static inline int +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { + struct ip_vs_dest *d; + + list_for_each_entry_rcu(d, &svc->destinations, n_list) { + if (atomic_read(&d->activeconns)*2 + < atomic_read(&d->weight)) { + return 1; + } + } + } + return 0; +} + + +/* + * Locality-Based (weighted) Least-Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + struct ip_vs_dest *dest; + struct ip_vs_lblcr_entry *en; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* First look in our cache */ + en = ip_vs_lblcr_get(svc->af, tbl, &iph->daddr); + if (en) { + en->lastuse = jiffies; + + /* Get the least loaded destination */ + dest = ip_vs_dest_set_min(&en->set); + + /* More than one destination + enough time passed by, cleanup */ + if (atomic_read(&en->set.size) > 1 && + time_after(jiffies, en->set.lastmod + + sysctl_lblcr_expiration(svc))) { + spin_lock_bh(&svc->sched_lock); + if (atomic_read(&en->set.size) > 1) { + struct ip_vs_dest *m; + + m = ip_vs_dest_set_max(&en->set); + if (m) + ip_vs_dest_set_erase(&en->set, m); + } + spin_unlock_bh(&svc->sched_lock); + } + + /* If the destination is not overloaded, use it */ + if (dest && !is_overloaded(dest, svc)) + goto out; + + /* The cache entry is invalid, time to schedule */ + dest = __ip_vs_lblcr_schedule(svc); + if (!dest) { + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + } + + /* Update our cache entry */ + spin_lock_bh(&svc->sched_lock); + if (!tbl->dead) + ip_vs_dest_set_insert(&en->set, dest, true); + spin_unlock_bh(&svc->sched_lock); + goto out; + } + + /* No cache entry, time to schedule */ + dest = __ip_vs_lblcr_schedule(svc); + if (!dest) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + + /* If we fail to create a cache entry, we'll just use the valid dest */ + spin_lock_bh(&svc->sched_lock); + if (!tbl->dead) + ip_vs_lblcr_new(tbl, &iph->daddr, svc->af, dest); + spin_unlock_bh(&svc->sched_lock); + +out: + IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n", + IP_VS_DBG_ADDR(svc->af, &iph->daddr), + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS LBLCR Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_lblcr_scheduler = +{ + .name = "lblcr", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list), + .init_service = ip_vs_lblcr_init_svc, + .done_service = ip_vs_lblcr_done_svc, + .schedule = ip_vs_lblcr_schedule, +}; + +/* + * per netns init. + */ +#ifdef CONFIG_SYSCTL +static int __net_init __ip_vs_lblcr_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!ipvs) + return -ENOENT; + + if (!net_eq(net, &init_net)) { + ipvs->lblcr_ctl_table = kmemdup(vs_vars_table, + sizeof(vs_vars_table), + GFP_KERNEL); + if (ipvs->lblcr_ctl_table == NULL) + return -ENOMEM; + + /* Don't export sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) + ipvs->lblcr_ctl_table[0].procname = NULL; + } else + ipvs->lblcr_ctl_table = vs_vars_table; + ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION; + ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration; + + ipvs->lblcr_ctl_header = + register_net_sysctl(net, "net/ipv4/vs", ipvs->lblcr_ctl_table); + if (!ipvs->lblcr_ctl_header) { + if (!net_eq(net, &init_net)) + kfree(ipvs->lblcr_ctl_table); + return -ENOMEM; + } + + return 0; +} + +static void __net_exit __ip_vs_lblcr_exit(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + unregister_net_sysctl_table(ipvs->lblcr_ctl_header); + + if (!net_eq(net, &init_net)) + kfree(ipvs->lblcr_ctl_table); +} + +#else + +static int __net_init __ip_vs_lblcr_init(struct net *net) { return 0; } +static void __net_exit __ip_vs_lblcr_exit(struct net *net) { } + +#endif + +static struct pernet_operations ip_vs_lblcr_ops = { + .init = __ip_vs_lblcr_init, + .exit = __ip_vs_lblcr_exit, +}; + +static int __init ip_vs_lblcr_init(void) +{ + int ret; + + ret = register_pernet_subsys(&ip_vs_lblcr_ops); + if (ret) + return ret; + + ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); + if (ret) + unregister_pernet_subsys(&ip_vs_lblcr_ops); + return ret; +} + +static void __exit ip_vs_lblcr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); + unregister_pernet_subsys(&ip_vs_lblcr_ops); + rcu_barrier(); +} + + +module_init(ip_vs_lblcr_init); +module_exit(ip_vs_lblcr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c new file mode 100644 index 000000000..9d34d81fc --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_lc.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * IPVS: Least-Connection Scheduling module + * + * Authors: Wensong Zhang + * + * Changes: + * Wensong Zhang : added the ip_vs_lc_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include + +#include + +/* + * Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest, *least = NULL; + unsigned int loh = 0, doh; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* + * Simply select the server with the least number of + * (activeconns<<5) + inactconns + * Except whose weight is equal to zero. + * If the weight is equal to zero, it means that the server is + * quiesced, the existing connections to the server still get + * served, but no new connection is assigned to the server. + */ + + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || + atomic_read(&dest->weight) == 0) + continue; + doh = ip_vs_dest_conn_overhead(dest); + if (!least || doh < loh) { + least = dest; + loh = doh; + } + } + + if (!least) + ip_vs_scheduler_err(svc, "no destination available"); + else + IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d " + "inactconns %d\n", + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->inactconns)); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_lc_scheduler = { + .name = "lc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list), + .schedule = ip_vs_lc_schedule, +}; + + +static int __init ip_vs_lc_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ; +} + +static void __exit ip_vs_lc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_lc_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_lc_init); +module_exit(ip_vs_lc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c new file mode 100644 index 000000000..e3d7f5c87 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_mh.c @@ -0,0 +1,539 @@ +// SPDX-License-Identifier: GPL-2.0 +/* IPVS: Maglev Hashing scheduling module + * + * Authors: Inju Song + * + */ + +/* The mh algorithm is to assign a preference list of all the lookup + * table positions to each destination and populate the table with + * the most-preferred position of destinations. Then it is to select + * destination with the hash key of source IP address through looking + * up a the lookup table. + * + * The algorithm is detailed in: + * [3.4 Consistent Hasing] +https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#define IP_VS_SVC_F_SCHED_MH_FALLBACK IP_VS_SVC_F_SCHED1 /* MH fallback */ +#define IP_VS_SVC_F_SCHED_MH_PORT IP_VS_SVC_F_SCHED2 /* MH use port */ + +struct ip_vs_mh_lookup { + struct ip_vs_dest __rcu *dest; /* real server (cache) */ +}; + +struct ip_vs_mh_dest_setup { + unsigned int offset; /* starting offset */ + unsigned int skip; /* skip */ + unsigned int perm; /* next_offset */ + int turns; /* weight / gcd() and rshift */ +}; + +/* Available prime numbers for MH table */ +static int primes[] = {251, 509, 1021, 2039, 4093, + 8191, 16381, 32749, 65521, 131071}; + +/* For IPVS MH entry hash table */ +#ifndef CONFIG_IP_VS_MH_TAB_INDEX +#define CONFIG_IP_VS_MH_TAB_INDEX 12 +#endif +#define IP_VS_MH_TAB_BITS (CONFIG_IP_VS_MH_TAB_INDEX / 2) +#define IP_VS_MH_TAB_INDEX (CONFIG_IP_VS_MH_TAB_INDEX - 8) +#define IP_VS_MH_TAB_SIZE primes[IP_VS_MH_TAB_INDEX] + +struct ip_vs_mh_state { + struct rcu_head rcu_head; + struct ip_vs_mh_lookup *lookup; + struct ip_vs_mh_dest_setup *dest_setup; + hsiphash_key_t hash1, hash2; + int gcd; + int rshift; +}; + +static inline void generate_hash_secret(hsiphash_key_t *hash1, + hsiphash_key_t *hash2) +{ + hash1->key[0] = 2654435761UL; + hash1->key[1] = 2654435761UL; + + hash2->key[0] = 2654446892UL; + hash2->key[1] = 2654446892UL; +} + +/* Helper function to determine if server is unavailable */ +static inline bool is_unavailable(struct ip_vs_dest *dest) +{ + return atomic_read(&dest->weight) <= 0 || + dest->flags & IP_VS_DEST_F_OVERLOAD; +} + +/* Returns hash value for IPVS MH entry */ +static inline unsigned int +ip_vs_mh_hashkey(int af, const union nf_inet_addr *addr, + __be16 port, hsiphash_key_t *key, unsigned int offset) +{ + unsigned int v; + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0] ^ addr->ip6[1] ^ + addr->ip6[2] ^ addr->ip6[3]; +#endif + v = (offset + ntohs(port) + ntohl(addr_fold)); + return hsiphash(&v, sizeof(v), key); +} + +/* Reset all the hash buckets of the specified table. */ +static void ip_vs_mh_reset(struct ip_vs_mh_state *s) +{ + int i; + struct ip_vs_mh_lookup *l; + struct ip_vs_dest *dest; + + l = &s->lookup[0]; + for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) { + dest = rcu_dereference_protected(l->dest, 1); + if (dest) { + ip_vs_dest_put(dest); + RCU_INIT_POINTER(l->dest, NULL); + } + l++; + } +} + +static int ip_vs_mh_permutate(struct ip_vs_mh_state *s, + struct ip_vs_service *svc) +{ + struct list_head *p; + struct ip_vs_mh_dest_setup *ds; + struct ip_vs_dest *dest; + int lw; + + /* If gcd is smaller then 1, number of dests or + * all last_weight of dests are zero. So, skip + * permutation for the dests. + */ + if (s->gcd < 1) + return 0; + + /* Set dest_setup for the dests permutation */ + p = &svc->destinations; + ds = &s->dest_setup[0]; + while ((p = p->next) != &svc->destinations) { + dest = list_entry(p, struct ip_vs_dest, n_list); + + ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr, + dest->port, &s->hash1, 0) % + IP_VS_MH_TAB_SIZE; + ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr, + dest->port, &s->hash2, 0) % + (IP_VS_MH_TAB_SIZE - 1) + 1; + ds->perm = ds->offset; + + lw = atomic_read(&dest->last_weight); + ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0); + ds++; + } + + return 0; +} + +static int ip_vs_mh_populate(struct ip_vs_mh_state *s, + struct ip_vs_service *svc) +{ + int n, c, dt_count; + unsigned long *table; + struct list_head *p; + struct ip_vs_mh_dest_setup *ds; + struct ip_vs_dest *dest, *new_dest; + + /* If gcd is smaller then 1, number of dests or + * all last_weight of dests are zero. So, skip + * the population for the dests and reset lookup table. + */ + if (s->gcd < 1) { + ip_vs_mh_reset(s); + return 0; + } + + table = bitmap_zalloc(IP_VS_MH_TAB_SIZE, GFP_KERNEL); + if (!table) + return -ENOMEM; + + p = &svc->destinations; + n = 0; + dt_count = 0; + while (n < IP_VS_MH_TAB_SIZE) { + if (p == &svc->destinations) + p = p->next; + + ds = &s->dest_setup[0]; + while (p != &svc->destinations) { + /* Ignore added server with zero weight */ + if (ds->turns < 1) { + p = p->next; + ds++; + continue; + } + + c = ds->perm; + while (test_bit(c, table)) { + /* Add skip, mod IP_VS_MH_TAB_SIZE */ + ds->perm += ds->skip; + if (ds->perm >= IP_VS_MH_TAB_SIZE) + ds->perm -= IP_VS_MH_TAB_SIZE; + c = ds->perm; + } + + __set_bit(c, table); + + dest = rcu_dereference_protected(s->lookup[c].dest, 1); + new_dest = list_entry(p, struct ip_vs_dest, n_list); + if (dest != new_dest) { + if (dest) + ip_vs_dest_put(dest); + ip_vs_dest_hold(new_dest); + RCU_INIT_POINTER(s->lookup[c].dest, new_dest); + } + + if (++n == IP_VS_MH_TAB_SIZE) + goto out; + + if (++dt_count >= ds->turns) { + dt_count = 0; + p = p->next; + ds++; + } + } + } + +out: + bitmap_free(table); + return 0; +} + +/* Get ip_vs_dest associated with supplied parameters. */ +static inline struct ip_vs_dest * +ip_vs_mh_get(struct ip_vs_service *svc, struct ip_vs_mh_state *s, + const union nf_inet_addr *addr, __be16 port) +{ + unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0) + % IP_VS_MH_TAB_SIZE; + struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest); + + return (!dest || is_unavailable(dest)) ? NULL : dest; +} + +/* As ip_vs_mh_get, but with fallback if selected server is unavailable */ +static inline struct ip_vs_dest * +ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s, + const union nf_inet_addr *addr, __be16 port) +{ + unsigned int offset, roffset; + unsigned int hash, ihash; + struct ip_vs_dest *dest; + + /* First try the dest it's supposed to go to */ + ihash = ip_vs_mh_hashkey(svc->af, addr, port, + &s->hash1, 0) % IP_VS_MH_TAB_SIZE; + dest = rcu_dereference(s->lookup[ihash].dest); + if (!dest) + return NULL; + if (!is_unavailable(dest)) + return dest; + + IP_VS_DBG_BUF(6, "MH: selected unavailable server %s:%u, reselecting", + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); + + /* If the original dest is unavailable, loop around the table + * starting from ihash to find a new dest + */ + for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) { + roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE; + hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, + roffset) % IP_VS_MH_TAB_SIZE; + dest = rcu_dereference(s->lookup[hash].dest); + if (!dest) + break; + if (!is_unavailable(dest)) + return dest; + IP_VS_DBG_BUF(6, + "MH: selected unavailable server %s:%u (offset %u), reselecting", + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port), roffset); + } + + return NULL; +} + +/* Assign all the hash buckets of the specified table with the service. */ +static int ip_vs_mh_reassign(struct ip_vs_mh_state *s, + struct ip_vs_service *svc) +{ + int ret; + + if (svc->num_dests > IP_VS_MH_TAB_SIZE) + return -EINVAL; + + if (svc->num_dests >= 1) { + s->dest_setup = kcalloc(svc->num_dests, + sizeof(struct ip_vs_mh_dest_setup), + GFP_KERNEL); + if (!s->dest_setup) + return -ENOMEM; + } + + ip_vs_mh_permutate(s, svc); + + ret = ip_vs_mh_populate(s, svc); + if (ret < 0) + goto out; + + IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%u\n", + IP_VS_DBG_ADDR(svc->af, &svc->addr), + ntohs(svc->port)); + +out: + if (svc->num_dests >= 1) { + kfree(s->dest_setup); + s->dest_setup = NULL; + } + return ret; +} + +static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + int weight; + int g = 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + weight = atomic_read(&dest->last_weight); + if (weight > 0) { + if (g > 0) + g = gcd(weight, g); + else + g = weight; + } + } + return g; +} + +/* To avoid assigning huge weight for the MH table, + * calculate shift value with gcd. + */ +static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd) +{ + struct ip_vs_dest *dest; + int new_weight, weight = 0; + int mw, shift; + + /* If gcd is smaller then 1, number of dests or + * all last_weight of dests are zero. So, return + * shift value as zero. + */ + if (gcd < 1) + return 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + new_weight = atomic_read(&dest->last_weight); + if (new_weight > weight) + weight = new_weight; + } + + /* Because gcd is greater than zero, + * the maximum weight and gcd are always greater than zero + */ + mw = weight / gcd; + + /* shift = occupied bits of weight/gcd - MH highest bits */ + shift = fls(mw) - IP_VS_MH_TAB_BITS; + return (shift >= 0) ? shift : 0; +} + +static void ip_vs_mh_state_free(struct rcu_head *head) +{ + struct ip_vs_mh_state *s; + + s = container_of(head, struct ip_vs_mh_state, rcu_head); + kfree(s->lookup); + kfree(s); +} + +static int ip_vs_mh_init_svc(struct ip_vs_service *svc) +{ + int ret; + struct ip_vs_mh_state *s; + + /* Allocate the MH table for this service */ + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup), + GFP_KERNEL); + if (!s->lookup) { + kfree(s); + return -ENOMEM; + } + + generate_hash_secret(&s->hash1, &s->hash2); + s->gcd = ip_vs_mh_gcd_weight(svc); + s->rshift = ip_vs_mh_shift_weight(svc, s->gcd); + + IP_VS_DBG(6, + "MH lookup table (memory=%zdbytes) allocated for current service\n", + sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE); + + /* Assign the lookup table with current dests */ + ret = ip_vs_mh_reassign(s, svc); + if (ret < 0) { + ip_vs_mh_reset(s); + ip_vs_mh_state_free(&s->rcu_head); + return ret; + } + + /* No more failures, attach state */ + svc->sched_data = s; + return 0; +} + +static void ip_vs_mh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_mh_state *s = svc->sched_data; + + /* Got to clean up lookup entry here */ + ip_vs_mh_reset(s); + + call_rcu(&s->rcu_head, ip_vs_mh_state_free); + IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n", + sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE); +} + +static int ip_vs_mh_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) +{ + struct ip_vs_mh_state *s = svc->sched_data; + + s->gcd = ip_vs_mh_gcd_weight(svc); + s->rshift = ip_vs_mh_shift_weight(svc, s->gcd); + + /* Assign the lookup table with the updated service */ + return ip_vs_mh_reassign(s, svc); +} + +/* Helper function to get port number */ +static inline __be16 +ip_vs_mh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph) +{ + __be16 _ports[2], *ports; + + /* At this point we know that we have a valid packet of some kind. + * Because ICMP packets are only guaranteed to have the first 8 + * bytes, let's just grab the ports. Fortunately they're in the + * same position for all three of the protocols we care about. + */ + switch (iph->protocol) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_SCTP: + ports = skb_header_pointer(skb, iph->len, sizeof(_ports), + &_ports); + if (unlikely(!ports)) + return 0; + + if (likely(!ip_vs_iph_inverse(iph))) + return ports[0]; + else + return ports[1]; + default: + return 0; + } +} + +/* Maglev Hashing scheduling */ +static struct ip_vs_dest * +ip_vs_mh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_mh_state *s; + __be16 port = 0; + const union nf_inet_addr *hash_addr; + + hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr; + + IP_VS_DBG(6, "%s : Scheduling...\n", __func__); + + if (svc->flags & IP_VS_SVC_F_SCHED_MH_PORT) + port = ip_vs_mh_get_port(skb, iph); + + s = (struct ip_vs_mh_state *)svc->sched_data; + + if (svc->flags & IP_VS_SVC_F_SCHED_MH_FALLBACK) + dest = ip_vs_mh_get_fallback(svc, s, hash_addr, port); + else + dest = ip_vs_mh_get(svc, s, hash_addr, port); + + if (!dest) { + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + } + + IP_VS_DBG_BUF(6, "MH: source IP address %s:%u --> server %s:%u\n", + IP_VS_DBG_ADDR(svc->af, hash_addr), + ntohs(port), + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port)); + + return dest; +} + +/* IPVS MH Scheduler structure */ +static struct ip_vs_scheduler ip_vs_mh_scheduler = { + .name = "mh", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_mh_scheduler.n_list), + .init_service = ip_vs_mh_init_svc, + .done_service = ip_vs_mh_done_svc, + .add_dest = ip_vs_mh_dest_changed, + .del_dest = ip_vs_mh_dest_changed, + .upd_dest = ip_vs_mh_dest_changed, + .schedule = ip_vs_mh_schedule, +}; + +static int __init ip_vs_mh_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_mh_scheduler); +} + +static void __exit ip_vs_mh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_mh_scheduler); + rcu_barrier(); +} + +module_init(ip_vs_mh_init); +module_exit(ip_vs_mh_cleanup); +MODULE_DESCRIPTION("Maglev hashing ipvs scheduler"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Inju Song "); diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c new file mode 100644 index 000000000..08adcb222 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * ip_vs_nfct.c: Netfilter connection tracking support for IPVS + * + * Portions Copyright (C) 2001-2002 + * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland. + * + * Portions Copyright (C) 2003-2010 + * Julian Anastasov + * + * Authors: + * Ben North + * Julian Anastasov Reorganize and sync with latest kernels + * Hannes Eder Extend NFCT support for FTP, ipvs match + * + * Current status: + * + * - provide conntrack confirmation for new and related connections, by + * this way we can see their proper conntrack state in all hooks + * - support for all forwarding methods, not only NAT + * - FTP support (NAT), ability to support other NAT apps with expectations + * - to correctly create expectations for related NAT connections the proper + * NF conntrack support must be already installed, eg. ip_vs_ftp requires + * nf_conntrack_ftp ... iptables_nat for the same ports (but no iptables + * NAT rules are needed) + * - alter reply for NAT when forwarding packet in original direction: + * conntrack from client in NEW or RELATED (Passive FTP DATA) state or + * when RELATED conntrack is created from real server (Active FTP DATA) + * - if iptables_nat is not loaded the Passive FTP will not work (the + * PASV response can not be NAT-ed) but Active FTP should work + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define FMT_TUPLE "%s:%u->%s:%u/%u" +#define ARG_TUPLE(T) IP_VS_DBG_ADDR((T)->src.l3num, &(T)->src.u3), \ + ntohs((T)->src.u.all), \ + IP_VS_DBG_ADDR((T)->src.l3num, &(T)->dst.u3), \ + ntohs((T)->dst.u.all), \ + (T)->dst.protonum + +#define FMT_CONN "%s:%u->%s:%u->%s:%u/%u:%u" +#define ARG_CONN(C) IP_VS_DBG_ADDR((C)->af, &((C)->caddr)), \ + ntohs((C)->cport), \ + IP_VS_DBG_ADDR((C)->af, &((C)->vaddr)), \ + ntohs((C)->vport), \ + IP_VS_DBG_ADDR((C)->daf, &((C)->daddr)), \ + ntohs((C)->dport), \ + (C)->protocol, (C)->state + +void +ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_conntrack_tuple new_tuple; + + if (ct == NULL || nf_ct_is_confirmed(ct) || + nf_ct_is_dying(ct)) + return; + + /* Never alter conntrack for non-NAT conns */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return; + + /* Never alter conntrack for OPS conns (no reply is expected) */ + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + return; + + /* Alter reply only in original direction */ + if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + return; + + /* Applications may adjust TCP seqs */ + if (cp->app && nf_ct_protonum(ct) == IPPROTO_TCP && + !nfct_seqadj(ct) && !nfct_seqadj_ext_add(ct)) + return; + + /* + * The connection is not yet in the hashtable, so we update it. + * CIP->VIP will remain the same, so leave the tuple in + * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the + * real-server we will see RIP->DIP. + */ + new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + /* + * This will also take care of UDP and other protocols. + */ + if (outin) { + new_tuple.src.u3 = cp->daddr; + if (new_tuple.dst.protonum != IPPROTO_ICMP && + new_tuple.dst.protonum != IPPROTO_ICMPV6) + new_tuple.src.u.tcp.port = cp->dport; + } else { + new_tuple.dst.u3 = cp->vaddr; + if (new_tuple.dst.protonum != IPPROTO_ICMP && + new_tuple.dst.protonum != IPPROTO_ICMPV6) + new_tuple.dst.u.tcp.port = cp->vport; + } + IP_VS_DBG_BUF(7, "%s: Updating conntrack ct=%p, status=0x%lX, " + "ctinfo=%d, old reply=" FMT_TUPLE "\n", + __func__, ct, ct->status, ctinfo, + ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple)); + IP_VS_DBG_BUF(7, "%s: Updating conntrack ct=%p, status=0x%lX, " + "ctinfo=%d, new reply=" FMT_TUPLE "\n", + __func__, ct, ct->status, ctinfo, + ARG_TUPLE(&new_tuple)); + nf_conntrack_alter_reply(ct, &new_tuple); + IP_VS_DBG_BUF(7, "%s: Updated conntrack ct=%p for cp=" FMT_CONN "\n", + __func__, ct, ARG_CONN(cp)); +} + +int ip_vs_confirm_conntrack(struct sk_buff *skb) +{ + return nf_conntrack_confirm(skb); +} + +/* + * Called from init_conntrack() as expectfn handler. + */ +static void ip_vs_nfct_expect_callback(struct nf_conn *ct, + struct nf_conntrack_expect *exp) +{ + struct nf_conntrack_tuple *orig, new_reply; + struct ip_vs_conn *cp; + struct ip_vs_conn_param p; + struct net *net = nf_ct_net(ct); + + /* + * We assume that no NF locks are held before this callback. + * ip_vs_conn_out_get and ip_vs_conn_in_get should match their + * expectations even if they use wildcard values, now we provide the + * actual values from the newly created original conntrack direction. + * The conntrack is confirmed when packet reaches IPVS hooks. + */ + + /* RS->CLIENT */ + orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + ip_vs_conn_fill_param(net_ipvs(net), exp->tuple.src.l3num, orig->dst.protonum, + &orig->src.u3, orig->src.u.tcp.port, + &orig->dst.u3, orig->dst.u.tcp.port, &p); + cp = ip_vs_conn_out_get(&p); + if (cp) { + /* Change reply CLIENT->RS to CLIENT->VS */ + IP_VS_DBG_BUF(7, "%s: for ct=%p, status=0x%lX found inout cp=" + FMT_CONN "\n", + __func__, ct, ct->status, ARG_CONN(cp)); + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + IP_VS_DBG_BUF(7, "%s: ct=%p before alter: reply tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&new_reply)); + new_reply.dst.u3 = cp->vaddr; + new_reply.dst.u.tcp.port = cp->vport; + goto alter; + } + + /* CLIENT->VS */ + cp = ip_vs_conn_in_get(&p); + if (cp) { + /* Change reply VS->CLIENT to RS->CLIENT */ + IP_VS_DBG_BUF(7, "%s: for ct=%p, status=0x%lX found outin cp=" + FMT_CONN "\n", + __func__, ct, ct->status, ARG_CONN(cp)); + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + IP_VS_DBG_BUF(7, "%s: ct=%p before alter: reply tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&new_reply)); + new_reply.src.u3 = cp->daddr; + new_reply.src.u.tcp.port = cp->dport; + goto alter; + } + + IP_VS_DBG_BUF(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE + " - unknown expect\n", + __func__, ct, ct->status, ARG_TUPLE(orig)); + return; + +alter: + /* Never alter conntrack for non-NAT conns */ + if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ) + nf_conntrack_alter_reply(ct, &new_reply); + ip_vs_conn_put(cp); + return; +} + +/* + * Create NF conntrack expectation with wildcard (optional) source port. + * Then the default callback function will alter the reply and will confirm + * the conntrack entry when the first packet comes. + * Use port 0 to expect connection from any port. + */ +void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct, + struct ip_vs_conn *cp, u_int8_t proto, + const __be16 port, int from_rs) +{ + struct nf_conntrack_expect *exp; + + if (ct == NULL) + return; + + exp = nf_ct_expect_alloc(ct); + if (!exp) + return; + + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + from_rs ? &cp->daddr : &cp->caddr, + from_rs ? &cp->caddr : &cp->vaddr, + proto, port ? &port : NULL, + from_rs ? &cp->cport : &cp->vport); + + exp->expectfn = ip_vs_nfct_expect_callback; + + IP_VS_DBG_BUF(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&exp->tuple)); + nf_ct_expect_related(exp, 0); + nf_ct_expect_put(exp); +} +EXPORT_SYMBOL(ip_vs_nfct_expect_related); + +/* + * Our connection was terminated, try to drop the conntrack immediately + */ +void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + struct nf_conntrack_tuple tuple; + + if (!cp->cport) + return; + + tuple = (struct nf_conntrack_tuple) { + .dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } }; + tuple.src.u3 = cp->caddr; + tuple.src.u.all = cp->cport; + tuple.src.l3num = cp->af; + tuple.dst.u3 = cp->vaddr; + tuple.dst.u.all = cp->vport; + + IP_VS_DBG_BUF(7, "%s: dropping conntrack for conn " FMT_CONN "\n", + __func__, ARG_CONN(cp)); + + h = nf_conntrack_find_get(cp->ipvs->net, &nf_ct_zone_dflt, &tuple); + if (h) { + ct = nf_ct_tuplehash_to_ctrack(h); + if (nf_ct_kill(ct)) { + IP_VS_DBG_BUF(7, "%s: ct=%p deleted for tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&tuple)); + } else { + IP_VS_DBG_BUF(7, "%s: ct=%p, no conntrack for tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&tuple)); + } + nf_ct_put(ct); + } else { + IP_VS_DBG_BUF(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n", + __func__, ARG_TUPLE(&tuple)); + } +} + diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c new file mode 100644 index 000000000..f56862a87 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_nq.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * IPVS: Never Queue scheduling module + * + * Authors: Wensong Zhang + * + * Changes: + */ + +/* + * The NQ algorithm adopts a two-speed model. When there is an idle server + * available, the job will be sent to the idle server, instead of waiting + * for a fast one. When there is no idle server available, the job will be + * sent to the server that minimize its expected delay (The Shortest + * Expected Delay scheduling algorithm). + * + * See the following paper for more information: + * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing + * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, + * pages 986-994, 1988. + * + * Thanks must go to Marko Buuri for talking NQ to me. + * + * The difference between NQ and SED is that NQ can improve overall + * system utilization. + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include + +#include + + +static inline int +ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We only use the active connection number in the cost + * calculation here. + */ + return atomic_read(&dest->activeconns) + 1; +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest, *least = NULL; + int loh = 0, doh; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* + * We calculate the load of each dest server as follows: + * (server expected overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + + if (dest->flags & IP_VS_DEST_F_OVERLOAD || + !atomic_read(&dest->weight)) + continue; + + doh = ip_vs_nq_dest_overhead(dest); + + /* return the server directly if it is idle */ + if (atomic_read(&dest->activeconns) == 0) { + least = dest; + loh = doh; + goto out; + } + + if (!least || + ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight))) { + least = dest; + loh = doh; + } + } + + if (!least) { + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + } + + out: + IP_VS_DBG_BUF(6, "NQ: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), + atomic_read(&least->activeconns), + refcount_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_nq_scheduler = +{ + .name = "nq", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list), + .schedule = ip_vs_nq_schedule, +}; + + +static int __init ip_vs_nq_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_nq_scheduler); +} + +static void __exit ip_vs_nq_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_nq_init); +module_exit(ip_vs_nq_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_ovf.c b/net/netfilter/ipvs/ip_vs_ovf.c new file mode 100644 index 000000000..c03066fdd --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_ovf.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * IPVS: Overflow-Connection Scheduling module + * + * Authors: Raducu Deaconu + * + * Scheduler implements "overflow" loadbalancing according to number of active + * connections , will keep all connections to the node with the highest weight + * and overflow to the next node if the number of connections exceeds the node's + * weight. + * Note that this scheduler might not be suitable for UDP because it only uses + * active connections + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include + +#include + +/* OVF Connection scheduling */ +static struct ip_vs_dest * +ip_vs_ovf_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest, *h = NULL; + int hw = 0, w; + + IP_VS_DBG(6, "ip_vs_ovf_schedule(): Scheduling...\n"); + /* select the node with highest weight, go to next in line if active + * connections exceed weight + */ + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + w = atomic_read(&dest->weight); + if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || + atomic_read(&dest->activeconns) > w || + w == 0) + continue; + if (!h || w > hw) { + h = dest; + hw = w; + } + } + + if (h) { + IP_VS_DBG_BUF(6, "OVF: server %s:%u active %d w %d\n", + IP_VS_DBG_ADDR(h->af, &h->addr), + ntohs(h->port), + atomic_read(&h->activeconns), + atomic_read(&h->weight)); + return h; + } + + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; +} + +static struct ip_vs_scheduler ip_vs_ovf_scheduler = { + .name = "ovf", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_ovf_scheduler.n_list), + .schedule = ip_vs_ovf_schedule, +}; + +static int __init ip_vs_ovf_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_ovf_scheduler); +} + +static void __exit ip_vs_ovf_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_ovf_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_ovf_init); +module_exit(ip_vs_ovf_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c new file mode 100644 index 000000000..166c669f0 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_pe.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0-only +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include + +#include + +/* IPVS pe list */ +static LIST_HEAD(ip_vs_pe); + +/* semaphore for IPVS PEs. */ +static DEFINE_MUTEX(ip_vs_pe_mutex); + +/* Get pe in the pe list by name */ +struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name) +{ + struct ip_vs_pe *pe; + + IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__, + pe_name); + + rcu_read_lock(); + list_for_each_entry_rcu(pe, &ip_vs_pe, n_list) { + /* Test and get the modules atomically */ + if (pe->module && + !try_module_get(pe->module)) { + /* This pe is just deleted */ + continue; + } + if (strcmp(pe_name, pe->name)==0) { + /* HIT */ + rcu_read_unlock(); + return pe; + } + module_put(pe->module); + } + rcu_read_unlock(); + + return NULL; +} + +/* Lookup pe and try to load it if it doesn't exist */ +struct ip_vs_pe *ip_vs_pe_getbyname(const char *name) +{ + struct ip_vs_pe *pe; + + /* Search for the pe by name */ + pe = __ip_vs_pe_getbyname(name); + + /* If pe not found, load the module and search again */ + if (!pe) { + request_module("ip_vs_pe_%s", name); + pe = __ip_vs_pe_getbyname(name); + } + + return pe; +} + +/* Register a pe in the pe list */ +int register_ip_vs_pe(struct ip_vs_pe *pe) +{ + struct ip_vs_pe *tmp; + + /* increase the module use count */ + if (!ip_vs_use_count_inc()) + return -ENOENT; + + mutex_lock(&ip_vs_pe_mutex); + /* Make sure that the pe with this name doesn't exist + * in the pe list. + */ + list_for_each_entry(tmp, &ip_vs_pe, n_list) { + if (strcmp(tmp->name, pe->name) == 0) { + mutex_unlock(&ip_vs_pe_mutex); + ip_vs_use_count_dec(); + pr_err("%s(): [%s] pe already existed " + "in the system\n", __func__, pe->name); + return -EINVAL; + } + } + /* Add it into the d-linked pe list */ + list_add_rcu(&pe->n_list, &ip_vs_pe); + mutex_unlock(&ip_vs_pe_mutex); + + pr_info("[%s] pe registered.\n", pe->name); + + return 0; +} +EXPORT_SYMBOL_GPL(register_ip_vs_pe); + +/* Unregister a pe from the pe list */ +int unregister_ip_vs_pe(struct ip_vs_pe *pe) +{ + mutex_lock(&ip_vs_pe_mutex); + /* Remove it from the d-linked pe list */ + list_del_rcu(&pe->n_list); + mutex_unlock(&ip_vs_pe_mutex); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + pr_info("[%s] pe unregistered.\n", pe->name); + + return 0; +} +EXPORT_SYMBOL_GPL(unregister_ip_vs_pe); diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c new file mode 100644 index 000000000..0ac6705a6 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -0,0 +1,187 @@ +// SPDX-License-Identifier: GPL-2.0-only +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include + +#include +#include +#include + +#ifdef CONFIG_IP_VS_DEBUG +static const char *ip_vs_dbg_callid(char *buf, size_t buf_len, + const char *callid, size_t callid_len, + int *idx) +{ + size_t max_len = 64; + size_t len = min3(max_len, callid_len, buf_len - *idx - 1); + memcpy(buf + *idx, callid, len); + buf[*idx+len] = '\0'; + *idx += len + 1; + return buf + *idx - len; +} + +#define IP_VS_DEBUG_CALLID(callid, len) \ + ip_vs_dbg_callid(ip_vs_dbg_buf, sizeof(ip_vs_dbg_buf), \ + callid, len, &ip_vs_dbg_idx) +#endif + +static int get_callid(const char *dptr, unsigned int dataoff, + unsigned int datalen, + unsigned int *matchoff, unsigned int *matchlen) +{ + /* Find callid */ + while (1) { + int ret = ct_sip_get_header(NULL, dptr, dataoff, datalen, + SIP_HDR_CALL_ID, matchoff, + matchlen); + if (ret > 0) + break; + if (!ret) + return -EINVAL; + dataoff += *matchoff; + } + + /* Too large is useless */ + if (*matchlen > IP_VS_PEDATA_MAXLEN) + return -EINVAL; + + /* SIP headers are always followed by a line terminator */ + if (*matchoff + *matchlen == datalen) + return -EINVAL; + + /* RFC 2543 allows lines to be terminated with CR, LF or CRLF, + * RFC 3261 allows only CRLF, we support both. */ + if (*(dptr + *matchoff + *matchlen) != '\r' && + *(dptr + *matchoff + *matchlen) != '\n') + return -EINVAL; + + IP_VS_DBG_BUF(9, "SIP callid %s (%d bytes)\n", + IP_VS_DEBUG_CALLID(dptr + *matchoff, *matchlen), + *matchlen); + return 0; +} + +static int +ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb) +{ + struct ip_vs_iphdr iph; + unsigned int dataoff, datalen, matchoff, matchlen; + const char *dptr; + int retc; + + retc = ip_vs_fill_iph_skb(p->af, skb, false, &iph); + + /* Only useful with UDP */ + if (!retc || iph.protocol != IPPROTO_UDP) + return -EINVAL; + /* todo: IPv6 fragments: + * I think this only should be done for the first fragment. /HS + */ + dataoff = iph.len + sizeof(struct udphdr); + + if (dataoff >= skb->len) + return -EINVAL; + retc = skb_linearize(skb); + if (retc < 0) + return retc; + dptr = skb->data + dataoff; + datalen = skb->len - dataoff; + + if (get_callid(dptr, 0, datalen, &matchoff, &matchlen)) + return -EINVAL; + + /* N.B: pe_data is only set on success, + * this allows fallback to the default persistence logic on failure + */ + p->pe_data = kmemdup(dptr + matchoff, matchlen, GFP_ATOMIC); + if (!p->pe_data) + return -ENOMEM; + + p->pe_data_len = matchlen; + + return 0; +} + +static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p, + struct ip_vs_conn *ct) + +{ + bool ret = false; + + if (ct->af == p->af && + ip_vs_addr_equal(p->af, p->caddr, &ct->caddr) && + /* protocol should only be IPPROTO_IP if + * d_addr is a fwmark */ + ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, + p->vaddr, &ct->vaddr) && + ct->vport == p->vport && + ct->flags & IP_VS_CONN_F_TEMPLATE && + ct->protocol == p->protocol && + ct->pe_data && ct->pe_data_len == p->pe_data_len && + !memcmp(ct->pe_data, p->pe_data, p->pe_data_len)) + ret = true; + + IP_VS_DBG_BUF(9, "SIP template match %s %s->%s:%d %s\n", + ip_vs_proto_name(p->protocol), + IP_VS_DEBUG_CALLID(p->pe_data, p->pe_data_len), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), + ret ? "hit" : "not hit"); + + return ret; +} + +static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p, + u32 initval, bool inverse) +{ + return jhash(p->pe_data, p->pe_data_len, initval); +} + +static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf) +{ + memcpy(buf, cp->pe_data, cp->pe_data_len); + return cp->pe_data_len; +} + +static struct ip_vs_conn * +ip_vs_sip_conn_out(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + __be16 dport, + __be16 cport) +{ + if (likely(iph->protocol == IPPROTO_UDP)) + return ip_vs_new_conn_out(svc, dest, skb, iph, dport, cport); + /* currently no need to handle other than UDP */ + return NULL; +} + +static struct ip_vs_pe ip_vs_sip_pe = +{ + .name = "sip", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_sip_pe.n_list), + .fill_param = ip_vs_sip_fill_param, + .ct_match = ip_vs_sip_ct_match, + .hashkey_raw = ip_vs_sip_hashkey_raw, + .show_pe_data = ip_vs_sip_show_pe_data, + .conn_out = ip_vs_sip_conn_out, +}; + +static int __init ip_vs_sip_init(void) +{ + return register_ip_vs_pe(&ip_vs_sip_pe); +} + +static void __exit ip_vs_sip_cleanup(void) +{ + unregister_ip_vs_pe(&ip_vs_sip_pe); + synchronize_rcu(); +} + +module_init(ip_vs_sip_init); +module_exit(ip_vs_sip_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c new file mode 100644 index 000000000..f100da4ba --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -0,0 +1,384 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * ip_vs_proto.c: transport protocol load balancing support for IPVS + * + * Authors: Wensong Zhang + * Julian Anastasov + * + * Changes: + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +/* + * IPVS protocols can only be registered/unregistered when the ipvs + * module is loaded/unloaded, so no lock is needed in accessing the + * ipvs protocol table. + */ + +#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ +#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1)) + +static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; + +/* States for conn templates: NONE or words separated with ",", max 15 chars */ +static const char *ip_vs_ctpl_state_name_table[IP_VS_CTPL_S_LAST] = { + [IP_VS_CTPL_S_NONE] = "NONE", + [IP_VS_CTPL_S_ASSURED] = "ASSURED", +}; + +/* + * register an ipvs protocol + */ +static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) +{ + unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); + + pp->next = ip_vs_proto_table[hash]; + ip_vs_proto_table[hash] = pp; + + if (pp->init != NULL) + pp->init(pp); + + return 0; +} + +/* + * register an ipvs protocols netns related data + */ +static int +register_ip_vs_proto_netns(struct netns_ipvs *ipvs, struct ip_vs_protocol *pp) +{ + unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); + struct ip_vs_proto_data *pd = + kzalloc(sizeof(struct ip_vs_proto_data), GFP_KERNEL); + + if (!pd) + return -ENOMEM; + + pd->pp = pp; /* For speed issues */ + pd->next = ipvs->proto_data_table[hash]; + ipvs->proto_data_table[hash] = pd; + atomic_set(&pd->appcnt, 0); /* Init app counter */ + + if (pp->init_netns != NULL) { + int ret = pp->init_netns(ipvs, pd); + if (ret) { + /* unlink an free proto data */ + ipvs->proto_data_table[hash] = pd->next; + kfree(pd); + return ret; + } + } + + return 0; +} + +/* + * unregister an ipvs protocol + */ +static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) +{ + struct ip_vs_protocol **pp_p; + unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); + + pp_p = &ip_vs_proto_table[hash]; + for (; *pp_p; pp_p = &(*pp_p)->next) { + if (*pp_p == pp) { + *pp_p = pp->next; + if (pp->exit != NULL) + pp->exit(pp); + return 0; + } + } + + return -ESRCH; +} + +/* + * unregister an ipvs protocols netns data + */ +static int +unregister_ip_vs_proto_netns(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) +{ + struct ip_vs_proto_data **pd_p; + unsigned int hash = IP_VS_PROTO_HASH(pd->pp->protocol); + + pd_p = &ipvs->proto_data_table[hash]; + for (; *pd_p; pd_p = &(*pd_p)->next) { + if (*pd_p == pd) { + *pd_p = pd->next; + if (pd->pp->exit_netns != NULL) + pd->pp->exit_netns(ipvs, pd); + kfree(pd); + return 0; + } + } + + return -ESRCH; +} + +/* + * get ip_vs_protocol object by its proto. + */ +struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) +{ + struct ip_vs_protocol *pp; + unsigned int hash = IP_VS_PROTO_HASH(proto); + + for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { + if (pp->protocol == proto) + return pp; + } + + return NULL; +} +EXPORT_SYMBOL(ip_vs_proto_get); + +/* + * get ip_vs_protocol object data by netns and proto + */ +struct ip_vs_proto_data * +ip_vs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) +{ + struct ip_vs_proto_data *pd; + unsigned int hash = IP_VS_PROTO_HASH(proto); + + for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) { + if (pd->pp->protocol == proto) + return pd; + } + + return NULL; +} +EXPORT_SYMBOL(ip_vs_proto_data_get); + +/* + * Propagate event for state change to all protocols + */ +void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags) +{ + struct ip_vs_proto_data *pd; + int i; + + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + for (pd = ipvs->proto_data_table[i]; pd; pd = pd->next) { + if (pd->pp->timeout_change) + pd->pp->timeout_change(pd, flags); + } + } +} + + +int * +ip_vs_create_timeout_table(int *table, int size) +{ + return kmemdup(table, size, GFP_KERNEL); +} + + +const char *ip_vs_state_name(const struct ip_vs_conn *cp) +{ + unsigned int state = cp->state; + struct ip_vs_protocol *pp; + + if (cp->flags & IP_VS_CONN_F_TEMPLATE) { + + if (state >= IP_VS_CTPL_S_LAST) + return "ERR!"; + return ip_vs_ctpl_state_name_table[state] ? : "?"; + } + pp = ip_vs_proto_get(cp->protocol); + if (pp == NULL || pp->state_name == NULL) + return (cp->protocol == IPPROTO_IP) ? "NONE" : "ERR!"; + return pp->state_name(state); +} + + +static void +ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ + char buf[128]; + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "TRUNCATED"); + else if (ih->frag_off & htons(IP_OFFSET)) + sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr); + else { + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, offset + ih->ihl*4, + sizeof(_ports), _ports); + if (pptr == NULL) + sprintf(buf, "TRUNCATED %pI4->%pI4", + &ih->saddr, &ih->daddr); + else + sprintf(buf, "%pI4:%u->%pI4:%u", + &ih->saddr, ntohs(pptr[0]), + &ih->daddr, ntohs(pptr[1])); + } + + pr_debug("%s: %s %s\n", msg, pp->name, buf); +} + +#ifdef CONFIG_IP_VS_IPV6 +static void +ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ + char buf[192]; + struct ipv6hdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "TRUNCATED"); + else if (ih->nexthdr == IPPROTO_FRAGMENT) + sprintf(buf, "%pI6c->%pI6c frag", &ih->saddr, &ih->daddr); + else { + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr), + sizeof(_ports), _ports); + if (pptr == NULL) + sprintf(buf, "TRUNCATED %pI6c->%pI6c", + &ih->saddr, &ih->daddr); + else + sprintf(buf, "%pI6c:%u->%pI6c:%u", + &ih->saddr, ntohs(pptr[0]), + &ih->daddr, ntohs(pptr[1])); + } + + pr_debug("%s: %s %s\n", msg, pp->name, buf); +} +#endif + + +void +ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg); + else +#endif + ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg); +} + +/* + * per network name-space init + */ +int __net_init ip_vs_protocol_net_init(struct netns_ipvs *ipvs) +{ + int i, ret; + static struct ip_vs_protocol *protos[] = { +#ifdef CONFIG_IP_VS_PROTO_TCP + &ip_vs_protocol_tcp, +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + &ip_vs_protocol_udp, +#endif +#ifdef CONFIG_IP_VS_PROTO_SCTP + &ip_vs_protocol_sctp, +#endif +#ifdef CONFIG_IP_VS_PROTO_AH + &ip_vs_protocol_ah, +#endif +#ifdef CONFIG_IP_VS_PROTO_ESP + &ip_vs_protocol_esp, +#endif + }; + + for (i = 0; i < ARRAY_SIZE(protos); i++) { + ret = register_ip_vs_proto_netns(ipvs, protos[i]); + if (ret < 0) + goto cleanup; + } + return 0; + +cleanup: + ip_vs_protocol_net_cleanup(ipvs); + return ret; +} + +void __net_exit ip_vs_protocol_net_cleanup(struct netns_ipvs *ipvs) +{ + struct ip_vs_proto_data *pd; + int i; + + /* unregister all the ipvs proto data for this netns */ + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + while ((pd = ipvs->proto_data_table[i]) != NULL) + unregister_ip_vs_proto_netns(ipvs, pd); + } +} + +int __init ip_vs_protocol_init(void) +{ + char protocols[64]; +#define REGISTER_PROTOCOL(p) \ + do { \ + register_ip_vs_protocol(p); \ + strcat(protocols, ", "); \ + strcat(protocols, (p)->name); \ + } while (0) + + protocols[0] = '\0'; + protocols[2] = '\0'; +#ifdef CONFIG_IP_VS_PROTO_TCP + REGISTER_PROTOCOL(&ip_vs_protocol_tcp); +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + REGISTER_PROTOCOL(&ip_vs_protocol_udp); +#endif +#ifdef CONFIG_IP_VS_PROTO_SCTP + REGISTER_PROTOCOL(&ip_vs_protocol_sctp); +#endif +#ifdef CONFIG_IP_VS_PROTO_AH + REGISTER_PROTOCOL(&ip_vs_protocol_ah); +#endif +#ifdef CONFIG_IP_VS_PROTO_ESP + REGISTER_PROTOCOL(&ip_vs_protocol_esp); +#endif + pr_info("Registered protocols (%s)\n", &protocols[2]); + + return 0; +} + + +void ip_vs_protocol_cleanup(void) +{ + struct ip_vs_protocol *pp; + int i; + + /* unregister all the ipvs protocols */ + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + while ((pp = ip_vs_proto_table[i]) != NULL) + unregister_ip_vs_protocol(pp); + } +} diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c new file mode 100644 index 000000000..89602c16f --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS + * + * Authors: Julian Anastasov , February 2002 + * Wensong Zhang + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include + +#include + + +/* TODO: + +struct isakmp_hdr { + __u8 icookie[8]; + __u8 rcookie[8]; + __u8 np; + __u8 version; + __u8 xchgtype; + __u8 flags; + __u32 msgid; + __u32 length; +}; + +*/ + +#define PORT_ISAKMP 500 + +static void +ah_esp_conn_fill_param_proto(struct netns_ipvs *ipvs, int af, + const struct ip_vs_iphdr *iph, + struct ip_vs_conn_param *p) +{ + if (likely(!ip_vs_iph_inverse(iph))) + ip_vs_conn_fill_param(ipvs, af, IPPROTO_UDP, + &iph->saddr, htons(PORT_ISAKMP), + &iph->daddr, htons(PORT_ISAKMP), p); + else + ip_vs_conn_fill_param(ipvs, af, IPPROTO_UDP, + &iph->daddr, htons(PORT_ISAKMP), + &iph->saddr, htons(PORT_ISAKMP), p); +} + +static struct ip_vs_conn * +ah_esp_conn_in_get(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph) +{ + struct ip_vs_conn *cp; + struct ip_vs_conn_param p; + + ah_esp_conn_fill_param_proto(ipvs, af, iph, &p); + cp = ip_vs_conn_in_get(&p); + if (!cp) { + /* + * We are not sure if the packet is from our + * service, so our conn_schedule hook should return NF_ACCEPT + */ + IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet " + "%s%s %s->%s\n", + ip_vs_iph_icmp(iph) ? "ICMP+" : "", + ip_vs_proto_get(iph->protocol)->name, + IP_VS_DBG_ADDR(af, &iph->saddr), + IP_VS_DBG_ADDR(af, &iph->daddr)); + } + + return cp; +} + + +static struct ip_vs_conn * +ah_esp_conn_out_get(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph) +{ + struct ip_vs_conn *cp; + struct ip_vs_conn_param p; + + ah_esp_conn_fill_param_proto(ipvs, af, iph, &p); + cp = ip_vs_conn_out_get(&p); + if (!cp) { + IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " + "%s%s %s->%s\n", + ip_vs_iph_icmp(iph) ? "ICMP+" : "", + ip_vs_proto_get(iph->protocol)->name, + IP_VS_DBG_ADDR(af, &iph->saddr), + IP_VS_DBG_ADDR(af, &iph->daddr)); + } + + return cp; +} + + +static int +ah_esp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, + struct ip_vs_proto_data *pd, + int *verdict, struct ip_vs_conn **cpp, + struct ip_vs_iphdr *iph) +{ + /* + * AH/ESP is only related traffic. Pass the packet to IP stack. + */ + *verdict = NF_ACCEPT; + return 0; +} + +#ifdef CONFIG_IP_VS_PROTO_AH +struct ip_vs_protocol ip_vs_protocol_ah = { + .name = "AH", + .protocol = IPPROTO_AH, + .num_states = 1, + .dont_defrag = 1, + .init = NULL, + .exit = NULL, + .conn_schedule = ah_esp_conn_schedule, + .conn_in_get = ah_esp_conn_in_get, + .conn_out_get = ah_esp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = NULL, /* ISAKMP */ +}; +#endif + +#ifdef CONFIG_IP_VS_PROTO_ESP +struct ip_vs_protocol ip_vs_protocol_esp = { + .name = "ESP", + .protocol = IPPROTO_ESP, + .num_states = 1, + .dont_defrag = 1, + .init = NULL, + .exit = NULL, + .conn_schedule = ah_esp_conn_schedule, + .conn_in_get = ah_esp_conn_in_get, + .conn_out_get = ah_esp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = NULL, /* ISAKMP */ +}; +#endif diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c new file mode 100644 index 000000000..a0921adc3 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -0,0 +1,595 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int +sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp); + +static int +sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, + struct ip_vs_proto_data *pd, + int *verdict, struct ip_vs_conn **cpp, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_service *svc; + struct sctp_chunkhdr _schunkh, *sch; + struct sctphdr *sh, _sctph; + __be16 _ports[2], *ports = NULL; + + if (likely(!ip_vs_iph_icmp(iph))) { + sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); + if (sh) { + sch = skb_header_pointer(skb, iph->len + sizeof(_sctph), + sizeof(_schunkh), &_schunkh); + if (sch) { + if (sch->type == SCTP_CID_ABORT || + !(sysctl_sloppy_sctp(ipvs) || + sch->type == SCTP_CID_INIT)) + return 1; + ports = &sh->source; + } + } + } else { + ports = skb_header_pointer( + skb, iph->len, sizeof(_ports), &_ports); + } + + if (!ports) { + *verdict = NF_DROP; + return 0; + } + + if (likely(!ip_vs_iph_inverse(iph))) + svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, + &iph->daddr, ports[1]); + else + svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, + &iph->saddr, ports[0]); + if (svc) { + int ignored; + + if (ip_vs_todrop(ipvs)) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + *verdict = NF_DROP; + return 0; + } + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); + if (!*cpp && ignored <= 0) { + if (!ignored) + *verdict = ip_vs_leave(svc, skb, pd, iph); + else + *verdict = NF_DROP; + return 0; + } + } + /* NF_ACCEPT */ + return 1; +} + +static void sctp_nat_csum(struct sk_buff *skb, struct sctphdr *sctph, + unsigned int sctphoff) +{ + sctph->checksum = sctp_compute_cksum(skb, sctphoff); + skb->ip_summed = CHECKSUM_UNNECESSARY; +} + +static int +sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) +{ + struct sctphdr *sctph; + unsigned int sctphoff = iph->len; + bool payload_csum = false; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; +#endif + + /* csum_check requires unshared skb */ + if (skb_ensure_writable(skb, sctphoff + sizeof(*sctph))) + return 0; + + if (unlikely(cp->app != NULL)) { + int ret; + + /* Some checks before mangling */ + if (!sctp_csum_check(cp->af, skb, pp)) + return 0; + + /* Call application helper if needed */ + ret = ip_vs_app_pkt_out(cp, skb, iph); + if (ret == 0) + return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 2) + payload_csum = true; + } + + sctph = (void *) skb_network_header(skb) + sctphoff; + + /* Only update csum if we really have to */ + if (sctph->source != cp->vport || payload_csum || + skb->ip_summed == CHECKSUM_PARTIAL) { + sctph->source = cp->vport; + sctp_nat_csum(skb, sctph, sctphoff); + } else { + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + + return 1; +} + +static int +sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) +{ + struct sctphdr *sctph; + unsigned int sctphoff = iph->len; + bool payload_csum = false; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; +#endif + + /* csum_check requires unshared skb */ + if (skb_ensure_writable(skb, sctphoff + sizeof(*sctph))) + return 0; + + if (unlikely(cp->app != NULL)) { + int ret; + + /* Some checks before mangling */ + if (!sctp_csum_check(cp->af, skb, pp)) + return 0; + + /* Call application helper if needed */ + ret = ip_vs_app_pkt_in(cp, skb, iph); + if (ret == 0) + return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 2) + payload_csum = true; + } + + sctph = (void *) skb_network_header(skb) + sctphoff; + + /* Only update csum if we really have to */ + if (sctph->dest != cp->dport || payload_csum || + (skb->ip_summed == CHECKSUM_PARTIAL && + !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CRC))) { + sctph->dest = cp->dport; + sctp_nat_csum(skb, sctph, sctphoff); + } else if (skb->ip_summed != CHECKSUM_PARTIAL) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + + return 1; +} + +static int +sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) +{ + unsigned int sctphoff; + struct sctphdr *sh; + __le32 cmp, val; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + sctphoff = sizeof(struct ipv6hdr); + else +#endif + sctphoff = ip_hdrlen(skb); + + sh = (struct sctphdr *)(skb->data + sctphoff); + cmp = sh->checksum; + val = sctp_compute_cksum(skb, sctphoff); + + if (val != cmp) { + /* CRC failure, dump it. */ + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, + "Failed checksum for"); + return 0; + } + return 1; +} + +enum ipvs_sctp_event_t { + IP_VS_SCTP_DATA = 0, /* DATA, SACK, HEARTBEATs */ + IP_VS_SCTP_INIT, + IP_VS_SCTP_INIT_ACK, + IP_VS_SCTP_COOKIE_ECHO, + IP_VS_SCTP_COOKIE_ACK, + IP_VS_SCTP_SHUTDOWN, + IP_VS_SCTP_SHUTDOWN_ACK, + IP_VS_SCTP_SHUTDOWN_COMPLETE, + IP_VS_SCTP_ERROR, + IP_VS_SCTP_ABORT, + IP_VS_SCTP_EVENT_LAST +}; + +/* RFC 2960, 3.2 Chunk Field Descriptions */ +static __u8 sctp_events[] = { + [SCTP_CID_DATA] = IP_VS_SCTP_DATA, + [SCTP_CID_INIT] = IP_VS_SCTP_INIT, + [SCTP_CID_INIT_ACK] = IP_VS_SCTP_INIT_ACK, + [SCTP_CID_SACK] = IP_VS_SCTP_DATA, + [SCTP_CID_HEARTBEAT] = IP_VS_SCTP_DATA, + [SCTP_CID_HEARTBEAT_ACK] = IP_VS_SCTP_DATA, + [SCTP_CID_ABORT] = IP_VS_SCTP_ABORT, + [SCTP_CID_SHUTDOWN] = IP_VS_SCTP_SHUTDOWN, + [SCTP_CID_SHUTDOWN_ACK] = IP_VS_SCTP_SHUTDOWN_ACK, + [SCTP_CID_ERROR] = IP_VS_SCTP_ERROR, + [SCTP_CID_COOKIE_ECHO] = IP_VS_SCTP_COOKIE_ECHO, + [SCTP_CID_COOKIE_ACK] = IP_VS_SCTP_COOKIE_ACK, + [SCTP_CID_ECN_ECNE] = IP_VS_SCTP_DATA, + [SCTP_CID_ECN_CWR] = IP_VS_SCTP_DATA, + [SCTP_CID_SHUTDOWN_COMPLETE] = IP_VS_SCTP_SHUTDOWN_COMPLETE, +}; + +/* SCTP States: + * See RFC 2960, 4. SCTP Association State Diagram + * + * New states (not in diagram): + * - INIT1 state: use shorter timeout for dropped INIT packets + * - REJECTED state: use shorter timeout if INIT is rejected with ABORT + * - INIT, COOKIE_SENT, COOKIE_REPLIED, COOKIE states: for better debugging + * + * The states are as seen in real server. In the diagram, INIT1, INIT, + * COOKIE_SENT and COOKIE_REPLIED processing happens in CLOSED state. + * + * States as per packets from client (C) and server (S): + * + * Setup of client connection: + * IP_VS_SCTP_S_INIT1: First C:INIT sent, wait for S:INIT-ACK + * IP_VS_SCTP_S_INIT: Next C:INIT sent, wait for S:INIT-ACK + * IP_VS_SCTP_S_COOKIE_SENT: S:INIT-ACK sent, wait for C:COOKIE-ECHO + * IP_VS_SCTP_S_COOKIE_REPLIED: C:COOKIE-ECHO sent, wait for S:COOKIE-ACK + * + * Setup of server connection: + * IP_VS_SCTP_S_COOKIE_WAIT: S:INIT sent, wait for C:INIT-ACK + * IP_VS_SCTP_S_COOKIE: C:INIT-ACK sent, wait for S:COOKIE-ECHO + * IP_VS_SCTP_S_COOKIE_ECHOED: S:COOKIE-ECHO sent, wait for C:COOKIE-ACK + */ + +#define sNO IP_VS_SCTP_S_NONE +#define sI1 IP_VS_SCTP_S_INIT1 +#define sIN IP_VS_SCTP_S_INIT +#define sCS IP_VS_SCTP_S_COOKIE_SENT +#define sCR IP_VS_SCTP_S_COOKIE_REPLIED +#define sCW IP_VS_SCTP_S_COOKIE_WAIT +#define sCO IP_VS_SCTP_S_COOKIE +#define sCE IP_VS_SCTP_S_COOKIE_ECHOED +#define sES IP_VS_SCTP_S_ESTABLISHED +#define sSS IP_VS_SCTP_S_SHUTDOWN_SENT +#define sSR IP_VS_SCTP_S_SHUTDOWN_RECEIVED +#define sSA IP_VS_SCTP_S_SHUTDOWN_ACK_SENT +#define sRJ IP_VS_SCTP_S_REJECTED +#define sCL IP_VS_SCTP_S_CLOSED + +static const __u8 sctp_states + [IP_VS_DIR_LAST][IP_VS_SCTP_EVENT_LAST][IP_VS_SCTP_S_LAST] = { + { /* INPUT */ +/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ +/* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* i */{sI1, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN}, +/* i_a */{sCW, sCW, sCW, sCS, sCR, sCO, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_e */{sCR, sIN, sIN, sCR, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_a */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sES, sES, sSS, sSR, sSA, sRJ, sCL}, +/* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL}, +/* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sCL, sSR, sCL, sRJ, sCL}, +/* s_c */{sCL, sCL, sCL, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sCL, sRJ, sCL}, +/* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCL, sES, sSS, sSR, sSA, sRJ, sCL}, +/* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, + }, + { /* OUTPUT */ +/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ +/* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* i */{sCW, sCW, sCW, sCW, sCW, sCW, sCW, sCW, sES, sCW, sCW, sCW, sCW, sCW}, +/* i_a */{sCS, sCS, sCS, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_e */{sCE, sCE, sCE, sCE, sCE, sCE, sCE, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_a */{sES, sES, sES, sES, sES, sES, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL}, +/* s */{sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSR, sSA, sRJ, sCL}, +/* s_a */{sSA, sSA, sSA, sSA, sSA, sCW, sCO, sCE, sES, sSA, sSA, sSA, sRJ, sCL}, +/* s_c */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* err */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* ab */{sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, + }, + { /* INPUT-ONLY */ +/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/ +/* d */{sES, sI1, sIN, sCS, sCR, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* i */{sI1, sIN, sIN, sIN, sIN, sIN, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN}, +/* i_a */{sCE, sCE, sCE, sCE, sCE, sCE, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_e */{sES, sES, sES, sES, sES, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* c_a */{sES, sI1, sIN, sES, sES, sCW, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL}, +/* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL}, +/* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sCL, sCL, sSR, sCL, sRJ, sCL}, +/* s_c */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sCL, sCL, sRJ, sCL}, +/* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL}, +/* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, + }, +}; + +#define IP_VS_SCTP_MAX_RTO ((60 + 1) * HZ) + +/* Timeout table[state] */ +static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = { + [IP_VS_SCTP_S_NONE] = 2 * HZ, + [IP_VS_SCTP_S_INIT1] = (0 + 3 + 1) * HZ, + [IP_VS_SCTP_S_INIT] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_COOKIE_SENT] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_COOKIE_REPLIED] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_COOKIE_WAIT] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_COOKIE] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_COOKIE_ECHOED] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_ESTABLISHED] = 15 * 60 * HZ, + [IP_VS_SCTP_S_SHUTDOWN_SENT] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_REJECTED] = (0 + 3 + 1) * HZ, + [IP_VS_SCTP_S_CLOSED] = IP_VS_SCTP_MAX_RTO, + [IP_VS_SCTP_S_LAST] = 2 * HZ, +}; + +static const char *sctp_state_name_table[IP_VS_SCTP_S_LAST + 1] = { + [IP_VS_SCTP_S_NONE] = "NONE", + [IP_VS_SCTP_S_INIT1] = "INIT1", + [IP_VS_SCTP_S_INIT] = "INIT", + [IP_VS_SCTP_S_COOKIE_SENT] = "C-SENT", + [IP_VS_SCTP_S_COOKIE_REPLIED] = "C-REPLIED", + [IP_VS_SCTP_S_COOKIE_WAIT] = "C-WAIT", + [IP_VS_SCTP_S_COOKIE] = "COOKIE", + [IP_VS_SCTP_S_COOKIE_ECHOED] = "C-ECHOED", + [IP_VS_SCTP_S_ESTABLISHED] = "ESTABLISHED", + [IP_VS_SCTP_S_SHUTDOWN_SENT] = "S-SENT", + [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = "S-RECEIVED", + [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = "S-ACK-SENT", + [IP_VS_SCTP_S_REJECTED] = "REJECTED", + [IP_VS_SCTP_S_CLOSED] = "CLOSED", + [IP_VS_SCTP_S_LAST] = "BUG!", +}; + + +static const char *sctp_state_name(int state) +{ + if (state >= IP_VS_SCTP_S_LAST) + return "ERR!"; + if (sctp_state_name_table[state]) + return sctp_state_name_table[state]; + return "?"; +} + +static inline void +set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, + int direction, const struct sk_buff *skb) +{ + struct sctp_chunkhdr _sctpch, *sch; + unsigned char chunk_type; + int event, next_state; + int ihl, cofs; + +#ifdef CONFIG_IP_VS_IPV6 + ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); +#else + ihl = ip_hdrlen(skb); +#endif + + cofs = ihl + sizeof(struct sctphdr); + sch = skb_header_pointer(skb, cofs, sizeof(_sctpch), &_sctpch); + if (sch == NULL) + return; + + chunk_type = sch->type; + /* + * Section 3: Multiple chunks can be bundled into one SCTP packet + * up to the MTU size, except for the INIT, INIT ACK, and + * SHUTDOWN COMPLETE chunks. These chunks MUST NOT be bundled with + * any other chunk in a packet. + * + * Section 3.3.7: DATA chunks MUST NOT be bundled with ABORT. Control + * chunks (except for INIT, INIT ACK, and SHUTDOWN COMPLETE) MAY be + * bundled with an ABORT, but they MUST be placed before the ABORT + * in the SCTP packet or they will be ignored by the receiver. + */ + if ((sch->type == SCTP_CID_COOKIE_ECHO) || + (sch->type == SCTP_CID_COOKIE_ACK)) { + int clen = ntohs(sch->length); + + if (clen >= sizeof(_sctpch)) { + sch = skb_header_pointer(skb, cofs + ALIGN(clen, 4), + sizeof(_sctpch), &_sctpch); + if (sch && sch->type == SCTP_CID_ABORT) + chunk_type = sch->type; + } + } + + event = (chunk_type < sizeof(sctp_events)) ? + sctp_events[chunk_type] : IP_VS_SCTP_DATA; + + /* Update direction to INPUT_ONLY if necessary + * or delete NO_OUTPUT flag if output packet detected + */ + if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { + if (direction == IP_VS_DIR_OUTPUT) + cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; + else + direction = IP_VS_DIR_INPUT_ONLY; + } + + next_state = sctp_states[direction][event][cp->state]; + + if (next_state != cp->state) { + struct ip_vs_dest *dest = cp->dest; + + IP_VS_DBG_BUF(8, "%s %s %s:%d->" + "%s:%d state: %s->%s conn->refcnt:%d\n", + pd->pp->name, + ((direction == IP_VS_DIR_OUTPUT) ? + "output " : "input "), + IP_VS_DBG_ADDR(cp->daf, &cp->daddr), + ntohs(cp->dport), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + sctp_state_name(cp->state), + sctp_state_name(next_state), + refcount_read(&cp->refcnt)); + if (dest) { + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (next_state != IP_VS_SCTP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + (next_state == IP_VS_SCTP_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } + if (next_state == IP_VS_SCTP_S_ESTABLISHED) + ip_vs_control_assure_ct(cp); + } + if (likely(pd)) + cp->timeout = pd->timeout_table[cp->state = next_state]; + else /* What to do ? */ + cp->timeout = sctp_timeouts[cp->state = next_state]; +} + +static void +sctp_state_transition(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, struct ip_vs_proto_data *pd) +{ + spin_lock_bh(&cp->lock); + set_sctp_state(pd, cp, direction, skb); + spin_unlock_bh(&cp->lock); +} + +static inline __u16 sctp_app_hashkey(__be16 port) +{ + return (((__force u16)port >> SCTP_APP_TAB_BITS) ^ (__force u16)port) + & SCTP_APP_TAB_MASK; +} + +static int sctp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) +{ + struct ip_vs_app *i; + __u16 hash; + __be16 port = inc->port; + int ret = 0; + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_SCTP); + + hash = sctp_app_hashkey(port); + + list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) { + if (i->port == port) { + ret = -EEXIST; + goto out; + } + } + list_add_rcu(&inc->p_list, &ipvs->sctp_apps[hash]); + atomic_inc(&pd->appcnt); +out: + + return ret; +} + +static void sctp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) +{ + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_SCTP); + + atomic_dec(&pd->appcnt); + list_del_rcu(&inc->p_list); +} + +static int sctp_app_conn_bind(struct ip_vs_conn *cp) +{ + struct netns_ipvs *ipvs = cp->ipvs; + int hash; + struct ip_vs_app *inc; + int result = 0; + + /* Default binding: bind app only for NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return 0; + /* Lookup application incarnations and bind the right one */ + hash = sctp_app_hashkey(cp->vport); + + list_for_each_entry_rcu(inc, &ipvs->sctp_apps[hash], p_list) { + if (inc->port == cp->vport) { + if (unlikely(!ip_vs_app_inc_get(inc))) + break; + + IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" + "%s:%u to app %s on port %u\n", + __func__, + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + inc->name, ntohs(inc->port)); + cp->app = inc; + if (inc->init_conn) + result = inc->init_conn(inc, cp); + break; + } + } + + return result; +} + +/* --------------------------------------------- + * timeouts is netns related now. + * --------------------------------------------- + */ +static int __ip_vs_sctp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) +{ + ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE); + pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts, + sizeof(sctp_timeouts)); + if (!pd->timeout_table) + return -ENOMEM; + return 0; +} + +static void __ip_vs_sctp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) +{ + kfree(pd->timeout_table); +} + +struct ip_vs_protocol ip_vs_protocol_sctp = { + .name = "SCTP", + .protocol = IPPROTO_SCTP, + .num_states = IP_VS_SCTP_S_LAST, + .dont_defrag = 0, + .init = NULL, + .exit = NULL, + .init_netns = __ip_vs_sctp_init, + .exit_netns = __ip_vs_sctp_exit, + .register_app = sctp_register_app, + .unregister_app = sctp_unregister_app, + .conn_schedule = sctp_conn_schedule, + .conn_in_get = ip_vs_conn_in_get_proto, + .conn_out_get = ip_vs_conn_out_get_proto, + .snat_handler = sctp_snat_handler, + .dnat_handler = sctp_dnat_handler, + .state_name = sctp_state_name, + .state_transition = sctp_state_transition, + .app_conn_bind = sctp_app_conn_bind, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = NULL, +}; diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c new file mode 100644 index 000000000..7da51390c --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -0,0 +1,746 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * ip_vs_proto_tcp.c: TCP load balancing support for IPVS + * + * Authors: Wensong Zhang + * Julian Anastasov + * + * Changes: Hans Schillstrom + * + * Network name space (netns) aware. + * Global data moved to netns i.e struct netns_ipvs + * tcp_timeouts table has copy per netns in a hash table per + * protocol ip_vs_proto_data and is handled by netns + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include /* for tcphdr */ +#include +#include /* for csum_tcpudp_magic */ +#include +#include +#include +#include + +#include + +static int +tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp); + +static int +tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, + struct ip_vs_proto_data *pd, + int *verdict, struct ip_vs_conn **cpp, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_service *svc; + struct tcphdr _tcph, *th; + __be16 _ports[2], *ports = NULL; + + /* In the event of icmp, we're only guaranteed to have the first 8 + * bytes of the transport header, so we only check the rest of the + * TCP packet for non-ICMP packets + */ + if (likely(!ip_vs_iph_icmp(iph))) { + th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); + if (th) { + if (th->rst || !(sysctl_sloppy_tcp(ipvs) || th->syn)) + return 1; + ports = &th->source; + } + } else { + ports = skb_header_pointer( + skb, iph->len, sizeof(_ports), &_ports); + } + + if (!ports) { + *verdict = NF_DROP; + return 0; + } + + /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ + + if (likely(!ip_vs_iph_inverse(iph))) + svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, + &iph->daddr, ports[1]); + else + svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, + &iph->saddr, ports[0]); + + if (svc) { + int ignored; + + if (ip_vs_todrop(ipvs)) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + *verdict = NF_DROP; + return 0; + } + + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); + if (!*cpp && ignored <= 0) { + if (!ignored) + *verdict = ip_vs_leave(svc, skb, pd, iph); + else + *verdict = NF_DROP; + return 0; + } + } + /* NF_ACCEPT */ + return 1; +} + + +static inline void +tcp_fast_csum_update(int af, struct tcphdr *tcph, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldport, __be16 newport) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcph->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(tcph->check)))); + else +#endif + tcph->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(tcph->check)))); +} + + +static inline void +tcp_partial_csum_update(int af, struct tcphdr *tcph, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldlen, __be16 newlen) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcph->check = + ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldlen, newlen, + csum_unfold(tcph->check)))); + else +#endif + tcph->check = + ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldlen, newlen, + csum_unfold(tcph->check)))); +} + + +INDIRECT_CALLABLE_SCOPE int +tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) +{ + struct tcphdr *tcph; + unsigned int tcphoff = iph->len; + bool payload_csum = false; + int oldlen; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; +#endif + oldlen = skb->len - tcphoff; + + /* csum_check requires unshared skb */ + if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph))) + return 0; + + if (unlikely(cp->app != NULL)) { + int ret; + + /* Some checks before mangling */ + if (!tcp_csum_check(cp->af, skb, pp)) + return 0; + + /* Call application helper if needed */ + if (!(ret = ip_vs_app_pkt_out(cp, skb, iph))) + return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - tcphoff; + else + payload_csum = true; + } + + tcph = (void *)skb_network_header(skb) + tcphoff; + tcph->source = cp->vport; + + /* Adjust TCP checksums */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, + htons(oldlen), + htons(skb->len - tcphoff)); + } else if (!payload_csum) { + /* Only port and addr are changed, do fast csum update */ + tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, + cp->dport, cp->vport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = cp->app ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; + } else { + /* full checksum calculation */ + tcph->check = 0; + skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcph->check = csum_ipv6_magic(&cp->vaddr.in6, + &cp->caddr.in6, + skb->len - tcphoff, + cp->protocol, skb->csum); + else +#endif + tcph->check = csum_tcpudp_magic(cp->vaddr.ip, + cp->caddr.ip, + skb->len - tcphoff, + cp->protocol, + skb->csum); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", + pp->name, tcph->check, + (char*)&(tcph->check) - (char*)tcph); + } + return 1; +} + + +static int +tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) +{ + struct tcphdr *tcph; + unsigned int tcphoff = iph->len; + bool payload_csum = false; + int oldlen; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; +#endif + oldlen = skb->len - tcphoff; + + /* csum_check requires unshared skb */ + if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph))) + return 0; + + if (unlikely(cp->app != NULL)) { + int ret; + + /* Some checks before mangling */ + if (!tcp_csum_check(cp->af, skb, pp)) + return 0; + + /* + * Attempt ip_vs_app call. + * It will fix ip_vs_conn and iph ack_seq stuff + */ + if (!(ret = ip_vs_app_pkt_in(cp, skb, iph))) + return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - tcphoff; + else + payload_csum = true; + } + + tcph = (void *)skb_network_header(skb) + tcphoff; + tcph->dest = cp->dport; + + /* + * Adjust TCP checksums + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, + htons(oldlen), + htons(skb->len - tcphoff)); + } else if (!payload_csum) { + /* Only port and addr are changed, do fast csum update */ + tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, + cp->vport, cp->dport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = cp->app ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; + } else { + /* full checksum calculation */ + tcph->check = 0; + skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcph->check = csum_ipv6_magic(&cp->caddr.in6, + &cp->daddr.in6, + skb->len - tcphoff, + cp->protocol, skb->csum); + else +#endif + tcph->check = csum_tcpudp_magic(cp->caddr.ip, + cp->daddr.ip, + skb->len - tcphoff, + cp->protocol, + skb->csum); + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return 1; +} + + +static int +tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) +{ + unsigned int tcphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcphoff = sizeof(struct ipv6hdr); + else +#endif + tcphoff = ip_hdrlen(skb); + + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); + fallthrough; + case CHECKSUM_COMPLETE: +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - tcphoff, + ipv6_hdr(skb)->nexthdr, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, + "Failed checksum for"); + return 0; + } + } else +#endif + if (csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - tcphoff, + ip_hdr(skb)->protocol, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, + "Failed checksum for"); + return 0; + } + break; + default: + /* No need to checksum. */ + break; + } + + return 1; +} + + +#define TCP_DIR_INPUT 0 +#define TCP_DIR_OUTPUT 4 +#define TCP_DIR_INPUT_ONLY 8 + +static const int tcp_state_off[IP_VS_DIR_LAST] = { + [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, + [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, + [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, +}; + +/* + * Timeout table[state] + */ +static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { + [IP_VS_TCP_S_NONE] = 2*HZ, + [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, + [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, + [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, + [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, + [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, + [IP_VS_TCP_S_CLOSE] = 10*HZ, + [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, + [IP_VS_TCP_S_LAST_ACK] = 30*HZ, + [IP_VS_TCP_S_LISTEN] = 2*60*HZ, + [IP_VS_TCP_S_SYNACK] = 120*HZ, + [IP_VS_TCP_S_LAST] = 2*HZ, +}; + +static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { + [IP_VS_TCP_S_NONE] = "NONE", + [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", + [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", + [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV", + [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT", + [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT", + [IP_VS_TCP_S_CLOSE] = "CLOSE", + [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT", + [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK", + [IP_VS_TCP_S_LISTEN] = "LISTEN", + [IP_VS_TCP_S_SYNACK] = "SYNACK", + [IP_VS_TCP_S_LAST] = "BUG!", +}; + +static const bool tcp_state_active_table[IP_VS_TCP_S_LAST] = { + [IP_VS_TCP_S_NONE] = false, + [IP_VS_TCP_S_ESTABLISHED] = true, + [IP_VS_TCP_S_SYN_SENT] = true, + [IP_VS_TCP_S_SYN_RECV] = true, + [IP_VS_TCP_S_FIN_WAIT] = false, + [IP_VS_TCP_S_TIME_WAIT] = false, + [IP_VS_TCP_S_CLOSE] = false, + [IP_VS_TCP_S_CLOSE_WAIT] = false, + [IP_VS_TCP_S_LAST_ACK] = false, + [IP_VS_TCP_S_LISTEN] = false, + [IP_VS_TCP_S_SYNACK] = true, +}; + +#define sNO IP_VS_TCP_S_NONE +#define sES IP_VS_TCP_S_ESTABLISHED +#define sSS IP_VS_TCP_S_SYN_SENT +#define sSR IP_VS_TCP_S_SYN_RECV +#define sFW IP_VS_TCP_S_FIN_WAIT +#define sTW IP_VS_TCP_S_TIME_WAIT +#define sCL IP_VS_TCP_S_CLOSE +#define sCW IP_VS_TCP_S_CLOSE_WAIT +#define sLA IP_VS_TCP_S_LAST_ACK +#define sLI IP_VS_TCP_S_LISTEN +#define sSA IP_VS_TCP_S_SYNACK + +struct tcp_states_t { + int next_state[IP_VS_TCP_S_LAST]; +}; + +static const char * tcp_state_name(int state) +{ + if (state >= IP_VS_TCP_S_LAST) + return "ERR!"; + return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; +} + +static bool tcp_state_active(int state) +{ + if (state >= IP_VS_TCP_S_LAST) + return false; + return tcp_state_active_table[state]; +} + +static struct tcp_states_t tcp_states[] = { +/* INPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, +/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, + +/* OUTPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, +/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, +/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, + +/* INPUT-ONLY */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, +/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, +}; + +static struct tcp_states_t tcp_states_dos[] = { +/* INPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, +/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, +/*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, + +/* OUTPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, +/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, +/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, + +/* INPUT-ONLY */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, +/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, +}; + +static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags) +{ + int on = (flags & 1); /* secure_tcp */ + + /* + ** FIXME: change secure_tcp to independent sysctl var + ** or make it per-service or per-app because it is valid + ** for most if not for all of the applications. Something + ** like "capabilities" (flags) for each object. + */ + pd->tcp_state_table = (on ? tcp_states_dos : tcp_states); +} + +static inline int tcp_state_idx(struct tcphdr *th) +{ + if (th->rst) + return 3; + if (th->syn) + return 0; + if (th->fin) + return 1; + if (th->ack) + return 2; + return -1; +} + +static inline void +set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, + int direction, struct tcphdr *th) +{ + int state_idx; + int new_state = IP_VS_TCP_S_CLOSE; + int state_off = tcp_state_off[direction]; + + /* + * Update state offset to INPUT_ONLY if necessary + * or delete NO_OUTPUT flag if output packet detected + */ + if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { + if (state_off == TCP_DIR_OUTPUT) + cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; + else + state_off = TCP_DIR_INPUT_ONLY; + } + + if ((state_idx = tcp_state_idx(th)) < 0) { + IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx); + goto tcp_state_out; + } + + new_state = + pd->tcp_state_table[state_off+state_idx].next_state[cp->state]; + + tcp_state_out: + if (new_state != cp->state) { + struct ip_vs_dest *dest = cp->dest; + + IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] c:%s:%d v:%s:%d " + "d:%s:%d state: %s->%s conn->refcnt:%d\n", + pd->pp->name, + ((state_off == TCP_DIR_OUTPUT) ? + "output " : "input "), + th->syn ? 'S' : '.', + th->fin ? 'F' : '.', + th->ack ? 'A' : '.', + th->rst ? 'R' : '.', + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->daf, &cp->daddr), + ntohs(cp->dport), + tcp_state_name(cp->state), + tcp_state_name(new_state), + refcount_read(&cp->refcnt)); + + if (dest) { + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + !tcp_state_active(new_state)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + tcp_state_active(new_state)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } + if (new_state == IP_VS_TCP_S_ESTABLISHED) + ip_vs_control_assure_ct(cp); + } + + if (likely(pd)) + cp->timeout = pd->timeout_table[cp->state = new_state]; + else /* What to do ? */ + cp->timeout = tcp_timeouts[cp->state = new_state]; +} + +/* + * Handle state transitions + */ +static void +tcp_state_transition(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_proto_data *pd) +{ + struct tcphdr _tcph, *th; + +#ifdef CONFIG_IP_VS_IPV6 + int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); +#else + int ihl = ip_hdrlen(skb); +#endif + + th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph); + if (th == NULL) + return; + + spin_lock_bh(&cp->lock); + set_tcp_state(pd, cp, direction, th); + spin_unlock_bh(&cp->lock); +} + +static inline __u16 tcp_app_hashkey(__be16 port) +{ + return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) + & TCP_APP_TAB_MASK; +} + + +static int tcp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) +{ + struct ip_vs_app *i; + __u16 hash; + __be16 port = inc->port; + int ret = 0; + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); + + hash = tcp_app_hashkey(port); + + list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) { + if (i->port == port) { + ret = -EEXIST; + goto out; + } + } + list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]); + atomic_inc(&pd->appcnt); + + out: + return ret; +} + + +static void +tcp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) +{ + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); + + atomic_dec(&pd->appcnt); + list_del_rcu(&inc->p_list); +} + + +static int +tcp_app_conn_bind(struct ip_vs_conn *cp) +{ + struct netns_ipvs *ipvs = cp->ipvs; + int hash; + struct ip_vs_app *inc; + int result = 0; + + /* Default binding: bind app only for NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return 0; + + /* Lookup application incarnations and bind the right one */ + hash = tcp_app_hashkey(cp->vport); + + list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) { + if (inc->port == cp->vport) { + if (unlikely(!ip_vs_app_inc_get(inc))) + break; + + IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" + "%s:%u to app %s on port %u\n", + __func__, + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + inc->name, ntohs(inc->port)); + + cp->app = inc; + if (inc->init_conn) + result = inc->init_conn(inc, cp); + break; + } + } + + return result; +} + + +/* + * Set LISTEN timeout. (ip_vs_conn_put will setup timer) + */ +void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) +{ + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(cp->ipvs, IPPROTO_TCP); + + spin_lock_bh(&cp->lock); + cp->state = IP_VS_TCP_S_LISTEN; + cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN] + : tcp_timeouts[IP_VS_TCP_S_LISTEN]); + spin_unlock_bh(&cp->lock); +} + +/* --------------------------------------------- + * timeouts is netns related now. + * --------------------------------------------- + */ +static int __ip_vs_tcp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) +{ + ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE); + pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts, + sizeof(tcp_timeouts)); + if (!pd->timeout_table) + return -ENOMEM; + pd->tcp_state_table = tcp_states; + return 0; +} + +static void __ip_vs_tcp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) +{ + kfree(pd->timeout_table); +} + + +struct ip_vs_protocol ip_vs_protocol_tcp = { + .name = "TCP", + .protocol = IPPROTO_TCP, + .num_states = IP_VS_TCP_S_LAST, + .dont_defrag = 0, + .init = NULL, + .exit = NULL, + .init_netns = __ip_vs_tcp_init, + .exit_netns = __ip_vs_tcp_exit, + .register_app = tcp_register_app, + .unregister_app = tcp_unregister_app, + .conn_schedule = tcp_conn_schedule, + .conn_in_get = ip_vs_conn_in_get_proto, + .conn_out_get = ip_vs_conn_out_get_proto, + .snat_handler = tcp_snat_handler, + .dnat_handler = tcp_dnat_handler, + .state_name = tcp_state_name, + .state_transition = tcp_state_transition, + .app_conn_bind = tcp_app_conn_bind, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = tcp_timeout_change, +}; diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c new file mode 100644 index 000000000..68260d91c --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -0,0 +1,503 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * ip_vs_proto_udp.c: UDP load balancing support for IPVS + * + * Authors: Wensong Zhang + * Julian Anastasov + * + * Changes: Hans Schillstrom + * Network name space (netns) aware. + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static int +udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp); + +static int +udp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, + struct ip_vs_proto_data *pd, + int *verdict, struct ip_vs_conn **cpp, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_service *svc; + struct udphdr _udph, *uh; + __be16 _ports[2], *ports = NULL; + + if (likely(!ip_vs_iph_icmp(iph))) { + /* IPv6 fragments, only first fragment will hit this */ + uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); + if (uh) + ports = &uh->source; + } else { + ports = skb_header_pointer( + skb, iph->len, sizeof(_ports), &_ports); + } + + if (!ports) { + *verdict = NF_DROP; + return 0; + } + + if (likely(!ip_vs_iph_inverse(iph))) + svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, + &iph->daddr, ports[1]); + else + svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, + &iph->saddr, ports[0]); + + if (svc) { + int ignored; + + if (ip_vs_todrop(ipvs)) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + *verdict = NF_DROP; + return 0; + } + + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); + if (!*cpp && ignored <= 0) { + if (!ignored) + *verdict = ip_vs_leave(svc, skb, pd, iph); + else + *verdict = NF_DROP; + return 0; + } + } + /* NF_ACCEPT */ + return 1; +} + + +static inline void +udp_fast_csum_update(int af, struct udphdr *uhdr, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldport, __be16 newport) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + uhdr->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(uhdr->check)))); + else +#endif + uhdr->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(uhdr->check)))); + if (!uhdr->check) + uhdr->check = CSUM_MANGLED_0; +} + +static inline void +udp_partial_csum_update(int af, struct udphdr *uhdr, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldlen, __be16 newlen) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + uhdr->check = + ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldlen, newlen, + csum_unfold(uhdr->check)))); + else +#endif + uhdr->check = + ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldlen, newlen, + csum_unfold(uhdr->check)))); +} + + +INDIRECT_CALLABLE_SCOPE int +udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) +{ + struct udphdr *udph; + unsigned int udphoff = iph->len; + bool payload_csum = false; + int oldlen; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; +#endif + oldlen = skb->len - udphoff; + + /* csum_check requires unshared skb */ + if (skb_ensure_writable(skb, udphoff + sizeof(*udph))) + return 0; + + if (unlikely(cp->app != NULL)) { + int ret; + + /* Some checks before mangling */ + if (!udp_csum_check(cp->af, skb, pp)) + return 0; + + /* + * Call application helper if needed + */ + if (!(ret = ip_vs_app_pkt_out(cp, skb, iph))) + return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - udphoff; + else + payload_csum = true; + } + + udph = (void *)skb_network_header(skb) + udphoff; + udph->source = cp->vport; + + /* + * Adjust UDP checksums + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, + htons(oldlen), + htons(skb->len - udphoff)); + } else if (!payload_csum && (udph->check != 0)) { + /* Only port and addr are changed, do fast csum update */ + udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, + cp->dport, cp->vport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = cp->app ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; + } else { + /* full checksum calculation */ + udph->check = 0; + skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udph->check = csum_ipv6_magic(&cp->vaddr.in6, + &cp->caddr.in6, + skb->len - udphoff, + cp->protocol, skb->csum); + else +#endif + udph->check = csum_tcpudp_magic(cp->vaddr.ip, + cp->caddr.ip, + skb->len - udphoff, + cp->protocol, + skb->csum); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + skb->ip_summed = CHECKSUM_UNNECESSARY; + IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", + pp->name, udph->check, + (char*)&(udph->check) - (char*)udph); + } + return 1; +} + + +static int +udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) +{ + struct udphdr *udph; + unsigned int udphoff = iph->len; + bool payload_csum = false; + int oldlen; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6 && iph->fragoffs) + return 1; +#endif + oldlen = skb->len - udphoff; + + /* csum_check requires unshared skb */ + if (skb_ensure_writable(skb, udphoff + sizeof(*udph))) + return 0; + + if (unlikely(cp->app != NULL)) { + int ret; + + /* Some checks before mangling */ + if (!udp_csum_check(cp->af, skb, pp)) + return 0; + + /* + * Attempt ip_vs_app call. + * It will fix ip_vs_conn + */ + if (!(ret = ip_vs_app_pkt_in(cp, skb, iph))) + return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - udphoff; + else + payload_csum = true; + } + + udph = (void *)skb_network_header(skb) + udphoff; + udph->dest = cp->dport; + + /* + * Adjust UDP checksums + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, + htons(oldlen), + htons(skb->len - udphoff)); + } else if (!payload_csum && (udph->check != 0)) { + /* Only port and addr are changed, do fast csum update */ + udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, + cp->vport, cp->dport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = cp->app ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; + } else { + /* full checksum calculation */ + udph->check = 0; + skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udph->check = csum_ipv6_magic(&cp->caddr.in6, + &cp->daddr.in6, + skb->len - udphoff, + cp->protocol, skb->csum); + else +#endif + udph->check = csum_tcpudp_magic(cp->caddr.ip, + cp->daddr.ip, + skb->len - udphoff, + cp->protocol, + skb->csum); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return 1; +} + + +static int +udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) +{ + struct udphdr _udph, *uh; + unsigned int udphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + udphoff = sizeof(struct ipv6hdr); + else +#endif + udphoff = ip_hdrlen(skb); + + uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); + if (uh == NULL) + return 0; + + if (uh->check != 0) { + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = skb_checksum(skb, udphoff, + skb->len - udphoff, 0); + fallthrough; + case CHECKSUM_COMPLETE: +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - udphoff, + ipv6_hdr(skb)->nexthdr, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, + "Failed checksum for"); + return 0; + } + } else +#endif + if (csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - udphoff, + ip_hdr(skb)->protocol, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, + "Failed checksum for"); + return 0; + } + break; + default: + /* No need to checksum. */ + break; + } + } + return 1; +} + +static inline __u16 udp_app_hashkey(__be16 port) +{ + return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port) + & UDP_APP_TAB_MASK; +} + + +static int udp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) +{ + struct ip_vs_app *i; + __u16 hash; + __be16 port = inc->port; + int ret = 0; + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); + + hash = udp_app_hashkey(port); + + list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) { + if (i->port == port) { + ret = -EEXIST; + goto out; + } + } + list_add_rcu(&inc->p_list, &ipvs->udp_apps[hash]); + atomic_inc(&pd->appcnt); + + out: + return ret; +} + + +static void +udp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) +{ + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); + + atomic_dec(&pd->appcnt); + list_del_rcu(&inc->p_list); +} + + +static int udp_app_conn_bind(struct ip_vs_conn *cp) +{ + struct netns_ipvs *ipvs = cp->ipvs; + int hash; + struct ip_vs_app *inc; + int result = 0; + + /* Default binding: bind app only for NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return 0; + + /* Lookup application incarnations and bind the right one */ + hash = udp_app_hashkey(cp->vport); + + list_for_each_entry_rcu(inc, &ipvs->udp_apps[hash], p_list) { + if (inc->port == cp->vport) { + if (unlikely(!ip_vs_app_inc_get(inc))) + break; + + IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" + "%s:%u to app %s on port %u\n", + __func__, + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + inc->name, ntohs(inc->port)); + + cp->app = inc; + if (inc->init_conn) + result = inc->init_conn(inc, cp); + break; + } + } + + return result; +} + + +static const int udp_timeouts[IP_VS_UDP_S_LAST+1] = { + [IP_VS_UDP_S_NORMAL] = 5*60*HZ, + [IP_VS_UDP_S_LAST] = 2*HZ, +}; + +static const char *const udp_state_name_table[IP_VS_UDP_S_LAST+1] = { + [IP_VS_UDP_S_NORMAL] = "UDP", + [IP_VS_UDP_S_LAST] = "BUG!", +}; + +static const char * udp_state_name(int state) +{ + if (state >= IP_VS_UDP_S_LAST) + return "ERR!"; + return udp_state_name_table[state] ? udp_state_name_table[state] : "?"; +} + +static void +udp_state_transition(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_proto_data *pd) +{ + if (unlikely(!pd)) { + pr_err("UDP no ns data\n"); + return; + } + + cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL]; + if (direction == IP_VS_DIR_OUTPUT) + ip_vs_control_assure_ct(cp); +} + +static int __udp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) +{ + ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE); + pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts, + sizeof(udp_timeouts)); + if (!pd->timeout_table) + return -ENOMEM; + return 0; +} + +static void __udp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) +{ + kfree(pd->timeout_table); +} + + +struct ip_vs_protocol ip_vs_protocol_udp = { + .name = "UDP", + .protocol = IPPROTO_UDP, + .num_states = IP_VS_UDP_S_LAST, + .dont_defrag = 0, + .init = NULL, + .exit = NULL, + .init_netns = __udp_init, + .exit_netns = __udp_exit, + .conn_schedule = udp_conn_schedule, + .conn_in_get = ip_vs_conn_in_get_proto, + .conn_out_get = ip_vs_conn_out_get_proto, + .snat_handler = udp_snat_handler, + .dnat_handler = udp_dnat_handler, + .state_transition = udp_state_transition, + .state_name = udp_state_name, + .register_app = udp_register_app, + .unregister_app = udp_unregister_app, + .app_conn_bind = udp_app_conn_bind, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = NULL, +}; diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c new file mode 100644 index 000000000..38495c6f6 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_rr.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * IPVS: Round-Robin Scheduling module + * + * Authors: Wensong Zhang + * Peter Kese + * + * Fixes/Changes: + * Wensong Zhang : changed the ip_vs_rr_schedule to return dest + * Julian Anastasov : fixed the NULL pointer access bug in debugging + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_rr_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include + +#include + + +static int ip_vs_rr_init_svc(struct ip_vs_service *svc) +{ + svc->sched_data = &svc->destinations; + return 0; +} + + +static int ip_vs_rr_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest) +{ + struct list_head *p; + + spin_lock_bh(&svc->sched_lock); + p = (struct list_head *) svc->sched_data; + /* dest is already unlinked, so p->prev is not valid but + * p->next is valid, use it to reach previous entry. + */ + if (p == &dest->n_list) + svc->sched_data = p->next->prev; + spin_unlock_bh(&svc->sched_lock); + return 0; +} + + +/* + * Round-Robin Scheduling + */ +static struct ip_vs_dest * +ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct list_head *p; + struct ip_vs_dest *dest, *last; + int pass = 0; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + spin_lock_bh(&svc->sched_lock); + p = (struct list_head *) svc->sched_data; + last = dest = list_entry(p, struct ip_vs_dest, n_list); + + do { + list_for_each_entry_continue_rcu(dest, + &svc->destinations, + n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) + /* HIT */ + goto out; + if (dest == last) + goto stop; + } + pass++; + /* Previous dest could be unlinked, do not loop forever. + * If we stay at head there is no need for 2nd pass. + */ + } while (pass < 2 && p != &svc->destinations); + +stop: + spin_unlock_bh(&svc->sched_lock); + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + + out: + svc->sched_data = &dest->n_list; + spin_unlock_bh(&svc->sched_lock); + IP_VS_DBG_BUF(6, "RR: server %s:%u " + "activeconns %d refcnt %d weight %d\n", + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + refcount_read(&dest->refcnt), atomic_read(&dest->weight)); + + return dest; +} + + +static struct ip_vs_scheduler ip_vs_rr_scheduler = { + .name = "rr", /* name */ + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list), + .init_service = ip_vs_rr_init_svc, + .add_dest = NULL, + .del_dest = ip_vs_rr_del_dest, + .schedule = ip_vs_rr_schedule, +}; + +static int __init ip_vs_rr_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_rr_scheduler); +} + +static void __exit ip_vs_rr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_rr_init); +module_exit(ip_vs_rr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c new file mode 100644 index 000000000..d4903723b --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -0,0 +1,250 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang + * Peter Kese + * + * Changes: + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include + +#include + +EXPORT_SYMBOL(ip_vs_scheduler_err); +/* + * IPVS scheduler list + */ +static LIST_HEAD(ip_vs_schedulers); + +/* semaphore for schedulers */ +static DEFINE_MUTEX(ip_vs_sched_mutex); + + +/* + * Bind a service with a scheduler + */ +int ip_vs_bind_scheduler(struct ip_vs_service *svc, + struct ip_vs_scheduler *scheduler) +{ + int ret; + + if (scheduler->init_service) { + ret = scheduler->init_service(svc); + if (ret) { + pr_err("%s(): init error\n", __func__); + return ret; + } + } + rcu_assign_pointer(svc->scheduler, scheduler); + return 0; +} + + +/* + * Unbind a service with its scheduler + */ +void ip_vs_unbind_scheduler(struct ip_vs_service *svc, + struct ip_vs_scheduler *sched) +{ + struct ip_vs_scheduler *cur_sched; + + cur_sched = rcu_dereference_protected(svc->scheduler, 1); + /* This check proves that old 'sched' was installed */ + if (!cur_sched) + return; + + if (sched->done_service) + sched->done_service(svc); + /* svc->scheduler can be set to NULL only by caller */ +} + + +/* + * Get scheduler in the scheduler list by name + */ +static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) +{ + struct ip_vs_scheduler *sched; + + IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name); + + mutex_lock(&ip_vs_sched_mutex); + + list_for_each_entry(sched, &ip_vs_schedulers, n_list) { + /* + * Test and get the modules atomically + */ + if (sched->module && !try_module_get(sched->module)) { + /* + * This scheduler is just deleted + */ + continue; + } + if (strcmp(sched_name, sched->name)==0) { + /* HIT */ + mutex_unlock(&ip_vs_sched_mutex); + return sched; + } + module_put(sched->module); + } + + mutex_unlock(&ip_vs_sched_mutex); + return NULL; +} + + +/* + * Lookup scheduler and try to load it if it doesn't exist + */ +struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) +{ + struct ip_vs_scheduler *sched; + + /* + * Search for the scheduler by sched_name + */ + sched = ip_vs_sched_getbyname(sched_name); + + /* + * If scheduler not found, load the module and search again + */ + if (sched == NULL) { + request_module("ip_vs_%s", sched_name); + sched = ip_vs_sched_getbyname(sched_name); + } + + return sched; +} + +void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) +{ + if (scheduler) + module_put(scheduler->module); +} + +/* + * Common error output helper for schedulers + */ + +void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg) +{ + struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); + char *sched_name = sched ? sched->name : "none"; + + if (svc->fwmark) { + IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n", + sched_name, svc->fwmark, svc->fwmark, msg); +#ifdef CONFIG_IP_VS_IPV6 + } else if (svc->af == AF_INET6) { + IP_VS_ERR_RL("%s: %s [%pI6c]:%d - %s\n", + sched_name, ip_vs_proto_name(svc->protocol), + &svc->addr.in6, ntohs(svc->port), msg); +#endif + } else { + IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n", + sched_name, ip_vs_proto_name(svc->protocol), + &svc->addr.ip, ntohs(svc->port), msg); + } +} + +/* + * Register a scheduler in the scheduler list + */ +int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) +{ + struct ip_vs_scheduler *sched; + + if (!scheduler) { + pr_err("%s(): NULL arg\n", __func__); + return -EINVAL; + } + + if (!scheduler->name) { + pr_err("%s(): NULL scheduler_name\n", __func__); + return -EINVAL; + } + + /* increase the module use count */ + if (!ip_vs_use_count_inc()) + return -ENOENT; + + mutex_lock(&ip_vs_sched_mutex); + + if (!list_empty(&scheduler->n_list)) { + mutex_unlock(&ip_vs_sched_mutex); + ip_vs_use_count_dec(); + pr_err("%s(): [%s] scheduler already linked\n", + __func__, scheduler->name); + return -EINVAL; + } + + /* + * Make sure that the scheduler with this name doesn't exist + * in the scheduler list. + */ + list_for_each_entry(sched, &ip_vs_schedulers, n_list) { + if (strcmp(scheduler->name, sched->name) == 0) { + mutex_unlock(&ip_vs_sched_mutex); + ip_vs_use_count_dec(); + pr_err("%s(): [%s] scheduler already existed " + "in the system\n", __func__, scheduler->name); + return -EINVAL; + } + } + /* + * Add it into the d-linked scheduler list + */ + list_add(&scheduler->n_list, &ip_vs_schedulers); + mutex_unlock(&ip_vs_sched_mutex); + + pr_info("[%s] scheduler registered.\n", scheduler->name); + + return 0; +} + + +/* + * Unregister a scheduler from the scheduler list + */ +int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) +{ + if (!scheduler) { + pr_err("%s(): NULL arg\n", __func__); + return -EINVAL; + } + + mutex_lock(&ip_vs_sched_mutex); + if (list_empty(&scheduler->n_list)) { + mutex_unlock(&ip_vs_sched_mutex); + pr_err("%s(): [%s] scheduler is not in the list. failed\n", + __func__, scheduler->name); + return -EINVAL; + } + + /* + * Remove it from the d-linked scheduler list + */ + list_del(&scheduler->n_list); + mutex_unlock(&ip_vs_sched_mutex); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + pr_info("[%s] scheduler unregistered.\n", scheduler->name); + + return 0; +} diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c new file mode 100644 index 000000000..7663288e5 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sed.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * IPVS: Shortest Expected Delay scheduling module + * + * Authors: Wensong Zhang + * + * Changes: + */ + +/* + * The SED algorithm attempts to minimize each job's expected delay until + * completion. The expected delay that the job will experience is + * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of + * jobs on the ith server and Ui is the fixed service rate (weight) of + * the ith server. The SED algorithm adopts a greedy policy that each does + * what is in its own best interest, i.e. to join the queue which would + * minimize its expected delay of completion. + * + * See the following paper for more information: + * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing + * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, + * pages 986-994, 1988. + * + * Thanks must go to Marko Buuri for talking SED to me. + * + * The difference between SED and WLC is that SED includes the incoming + * job in the cost function (the increment of 1). SED may outperform + * WLC, while scheduling big jobs under larger heterogeneous systems + * (the server weight varies a lot). + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include + +#include + + +static inline int +ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We only use the active connection number in the cost + * calculation here. + */ + return atomic_read(&dest->activeconns) + 1; +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest, *least; + int loh, doh; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* + * We calculate the load of each dest server as follows: + * (server expected overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_sed_dest_overhead(least); + goto nextstage; + } + } + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + doh = ip_vs_sed_dest_overhead(dest); + if ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "SED: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), + atomic_read(&least->activeconns), + refcount_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_sed_scheduler = +{ + .name = "sed", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list), + .schedule = ip_vs_sed_schedule, +}; + + +static int __init ip_vs_sed_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_sed_scheduler); +} + +static void __exit ip_vs_sed_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_sed_init); +module_exit(ip_vs_sed_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c new file mode 100644 index 000000000..c2028e412 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -0,0 +1,378 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * IPVS: Source Hashing scheduling module + * + * Authors: Wensong Zhang + * + * Changes: + */ + +/* + * The sh algorithm is to select server by the hash key of source IP + * address. The pseudo code is as follows: + * + * n <- servernode[src_ip]; + * if (n is dead) OR + * (n is overloaded) or (n.weight <= 0) then + * return NULL; + * + * return n; + * + * Notes that servernode is a 256-bucket hash table that maps the hash + * index derived from packet source IP address to the current server + * array. If the sh scheduler is used in cache cluster, it is good to + * combine it with cache_bypass feature. When the statically assigned + * server is dead or overloaded, the load balancer can bypass the cache + * server and send requests to the original server directly. + * + * The weight destination attribute can be used to control the + * distribution of connections to the destinations in servernode. The + * greater the weight, the more connections the destination + * will receive. + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + + +/* + * IPVS SH bucket + */ +struct ip_vs_sh_bucket { + struct ip_vs_dest __rcu *dest; /* real server (cache) */ +}; + +/* + * for IPVS SH entry hash table + */ +#ifndef CONFIG_IP_VS_SH_TAB_BITS +#define CONFIG_IP_VS_SH_TAB_BITS 8 +#endif +#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS +#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) +#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) + +struct ip_vs_sh_state { + struct rcu_head rcu_head; + struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE]; +}; + +/* Helper function to determine if server is unavailable */ +static inline bool is_unavailable(struct ip_vs_dest *dest) +{ + return atomic_read(&dest->weight) <= 0 || + dest->flags & IP_VS_DEST_F_OVERLOAD; +} + +/* + * Returns hash value for IPVS SH entry + */ +static inline unsigned int +ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr, + __be16 port, unsigned int offset) +{ + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + return (offset + hash_32(ntohs(port) + ntohl(addr_fold), + IP_VS_SH_TAB_BITS)) & + IP_VS_SH_TAB_MASK; +} + + +/* + * Get ip_vs_dest associated with supplied parameters. + */ +static inline struct ip_vs_dest * +ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s, + const union nf_inet_addr *addr, __be16 port) +{ + unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0); + struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest); + + return (!dest || is_unavailable(dest)) ? NULL : dest; +} + + +/* As ip_vs_sh_get, but with fallback if selected server is unavailable + * + * The fallback strategy loops around the table starting from a "random" + * point (in fact, it is chosen to be the original hash value to make the + * algorithm deterministic) to find a new server. + */ +static inline struct ip_vs_dest * +ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s, + const union nf_inet_addr *addr, __be16 port) +{ + unsigned int offset, roffset; + unsigned int hash, ihash; + struct ip_vs_dest *dest; + + /* first try the dest it's supposed to go to */ + ihash = ip_vs_sh_hashkey(svc->af, addr, port, 0); + dest = rcu_dereference(s->buckets[ihash].dest); + if (!dest) + return NULL; + if (!is_unavailable(dest)) + return dest; + + IP_VS_DBG_BUF(6, "SH: selected unavailable server %s:%d, reselecting", + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); + + /* if the original dest is unavailable, loop around the table + * starting from ihash to find a new dest + */ + for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) { + roffset = (offset + ihash) % IP_VS_SH_TAB_SIZE; + hash = ip_vs_sh_hashkey(svc->af, addr, port, roffset); + dest = rcu_dereference(s->buckets[hash].dest); + if (!dest) + break; + if (!is_unavailable(dest)) + return dest; + IP_VS_DBG_BUF(6, "SH: selected unavailable " + "server %s:%d (offset %d), reselecting", + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port), roffset); + } + + return NULL; +} + +/* + * Assign all the hash buckets of the specified table with the service. + */ +static int +ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc) +{ + int i; + struct ip_vs_sh_bucket *b; + struct list_head *p; + struct ip_vs_dest *dest; + int d_count; + bool empty; + + b = &s->buckets[0]; + p = &svc->destinations; + empty = list_empty(p); + d_count = 0; + for (i=0; idest, 1); + if (dest) + ip_vs_dest_put(dest); + if (empty) + RCU_INIT_POINTER(b->dest, NULL); + else { + if (p == &svc->destinations) + p = p->next; + + dest = list_entry(p, struct ip_vs_dest, n_list); + ip_vs_dest_hold(dest); + RCU_INIT_POINTER(b->dest, dest); + + IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n", + i, IP_VS_DBG_ADDR(dest->af, &dest->addr), + atomic_read(&dest->weight)); + + /* Don't move to next dest until filling weight */ + if (++d_count >= atomic_read(&dest->weight)) { + p = p->next; + d_count = 0; + } + + } + b++; + } + return 0; +} + + +/* + * Flush all the hash buckets of the specified table. + */ +static void ip_vs_sh_flush(struct ip_vs_sh_state *s) +{ + int i; + struct ip_vs_sh_bucket *b; + struct ip_vs_dest *dest; + + b = &s->buckets[0]; + for (i=0; idest, 1); + if (dest) { + ip_vs_dest_put(dest); + RCU_INIT_POINTER(b->dest, NULL); + } + b++; + } +} + + +static int ip_vs_sh_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_state *s; + + /* allocate the SH table for this service */ + s = kzalloc(sizeof(struct ip_vs_sh_state), GFP_KERNEL); + if (s == NULL) + return -ENOMEM; + + svc->sched_data = s; + IP_VS_DBG(6, "SH hash table (memory=%zdbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); + + /* assign the hash buckets with current dests */ + ip_vs_sh_reassign(s, svc); + + return 0; +} + + +static void ip_vs_sh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_state *s = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_sh_flush(s); + + /* release the table itself */ + kfree_rcu(s, rcu_head); + IP_VS_DBG(6, "SH hash table (memory=%zdbytes) released\n", + sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); +} + + +static int ip_vs_sh_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) +{ + struct ip_vs_sh_state *s = svc->sched_data; + + /* assign the hash buckets with the updated service */ + ip_vs_sh_reassign(s, svc); + + return 0; +} + + +/* Helper function to get port number */ +static inline __be16 +ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph) +{ + __be16 _ports[2], *ports; + + /* At this point we know that we have a valid packet of some kind. + * Because ICMP packets are only guaranteed to have the first 8 + * bytes, let's just grab the ports. Fortunately they're in the + * same position for all three of the protocols we care about. + */ + switch (iph->protocol) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_SCTP: + ports = skb_header_pointer(skb, iph->len, sizeof(_ports), + &_ports); + if (unlikely(!ports)) + return 0; + + if (likely(!ip_vs_iph_inverse(iph))) + return ports[0]; + else + return ports[1]; + default: + return 0; + } +} + + +/* + * Source Hashing scheduling + */ +static struct ip_vs_dest * +ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_sh_state *s; + __be16 port = 0; + const union nf_inet_addr *hash_addr; + + hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr; + + IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); + + if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT) + port = ip_vs_sh_get_port(skb, iph); + + s = (struct ip_vs_sh_state *) svc->sched_data; + + if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK) + dest = ip_vs_sh_get_fallback(svc, s, hash_addr, port); + else + dest = ip_vs_sh_get(svc, s, hash_addr, port); + + if (!dest) { + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + } + + IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n", + IP_VS_DBG_ADDR(svc->af, hash_addr), + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS SH Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_sh_scheduler = +{ + .name = "sh", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list), + .init_service = ip_vs_sh_init_svc, + .done_service = ip_vs_sh_done_svc, + .add_dest = ip_vs_sh_dest_changed, + .del_dest = ip_vs_sh_dest_changed, + .upd_dest = ip_vs_sh_dest_changed, + .schedule = ip_vs_sh_schedule, +}; + + +static int __init ip_vs_sh_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_sh_scheduler); +} + + +static void __exit ip_vs_sh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); + synchronize_rcu(); +} + + +module_init(ip_vs_sh_init); +module_exit(ip_vs_sh_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c new file mode 100644 index 000000000..e1dea9a82 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -0,0 +1,2051 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the NetFilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Version 1, is capable of handling both version 0 and 1 messages. + * Version 0 is the plain old format. + * Note Version 0 receivers will just drop Ver 1 messages. + * Version 1 is capable of handle IPv6, Persistence data, + * time-outs, and firewall marks. + * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order. + * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0 + * + * Definitions Message: is a complete datagram + * Sync_conn: is a part of a Message + * Param Data is an option to a Sync_conn. + * + * Authors: Wensong Zhang + * + * ip_vs_sync: sync connection info from master load balancer to backups + * through multicast + * + * Changes: + * Alexandre Cassen : Added master & backup support at a time. + * Alexandre Cassen : Added SyncID support for incoming sync + * messages filtering. + * Justin Ossevoort : Fix endian problem on sync message size. + * Hans Schillstrom : Added Version 1: i.e. IPv6, + * Persistence support, fwmark and time-out. + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for ip_mc_join_group */ +#include +#include +#include +#include +#include +#include + +#include /* Used for ntoh_seq and hton_seq */ + +#include +#include + +#include + +#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ +#define IP_VS_SYNC_PORT 8848 /* multicast port */ + +#define SYNC_PROTO_VER 1 /* Protocol version in header */ + +static struct lock_class_key __ipvs_sync_key; +/* + * IPVS sync connection entry + * Version 0, i.e. original version. + */ +struct ip_vs_sync_conn_v0 { + __u8 reserved; + + /* Protocol, addresses and port numbers */ + __u8 protocol; /* Which protocol (TCP/UDP) */ + __be16 cport; + __be16 vport; + __be16 dport; + __be32 caddr; /* client address */ + __be32 vaddr; /* virtual address */ + __be32 daddr; /* destination address */ + + /* Flags and state transition */ + __be16 flags; /* status flags */ + __be16 state; /* state info */ + + /* The sequence options start here */ +}; + +struct ip_vs_sync_conn_options { + struct ip_vs_seq in_seq; /* incoming seq. struct */ + struct ip_vs_seq out_seq; /* outgoing seq. struct */ +}; + +/* + Sync Connection format (sync_conn) + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Type | Protocol | Ver. | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Flags | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | State | cport | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | vport | dport | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | fwmark | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | timeout (in sec.) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | ... | + | IP-Addresses (v4 or v6) | + | ... | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + Optional Parameters. + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Param. Type | Param. Length | Param. data | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | + | ... | + | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | Param Type | Param. Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Param data | + | Last Param data should be padded for 32 bit alignment | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +*/ + +/* + * Type 0, IPv4 sync connection format + */ +struct ip_vs_sync_v4 { + __u8 type; + __u8 protocol; /* Which protocol (TCP/UDP) */ + __be16 ver_size; /* Version msb 4 bits */ + /* Flags and state transition */ + __be32 flags; /* status flags */ + __be16 state; /* state info */ + /* Protocol, addresses and port numbers */ + __be16 cport; + __be16 vport; + __be16 dport; + __be32 fwmark; /* Firewall mark from skb */ + __be32 timeout; /* cp timeout */ + __be32 caddr; /* client address */ + __be32 vaddr; /* virtual address */ + __be32 daddr; /* destination address */ + /* The sequence options start here */ + /* PE data padded to 32bit alignment after seq. options */ +}; +/* + * Type 2 messages IPv6 + */ +struct ip_vs_sync_v6 { + __u8 type; + __u8 protocol; /* Which protocol (TCP/UDP) */ + __be16 ver_size; /* Version msb 4 bits */ + /* Flags and state transition */ + __be32 flags; /* status flags */ + __be16 state; /* state info */ + /* Protocol, addresses and port numbers */ + __be16 cport; + __be16 vport; + __be16 dport; + __be32 fwmark; /* Firewall mark from skb */ + __be32 timeout; /* cp timeout */ + struct in6_addr caddr; /* client address */ + struct in6_addr vaddr; /* virtual address */ + struct in6_addr daddr; /* destination address */ + /* The sequence options start here */ + /* PE data padded to 32bit alignment after seq. options */ +}; + +union ip_vs_sync_conn { + struct ip_vs_sync_v4 v4; + struct ip_vs_sync_v6 v6; +}; + +/* Bits in Type field in above */ +#define STYPE_INET6 0 +#define STYPE_F_INET6 (1 << STYPE_INET6) + +#define SVER_SHIFT 12 /* Shift to get version */ +#define SVER_MASK 0x0fff /* Mask to strip version */ + +#define IPVS_OPT_SEQ_DATA 1 +#define IPVS_OPT_PE_DATA 2 +#define IPVS_OPT_PE_NAME 3 +#define IPVS_OPT_PARAM 7 + +#define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1)) +#define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1)) +#define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1)) +#define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1)) + +struct ip_vs_sync_thread_data { + struct task_struct *task; + struct netns_ipvs *ipvs; + struct socket *sock; + char *buf; + int id; +}; + +/* Version 0 definition of packet sizes */ +#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0)) +#define FULL_CONN_SIZE \ +(sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options)) + + +/* + The master mulitcasts messages (Datagrams) to the backup load balancers + in the following format. + + Version 1: + Note, first byte should be Zero, so ver 0 receivers will drop the packet. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | 0 | SyncID | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Count Conns | Version | Reserved, set to Zero | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | IPVS Sync Connection (1) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | . | + ~ . ~ + | . | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | IPVS Sync Connection (n) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Version 0 Header + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Count Conns | SyncID | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | IPVS Sync Connection (1) | +*/ + +/* Version 0 header */ +struct ip_vs_sync_mesg_v0 { + __u8 nr_conns; + __u8 syncid; + __be16 size; + + /* ip_vs_sync_conn entries start here */ +}; + +/* Version 1 header */ +struct ip_vs_sync_mesg { + __u8 reserved; /* must be zero */ + __u8 syncid; + __be16 size; + __u8 nr_conns; + __s8 version; /* SYNC_PROTO_VER */ + __u16 spare; + /* ip_vs_sync_conn entries start here */ +}; + +union ipvs_sockaddr { + struct sockaddr_in in; + struct sockaddr_in6 in6; +}; + +struct ip_vs_sync_buff { + struct list_head list; + unsigned long firstuse; + + /* pointers for the message data */ + struct ip_vs_sync_mesg *mesg; + unsigned char *head; + unsigned char *end; +}; + +/* + * Copy of struct ip_vs_seq + * From unaligned network order to aligned host order + */ +static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho) +{ + memset(ho, 0, sizeof(*ho)); + ho->init_seq = get_unaligned_be32(&no->init_seq); + ho->delta = get_unaligned_be32(&no->delta); + ho->previous_delta = get_unaligned_be32(&no->previous_delta); +} + +/* + * Copy of struct ip_vs_seq + * From Aligned host order to unaligned network order + */ +static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no) +{ + put_unaligned_be32(ho->init_seq, &no->init_seq); + put_unaligned_be32(ho->delta, &no->delta); + put_unaligned_be32(ho->previous_delta, &no->previous_delta); +} + +static inline struct ip_vs_sync_buff * +sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) +{ + struct ip_vs_sync_buff *sb; + + spin_lock_bh(&ipvs->sync_lock); + if (list_empty(&ms->sync_queue)) { + sb = NULL; + __set_current_state(TASK_INTERRUPTIBLE); + } else { + sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff, + list); + list_del(&sb->list); + ms->sync_queue_len--; + if (!ms->sync_queue_len) + ms->sync_queue_delay = 0; + } + spin_unlock_bh(&ipvs->sync_lock); + + return sb; +} + +/* + * Create a new sync buffer for Version 1 proto. + */ +static inline struct ip_vs_sync_buff * +ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len) +{ + struct ip_vs_sync_buff *sb; + + if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) + return NULL; + + len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg), + ipvs->mcfg.sync_maxlen); + sb->mesg = kmalloc(len, GFP_ATOMIC); + if (!sb->mesg) { + kfree(sb); + return NULL; + } + sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */ + sb->mesg->version = SYNC_PROTO_VER; + sb->mesg->syncid = ipvs->mcfg.syncid; + sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg)); + sb->mesg->nr_conns = 0; + sb->mesg->spare = 0; + sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); + sb->end = (unsigned char *)sb->mesg + len; + + sb->firstuse = jiffies; + return sb; +} + +static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) +{ + kfree(sb->mesg); + kfree(sb); +} + +static inline void sb_queue_tail(struct netns_ipvs *ipvs, + struct ipvs_master_sync_state *ms) +{ + struct ip_vs_sync_buff *sb = ms->sync_buff; + + spin_lock(&ipvs->sync_lock); + if (ipvs->sync_state & IP_VS_STATE_MASTER && + ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) { + if (!ms->sync_queue_len) + schedule_delayed_work(&ms->master_wakeup_work, + max(IPVS_SYNC_SEND_DELAY, 1)); + ms->sync_queue_len++; + list_add_tail(&sb->list, &ms->sync_queue); + if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) { + int id = (int)(ms - ipvs->ms); + + wake_up_process(ipvs->master_tinfo[id].task); + } + } else + ip_vs_sync_buff_release(sb); + spin_unlock(&ipvs->sync_lock); +} + +/* + * Get the current sync buffer if it has been created for more + * than the specified time or the specified time is zero. + */ +static inline struct ip_vs_sync_buff * +get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms, + unsigned long time) +{ + struct ip_vs_sync_buff *sb; + + spin_lock_bh(&ipvs->sync_buff_lock); + sb = ms->sync_buff; + if (sb && time_after_eq(jiffies - sb->firstuse, time)) { + ms->sync_buff = NULL; + __set_current_state(TASK_RUNNING); + } else + sb = NULL; + spin_unlock_bh(&ipvs->sync_buff_lock); + return sb; +} + +static inline int +select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp) +{ + return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask; +} + +/* + * Create a new sync buffer for Version 0 proto. + */ +static inline struct ip_vs_sync_buff * +ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len) +{ + struct ip_vs_sync_buff *sb; + struct ip_vs_sync_mesg_v0 *mesg; + + if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) + return NULL; + + len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0), + ipvs->mcfg.sync_maxlen); + sb->mesg = kmalloc(len, GFP_ATOMIC); + if (!sb->mesg) { + kfree(sb); + return NULL; + } + mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; + mesg->nr_conns = 0; + mesg->syncid = ipvs->mcfg.syncid; + mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0)); + sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); + sb->end = (unsigned char *)mesg + len; + sb->firstuse = jiffies; + return sb; +} + +/* Check if connection is controlled by persistence */ +static inline bool in_persistence(struct ip_vs_conn *cp) +{ + for (cp = cp->control; cp; cp = cp->control) { + if (cp->flags & IP_VS_CONN_F_TEMPLATE) + return true; + } + return false; +} + +/* Check if conn should be synced. + * pkts: conn packets, use sysctl_sync_threshold to avoid packet check + * - (1) sync_refresh_period: reduce sync rate. Additionally, retry + * sync_retries times with period of sync_refresh_period/8 + * - (2) if both sync_refresh_period and sync_period are 0 send sync only + * for state changes or only once when pkts matches sync_threshold + * - (3) templates: rate can be reduced only with sync_refresh_period or + * with (2) + */ +static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs, + struct ip_vs_conn *cp, int pkts) +{ + unsigned long orig = READ_ONCE(cp->sync_endtime); + unsigned long now = jiffies; + unsigned long n = (now + cp->timeout) & ~3UL; + unsigned int sync_refresh_period; + int sync_period; + int force; + + /* Check if we sync in current state */ + if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE)) + force = 0; + else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp))) + return 0; + else if (likely(cp->protocol == IPPROTO_TCP)) { + if (!((1 << cp->state) & + ((1 << IP_VS_TCP_S_ESTABLISHED) | + (1 << IP_VS_TCP_S_FIN_WAIT) | + (1 << IP_VS_TCP_S_CLOSE) | + (1 << IP_VS_TCP_S_CLOSE_WAIT) | + (1 << IP_VS_TCP_S_TIME_WAIT)))) + return 0; + force = cp->state != cp->old_state; + if (force && cp->state != IP_VS_TCP_S_ESTABLISHED) + goto set; + } else if (unlikely(cp->protocol == IPPROTO_SCTP)) { + if (!((1 << cp->state) & + ((1 << IP_VS_SCTP_S_ESTABLISHED) | + (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) | + (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) | + (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) | + (1 << IP_VS_SCTP_S_CLOSED)))) + return 0; + force = cp->state != cp->old_state; + if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED) + goto set; + } else { + /* UDP or another protocol with single state */ + force = 0; + } + + sync_refresh_period = sysctl_sync_refresh_period(ipvs); + if (sync_refresh_period > 0) { + long diff = n - orig; + long min_diff = max(cp->timeout >> 1, 10UL * HZ); + + /* Avoid sync if difference is below sync_refresh_period + * and below the half timeout. + */ + if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) { + int retries = orig & 3; + + if (retries >= sysctl_sync_retries(ipvs)) + return 0; + if (time_before(now, orig - cp->timeout + + (sync_refresh_period >> 3))) + return 0; + n |= retries + 1; + } + } + sync_period = sysctl_sync_period(ipvs); + if (sync_period > 0) { + if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) && + pkts % sync_period != sysctl_sync_threshold(ipvs)) + return 0; + } else if (!sync_refresh_period && + pkts != sysctl_sync_threshold(ipvs)) + return 0; + +set: + cp->old_state = cp->state; + n = cmpxchg(&cp->sync_endtime, orig, n); + return n == orig || force; +} + +/* + * Version 0 , could be switched in by sys_ctl. + * Add an ip_vs_conn information into the current sync_buff. + */ +static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, + int pkts) +{ + struct ip_vs_sync_mesg_v0 *m; + struct ip_vs_sync_conn_v0 *s; + struct ip_vs_sync_buff *buff; + struct ipvs_master_sync_state *ms; + int id; + unsigned int len; + + if (unlikely(cp->af != AF_INET)) + return; + /* Do not sync ONE PACKET */ + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + return; + + if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) + return; + + spin_lock_bh(&ipvs->sync_buff_lock); + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { + spin_unlock_bh(&ipvs->sync_buff_lock); + return; + } + + id = select_master_thread_id(ipvs, cp); + ms = &ipvs->ms[id]; + buff = ms->sync_buff; + len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : + SIMPLE_CONN_SIZE; + if (buff) { + m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; + /* Send buffer if it is for v1 */ + if (buff->head + len > buff->end || !m->nr_conns) { + sb_queue_tail(ipvs, ms); + ms->sync_buff = NULL; + buff = NULL; + } + } + if (!buff) { + buff = ip_vs_sync_buff_create_v0(ipvs, len); + if (!buff) { + spin_unlock_bh(&ipvs->sync_buff_lock); + pr_err("ip_vs_sync_buff_create failed.\n"); + return; + } + ms->sync_buff = buff; + } + + m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; + s = (struct ip_vs_sync_conn_v0 *) buff->head; + + /* copy members */ + s->reserved = 0; + s->protocol = cp->protocol; + s->cport = cp->cport; + s->vport = cp->vport; + s->dport = cp->dport; + s->caddr = cp->caddr.ip; + s->vaddr = cp->vaddr.ip; + s->daddr = cp->daddr.ip; + s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); + s->state = htons(cp->state); + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { + struct ip_vs_sync_conn_options *opt = + (struct ip_vs_sync_conn_options *)&s[1]; + memcpy(opt, &cp->sync_conn_opt, sizeof(*opt)); + } + + m->nr_conns++; + m->size = htons(ntohs(m->size) + len); + buff->head += len; + spin_unlock_bh(&ipvs->sync_buff_lock); + + /* synchronize its controller if it has */ + cp = cp->control; + if (cp) { + if (cp->flags & IP_VS_CONN_F_TEMPLATE) + pkts = atomic_inc_return(&cp->in_pkts); + else + pkts = sysctl_sync_threshold(ipvs); + ip_vs_sync_conn(ipvs, cp, pkts); + } +} + +/* + * Add an ip_vs_conn information into the current sync_buff. + * Called by ip_vs_in. + * Sending Version 1 messages + */ +void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts) +{ + struct ip_vs_sync_mesg *m; + union ip_vs_sync_conn *s; + struct ip_vs_sync_buff *buff; + struct ipvs_master_sync_state *ms; + int id; + __u8 *p; + unsigned int len, pe_name_len, pad; + + /* Handle old version of the protocol */ + if (sysctl_sync_ver(ipvs) == 0) { + ip_vs_sync_conn_v0(ipvs, cp, pkts); + return; + } + /* Do not sync ONE PACKET */ + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + goto control; +sloop: + if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) + goto control; + + /* Sanity checks */ + pe_name_len = 0; + if (cp->pe_data_len) { + if (!cp->pe_data || !cp->dest) { + IP_VS_ERR_RL("SYNC, connection pe_data invalid\n"); + return; + } + pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN); + } + + spin_lock_bh(&ipvs->sync_buff_lock); + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { + spin_unlock_bh(&ipvs->sync_buff_lock); + return; + } + + id = select_master_thread_id(ipvs, cp); + ms = &ipvs->ms[id]; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + len = sizeof(struct ip_vs_sync_v6); + else +#endif + len = sizeof(struct ip_vs_sync_v4); + + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) + len += sizeof(struct ip_vs_sync_conn_options) + 2; + + if (cp->pe_data_len) + len += cp->pe_data_len + 2; /* + Param hdr field */ + if (pe_name_len) + len += pe_name_len + 2; + + /* check if there is a space for this one */ + pad = 0; + buff = ms->sync_buff; + if (buff) { + m = buff->mesg; + pad = (4 - (size_t) buff->head) & 3; + /* Send buffer if it is for v0 */ + if (buff->head + len + pad > buff->end || m->reserved) { + sb_queue_tail(ipvs, ms); + ms->sync_buff = NULL; + buff = NULL; + pad = 0; + } + } + + if (!buff) { + buff = ip_vs_sync_buff_create(ipvs, len); + if (!buff) { + spin_unlock_bh(&ipvs->sync_buff_lock); + pr_err("ip_vs_sync_buff_create failed.\n"); + return; + } + ms->sync_buff = buff; + m = buff->mesg; + } + + p = buff->head; + buff->head += pad + len; + m->size = htons(ntohs(m->size) + pad + len); + /* Add ev. padding from prev. sync_conn */ + while (pad--) + *(p++) = 0; + + s = (union ip_vs_sync_conn *)p; + + /* Set message type & copy members */ + s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0); + s->v4.ver_size = htons(len & SVER_MASK); /* Version 0 */ + s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED); + s->v4.state = htons(cp->state); + s->v4.protocol = cp->protocol; + s->v4.cport = cp->cport; + s->v4.vport = cp->vport; + s->v4.dport = cp->dport; + s->v4.fwmark = htonl(cp->fwmark); + s->v4.timeout = htonl(cp->timeout / HZ); + m->nr_conns++; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) { + p += sizeof(struct ip_vs_sync_v6); + s->v6.caddr = cp->caddr.in6; + s->v6.vaddr = cp->vaddr.in6; + s->v6.daddr = cp->daddr.in6; + } else +#endif + { + p += sizeof(struct ip_vs_sync_v4); /* options ptr */ + s->v4.caddr = cp->caddr.ip; + s->v4.vaddr = cp->vaddr.ip; + s->v4.daddr = cp->daddr.ip; + } + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { + *(p++) = IPVS_OPT_SEQ_DATA; + *(p++) = sizeof(struct ip_vs_sync_conn_options); + hton_seq((struct ip_vs_seq *)p, &cp->in_seq); + p += sizeof(struct ip_vs_seq); + hton_seq((struct ip_vs_seq *)p, &cp->out_seq); + p += sizeof(struct ip_vs_seq); + } + /* Handle pe data */ + if (cp->pe_data_len && cp->pe_data) { + *(p++) = IPVS_OPT_PE_DATA; + *(p++) = cp->pe_data_len; + memcpy(p, cp->pe_data, cp->pe_data_len); + p += cp->pe_data_len; + if (pe_name_len) { + /* Add PE_NAME */ + *(p++) = IPVS_OPT_PE_NAME; + *(p++) = pe_name_len; + memcpy(p, cp->pe->name, pe_name_len); + p += pe_name_len; + } + } + + spin_unlock_bh(&ipvs->sync_buff_lock); + +control: + /* synchronize its controller if it has */ + cp = cp->control; + if (!cp) + return; + if (cp->flags & IP_VS_CONN_F_TEMPLATE) + pkts = atomic_inc_return(&cp->in_pkts); + else + pkts = sysctl_sync_threshold(ipvs); + goto sloop; +} + +/* + * fill_param used by version 1 + */ +static inline int +ip_vs_conn_fill_param_sync(struct netns_ipvs *ipvs, int af, union ip_vs_sync_conn *sc, + struct ip_vs_conn_param *p, + __u8 *pe_data, unsigned int pe_data_len, + __u8 *pe_name, unsigned int pe_name_len) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ip_vs_conn_fill_param(ipvs, af, sc->v6.protocol, + (const union nf_inet_addr *)&sc->v6.caddr, + sc->v6.cport, + (const union nf_inet_addr *)&sc->v6.vaddr, + sc->v6.vport, p); + else +#endif + ip_vs_conn_fill_param(ipvs, af, sc->v4.protocol, + (const union nf_inet_addr *)&sc->v4.caddr, + sc->v4.cport, + (const union nf_inet_addr *)&sc->v4.vaddr, + sc->v4.vport, p); + /* Handle pe data */ + if (pe_data_len) { + if (pe_name_len) { + char buff[IP_VS_PENAME_MAXLEN+1]; + + memcpy(buff, pe_name, pe_name_len); + buff[pe_name_len]=0; + p->pe = __ip_vs_pe_getbyname(buff); + if (!p->pe) { + IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n", + buff); + return 1; + } + } else { + IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n"); + return 1; + } + + p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC); + if (!p->pe_data) { + module_put(p->pe->module); + return -ENOMEM; + } + p->pe_data_len = pe_data_len; + } + return 0; +} + +/* + * Connection Add / Update. + * Common for version 0 and 1 reception of backup sync_conns. + * Param: ... + * timeout is in sec. + */ +static void ip_vs_proc_conn(struct netns_ipvs *ipvs, struct ip_vs_conn_param *param, + unsigned int flags, unsigned int state, + unsigned int protocol, unsigned int type, + const union nf_inet_addr *daddr, __be16 dport, + unsigned long timeout, __u32 fwmark, + struct ip_vs_sync_conn_options *opt) +{ + struct ip_vs_dest *dest; + struct ip_vs_conn *cp; + + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { + cp = ip_vs_conn_in_get(param); + if (cp && ((cp->dport != dport) || + !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) { + if (!(flags & IP_VS_CONN_F_INACTIVE)) { + ip_vs_conn_expire_now(cp); + __ip_vs_conn_put(cp); + cp = NULL; + } else { + /* This is the expiration message for the + * connection that was already replaced, so we + * just ignore it. + */ + __ip_vs_conn_put(cp); + kfree(param->pe_data); + return; + } + } + } else { + cp = ip_vs_ct_in_get(param); + } + + if (cp) { + /* Free pe_data */ + kfree(param->pe_data); + + dest = cp->dest; + spin_lock_bh(&cp->lock); + if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE && + !(flags & IP_VS_CONN_F_TEMPLATE) && dest) { + if (flags & IP_VS_CONN_F_INACTIVE) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + } else { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + } + } + flags &= IP_VS_CONN_F_BACKUP_UPD_MASK; + flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK; + cp->flags = flags; + spin_unlock_bh(&cp->lock); + if (!dest) + ip_vs_try_bind_dest(cp); + } else { + /* + * Find the appropriate destination for the connection. + * If it is not found the connection will remain unbound + * but still handled. + */ + rcu_read_lock(); + /* This function is only invoked by the synchronization + * code. We do not currently support heterogeneous pools + * with synchronization, so we can make the assumption that + * the svc_af is the same as the dest_af + */ + dest = ip_vs_find_dest(ipvs, type, type, daddr, dport, + param->vaddr, param->vport, protocol, + fwmark, flags); + + cp = ip_vs_conn_new(param, type, daddr, dport, flags, dest, + fwmark); + rcu_read_unlock(); + if (!cp) { + kfree(param->pe_data); + IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); + return; + } + if (!(flags & IP_VS_CONN_F_TEMPLATE)) + kfree(param->pe_data); + } + + if (opt) { + cp->in_seq = opt->in_seq; + cp->out_seq = opt->out_seq; + } + atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs)); + cp->state = state; + cp->old_state = cp->state; + /* + * For Ver 0 messages style + * - Not possible to recover the right timeout for templates + * - can not find the right fwmark + * virtual service. If needed, we can do it for + * non-fwmark persistent services. + * Ver 1 messages style. + * - No problem. + */ + if (timeout) { + if (timeout > MAX_SCHEDULE_TIMEOUT / HZ) + timeout = MAX_SCHEDULE_TIMEOUT / HZ; + cp->timeout = timeout*HZ; + } else { + struct ip_vs_proto_data *pd; + + pd = ip_vs_proto_data_get(ipvs, protocol); + if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table) + cp->timeout = pd->timeout_table[state]; + else + cp->timeout = (3*60*HZ); + } + ip_vs_conn_put(cp); +} + +/* + * Process received multicast message for Version 0 + */ +static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer, + const size_t buflen) +{ + struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer; + struct ip_vs_sync_conn_v0 *s; + struct ip_vs_sync_conn_options *opt; + struct ip_vs_protocol *pp; + struct ip_vs_conn_param param; + char *p; + int i; + + p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0); + for (i=0; inr_conns; i++) { + unsigned int flags, state; + + if (p + SIMPLE_CONN_SIZE > buffer+buflen) { + IP_VS_ERR_RL("BACKUP v0, bogus conn\n"); + return; + } + s = (struct ip_vs_sync_conn_v0 *) p; + flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; + flags &= ~IP_VS_CONN_F_HASHED; + if (flags & IP_VS_CONN_F_SEQ_MASK) { + opt = (struct ip_vs_sync_conn_options *)&s[1]; + p += FULL_CONN_SIZE; + if (p > buffer+buflen) { + IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n"); + return; + } + } else { + opt = NULL; + p += SIMPLE_CONN_SIZE; + } + + state = ntohs(s->state); + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { + pp = ip_vs_proto_get(s->protocol); + if (!pp) { + IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n", + s->protocol); + continue; + } + if (state >= pp->num_states) { + IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n", + pp->name, state); + continue; + } + } else { + if (state >= IP_VS_CTPL_S_LAST) + IP_VS_DBG(7, "BACKUP v0, Invalid tpl state %u\n", + state); + } + + ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol, + (const union nf_inet_addr *)&s->caddr, + s->cport, + (const union nf_inet_addr *)&s->vaddr, + s->vport, ¶m); + + /* Send timeout as Zero */ + ip_vs_proc_conn(ipvs, ¶m, flags, state, s->protocol, AF_INET, + (union nf_inet_addr *)&s->daddr, s->dport, + 0, 0, opt); + } +} + +/* + * Handle options + */ +static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen, + __u32 *opt_flags, + struct ip_vs_sync_conn_options *opt) +{ + struct ip_vs_sync_conn_options *topt; + + topt = (struct ip_vs_sync_conn_options *)p; + + if (plen != sizeof(struct ip_vs_sync_conn_options)) { + IP_VS_DBG(2, "BACKUP, bogus conn options length\n"); + return -EINVAL; + } + if (*opt_flags & IPVS_OPT_F_SEQ_DATA) { + IP_VS_DBG(2, "BACKUP, conn options found twice\n"); + return -EINVAL; + } + ntoh_seq(&topt->in_seq, &opt->in_seq); + ntoh_seq(&topt->out_seq, &opt->out_seq); + *opt_flags |= IPVS_OPT_F_SEQ_DATA; + return 0; +} + +static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len, + __u8 **data, unsigned int maxlen, + __u32 *opt_flags, __u32 flag) +{ + if (plen > maxlen) { + IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen); + return -EINVAL; + } + if (*opt_flags & flag) { + IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag); + return -EINVAL; + } + *data_len = plen; + *data = p; + *opt_flags |= flag; + return 0; +} +/* + * Process a Version 1 sync. connection + */ +static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *msg_end) +{ + struct ip_vs_sync_conn_options opt; + union ip_vs_sync_conn *s; + struct ip_vs_protocol *pp; + struct ip_vs_conn_param param; + __u32 flags; + unsigned int af, state, pe_data_len=0, pe_name_len=0; + __u8 *pe_data=NULL, *pe_name=NULL; + __u32 opt_flags=0; + int retc=0; + + s = (union ip_vs_sync_conn *) p; + + if (s->v6.type & STYPE_F_INET6) { +#ifdef CONFIG_IP_VS_IPV6 + af = AF_INET6; + p += sizeof(struct ip_vs_sync_v6); +#else + IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n"); + retc = 10; + goto out; +#endif + } else if (!s->v4.type) { + af = AF_INET; + p += sizeof(struct ip_vs_sync_v4); + } else { + return -10; + } + if (p > msg_end) + return -20; + + /* Process optional params check Type & Len. */ + while (p < msg_end) { + int ptype; + int plen; + + if (p+2 > msg_end) + return -30; + ptype = *(p++); + plen = *(p++); + + if (!plen || ((p + plen) > msg_end)) + return -40; + /* Handle seq option p = param data */ + switch (ptype & ~IPVS_OPT_F_PARAM) { + case IPVS_OPT_SEQ_DATA: + if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt)) + return -50; + break; + + case IPVS_OPT_PE_DATA: + if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data, + IP_VS_PEDATA_MAXLEN, &opt_flags, + IPVS_OPT_F_PE_DATA)) + return -60; + break; + + case IPVS_OPT_PE_NAME: + if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name, + IP_VS_PENAME_MAXLEN, &opt_flags, + IPVS_OPT_F_PE_NAME)) + return -70; + break; + + default: + /* Param data mandatory ? */ + if (!(ptype & IPVS_OPT_F_PARAM)) { + IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n", + ptype & ~IPVS_OPT_F_PARAM); + retc = 20; + goto out; + } + } + p += plen; /* Next option */ + } + + /* Get flags and Mask off unsupported */ + flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK; + flags |= IP_VS_CONN_F_SYNC; + state = ntohs(s->v4.state); + + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { + pp = ip_vs_proto_get(s->v4.protocol); + if (!pp) { + IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n", + s->v4.protocol); + retc = 30; + goto out; + } + if (state >= pp->num_states) { + IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n", + pp->name, state); + retc = 40; + goto out; + } + } else { + if (state >= IP_VS_CTPL_S_LAST) + IP_VS_DBG(7, "BACKUP, Invalid tpl state %u\n", + state); + } + if (ip_vs_conn_fill_param_sync(ipvs, af, s, ¶m, pe_data, + pe_data_len, pe_name, pe_name_len)) { + retc = 50; + goto out; + } + /* If only IPv4, just silent skip IPv6 */ + if (af == AF_INET) + ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v4.protocol, af, + (union nf_inet_addr *)&s->v4.daddr, s->v4.dport, + ntohl(s->v4.timeout), ntohl(s->v4.fwmark), + (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) + ); +#ifdef CONFIG_IP_VS_IPV6 + else + ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v6.protocol, af, + (union nf_inet_addr *)&s->v6.daddr, s->v6.dport, + ntohl(s->v6.timeout), ntohl(s->v6.fwmark), + (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) + ); +#endif + ip_vs_pe_put(param.pe); + return 0; + /* Error exit */ +out: + IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc); + return retc; + +} +/* + * Process received multicast message and create the corresponding + * ip_vs_conn entries. + * Handles Version 0 & 1 + */ +static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer, + const size_t buflen) +{ + struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer; + __u8 *p, *msg_end; + int i, nr_conns; + + if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) { + IP_VS_DBG(2, "BACKUP, message header too short\n"); + return; + } + + if (buflen != ntohs(m2->size)) { + IP_VS_DBG(2, "BACKUP, bogus message size\n"); + return; + } + /* SyncID sanity check */ + if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) { + IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid); + return; + } + /* Handle version 1 message */ + if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0) + && (m2->spare == 0)) { + + msg_end = buffer + sizeof(struct ip_vs_sync_mesg); + nr_conns = m2->nr_conns; + + for (i=0; iv4) > buffer+buflen) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, too small\n"); + return; + } + s = (union ip_vs_sync_conn *)p; + size = ntohs(s->v4.ver_size) & SVER_MASK; + msg_end = p + size; + /* Basic sanity checks */ + if (msg_end > buffer+buflen) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n"); + return; + } + if (ntohs(s->v4.ver_size) >> SVER_SHIFT) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n", + ntohs(s->v4.ver_size) >> SVER_SHIFT); + return; + } + /* Process a single sync_conn */ + retc = ip_vs_proc_sync_conn(ipvs, p, msg_end); + if (retc < 0) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n", + retc); + return; + } + /* Make sure we have 32 bit alignment */ + msg_end = p + ((size + 3) & ~3); + } + } else { + /* Old type of message */ + ip_vs_process_message_v0(ipvs, buffer, buflen); + return; + } +} + + +/* + * Setup sndbuf (mode=1) or rcvbuf (mode=0) + */ +static void set_sock_size(struct sock *sk, int mode, int val) +{ + /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */ + /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */ + lock_sock(sk); + if (mode) { + val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2, + READ_ONCE(sysctl_wmem_max)); + sk->sk_sndbuf = val * 2; + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + } else { + val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2, + READ_ONCE(sysctl_rmem_max)); + sk->sk_rcvbuf = val * 2; + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + } + release_sock(sk); +} + +/* + * Setup loopback of outgoing multicasts on a sending socket + */ +static void set_mcast_loop(struct sock *sk, u_char loop) +{ + struct inet_sock *inet = inet_sk(sk); + + /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ + lock_sock(sk); + inet->mc_loop = loop ? 1 : 0; +#ifdef CONFIG_IP_VS_IPV6 + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + /* IPV6_MULTICAST_LOOP */ + np->mc_loop = loop ? 1 : 0; + } +#endif + release_sock(sk); +} + +/* + * Specify TTL for outgoing multicasts on a sending socket + */ +static void set_mcast_ttl(struct sock *sk, u_char ttl) +{ + struct inet_sock *inet = inet_sk(sk); + + /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ + lock_sock(sk); + inet->mc_ttl = ttl; +#ifdef CONFIG_IP_VS_IPV6 + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + /* IPV6_MULTICAST_HOPS */ + np->mcast_hops = ttl; + } +#endif + release_sock(sk); +} + +/* Control fragmentation of messages */ +static void set_mcast_pmtudisc(struct sock *sk, int val) +{ + struct inet_sock *inet = inet_sk(sk); + + /* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */ + lock_sock(sk); + inet->pmtudisc = val; +#ifdef CONFIG_IP_VS_IPV6 + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + /* IPV6_MTU_DISCOVER */ + np->pmtudisc = val; + } +#endif + release_sock(sk); +} + +/* + * Specifiy default interface for outgoing multicasts + */ +static int set_mcast_if(struct sock *sk, struct net_device *dev) +{ + struct inet_sock *inet = inet_sk(sk); + + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; + + lock_sock(sk); + inet->mc_index = dev->ifindex; + /* inet->mc_addr = 0; */ +#ifdef CONFIG_IP_VS_IPV6 + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + /* IPV6_MULTICAST_IF */ + np->mcast_oif = dev->ifindex; + } +#endif + release_sock(sk); + + return 0; +} + + +/* + * Join a multicast group. + * the group is specified by a class D multicast address 224.0.0.0/8 + * in the in_addr structure passed in as a parameter. + */ +static int +join_mcast_group(struct sock *sk, struct in_addr *addr, struct net_device *dev) +{ + struct ip_mreqn mreq; + int ret; + + memset(&mreq, 0, sizeof(mreq)); + memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); + + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; + + mreq.imr_ifindex = dev->ifindex; + + lock_sock(sk); + ret = ip_mc_join_group(sk, &mreq); + release_sock(sk); + + return ret; +} + +#ifdef CONFIG_IP_VS_IPV6 +static int join_mcast_group6(struct sock *sk, struct in6_addr *addr, + struct net_device *dev) +{ + int ret; + + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; + + lock_sock(sk); + ret = ipv6_sock_mc_join(sk, dev->ifindex, addr); + release_sock(sk); + + return ret; +} +#endif + +static int bind_mcastif_addr(struct socket *sock, struct net_device *dev) +{ + __be32 addr; + struct sockaddr_in sin; + + addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); + if (!addr) + pr_err("You probably need to specify IP address on " + "multicast interface.\n"); + + IP_VS_DBG(7, "binding socket with (%s) %pI4\n", + dev->name, &addr); + + /* Now bind the socket with the address of multicast interface */ + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr; + sin.sin_port = 0; + + return kernel_bind(sock, (struct sockaddr *)&sin, sizeof(sin)); +} + +static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen, + struct ipvs_sync_daemon_cfg *c, int id) +{ + if (AF_INET6 == c->mcast_af) { + sa->in6 = (struct sockaddr_in6) { + .sin6_family = AF_INET6, + .sin6_port = htons(c->mcast_port + id), + }; + sa->in6.sin6_addr = c->mcast_group.in6; + *salen = sizeof(sa->in6); + } else { + sa->in = (struct sockaddr_in) { + .sin_family = AF_INET, + .sin_port = htons(c->mcast_port + id), + }; + sa->in.sin_addr = c->mcast_group.in; + *salen = sizeof(sa->in); + } +} + +/* + * Set up sending multicast socket over UDP + */ +static int make_send_sock(struct netns_ipvs *ipvs, int id, + struct net_device *dev, struct socket **sock_ret) +{ + /* multicast addr */ + union ipvs_sockaddr mcast_addr; + struct socket *sock; + int result, salen; + + /* First create a socket */ + result = sock_create_kern(ipvs->net, ipvs->mcfg.mcast_af, SOCK_DGRAM, + IPPROTO_UDP, &sock); + if (result < 0) { + pr_err("Error during creation of socket; terminating\n"); + goto error; + } + *sock_ret = sock; + result = set_mcast_if(sock->sk, dev); + if (result < 0) { + pr_err("Error setting outbound mcast interface\n"); + goto error; + } + + set_mcast_loop(sock->sk, 0); + set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl); + /* Allow fragmentation if MTU changes */ + set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT); + result = sysctl_sync_sock_size(ipvs); + if (result > 0) + set_sock_size(sock->sk, 1, result); + + if (AF_INET == ipvs->mcfg.mcast_af) + result = bind_mcastif_addr(sock, dev); + else + result = 0; + if (result < 0) { + pr_err("Error binding address of the mcast interface\n"); + goto error; + } + + get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id); + result = kernel_connect(sock, (struct sockaddr *)&mcast_addr, + salen, 0); + if (result < 0) { + pr_err("Error connecting to the multicast addr\n"); + goto error; + } + + return 0; + +error: + return result; +} + + +/* + * Set up receiving multicast socket over UDP + */ +static int make_receive_sock(struct netns_ipvs *ipvs, int id, + struct net_device *dev, struct socket **sock_ret) +{ + /* multicast addr */ + union ipvs_sockaddr mcast_addr; + struct socket *sock; + int result, salen; + + /* First create a socket */ + result = sock_create_kern(ipvs->net, ipvs->bcfg.mcast_af, SOCK_DGRAM, + IPPROTO_UDP, &sock); + if (result < 0) { + pr_err("Error during creation of socket; terminating\n"); + goto error; + } + *sock_ret = sock; + /* it is equivalent to the REUSEADDR option in user-space */ + sock->sk->sk_reuse = SK_CAN_REUSE; + result = sysctl_sync_sock_size(ipvs); + if (result > 0) + set_sock_size(sock->sk, 0, result); + + get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); + sock->sk->sk_bound_dev_if = dev->ifindex; + result = kernel_bind(sock, (struct sockaddr *)&mcast_addr, salen); + if (result < 0) { + pr_err("Error binding to the multicast addr\n"); + goto error; + } + + /* join the multicast group */ +#ifdef CONFIG_IP_VS_IPV6 + if (ipvs->bcfg.mcast_af == AF_INET6) + result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr, + dev); + else +#endif + result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr, + dev); + if (result < 0) { + pr_err("Error joining to the multicast group\n"); + goto error; + } + + return 0; + +error: + return result; +} + + +static int +ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) +{ + struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; + struct kvec iov; + int len; + + EnterFunction(7); + iov.iov_base = (void *)buffer; + iov.iov_len = length; + + len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); + + LeaveFunction(7); + return len; +} + +static int +ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) +{ + int msize; + int ret; + + msize = ntohs(msg->size); + + ret = ip_vs_send_async(sock, (char *)msg, msize); + if (ret >= 0 || ret == -EAGAIN) + return ret; + pr_err("ip_vs_send_async error %d\n", ret); + return 0; +} + +static int +ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) +{ + struct msghdr msg = {NULL,}; + struct kvec iov = {buffer, buflen}; + int len; + + EnterFunction(7); + + /* Receive a packet */ + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, buflen); + len = sock_recvmsg(sock, &msg, MSG_DONTWAIT); + if (len < 0) + return len; + + LeaveFunction(7); + return len; +} + +/* Wakeup the master thread for sending */ +static void master_wakeup_work_handler(struct work_struct *work) +{ + struct ipvs_master_sync_state *ms = + container_of(work, struct ipvs_master_sync_state, + master_wakeup_work.work); + struct netns_ipvs *ipvs = ms->ipvs; + + spin_lock_bh(&ipvs->sync_lock); + if (ms->sync_queue_len && + ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) { + int id = (int)(ms - ipvs->ms); + + ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE; + wake_up_process(ipvs->master_tinfo[id].task); + } + spin_unlock_bh(&ipvs->sync_lock); +} + +/* Get next buffer to send */ +static inline struct ip_vs_sync_buff * +next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) +{ + struct ip_vs_sync_buff *sb; + + sb = sb_dequeue(ipvs, ms); + if (sb) + return sb; + /* Do not delay entries in buffer for more than 2 seconds */ + return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME); +} + +static int sync_thread_master(void *data) +{ + struct ip_vs_sync_thread_data *tinfo = data; + struct netns_ipvs *ipvs = tinfo->ipvs; + struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id]; + struct sock *sk = tinfo->sock->sk; + struct ip_vs_sync_buff *sb; + + pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " + "syncid = %d, id = %d\n", + ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id); + + for (;;) { + sb = next_sync_buff(ipvs, ms); + if (unlikely(kthread_should_stop())) + break; + if (!sb) { + schedule_timeout(IPVS_SYNC_CHECK_PERIOD); + continue; + } + while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) { + /* (Ab)use interruptible sleep to avoid increasing + * the load avg. + */ + __wait_event_interruptible(*sk_sleep(sk), + sock_writeable(sk) || + kthread_should_stop()); + if (unlikely(kthread_should_stop())) + goto done; + } + ip_vs_sync_buff_release(sb); + } + +done: + __set_current_state(TASK_RUNNING); + if (sb) + ip_vs_sync_buff_release(sb); + + /* clean up the sync_buff queue */ + while ((sb = sb_dequeue(ipvs, ms))) + ip_vs_sync_buff_release(sb); + __set_current_state(TASK_RUNNING); + + /* clean up the current sync_buff */ + sb = get_curr_sync_buff(ipvs, ms, 0); + if (sb) + ip_vs_sync_buff_release(sb); + + return 0; +} + + +static int sync_thread_backup(void *data) +{ + struct ip_vs_sync_thread_data *tinfo = data; + struct netns_ipvs *ipvs = tinfo->ipvs; + struct sock *sk = tinfo->sock->sk; + struct udp_sock *up = udp_sk(sk); + int len; + + pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " + "syncid = %d, id = %d\n", + ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id); + + while (!kthread_should_stop()) { + wait_event_interruptible(*sk_sleep(sk), + !skb_queue_empty_lockless(&sk->sk_receive_queue) || + !skb_queue_empty_lockless(&up->reader_queue) || + kthread_should_stop()); + + /* do we have data now? */ + while (!skb_queue_empty_lockless(&sk->sk_receive_queue) || + !skb_queue_empty_lockless(&up->reader_queue)) { + len = ip_vs_receive(tinfo->sock, tinfo->buf, + ipvs->bcfg.sync_maxlen); + if (len <= 0) { + if (len != -EAGAIN) + pr_err("receiving message error\n"); + break; + } + + ip_vs_process_message(ipvs, tinfo->buf, len); + } + } + + return 0; +} + + +int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, + int state) +{ + struct ip_vs_sync_thread_data *ti = NULL, *tinfo; + struct task_struct *task; + struct net_device *dev; + char *name; + int (*threadfn)(void *data); + int id = 0, count, hlen; + int result = -ENOMEM; + u16 mtu, min_mtu; + + IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); + IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n", + sizeof(struct ip_vs_sync_conn_v0)); + + /* increase the module use count */ + if (!ip_vs_use_count_inc()) + return -ENOPROTOOPT; + + /* Do not hold one mutex and then to block on another */ + for (;;) { + rtnl_lock(); + if (mutex_trylock(&ipvs->sync_mutex)) + break; + rtnl_unlock(); + mutex_lock(&ipvs->sync_mutex); + if (rtnl_trylock()) + break; + mutex_unlock(&ipvs->sync_mutex); + } + + if (!ipvs->sync_state) { + count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX); + ipvs->threads_mask = count - 1; + } else + count = ipvs->threads_mask + 1; + + if (c->mcast_af == AF_UNSPEC) { + c->mcast_af = AF_INET; + c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP); + } + if (!c->mcast_port) + c->mcast_port = IP_VS_SYNC_PORT; + if (!c->mcast_ttl) + c->mcast_ttl = 1; + + dev = __dev_get_by_name(ipvs->net, c->mcast_ifn); + if (!dev) { + pr_err("Unknown mcast interface: %s\n", c->mcast_ifn); + result = -ENODEV; + goto out_early; + } + hlen = (AF_INET6 == c->mcast_af) ? + sizeof(struct ipv6hdr) + sizeof(struct udphdr) : + sizeof(struct iphdr) + sizeof(struct udphdr); + mtu = (state == IP_VS_STATE_BACKUP) ? + clamp(dev->mtu, 1500U, 65535U) : 1500U; + min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1; + + if (c->sync_maxlen) + c->sync_maxlen = clamp_t(unsigned int, + c->sync_maxlen, min_mtu, + 65535 - hlen); + else + c->sync_maxlen = mtu - hlen; + + if (state == IP_VS_STATE_MASTER) { + result = -EEXIST; + if (ipvs->ms) + goto out_early; + + ipvs->mcfg = *c; + name = "ipvs-m:%d:%d"; + threadfn = sync_thread_master; + } else if (state == IP_VS_STATE_BACKUP) { + result = -EEXIST; + if (ipvs->backup_tinfo) + goto out_early; + + ipvs->bcfg = *c; + name = "ipvs-b:%d:%d"; + threadfn = sync_thread_backup; + } else { + result = -EINVAL; + goto out_early; + } + + if (state == IP_VS_STATE_MASTER) { + struct ipvs_master_sync_state *ms; + + result = -ENOMEM; + ipvs->ms = kcalloc(count, sizeof(ipvs->ms[0]), GFP_KERNEL); + if (!ipvs->ms) + goto out; + ms = ipvs->ms; + for (id = 0; id < count; id++, ms++) { + INIT_LIST_HEAD(&ms->sync_queue); + ms->sync_queue_len = 0; + ms->sync_queue_delay = 0; + INIT_DELAYED_WORK(&ms->master_wakeup_work, + master_wakeup_work_handler); + ms->ipvs = ipvs; + } + } + result = -ENOMEM; + ti = kcalloc(count, sizeof(struct ip_vs_sync_thread_data), + GFP_KERNEL); + if (!ti) + goto out; + + for (id = 0; id < count; id++) { + tinfo = &ti[id]; + tinfo->ipvs = ipvs; + if (state == IP_VS_STATE_BACKUP) { + result = -ENOMEM; + tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen, + GFP_KERNEL); + if (!tinfo->buf) + goto out; + } + tinfo->id = id; + if (state == IP_VS_STATE_MASTER) + result = make_send_sock(ipvs, id, dev, &tinfo->sock); + else + result = make_receive_sock(ipvs, id, dev, &tinfo->sock); + if (result < 0) + goto out; + + task = kthread_run(threadfn, tinfo, name, ipvs->gen, id); + if (IS_ERR(task)) { + result = PTR_ERR(task); + goto out; + } + tinfo->task = task; + } + + /* mark as active */ + + if (state == IP_VS_STATE_MASTER) + ipvs->master_tinfo = ti; + else + ipvs->backup_tinfo = ti; + spin_lock_bh(&ipvs->sync_buff_lock); + ipvs->sync_state |= state; + spin_unlock_bh(&ipvs->sync_buff_lock); + + mutex_unlock(&ipvs->sync_mutex); + rtnl_unlock(); + + return 0; + +out: + /* We do not need RTNL lock anymore, release it here so that + * sock_release below can use rtnl_lock to leave the mcast group. + */ + rtnl_unlock(); + id = min(id, count - 1); + if (ti) { + for (tinfo = ti + id; tinfo >= ti; tinfo--) { + if (tinfo->task) + kthread_stop(tinfo->task); + } + } + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { + kfree(ipvs->ms); + ipvs->ms = NULL; + } + mutex_unlock(&ipvs->sync_mutex); + + /* No more mutexes, release socks */ + if (ti) { + for (tinfo = ti + id; tinfo >= ti; tinfo--) { + if (tinfo->sock) + sock_release(tinfo->sock); + kfree(tinfo->buf); + } + kfree(ti); + } + + /* decrease the module use count */ + ip_vs_use_count_dec(); + return result; + +out_early: + mutex_unlock(&ipvs->sync_mutex); + rtnl_unlock(); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + return result; +} + + +int stop_sync_thread(struct netns_ipvs *ipvs, int state) +{ + struct ip_vs_sync_thread_data *ti, *tinfo; + int id; + int retc = -EINVAL; + + IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); + + mutex_lock(&ipvs->sync_mutex); + if (state == IP_VS_STATE_MASTER) { + retc = -ESRCH; + if (!ipvs->ms) + goto err; + ti = ipvs->master_tinfo; + + /* + * The lock synchronizes with sb_queue_tail(), so that we don't + * add sync buffers to the queue, when we are already in + * progress of stopping the master sync daemon. + */ + + spin_lock_bh(&ipvs->sync_buff_lock); + spin_lock(&ipvs->sync_lock); + ipvs->sync_state &= ~IP_VS_STATE_MASTER; + spin_unlock(&ipvs->sync_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); + + retc = 0; + for (id = ipvs->threads_mask; id >= 0; id--) { + struct ipvs_master_sync_state *ms = &ipvs->ms[id]; + int ret; + + tinfo = &ti[id]; + pr_info("stopping master sync thread %d ...\n", + task_pid_nr(tinfo->task)); + cancel_delayed_work_sync(&ms->master_wakeup_work); + ret = kthread_stop(tinfo->task); + if (retc >= 0) + retc = ret; + } + kfree(ipvs->ms); + ipvs->ms = NULL; + ipvs->master_tinfo = NULL; + } else if (state == IP_VS_STATE_BACKUP) { + retc = -ESRCH; + if (!ipvs->backup_tinfo) + goto err; + ti = ipvs->backup_tinfo; + + ipvs->sync_state &= ~IP_VS_STATE_BACKUP; + retc = 0; + for (id = ipvs->threads_mask; id >= 0; id--) { + int ret; + + tinfo = &ti[id]; + pr_info("stopping backup sync thread %d ...\n", + task_pid_nr(tinfo->task)); + ret = kthread_stop(tinfo->task); + if (retc >= 0) + retc = ret; + } + ipvs->backup_tinfo = NULL; + } else { + goto err; + } + id = ipvs->threads_mask; + mutex_unlock(&ipvs->sync_mutex); + + /* No more mutexes, release socks */ + for (tinfo = ti + id; tinfo >= ti; tinfo--) { + if (tinfo->sock) + sock_release(tinfo->sock); + kfree(tinfo->buf); + } + kfree(ti); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + return retc; + +err: + mutex_unlock(&ipvs->sync_mutex); + return retc; +} + +/* + * Initialize data struct for each netns + */ +int __net_init ip_vs_sync_net_init(struct netns_ipvs *ipvs) +{ + __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key); + spin_lock_init(&ipvs->sync_lock); + spin_lock_init(&ipvs->sync_buff_lock); + return 0; +} + +void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs) +{ + int retc; + + retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER); + if (retc && retc != -ESRCH) + pr_err("Failed to stop Master Daemon\n"); + + retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP); + if (retc && retc != -ESRCH) + pr_err("Failed to stop Backup Daemon\n"); +} diff --git a/net/netfilter/ipvs/ip_vs_twos.c b/net/netfilter/ipvs/ip_vs_twos.c new file mode 100644 index 000000000..f2579fc9c --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_twos.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* IPVS: Power of Twos Choice Scheduling module + * + * Authors: Darby Payne + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include + +#include + +/* Power of Twos Choice scheduling, algorithm originally described by + * Michael Mitzenmacher. + * + * Randomly picks two destinations and picks the one with the least + * amount of connections + * + * The algorithm calculates a few variables + * - total_weight = sum of all weights + * - rweight1 = random number between [0,total_weight] + * - rweight2 = random number between [0,total_weight] + * + * For each destination + * decrement rweight1 and rweight2 by the destination weight + * pick choice1 when rweight1 is <= 0 + * pick choice2 when rweight2 is <= 0 + * + * Return choice2 if choice2 has less connections than choice 1 normalized + * by weight + * + * References + * ---------- + * + * [Mitzenmacher 2016] + * The Power of Two Random Choices: A Survey of Techniques and Results + * Michael Mitzenmacher, Andrea W. Richa y, Ramesh Sitaraman + * http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/twosurvey.pdf + * + */ +static struct ip_vs_dest *ip_vs_twos_schedule(struct ip_vs_service *svc, + const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest, *choice1 = NULL, *choice2 = NULL; + int rweight1, rweight2, weight1 = -1, weight2 = -1, overhead1 = 0; + int overhead2, total_weight = 0, weight; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* Generate a random weight between [0,sum of all weights) */ + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD)) { + weight = atomic_read(&dest->weight); + if (weight > 0) { + total_weight += weight; + choice1 = dest; + } + } + } + + if (!choice1) { + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + } + + /* Add 1 to total_weight so that the random weights are inclusive + * from 0 to total_weight + */ + total_weight += 1; + rweight1 = prandom_u32_max(total_weight); + rweight2 = prandom_u32_max(total_weight); + + /* Pick two weighted servers */ + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + weight = atomic_read(&dest->weight); + if (weight <= 0) + continue; + + rweight1 -= weight; + rweight2 -= weight; + + if (rweight1 <= 0 && weight1 == -1) { + choice1 = dest; + weight1 = weight; + overhead1 = ip_vs_dest_conn_overhead(dest); + } + + if (rweight2 <= 0 && weight2 == -1) { + choice2 = dest; + weight2 = weight; + overhead2 = ip_vs_dest_conn_overhead(dest); + } + + if (weight1 != -1 && weight2 != -1) + goto nextstage; + } + +nextstage: + if (choice2 && (weight2 * overhead1) > (weight1 * overhead2)) + choice1 = choice2; + + IP_VS_DBG_BUF(6, "twos: server %s:%u conns %d refcnt %d weight %d\n", + IP_VS_DBG_ADDR(choice1->af, &choice1->addr), + ntohs(choice1->port), atomic_read(&choice1->activeconns), + refcount_read(&choice1->refcnt), + atomic_read(&choice1->weight)); + + return choice1; +} + +static struct ip_vs_scheduler ip_vs_twos_scheduler = { + .name = "twos", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_twos_scheduler.n_list), + .schedule = ip_vs_twos_schedule, +}; + +static int __init ip_vs_twos_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_twos_scheduler); +} + +static void __exit ip_vs_twos_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_twos_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_twos_init); +module_exit(ip_vs_twos_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c new file mode 100644 index 000000000..09f584b56 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * IPVS: Weighted Least-Connection Scheduling module + * + * Authors: Wensong Zhang + * Peter Kese + * + * Changes: + * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest + * Wensong Zhang : changed to use the inactconns in scheduling + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_wlc_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include + +#include + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest, *least; + int loh, doh; + + IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); + + /* + * We calculate the load of each dest server as follows: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_dest_conn_overhead(least); + goto nextstage; + } + } + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + doh = ip_vs_dest_conn_overhead(dest); + if ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "WLC: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), + atomic_read(&least->activeconns), + refcount_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_wlc_scheduler = +{ + .name = "wlc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list), + .schedule = ip_vs_wlc_schedule, +}; + + +static int __init ip_vs_wlc_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_wlc_scheduler); +} + +static void __exit ip_vs_wlc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_wlc_init); +module_exit(ip_vs_wlc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c new file mode 100644 index 000000000..1bc7a0789 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -0,0 +1,265 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * IPVS: Weighted Round-Robin Scheduling module + * + * Authors: Wensong Zhang + * + * Changes: + * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_wrr_update_svc + * Julian Anastasov : fixed the bug of returning destination + * with weight 0 when all weights are zero + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include + +#include + +/* The WRR algorithm depends on some caclulations: + * - mw: maximum weight + * - di: weight step, greatest common divisor from all weights + * - cw: current required weight + * As result, all weights are in the [di..mw] range with a step=di. + * + * First, we start with cw = mw and select dests with weight >= cw. + * Then cw is reduced with di and all dests are checked again. + * Last pass should be with cw = di. We have mw/di passes in total: + * + * pass 1: cw = max weight + * pass 2: cw = max weight - di + * pass 3: cw = max weight - 2 * di + * ... + * last pass: cw = di + * + * Weights are supposed to be >= di but we run in parallel with + * weight changes, it is possible some dest weight to be reduced + * below di, bad if it is the only available dest. + * + * So, we modify how mw is calculated, now it is reduced with (di - 1), + * so that last cw is 1 to catch such dests with weight below di: + * pass 1: cw = max weight - (di - 1) + * pass 2: cw = max weight - di - (di - 1) + * pass 3: cw = max weight - 2 * di - (di - 1) + * ... + * last pass: cw = 1 + * + */ + +/* + * current destination pointer for weighted round-robin scheduling + */ +struct ip_vs_wrr_mark { + struct ip_vs_dest *cl; /* current dest or head */ + int cw; /* current weight */ + int mw; /* maximum weight */ + int di; /* decreasing interval */ + struct rcu_head rcu_head; +}; + + +static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + int weight; + int g = 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + weight = atomic_read(&dest->weight); + if (weight > 0) { + if (g > 0) + g = gcd(weight, g); + else + g = weight; + } + } + return g ? g : 1; +} + + +/* + * Get the maximum weight of the service destinations. + */ +static int ip_vs_wrr_max_weight(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + int new_weight, weight = 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + new_weight = atomic_read(&dest->weight); + if (new_weight > weight) + weight = new_weight; + } + + return weight; +} + + +static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_wrr_mark *mark; + + /* + * Allocate the mark variable for WRR scheduling + */ + mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_KERNEL); + if (mark == NULL) + return -ENOMEM; + + mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list); + mark->di = ip_vs_wrr_gcd_weight(svc); + mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1); + mark->cw = mark->mw; + svc->sched_data = mark; + + return 0; +} + + +static void ip_vs_wrr_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_wrr_mark *mark = svc->sched_data; + + /* + * Release the mark variable + */ + kfree_rcu(mark, rcu_head); +} + + +static int ip_vs_wrr_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) +{ + struct ip_vs_wrr_mark *mark = svc->sched_data; + + spin_lock_bh(&svc->sched_lock); + mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list); + mark->di = ip_vs_wrr_gcd_weight(svc); + mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1); + if (mark->cw > mark->mw || !mark->cw) + mark->cw = mark->mw; + else if (mark->di > 1) + mark->cw = (mark->cw / mark->di) * mark->di + 1; + spin_unlock_bh(&svc->sched_lock); + return 0; +} + + +/* + * Weighted Round-Robin Scheduling + */ +static struct ip_vs_dest * +ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest, *last, *stop = NULL; + struct ip_vs_wrr_mark *mark = svc->sched_data; + bool last_pass = false, restarted = false; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + spin_lock_bh(&svc->sched_lock); + dest = mark->cl; + /* No available dests? */ + if (mark->mw == 0) + goto err_noavail; + last = dest; + /* Stop only after all dests were checked for weight >= 1 (last pass) */ + while (1) { + list_for_each_entry_continue_rcu(dest, + &svc->destinations, + n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) >= mark->cw) + goto found; + if (dest == stop) + goto err_over; + } + mark->cw -= mark->di; + if (mark->cw <= 0) { + mark->cw = mark->mw; + /* Stop if we tried last pass from first dest: + * 1. last_pass: we started checks when cw > di but + * then all dests were checked for w >= 1 + * 2. last was head: the first and only traversal + * was for weight >= 1, for all dests. + */ + if (last_pass || + &last->n_list == &svc->destinations) + goto err_over; + restarted = true; + } + last_pass = mark->cw <= mark->di; + if (last_pass && restarted && + &last->n_list != &svc->destinations) { + /* First traversal was for w >= 1 but only + * for dests after 'last', now do the same + * for all dests up to 'last'. + */ + stop = last; + } + } + +found: + IP_VS_DBG_BUF(6, "WRR: server %s:%u " + "activeconns %d refcnt %d weight %d\n", + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + refcount_read(&dest->refcnt), + atomic_read(&dest->weight)); + mark->cl = dest; + + out: + spin_unlock_bh(&svc->sched_lock); + return dest; + +err_noavail: + mark->cl = dest; + dest = NULL; + ip_vs_scheduler_err(svc, "no destination available"); + goto out; + +err_over: + mark->cl = dest; + dest = NULL; + ip_vs_scheduler_err(svc, "no destination available: " + "all destinations are overloaded"); + goto out; +} + + +static struct ip_vs_scheduler ip_vs_wrr_scheduler = { + .name = "wrr", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list), + .init_service = ip_vs_wrr_init_svc, + .done_service = ip_vs_wrr_done_svc, + .add_dest = ip_vs_wrr_dest_changed, + .del_dest = ip_vs_wrr_dest_changed, + .upd_dest = ip_vs_wrr_dest_changed, + .schedule = ip_vs_wrr_schedule, +}; + +static int __init ip_vs_wrr_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ; +} + +static void __exit ip_vs_wrr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_wrr_init); +module_exit(ip_vs_wrr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c new file mode 100644 index 000000000..d40a4ca2b --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -0,0 +1,1686 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * ip_vs_xmit.c: various packet transmitters for IPVS + * + * Authors: Wensong Zhang + * Julian Anastasov + * + * Changes: + * + * Description of forwarding methods: + * - all transmitters are called from LOCAL_IN (remote clients) and + * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD + * - not all connections have destination server, for example, + * connections in backup server when fwmark is used + * - bypass connections use daddr from packet + * - we can use dst without ref while sending in RCU section, we use + * ref when returning NF_ACCEPT for NAT-ed packet via loopback + * LOCAL_OUT rules: + * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING) + * - skb->pkt_type is not set yet + * - the only place where we can see skb->sk != NULL + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include /* for tcphdr */ +#include +#include +#include +#include /* for csum_tcpudp_magic */ +#include +#include /* for icmp_send */ +#include /* for ip_route_output */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +enum { + IP_VS_RT_MODE_LOCAL = 1, /* Allow local dest */ + IP_VS_RT_MODE_NON_LOCAL = 2, /* Allow non-local dest */ + IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to + * local + */ + IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */ + IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */ + IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */ +}; + +static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void) +{ + return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC); +} + +static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst) +{ + kfree(dest_dst); +} + +/* + * Destination cache to speed up outgoing route lookup + */ +static inline void +__ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst, + struct dst_entry *dst, u32 dst_cookie) +{ + struct ip_vs_dest_dst *old; + + old = rcu_dereference_protected(dest->dest_dst, + lockdep_is_held(&dest->dst_lock)); + + if (dest_dst) { + dest_dst->dst_cache = dst; + dest_dst->dst_cookie = dst_cookie; + } + rcu_assign_pointer(dest->dest_dst, dest_dst); + + if (old) + call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); +} + +static inline struct ip_vs_dest_dst * +__ip_vs_dst_check(struct ip_vs_dest *dest) +{ + struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst); + struct dst_entry *dst; + + if (!dest_dst) + return NULL; + dst = dest_dst->dst_cache; + if (dst->obsolete && + dst->ops->check(dst, dest_dst->dst_cookie) == NULL) + return NULL; + return dest_dst; +} + +static inline bool +__mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu) +{ + if (IP6CB(skb)->frag_max_size) { + /* frag_max_size tell us that, this packet have been + * defragmented by netfilter IPv6 conntrack module. + */ + if (IP6CB(skb)->frag_max_size > mtu) + return true; /* largest fragment violate MTU */ + } + else if (skb->len > mtu && !skb_is_gso(skb)) { + return true; /* Packet size violate MTU size */ + } + return false; +} + +/* Get route to daddr, update *saddr, optionally bind route to saddr */ +static struct rtable *do_output_route4(struct net *net, __be32 daddr, + int rt_mode, __be32 *saddr) +{ + struct flowi4 fl4; + struct rtable *rt; + bool loop = false; + + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = daddr; + fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ? + FLOWI_FLAG_KNOWN_NH : 0; + +retry: + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) { + /* Invalid saddr ? */ + if (PTR_ERR(rt) == -EINVAL && *saddr && + rt_mode & IP_VS_RT_MODE_CONNECT && !loop) { + *saddr = 0; + flowi4_update_output(&fl4, 0, 0, daddr, 0); + goto retry; + } + IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); + return NULL; + } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { + ip_rt_put(rt); + *saddr = fl4.saddr; + flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr); + loop = true; + goto retry; + } + *saddr = fl4.saddr; + return rt; +} + +#ifdef CONFIG_IP_VS_IPV6 +static inline int __ip_vs_is_local_route6(struct rt6_info *rt) +{ + return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK; +} +#endif + +static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb, + int rt_mode, + bool new_rt_is_local) +{ + bool rt_mode_allow_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL); + bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_NON_LOCAL); + bool rt_mode_allow_redirect = !!(rt_mode & IP_VS_RT_MODE_RDR); + bool source_is_loopback; + bool old_rt_is_local; + +#ifdef CONFIG_IP_VS_IPV6 + if (skb_af == AF_INET6) { + int addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr); + + source_is_loopback = + (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && + (addr_type & IPV6_ADDR_LOOPBACK); + old_rt_is_local = __ip_vs_is_local_route6( + (struct rt6_info *)skb_dst(skb)); + } else +#endif + { + source_is_loopback = ipv4_is_loopback(ip_hdr(skb)->saddr); + old_rt_is_local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; + } + + if (unlikely(new_rt_is_local)) { + if (!rt_mode_allow_local) + return true; + if (!rt_mode_allow_redirect && !old_rt_is_local) + return true; + } else { + if (!rt_mode_allow_non_local) + return true; + if (source_is_loopback) + return true; + } + return false; +} + +static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) +{ + struct sock *sk = skb->sk; + struct rtable *ort = skb_rtable(skb); + + if (!skb->dev && sk && sk_fullsock(sk)) + ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu, true); +} + +static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af, + int rt_mode, + struct ip_vs_iphdr *ipvsh, + struct sk_buff *skb, int mtu) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (skb_af == AF_INET6) { + struct net *net = ipvs->net; + + if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { + if (!skb->dev) + skb->dev = net->loopback_dev; + /* only send ICMP too big on first fragment */ + if (!ipvsh->fragoffs && !ip_vs_iph_icmp(ipvsh)) + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP_VS_DBG(1, "frag needed for %pI6c\n", + &ipv6_hdr(skb)->saddr); + return false; + } + } else +#endif + { + /* If we're going to tunnel the packet and pmtu discovery + * is disabled, we'll just fragment it anyway + */ + if ((rt_mode & IP_VS_RT_MODE_TUNNEL) && !sysctl_pmtu_disc(ipvs)) + return true; + + if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) && + skb->len > mtu && !skb_is_gso(skb) && + !ip_vs_iph_icmp(ipvsh))) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + IP_VS_DBG(1, "frag needed for %pI4\n", + &ip_hdr(skb)->saddr); + return false; + } + } + + return true; +} + +static inline bool decrement_ttl(struct netns_ipvs *ipvs, + int skb_af, + struct sk_buff *skb) +{ + struct net *net = ipvs->net; + +#ifdef CONFIG_IP_VS_IPV6 + if (skb_af == AF_INET6) { + struct dst_entry *dst = skb_dst(skb); + + /* check and decrement ttl */ + if (ipv6_hdr(skb)->hop_limit <= 1) { + struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); + + /* Force OUTPUT device used as source address */ + skb->dev = dst->dev; + icmpv6_send(skb, ICMPV6_TIME_EXCEED, + ICMPV6_EXC_HOPLIMIT, 0); + IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); + + return false; + } + + /* don't propagate ttl change to cloned packets */ + if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) + return false; + + ipv6_hdr(skb)->hop_limit--; + } else +#endif + { + if (ip_hdr(skb)->ttl <= 1) { + /* Tell the sender its packet died... */ + IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); + icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); + return false; + } + + /* don't propagate ttl change to cloned packets */ + if (skb_ensure_writable(skb, sizeof(struct iphdr))) + return false; + + /* Decrease ttl */ + ip_decrease_ttl(ip_hdr(skb)); + } + + return true; +} + +/* Get route to destination or remote server */ +static int +__ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, + struct ip_vs_dest *dest, + __be32 daddr, int rt_mode, __be32 *ret_saddr, + struct ip_vs_iphdr *ipvsh) +{ + struct net *net = ipvs->net; + struct ip_vs_dest_dst *dest_dst; + struct rtable *rt; /* Route to the other host */ + int mtu; + int local, noref = 1; + + if (dest) { + dest_dst = __ip_vs_dst_check(dest); + if (likely(dest_dst)) + rt = (struct rtable *) dest_dst->dst_cache; + else { + dest_dst = ip_vs_dest_dst_alloc(); + spin_lock_bh(&dest->dst_lock); + if (!dest_dst) { + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + goto err_unreach; + } + rt = do_output_route4(net, dest->addr.ip, rt_mode, + &dest_dst->dst_saddr.ip); + if (!rt) { + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + ip_vs_dest_dst_free(dest_dst); + goto err_unreach; + } + __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0); + spin_unlock_bh(&dest->dst_lock); + IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", + &dest->addr.ip, &dest_dst->dst_saddr.ip, + atomic_read(&rt->dst.__refcnt)); + } + if (ret_saddr) + *ret_saddr = dest_dst->dst_saddr.ip; + } else { + __be32 saddr = htonl(INADDR_ANY); + + noref = 0; + + /* For such unconfigured boxes avoid many route lookups + * for performance reasons because we do not remember saddr + */ + rt_mode &= ~IP_VS_RT_MODE_CONNECT; + rt = do_output_route4(net, daddr, rt_mode, &saddr); + if (!rt) + goto err_unreach; + if (ret_saddr) + *ret_saddr = saddr; + } + + local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; + if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode, + local))) { + IP_VS_DBG_RL("We are crossing local and non-local addresses" + " daddr=%pI4\n", &daddr); + goto err_put; + } + + if (unlikely(local)) { + /* skb to local stack, preserve old route */ + if (!noref) + ip_rt_put(rt); + return local; + } + + if (!decrement_ttl(ipvs, skb_af, skb)) + goto err_put; + + if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) { + mtu = dst_mtu(&rt->dst); + } else { + mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); + if (!dest) + goto err_put; + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); + if ((dest->tun_flags & + IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) + mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + __be16 tflags = 0; + + if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + tflags |= TUNNEL_CSUM; + mtu -= gre_calc_hlen(tflags); + } + if (mtu < 68) { + IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); + goto err_put; + } + maybe_update_pmtu(skb_af, skb, mtu); + } + + if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) + goto err_put; + + skb_dst_drop(skb); + if (noref) + skb_dst_set_noref(skb, &rt->dst); + else + skb_dst_set(skb, &rt->dst); + + return local; + +err_put: + if (!noref) + ip_rt_put(rt); + return -1; + +err_unreach: + dst_link_failure(skb); + return -1; +} + +#ifdef CONFIG_IP_VS_IPV6 +static struct dst_entry * +__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, + struct in6_addr *ret_saddr, int do_xfrm, int rt_mode) +{ + struct dst_entry *dst; + struct flowi6 fl6 = { + .daddr = *daddr, + }; + + if (rt_mode & IP_VS_RT_MODE_KNOWN_NH) + fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; + + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) + goto out_err; + if (!ret_saddr) + return dst; + if (ipv6_addr_any(&fl6.saddr) && + ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, + &fl6.daddr, 0, &fl6.saddr) < 0) + goto out_err; + if (do_xfrm) { + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + if (IS_ERR(dst)) { + dst = NULL; + goto out_err; + } + } + *ret_saddr = fl6.saddr; + return dst; + +out_err: + dst_release(dst); + IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr); + return NULL; +} + +/* + * Get route to destination or remote server + */ +static int +__ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, + struct ip_vs_dest *dest, + struct in6_addr *daddr, struct in6_addr *ret_saddr, + struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode) +{ + struct net *net = ipvs->net; + struct ip_vs_dest_dst *dest_dst; + struct rt6_info *rt; /* Route to the other host */ + struct dst_entry *dst; + int mtu; + int local, noref = 1; + + if (dest) { + dest_dst = __ip_vs_dst_check(dest); + if (likely(dest_dst)) + rt = (struct rt6_info *) dest_dst->dst_cache; + else { + u32 cookie; + + dest_dst = ip_vs_dest_dst_alloc(); + spin_lock_bh(&dest->dst_lock); + if (!dest_dst) { + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + goto err_unreach; + } + dst = __ip_vs_route_output_v6(net, &dest->addr.in6, + &dest_dst->dst_saddr.in6, + do_xfrm, rt_mode); + if (!dst) { + __ip_vs_dst_set(dest, NULL, NULL, 0); + spin_unlock_bh(&dest->dst_lock); + ip_vs_dest_dst_free(dest_dst); + goto err_unreach; + } + rt = (struct rt6_info *) dst; + cookie = rt6_get_cookie(rt); + __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); + spin_unlock_bh(&dest->dst_lock); + IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", + &dest->addr.in6, &dest_dst->dst_saddr.in6, + atomic_read(&rt->dst.__refcnt)); + } + if (ret_saddr) + *ret_saddr = dest_dst->dst_saddr.in6; + } else { + noref = 0; + dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm, + rt_mode); + if (!dst) + goto err_unreach; + rt = (struct rt6_info *) dst; + } + + local = __ip_vs_is_local_route6(rt); + + if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode, + local))) { + IP_VS_DBG_RL("We are crossing local and non-local addresses" + " daddr=%pI6\n", daddr); + goto err_put; + } + + if (unlikely(local)) { + /* skb to local stack, preserve old route */ + if (!noref) + dst_release(&rt->dst); + return local; + } + + if (!decrement_ttl(ipvs, skb_af, skb)) + goto err_put; + + /* MTU checking */ + if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) + mtu = dst_mtu(&rt->dst); + else { + mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); + if (!dest) + goto err_put; + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); + if ((dest->tun_flags & + IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) + mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + __be16 tflags = 0; + + if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + tflags |= TUNNEL_CSUM; + mtu -= gre_calc_hlen(tflags); + } + if (mtu < IPV6_MIN_MTU) { + IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, + IPV6_MIN_MTU); + goto err_put; + } + maybe_update_pmtu(skb_af, skb, mtu); + } + + if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) + goto err_put; + + skb_dst_drop(skb); + if (noref) + skb_dst_set_noref(skb, &rt->dst); + else + skb_dst_set(skb, &rt->dst); + + return local; + +err_put: + if (!noref) + dst_release(&rt->dst); + return -1; + +err_unreach: + /* The ip6_link_failure function requires the dev field to be set + * in order to get the net (further for the sake of fwmark + * reflection). + */ + if (!skb->dev) + skb->dev = skb_dst(skb)->dev; + + dst_link_failure(skb); + return -1; +} +#endif + + +/* return NF_ACCEPT to allow forwarding or other NF_xxx on error */ +static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, + struct ip_vs_conn *cp) +{ + int ret = NF_ACCEPT; + + skb->ipvs_property = 1; + if (unlikely(cp->flags & IP_VS_CONN_F_NFCT)) + ret = ip_vs_confirm_conntrack(skb); + if (ret == NF_ACCEPT) { + nf_reset_ct(skb); + skb_forward_csum(skb); + if (skb->dev) + skb_clear_tstamp(skb); + } + return ret; +} + +/* In the event of a remote destination, it's possible that we would have + * matches against an old socket (particularly a TIME-WAIT socket). This + * causes havoc down the line (ip_local_out et. al. expect regular sockets + * and invalid memory accesses will happen) so simply drop the association + * in this case. +*/ +static inline void ip_vs_drop_early_demux_sk(struct sk_buff *skb) +{ + /* If dev is set, the packet came from the LOCAL_IN callback and + * not from a local TCP socket. + */ + if (skb->dev) + skb_orphan(skb); +} + +/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ +static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, + struct ip_vs_conn *cp, int local) +{ + int ret = NF_STOLEN; + + skb->ipvs_property = 1; + if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) + ip_vs_notrack(skb); + else + ip_vs_update_conntrack(skb, cp, 1); + + /* Remove the early_demux association unless it's bound for the + * exact same port and address on this host after translation. + */ + if (!local || cp->vport != cp->dport || + !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr)) + ip_vs_drop_early_demux_sk(skb); + + if (!local) { + skb_forward_csum(skb); + if (skb->dev) + skb_clear_tstamp(skb); + NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, + NULL, skb_dst(skb)->dev, dst_output); + } else + ret = NF_ACCEPT; + + return ret; +} + +/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ +static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, + struct ip_vs_conn *cp, int local) +{ + int ret = NF_STOLEN; + + skb->ipvs_property = 1; + if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) + ip_vs_notrack(skb); + if (!local) { + ip_vs_drop_early_demux_sk(skb); + skb_forward_csum(skb); + if (skb->dev) + skb_clear_tstamp(skb); + NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, + NULL, skb_dst(skb)->dev, dst_output); + } else + ret = NF_ACCEPT; + return ret; +} + + +/* + * NULL transmitter (do nothing except return NF_ACCEPT) + */ +int +ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) +{ + /* we do not touch skb and do not need pskb ptr */ + return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); +} + + +/* + * Bypass transmitter + * Let packets bypass the destination when the destination is not + * available, it may be only used in transparent cache cluster. + */ +int +ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) +{ + struct iphdr *iph = ip_hdr(skb); + + EnterFunction(10); + + if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr, + IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0) + goto tx_error; + + ip_send_check(iph); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->ignore_df = 1; + + ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + + EnterFunction(10); + + if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL, + &iph->daddr, NULL, + ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0) + goto tx_error; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->ignore_df = 1; + + ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} +#endif + +/* + * NAT transmitter (only for outside-to-inside nat forwarding) + * Not used for related ICMP + */ +int +ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) +{ + struct rtable *rt; /* Route to the other host */ + int local, rc, was_input; + + EnterFunction(10); + + /* check if it is a connection of no-client-port */ + if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { + __be16 _pt, *p; + + p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); + if (p == NULL) + goto tx_error; + ip_vs_conn_fill_cport(cp, *p); + IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); + } + + was_input = rt_is_input_route(skb_rtable(skb)); + local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR, NULL, ipvsh); + if (local < 0) + goto tx_error; + rt = skb_rtable(skb); + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + if (ct) { + IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, ipvsh->off, + "ip_vs_nat_xmit(): " + "stopping DNAT to local address"); + goto tx_error; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { + IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, ipvsh->off, + "ip_vs_nat_xmit(): stopping DNAT to loopback " + "address"); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (skb_ensure_writable(skb, sizeof(struct iphdr))) + goto tx_error; + + if (skb_cow(skb, rt->dst.dev->hard_header_len)) + goto tx_error; + + /* mangle the packet */ + if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) + goto tx_error; + ip_hdr(skb)->daddr = cp->daddr.ip; + ip_send_check(ip_hdr(skb)); + + IP_VS_DBG_PKT(10, AF_INET, pp, skb, ipvsh->off, "After DNAT"); + + /* FIXME: when application helper enlarges the packet and the length + is larger than the MTU of outgoing device, there will be still + MTU problem. */ + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->ignore_df = 1; + + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); + + LeaveFunction(10); + return rc; + + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) +{ + struct rt6_info *rt; /* Route to the other host */ + int local, rc; + + EnterFunction(10); + + /* check if it is a connection of no-client-port */ + if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) { + __be16 _pt, *p; + p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); + if (p == NULL) + goto tx_error; + ip_vs_conn_fill_cport(cp, *p); + IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); + } + + local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, + &cp->daddr.in6, + NULL, ipvsh, 0, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR); + if (local < 0) + goto tx_error; + rt = (struct rt6_info *) skb_dst(skb); + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + if (ct) { + IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, ipvsh->off, + "ip_vs_nat_xmit_v6(): " + "stopping DNAT to local address"); + goto tx_error; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { + IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, ipvsh->off, + "ip_vs_nat_xmit_v6(): " + "stopping DNAT to loopback address"); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) + goto tx_error; + + if (skb_cow(skb, rt->dst.dev->hard_header_len)) + goto tx_error; + + /* mangle the packet */ + if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) + goto tx_error; + ipv6_hdr(skb)->daddr = cp->daddr.in6; + + IP_VS_DBG_PKT(10, AF_INET6, pp, skb, ipvsh->off, "After DNAT"); + + /* FIXME: when application helper enlarges the packet and the length + is larger than the MTU of outgoing device, there will be still + MTU problem. */ + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->ignore_df = 1; + + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); + + LeaveFunction(10); + return rc; + +tx_error: + LeaveFunction(10); + kfree_skb(skb); + return NF_STOLEN; +} +#endif + +/* When forwarding a packet, we must ensure that we've got enough headroom + * for the encapsulation packet in the skb. This also gives us an + * opportunity to figure out what the payload_len, dsfield, ttl, and df + * values should be, so that we won't need to look at the old ip header + * again + */ +static struct sk_buff * +ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, + unsigned int max_headroom, __u8 *next_protocol, + __u32 *payload_len, __u8 *dsfield, __u8 *ttl, + __be16 *df) +{ + struct sk_buff *new_skb = NULL; + struct iphdr *old_iph = NULL; + __u8 old_dsfield; +#ifdef CONFIG_IP_VS_IPV6 + struct ipv6hdr *old_ipv6h = NULL; +#endif + + ip_vs_drop_early_demux_sk(skb); + + if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { + new_skb = skb_realloc_headroom(skb, max_headroom); + if (!new_skb) + goto error; + if (skb->sk) + skb_set_owner_w(new_skb, skb->sk); + consume_skb(skb); + skb = new_skb; + } + +#ifdef CONFIG_IP_VS_IPV6 + if (skb_af == AF_INET6) { + old_ipv6h = ipv6_hdr(skb); + *next_protocol = IPPROTO_IPV6; + if (payload_len) + *payload_len = + ntohs(old_ipv6h->payload_len) + + sizeof(*old_ipv6h); + old_dsfield = ipv6_get_dsfield(old_ipv6h); + *ttl = old_ipv6h->hop_limit; + if (df) + *df = 0; + } else +#endif + { + old_iph = ip_hdr(skb); + /* Copy DF, reset fragment offset and MF */ + if (df) + *df = (old_iph->frag_off & htons(IP_DF)); + *next_protocol = IPPROTO_IPIP; + + /* fix old IP header checksum */ + ip_send_check(old_iph); + old_dsfield = ipv4_get_dsfield(old_iph); + *ttl = old_iph->ttl; + if (payload_len) + *payload_len = skb_ip_totlen(skb); + } + + /* Implement full-functionality option for ECN encapsulation */ + *dsfield = INET_ECN_encapsulate(old_dsfield, old_dsfield); + + return skb; +error: + kfree_skb(skb); + return ERR_PTR(-ENOMEM); +} + +static inline int __tun_gso_type_mask(int encaps_af, int orig_af) +{ + switch (encaps_af) { + case AF_INET: + return SKB_GSO_IPXIP4; + case AF_INET6: + return SKB_GSO_IPXIP6; + default: + return 0; + } +} + +static int +ipvs_gue_encap(struct net *net, struct sk_buff *skb, + struct ip_vs_conn *cp, __u8 *next_protocol) +{ + __be16 dport; + __be16 sport = udp_flow_src_port(net, skb, 0, 0, false); + struct udphdr *udph; /* Our new UDP header */ + struct guehdr *gueh; /* Our new GUE header */ + size_t hdrlen, optlen = 0; + void *data; + bool need_priv = false; + + if ((cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + need_priv = true; + } + + hdrlen = sizeof(struct guehdr) + optlen; + + skb_push(skb, hdrlen); + + gueh = (struct guehdr *)skb->data; + + gueh->control = 0; + gueh->version = 0; + gueh->hlen = optlen >> 2; + gueh->flags = 0; + gueh->proto_ctype = *next_protocol; + + data = &gueh[1]; + + if (need_priv) { + __be32 *flags = data; + u16 csum_start = skb_checksum_start_offset(skb); + __be16 *pd; + + gueh->flags |= GUE_FLAG_PRIV; + *flags = 0; + data += GUE_LEN_PRIV; + + if (csum_start < hdrlen) + return -EINVAL; + + csum_start -= hdrlen; + pd = data; + pd[0] = htons(csum_start); + pd[1] = htons(csum_start + skb->csum_offset); + + if (!skb_is_gso(skb)) { + skb->ip_summed = CHECKSUM_NONE; + skb->encapsulation = 0; + } + + *flags |= GUE_PFLAG_REMCSUM; + data += GUE_PLEN_REMCSUM; + } + + skb_push(skb, sizeof(struct udphdr)); + skb_reset_transport_header(skb); + + udph = udp_hdr(skb); + + dport = cp->dest->tun_port; + udph->dest = dport; + udph->source = sport; + udph->len = htons(skb->len); + udph->check = 0; + + *next_protocol = IPPROTO_UDP; + + return 0; +} + +static void +ipvs_gre_encap(struct net *net, struct sk_buff *skb, + struct ip_vs_conn *cp, __u8 *next_protocol) +{ + __be16 proto = *next_protocol == IPPROTO_IPIP ? + htons(ETH_P_IP) : htons(ETH_P_IPV6); + __be16 tflags = 0; + size_t hdrlen; + + if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + tflags |= TUNNEL_CSUM; + + hdrlen = gre_calc_hlen(tflags); + gre_build_header(skb, hdrlen, tflags, proto, 0, 0); + + *next_protocol = IPPROTO_GRE; +} + +/* + * IP Tunneling transmitter + * + * This function encapsulates the packet in a new IP packet, its + * destination will be set to cp->daddr. Most code of this function + * is taken from ipip.c. + * + * It is used in VS/TUN cluster. The load balancer selects a real + * server from a cluster based on a scheduling algorithm, + * encapsulates the request packet and forwards it to the selected + * server. For example, all real servers are configured with + * "ifconfig tunl0 up". When the server receives + * the encapsulated packet, it will decapsulate the packet, processe + * the request and return the response packets directly to the client + * without passing the load balancer. This can greatly increase the + * scalability of virtual server. + * + * Used for ANY protocol + */ +int +ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) +{ + struct netns_ipvs *ipvs = cp->ipvs; + struct net *net = ipvs->net; + struct rtable *rt; /* Route to the other host */ + __be32 saddr; /* Source for tunnel */ + struct net_device *tdev; /* Device to other host */ + __u8 next_protocol = 0; + __u8 dsfield = 0; + __u8 ttl = 0; + __be16 df = 0; + __be16 *dfp = NULL; + struct iphdr *iph; /* Our new IP header */ + unsigned int max_headroom; /* The extra header space needed */ + int ret, local; + int tun_type, gso_type; + int tun_flags; + + EnterFunction(10); + + local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_CONNECT | + IP_VS_RT_MODE_TUNNEL, &saddr, ipvsh); + if (local < 0) + goto tx_error; + if (local) + return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); + + rt = skb_rtable(skb); + tdev = rt->dst.dev; + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); + + tun_type = cp->dest->tun_type; + tun_flags = cp->dest->tun_flags; + + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + size_t gue_hdrlen, gue_optlen = 0; + + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } + gue_hdrlen = sizeof(struct guehdr) + gue_optlen; + + max_headroom += sizeof(struct udphdr) + gue_hdrlen; + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + size_t gre_hdrlen; + __be16 tflags = 0; + + if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + tflags |= TUNNEL_CSUM; + gre_hdrlen = gre_calc_hlen(tflags); + + max_headroom += gre_hdrlen; + } + + /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */ + dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL; + skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, + &next_protocol, NULL, &dsfield, + &ttl, dfp); + if (IS_ERR(skb)) + goto tx_error; + + gso_type = __tun_gso_type_mask(AF_INET, cp->af); + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || + (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) + gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; + else + gso_type |= SKB_GSO_UDP_TUNNEL; + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + gso_type |= SKB_GSO_TUNNEL_REMCSUM; + } + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + gso_type |= SKB_GSO_GRE_CSUM; + else + gso_type |= SKB_GSO_GRE; + } + + if (iptunnel_handle_offloads(skb, gso_type)) + goto tx_error; + + skb->transport_header = skb->network_header; + + skb_set_inner_ipproto(skb, next_protocol); + skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); + + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + bool check = false; + + if (ipvs_gue_encap(net, skb, cp, &next_protocol)) + goto tx_error; + + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || + (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) + check = true; + + udp_set_csum(!check, skb, saddr, cp->daddr.ip, skb->len); + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) + ipvs_gre_encap(net, skb, cp, &next_protocol); + + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + + /* + * Push down and install the IPIP header. + */ + iph = ip_hdr(skb); + iph->version = 4; + iph->ihl = sizeof(struct iphdr)>>2; + iph->frag_off = df; + iph->protocol = next_protocol; + iph->tos = dsfield; + iph->daddr = cp->daddr.ip; + iph->saddr = saddr; + iph->ttl = ttl; + ip_select_ident(net, skb, NULL); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->ignore_df = 1; + + ret = ip_vs_tunnel_xmit_prepare(skb, cp); + if (ret == NF_ACCEPT) + ip_local_out(net, skb->sk, skb); + else if (ret == NF_DROP) + kfree_skb(skb); + + LeaveFunction(10); + + return NF_STOLEN; + + tx_error: + if (!IS_ERR(skb)) + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) +{ + struct netns_ipvs *ipvs = cp->ipvs; + struct net *net = ipvs->net; + struct rt6_info *rt; /* Route to the other host */ + struct in6_addr saddr; /* Source for tunnel */ + struct net_device *tdev; /* Device to other host */ + __u8 next_protocol = 0; + __u32 payload_len = 0; + __u8 dsfield = 0; + __u8 ttl = 0; + struct ipv6hdr *iph; /* Our new IP header */ + unsigned int max_headroom; /* The extra header space needed */ + int ret, local; + int tun_type, gso_type; + int tun_flags; + + EnterFunction(10); + + local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest, + &cp->daddr.in6, + &saddr, ipvsh, 1, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_TUNNEL); + if (local < 0) + goto tx_error; + if (local) + return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); + + rt = (struct rt6_info *) skb_dst(skb); + tdev = rt->dst.dev; + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); + + tun_type = cp->dest->tun_type; + tun_flags = cp->dest->tun_flags; + + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + size_t gue_hdrlen, gue_optlen = 0; + + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } + gue_hdrlen = sizeof(struct guehdr) + gue_optlen; + + max_headroom += sizeof(struct udphdr) + gue_hdrlen; + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + size_t gre_hdrlen; + __be16 tflags = 0; + + if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + tflags |= TUNNEL_CSUM; + gre_hdrlen = gre_calc_hlen(tflags); + + max_headroom += gre_hdrlen; + } + + skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, + &next_protocol, &payload_len, + &dsfield, &ttl, NULL); + if (IS_ERR(skb)) + goto tx_error; + + gso_type = __tun_gso_type_mask(AF_INET6, cp->af); + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || + (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) + gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; + else + gso_type |= SKB_GSO_UDP_TUNNEL; + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + gso_type |= SKB_GSO_TUNNEL_REMCSUM; + } + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + gso_type |= SKB_GSO_GRE_CSUM; + else + gso_type |= SKB_GSO_GRE; + } + + if (iptunnel_handle_offloads(skb, gso_type)) + goto tx_error; + + skb->transport_header = skb->network_header; + + skb_set_inner_ipproto(skb, next_protocol); + skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); + + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + bool check = false; + + if (ipvs_gue_encap(net, skb, cp, &next_protocol)) + goto tx_error; + + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || + (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) + check = true; + + udp6_set_csum(!check, skb, &saddr, &cp->daddr.in6, skb->len); + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) + ipvs_gre_encap(net, skb, cp, &next_protocol); + + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + + /* + * Push down and install the IPIP header. + */ + iph = ipv6_hdr(skb); + iph->version = 6; + iph->nexthdr = next_protocol; + iph->payload_len = htons(payload_len); + memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); + ipv6_change_dsfield(iph, 0, dsfield); + iph->daddr = cp->daddr.in6; + iph->saddr = saddr; + iph->hop_limit = ttl; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->ignore_df = 1; + + ret = ip_vs_tunnel_xmit_prepare(skb, cp); + if (ret == NF_ACCEPT) + ip6_local_out(net, skb->sk, skb); + else if (ret == NF_DROP) + kfree_skb(skb); + + LeaveFunction(10); + + return NF_STOLEN; + +tx_error: + if (!IS_ERR(skb)) + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} +#endif + + +/* + * Direct Routing transmitter + * Used for ANY protocol + */ +int +ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) +{ + int local; + + EnterFunction(10); + + local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh); + if (local < 0) + goto tx_error; + if (local) + return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); + + ip_send_check(ip_hdr(skb)); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->ignore_df = 1; + + ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) +{ + int local; + + EnterFunction(10); + + local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, + &cp->daddr.in6, + NULL, ipvsh, 0, + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_KNOWN_NH); + if (local < 0) + goto tx_error; + if (local) + return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->ignore_df = 1; + + ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); + + LeaveFunction(10); + return NF_STOLEN; + +tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} +#endif + + +/* + * ICMP packet transmitter + * called by the ip_vs_in_icmp + */ +int +ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, int offset, unsigned int hooknum, + struct ip_vs_iphdr *iph) +{ + struct rtable *rt; /* Route to the other host */ + int rc; + int local; + int rt_mode, was_input; + + EnterFunction(10); + + /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be + forwarded directly here, because there is no need to + translate address/port back */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { + if (cp->packet_xmit) + rc = cp->packet_xmit(skb, cp, pp, iph); + else + rc = NF_ACCEPT; + /* do not touch skb anymore */ + atomic_inc(&cp->in_pkts); + goto out; + } + + /* + * mangle and send the packet here (only for VS/NAT) + */ + was_input = rt_is_input_route(skb_rtable(skb)); + + /* LOCALNODE from FORWARD hook is not supported */ + rt_mode = (hooknum != NF_INET_FORWARD) ? + IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; + local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, rt_mode, + NULL, iph); + if (local < 0) + goto tx_error; + rt = skb_rtable(skb); + + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + if (ct) { + IP_VS_DBG(10, "%s(): " + "stopping DNAT to local address %pI4\n", + __func__, &cp->daddr.ip); + goto tx_error; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { + IP_VS_DBG(1, "%s(): " + "stopping DNAT to loopback %pI4\n", + __func__, &cp->daddr.ip); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (skb_ensure_writable(skb, offset)) + goto tx_error; + + if (skb_cow(skb, rt->dst.dev->hard_header_len)) + goto tx_error; + + ip_vs_nat_icmp(skb, pp, cp, 0); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->ignore_df = 1; + + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); + goto out; + + tx_error: + kfree_skb(skb); + rc = NF_STOLEN; + out: + LeaveFunction(10); + return rc; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, int offset, unsigned int hooknum, + struct ip_vs_iphdr *ipvsh) +{ + struct rt6_info *rt; /* Route to the other host */ + int rc; + int local; + int rt_mode; + + EnterFunction(10); + + /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be + forwarded directly here, because there is no need to + translate address/port back */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { + if (cp->packet_xmit) + rc = cp->packet_xmit(skb, cp, pp, ipvsh); + else + rc = NF_ACCEPT; + /* do not touch skb anymore */ + atomic_inc(&cp->in_pkts); + goto out; + } + + /* + * mangle and send the packet here (only for VS/NAT) + */ + + /* LOCALNODE from FORWARD hook is not supported */ + rt_mode = (hooknum != NF_INET_FORWARD) ? + IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; + local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, + &cp->daddr.in6, NULL, ipvsh, 0, rt_mode); + if (local < 0) + goto tx_error; + rt = (struct rt6_info *) skb_dst(skb); + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + if (ct) { + IP_VS_DBG(10, "%s(): " + "stopping DNAT to local address %pI6\n", + __func__, &cp->daddr.in6); + goto tx_error; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { + IP_VS_DBG(1, "%s(): " + "stopping DNAT to loopback %pI6\n", + __func__, &cp->daddr.in6); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (skb_ensure_writable(skb, offset)) + goto tx_error; + + if (skb_cow(skb, rt->dst.dev->hard_header_len)) + goto tx_error; + + ip_vs_nat_icmp_v6(skb, pp, cp, 0); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->ignore_df = 1; + + rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); + goto out; + +tx_error: + kfree_skb(skb); + rc = NF_STOLEN; +out: + LeaveFunction(10); + return rc; +} +#endif -- cgit v1.2.3